Commit 0e3e1041 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Add script to extract dataset, and description of dataset

parents
SAEN1
SAEN.1116.00001.i001_01127 1
SAEN.1116.00001.i001_04078 4
SAEN.1116.00001.i001_00297 5
SAEN.1116.00001.i001_02366 6
SAEN.1116.00001.i001_00685 7
SAEN.1116.00001.i001_02115 10
SAEN.1116.00001.i001_03203 11
SAEN.1116.00001.i001_00953 12
SAEN.1116.00001.i001_02734 13
SAEN.1116.00001.i001_03171 14
SAEN.1116.00001.i001_02692 15
SAEN.1116.00001.i001_02692 15
SAEN2
SAEN.1116.00002.i001_01841 1
SAEN.1116.00002.i001_04441 2
SAEN.1116.00002.i001_03624 4
SAEN.1116.00002.i001_00528 6
SAEN.1116.00002.i001_02223 7
SAEN.1116.00002.i001_01619 8
SAEN.1116.00002.i001_00785 10
SAEN.1116.00002.i001_03143 11
SAEN.1116.00002.i001_02028 12
SAEN.1116.00002.i001_03474 13
SAEN.1116.00002.i001_04402 13
SAEN.1116.00002.i001_03107 14
SAEN.1116.00002.i001_02668 15
SAEN3
SAEN.1116.00003.i001_01139 1
SAEN.1116.00003.i001_04563 3
SAEN.1116.00003.i001_04145 4
SAEN.1116.00003.i001_02469 6
SAEN.1116.00003.i001_01791 8
SAEN.1116.00003.i002_04824 9
SAEN.1116.00003.i001_02204 10
SAEN.1116.00003.i001_03278 11
SAEN.1116.00003.i001_00967 12
SAEN.1116.00003.i001_00967 12
SAEN.1116.00003.i001_03244 14
SAEN.1116.00003.i001_02816 15
SAEN4
SAEN.1116.00004.i001_00986 1
SAEN.1116.00004.i001_00987 16
SAEN.1116.00004.i001_03943 4
SAEN.1116.00004.i001_02373 6
SAEN.1116.00004.i001_02131 10
SAEN.1116.00004.i001_03089 11
SAEN.1116.00004.i001_00861 12
SAEN.1116.00004.i001_03057 14
SAEN.1116.00004.i001_02647 15
#!/usr/bin/env python3
# coding: utf-8
import sys
def main(infile, outfile, toextract):
"""
infile: original file, with all sequences
outfile: file where extracted sequences must be saved
toextract: list of headers to extract
"""
to_extract = {}
with open(toextract, "r") as toef:
for line in toef:
header = line.split()[0].strip()
if header in to_extract:
to_extract[header] += 1
else:
to_extract[header] = 1
with open(infile, 'r') as inf, open(outfile, 'w') as outf:
record = False
cur_head = ""
cur_cont = ""
for inline in inf:
if inline.startswith(">"):
if cur_head != "":
for _ in range(to_extract[cur_head]):
outf.write(">" + cur_head + "\n")
outf.write(cur_cont + "\n")
cur_head = ""
cur_cont = ""
record = False
header = inline.split()[0].strip()
header = ">".join(header.split(">")[1:])
if header in to_extract:
record = True
cur_head = header
else:
if record:
cur_cont += inline.strip()
if __name__ == '__main__':
if len(sys.argv) != 4:
print("provide infile outfile extract")
sys.exit(1)
infile = sys.argv[1]
outfile = sys.argv[2]
extract = sys.argv[3]
main(infile, outfile, extract)
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment