Skip to content
Snippets Groups Projects
Commit 0e3e1041 authored by Amandine  PERRIN's avatar Amandine PERRIN
Browse files

Add script to extract dataset, and description of dataset

parents
No related branches found
No related tags found
No related merge requests found
SAEN1
SAEN.1116.00001.i001_01127 1
SAEN.1116.00001.i001_04078 4
SAEN.1116.00001.i001_00297 5
SAEN.1116.00001.i001_02366 6
SAEN.1116.00001.i001_00685 7
SAEN.1116.00001.i001_02115 10
SAEN.1116.00001.i001_03203 11
SAEN.1116.00001.i001_00953 12
SAEN.1116.00001.i001_02734 13
SAEN.1116.00001.i001_03171 14
SAEN.1116.00001.i001_02692 15
SAEN.1116.00001.i001_02692 15
SAEN2
SAEN.1116.00002.i001_01841 1
SAEN.1116.00002.i001_04441 2
SAEN.1116.00002.i001_03624 4
SAEN.1116.00002.i001_00528 6
SAEN.1116.00002.i001_02223 7
SAEN.1116.00002.i001_01619 8
SAEN.1116.00002.i001_00785 10
SAEN.1116.00002.i001_03143 11
SAEN.1116.00002.i001_02028 12
SAEN.1116.00002.i001_03474 13
SAEN.1116.00002.i001_04402 13
SAEN.1116.00002.i001_03107 14
SAEN.1116.00002.i001_02668 15
SAEN3
SAEN.1116.00003.i001_01139 1
SAEN.1116.00003.i001_04563 3
SAEN.1116.00003.i001_04145 4
SAEN.1116.00003.i001_02469 6
SAEN.1116.00003.i001_01791 8
SAEN.1116.00003.i002_04824 9
SAEN.1116.00003.i001_02204 10
SAEN.1116.00003.i001_03278 11
SAEN.1116.00003.i001_00967 12
SAEN.1116.00003.i001_00967 12
SAEN.1116.00003.i001_03244 14
SAEN.1116.00003.i001_02816 15
SAEN4
SAEN.1116.00004.i001_00986 1
SAEN.1116.00004.i001_00987 16
SAEN.1116.00004.i001_03943 4
SAEN.1116.00004.i001_02373 6
SAEN.1116.00004.i001_02131 10
SAEN.1116.00004.i001_03089 11
SAEN.1116.00004.i001_00861 12
SAEN.1116.00004.i001_03057 14
SAEN.1116.00004.i001_02647 15
#!/usr/bin/env python3
# coding: utf-8
import sys
def main(infile, outfile, toextract):
"""
infile: original file, with all sequences
outfile: file where extracted sequences must be saved
toextract: list of headers to extract
"""
to_extract = {}
with open(toextract, "r") as toef:
for line in toef:
header = line.split()[0].strip()
if header in to_extract:
to_extract[header] += 1
else:
to_extract[header] = 1
with open(infile, 'r') as inf, open(outfile, 'w') as outf:
record = False
cur_head = ""
cur_cont = ""
for inline in inf:
if inline.startswith(">"):
if cur_head != "":
for _ in range(to_extract[cur_head]):
outf.write(">" + cur_head + "\n")
outf.write(cur_cont + "\n")
cur_head = ""
cur_cont = ""
record = False
header = inline.split()[0].strip()
header = ">".join(header.split(">")[1:])
if header in to_extract:
record = True
cur_head = header
else:
if record:
cur_cont += inline.strip()
if __name__ == '__main__':
if len(sys.argv) != 4:
print("provide infile outfile extract")
sys.exit(1)
infile = sys.argv[1]
outfile = sys.argv[2]
extract = sys.argv[3]
main(infile, outfile, extract)
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment