Add script to extract dataset, and description of dataset

0e3e1041 · Amandine PERRIN · 0e3e1041 · 0e3e1041
Commit 0e3e1041 authored 7 years ago by Amandine PERRIN
--- a/construction_test_datasets/dataset.txt
+++ b/construction_test_datasets/dataset.txt
+SAEN1
+SAEN.1116.00001.i001_01127 1
+SAEN.1116.00001.i001_04078 4
+SAEN.1116.00001.i001_00297 5
+SAEN.1116.00001.i001_02366 6
+SAEN.1116.00001.i001_00685 7 
+SAEN.1116.00001.i001_02115 10
+SAEN.1116.00001.i001_03203 11
+SAEN.1116.00001.i001_00953 12
+SAEN.1116.00001.i001_02734 13
+SAEN.1116.00001.i001_03171 14
+SAEN.1116.00001.i001_02692 15
+SAEN.1116.00001.i001_02692 15
+
+SAEN2
+SAEN.1116.00002.i001_01841 1
+SAEN.1116.00002.i001_04441 2
+SAEN.1116.00002.i001_03624 4
+SAEN.1116.00002.i001_00528 6
+SAEN.1116.00002.i001_02223 7
+SAEN.1116.00002.i001_01619 8
+SAEN.1116.00002.i001_00785 10
+SAEN.1116.00002.i001_03143 11
+SAEN.1116.00002.i001_02028 12
+SAEN.1116.00002.i001_03474 13
+SAEN.1116.00002.i001_04402 13
+SAEN.1116.00002.i001_03107 14
+SAEN.1116.00002.i001_02668 15
+
+SAEN3
+SAEN.1116.00003.i001_01139 1
+SAEN.1116.00003.i001_04563 3
+SAEN.1116.00003.i001_04145 4
+SAEN.1116.00003.i001_02469 6
+SAEN.1116.00003.i001_01791 8
+SAEN.1116.00003.i002_04824 9
+SAEN.1116.00003.i001_02204 10
+SAEN.1116.00003.i001_03278 11
+SAEN.1116.00003.i001_00967 12
+SAEN.1116.00003.i001_00967 12
+SAEN.1116.00003.i001_03244 14
+SAEN.1116.00003.i001_02816 15
+
+SAEN4
+SAEN.1116.00004.i001_00986 1
+SAEN.1116.00004.i001_00987 16
+SAEN.1116.00004.i001_03943 4
+SAEN.1116.00004.i001_02373 6
+SAEN.1116.00004.i001_02131 10
+SAEN.1116.00004.i001_03089 11
+SAEN.1116.00004.i001_00861 12
+SAEN.1116.00004.i001_03057 14
+SAEN.1116.00004.i001_02647 15
--- a/construction_test_datasets/get_seq.py
+++ b/construction_test_datasets/get_seq.py
+#!/usr/bin/env python3
+# coding: utf-8
+
+import sys
+
+def main(infile, outfile, toextract):
+    """
+    infile: original file, with all sequences
+    outfile: file where extracted sequences must be saved
+    toextract: list of headers to extract
+
+    """
+    to_extract = {}
+    with open(toextract, "r") as toef:
+        for line in toef:
+            header = line.split()[0].strip()
+            if header in to_extract:
+                to_extract[header] += 1
+            else:
+                to_extract[header] = 1
+
+
+    with open(infile, 'r') as inf, open(outfile, 'w') as outf:
+        record = False
+        cur_head = ""
+        cur_cont = ""
+        for inline in inf:
+            if inline.startswith(">"):
+                if cur_head != "":
+                    for _ in range(to_extract[cur_head]):
+                        outf.write(">" + cur_head + "\n")
+                        outf.write(cur_cont + "\n")
+                    cur_head = ""
+                    cur_cont = ""
+                    record = False
+                header = inline.split()[0].strip()
+                header = ">".join(header.split(">")[1:])
+                if header in to_extract:
+                    record = True
+                    cur_head = header
+            else:
+                if record:
+                    cur_cont += inline.strip()
+
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print("provide infile outfile extract")
+        sys.exit(1)
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
+    extract = sys.argv[3]
+    main(infile, outfile, extract)
\ No newline at end of file