diff --git a/XML_Subsetter/extract_uniprot_XML.py b/XML_Subsetter/extract_uniprot_XML.py index 801b995c0b795085ddbfd0e35c020b23a53f41bb..53b3c6b3ccdcc66cd6bb7413d096df0bdf0dcf28 100644 --- a/XML_Subsetter/extract_uniprot_XML.py +++ b/XML_Subsetter/extract_uniprot_XML.py @@ -47,9 +47,6 @@ import re -import sys -import getopt - def define_db(database): @@ -107,6 +104,9 @@ if isFasta: for line in lines: if line.startswith(">"): acc=line.split("|")[1] + if acc.find("-"): + acc=acc.split("-")[0] + print(acc) l_acc.append(acc) print("\n") print("Number of sequence in fasta file: "+str(len(l_acc))+"\n") @@ -116,6 +116,8 @@ else: print("\n") print("Number of accessions in file: "+str(len(l_acc))+"\n") +l_acc = list(dict.fromkeys(l_acc)) + xmlo=open(options.output_name+".xml","w") @@ -130,7 +132,7 @@ flag_entry=0 temp_entry=[] toprint=1 for line in xml_lines: - if line.startswith("<entry"): + if line.strip().startswith("<entry"): acc=[] seq=[] name="" @@ -139,21 +141,21 @@ for line in xml_lines: temp_entry=[line] flag_entry=1 toprint=1 - elif line.startswith("<accession"): + elif line.strip().startswith("<accession"): acc.append(re.split(">|<",line)[2]) temp_entry.append(line) - elif line.startswith("<sequence"): + elif line.strip().startswith("<sequence"): temp_entry.append(line) if line.find("</sequence>"): seq=re.split(">|<",line)[2] - elif line.startswith("<name>"): + elif line.strip().startswith("<name>"): temp_entry.append(line) name=re.split(">|<",line)[2] - elif line.startswith("<fullName>"): + elif line.strip().startswith("<fullName>"): temp_entry.append(line) if fullname=="": fullname=re.split(">|<",line)[2] - elif line.startswith('</entry'): + elif line.strip().startswith('</entry'): temp_entry.append(line) flag_entry=0 # we test if we have to keep this proteoform