From 4bd4005b3f66fa0836e33cae1b80e7dccfd57583 Mon Sep 17 00:00:00 2001
From: Karen  DRUART <karen.druart@pasteur.fr>
Date: Thu, 6 Oct 2022 15:53:29 +0200
Subject: [PATCH] Upload New File

---
 XML_Subsetter/extract_uniprot_XML.py | 183 +++++++++++++++++++++++++++
 1 file changed, 183 insertions(+)
 create mode 100644 XML_Subsetter/extract_uniprot_XML.py

diff --git a/XML_Subsetter/extract_uniprot_XML.py b/XML_Subsetter/extract_uniprot_XML.py
new file mode 100644
index 0000000..801b995
--- /dev/null
+++ b/XML_Subsetter/extract_uniprot_XML.py
@@ -0,0 +1,183 @@
+#!/usr/bin/python3
+# extract XML sequences from a subset fasta file OR extract XML and create a fasta file from a list of accessions id
+# usage from a fasta file  ./extract_uniprot_XML.py -x uniprot.xml -f file.fasta -o output_name
+# usage from a list of accession  ./extract_uniprot_XML.py -x uniprot.xml -l accessions.txt -o output_name
+# created Jan 22; KD
+
+
+#############################################################################
+# Author: Karen Druart -- karen.druart@pasteur.fr                           #
+# https://research.pasteur.fr/fr/member/karen-druart/                       #
+# Copyright (c) 2022 Institut Pasteur                                       #
+#                 				                                            #
+#                                                                           #
+#  Redistribution and use in source and binary forms, with or without       #
+#  modification, are permitted provided that the following conditions       #
+#  are met:                                                                 #
+#                                                                           #
+#  1. Redistributions of source code must retain the above copyright        #
+#  notice, this list of conditions and the following disclaimer.            #
+#  2. Redistributions in binary form must reproduce the above copyright     #
+#  notice, this list of conditions and the following disclaimer in the      #
+#  documentation and/or other materials provided with the distribution.     #
+#  3. Neither the name of the copyright holder nor the names of its         #
+#  contributors may be used to endorse or promote products derived from     #
+#  this software without specific prior written permission.                 #
+#                                                                           #
+#  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS      #
+#  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT        #
+#  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR    #
+#  A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT     #
+#  HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,   #
+#  SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT         #
+#  LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,    #
+#  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY    #
+#  THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT      #
+#  (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE    #
+#  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.     #
+#                                                                           #
+#  This program is free software: you can redistribute it and/or modify     #
+#                                                                           #
+#############################################################################
+
+
+
+
+
+
+
+import re
+import sys
+import getopt
+
+
+
+def define_db(database):
+    if database=="Swiss-Prot":
+        return 'sp'
+    elif database=="TrEMBL":
+        return 'tr'
+
+def wrap(seq, size):
+    cmp=0
+    seqout=""
+    for i in range(0,len(seq)):
+        if i - cmp *size == size:
+            cmp=cmp+1
+            seqout=seqout+"\n"
+        seqout=seqout+seq[i]
+    return seqout
+
+
+from optparse import OptionParser
+
+parser = OptionParser()
+parser.add_option("-f", "--fasta",  dest="fasta_name",    help="fasta file containing the list of proteins of interest")
+parser.add_option("-l", "--list",   dest="list_name",    help="file containing the list of accessions of interest")
+parser.add_option("-x", "--xml",    dest="xml_file",    help="XML file to extract proteins of interest")
+parser.add_option("-o", "--output", dest="output_name", help="the name of the outputfile. If empty, the name of the fasta or list file will be used")
+parser.epilog="For more help on 'extract_uniprot_XML' see the README."
+
+(options, args) = parser.parse_args()
+
+if not options.xml_file:   # if filename is not given
+    parser.error('XML file not given\n')
+if not options.fasta_name and not options.list_name:   # if filename is not given
+    parser.error('Please give a fasta file or a list of accessions to extract from the XML file\n')
+
+isFasta=0
+if not options.list_name:
+    filename=options.fasta_name
+    isFasta=1
+elif not options.fasta_name:
+    filename=options.list_name
+if not options.output_name:
+    output_name=filename.split(".")[0]
+
+
+
+
+
+
+
+l_acc=[]
+with open(filename) as filin:
+    lines=filin.readlines()
+if isFasta:
+    for line in lines:
+        if line.startswith(">"):
+            acc=line.split("|")[1]
+            l_acc.append(acc)
+    print("\n")        
+    print("Number of sequence in fasta file: "+str(len(l_acc))+"\n")
+else:
+    for line in lines:
+        l_acc.append(line[:-1])
+    print("\n")        
+    print("Number of accessions in file: "+str(len(l_acc))+"\n")
+
+
+
+xmlo=open(options.output_name+".xml","w")
+if isFasta==0:
+    fastao=open(options.output_name+".fasta","w")
+
+
+with open(options.xml_file) as filin:
+    xml_lines=filin.readlines()
+
+flag_entry=0
+temp_entry=[]
+toprint=1
+for line in xml_lines:
+    if line.startswith("<entry"):
+        acc=[]
+        seq=[]
+        name=""
+        fullname=""
+        database=line.split('dataset="')[1].split('"')[0]
+        temp_entry=[line]
+        flag_entry=1
+        toprint=1
+    elif line.startswith("<accession"):
+        acc.append(re.split(">|<",line)[2])
+        temp_entry.append(line)
+    elif line.startswith("<sequence"):
+        temp_entry.append(line)
+        if line.find("</sequence>"):
+            seq=re.split(">|<",line)[2]
+    elif line.startswith("<name>"):
+        temp_entry.append(line)
+        name=re.split(">|<",line)[2]
+    elif line.startswith("<fullName>"):
+        temp_entry.append(line)
+        if fullname=="":
+            fullname=re.split(">|<",line)[2]
+    elif line.startswith('</entry'):
+        temp_entry.append(line)
+        flag_entry=0
+        # we test if we have to keep this proteoform
+        for a in acc:
+            if a in l_acc:
+                if toprint==1:
+                    for i in temp_entry:
+                        xmlo.write(i)
+                    if isFasta==0:
+                        db=define_db(database)
+                        seq_fasta=wrap(seq,80)
+                        fastao.write(">"+db+"|"+a+"|"+name+" "+fullname+'\n'+seq_fasta+"\n")
+                    toprint=0 
+            
+    else:
+        if flag_entry==0:
+            xmlo.write(line)
+        else:
+            temp_entry.append(line)
+         
+
+xmlo.close()
+if isFasta==0:
+    fastao.close()
+
+
+
-- 
GitLab