From 4bd4005b3f66fa0836e33cae1b80e7dccfd57583 Mon Sep 17 00:00:00 2001 From: Karen DRUART <karen.druart@pasteur.fr> Date: Thu, 6 Oct 2022 15:53:29 +0200 Subject: [PATCH] Upload New File --- XML_Subsetter/extract_uniprot_XML.py | 183 +++++++++++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 XML_Subsetter/extract_uniprot_XML.py diff --git a/XML_Subsetter/extract_uniprot_XML.py b/XML_Subsetter/extract_uniprot_XML.py new file mode 100644 index 0000000..801b995 --- /dev/null +++ b/XML_Subsetter/extract_uniprot_XML.py @@ -0,0 +1,183 @@ +#!/usr/bin/python3 +# extract XML sequences from a subset fasta file OR extract XML and create a fasta file from a list of accessions id +# usage from a fasta file ./extract_uniprot_XML.py -x uniprot.xml -f file.fasta -o output_name +# usage from a list of accession ./extract_uniprot_XML.py -x uniprot.xml -l accessions.txt -o output_name +# created Jan 22; KD + + +############################################################################# +# Author: Karen Druart -- karen.druart@pasteur.fr # +# https://research.pasteur.fr/fr/member/karen-druart/ # +# Copyright (c) 2022 Institut Pasteur # +# # +# # +# Redistribution and use in source and binary forms, with or without # +# modification, are permitted provided that the following conditions # +# are met: # +# # +# 1. Redistributions of source code must retain the above copyright # +# notice, this list of conditions and the following disclaimer. # +# 2. Redistributions in binary form must reproduce the above copyright # +# notice, this list of conditions and the following disclaimer in the # +# documentation and/or other materials provided with the distribution. # +# 3. Neither the name of the copyright holder nor the names of its # +# contributors may be used to endorse or promote products derived from # +# this software without specific prior written permission. # +# # +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # +# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # +# # +# This program is free software: you can redistribute it and/or modify # +# # +############################################################################# + + + + + + + +import re +import sys +import getopt + + + +def define_db(database): + if database=="Swiss-Prot": + return 'sp' + elif database=="TrEMBL": + return 'tr' + +def wrap(seq, size): + cmp=0 + seqout="" + for i in range(0,len(seq)): + if i - cmp *size == size: + cmp=cmp+1 + seqout=seqout+"\n" + seqout=seqout+seq[i] + return seqout + + +from optparse import OptionParser + +parser = OptionParser() +parser.add_option("-f", "--fasta", dest="fasta_name", help="fasta file containing the list of proteins of interest") +parser.add_option("-l", "--list", dest="list_name", help="file containing the list of accessions of interest") +parser.add_option("-x", "--xml", dest="xml_file", help="XML file to extract proteins of interest") +parser.add_option("-o", "--output", dest="output_name", help="the name of the outputfile. If empty, the name of the fasta or list file will be used") +parser.epilog="For more help on 'extract_uniprot_XML' see the README." + +(options, args) = parser.parse_args() + +if not options.xml_file: # if filename is not given + parser.error('XML file not given\n') +if not options.fasta_name and not options.list_name: # if filename is not given + parser.error('Please give a fasta file or a list of accessions to extract from the XML file\n') + +isFasta=0 +if not options.list_name: + filename=options.fasta_name + isFasta=1 +elif not options.fasta_name: + filename=options.list_name +if not options.output_name: + output_name=filename.split(".")[0] + + + + + + + +l_acc=[] +with open(filename) as filin: + lines=filin.readlines() +if isFasta: + for line in lines: + if line.startswith(">"): + acc=line.split("|")[1] + l_acc.append(acc) + print("\n") + print("Number of sequence in fasta file: "+str(len(l_acc))+"\n") +else: + for line in lines: + l_acc.append(line[:-1]) + print("\n") + print("Number of accessions in file: "+str(len(l_acc))+"\n") + + + +xmlo=open(options.output_name+".xml","w") +if isFasta==0: + fastao=open(options.output_name+".fasta","w") + + +with open(options.xml_file) as filin: + xml_lines=filin.readlines() + +flag_entry=0 +temp_entry=[] +toprint=1 +for line in xml_lines: + if line.startswith("<entry"): + acc=[] + seq=[] + name="" + fullname="" + database=line.split('dataset="')[1].split('"')[0] + temp_entry=[line] + flag_entry=1 + toprint=1 + elif line.startswith("<accession"): + acc.append(re.split(">|<",line)[2]) + temp_entry.append(line) + elif line.startswith("<sequence"): + temp_entry.append(line) + if line.find("</sequence>"): + seq=re.split(">|<",line)[2] + elif line.startswith("<name>"): + temp_entry.append(line) + name=re.split(">|<",line)[2] + elif line.startswith("<fullName>"): + temp_entry.append(line) + if fullname=="": + fullname=re.split(">|<",line)[2] + elif line.startswith('</entry'): + temp_entry.append(line) + flag_entry=0 + # we test if we have to keep this proteoform + for a in acc: + if a in l_acc: + if toprint==1: + for i in temp_entry: + xmlo.write(i) + if isFasta==0: + db=define_db(database) + seq_fasta=wrap(seq,80) + fastao.write(">"+db+"|"+a+"|"+name+" "+fullname+'\n'+seq_fasta+"\n") + toprint=0 + + else: + if flag_entry==0: + xmlo.write(line) + else: + temp_entry.append(line) + + +xmlo.close() +if isFasta==0: + fastao.close() + + + -- GitLab