Skip to content
Snippets Groups Projects
Commit 4bd4005b authored by Karen  DRUART's avatar Karen DRUART
Browse files

Upload New File

parent 511e254f
Branches icy-3.0.0
No related merge requests found
#!/usr/bin/python3
# extract XML sequences from a subset fasta file OR extract XML and create a fasta file from a list of accessions id
# usage from a fasta file ./extract_uniprot_XML.py -x uniprot.xml -f file.fasta -o output_name
# usage from a list of accession ./extract_uniprot_XML.py -x uniprot.xml -l accessions.txt -o output_name
# created Jan 22; KD
#############################################################################
# Author: Karen Druart -- karen.druart@pasteur.fr #
# https://research.pasteur.fr/fr/member/karen-druart/ #
# Copyright (c) 2022 Institut Pasteur #
# #
# #
# Redistribution and use in source and binary forms, with or without #
# modification, are permitted provided that the following conditions #
# are met: #
# #
# 1. Redistributions of source code must retain the above copyright #
# notice, this list of conditions and the following disclaimer. #
# 2. Redistributions in binary form must reproduce the above copyright #
# notice, this list of conditions and the following disclaimer in the #
# documentation and/or other materials provided with the distribution. #
# 3. Neither the name of the copyright holder nor the names of its #
# contributors may be used to endorse or promote products derived from #
# this software without specific prior written permission. #
# #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT #
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR #
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT #
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, #
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT #
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
# #
# This program is free software: you can redistribute it and/or modify #
# #
#############################################################################
import re
import sys
import getopt
def define_db(database):
if database=="Swiss-Prot":
return 'sp'
elif database=="TrEMBL":
return 'tr'
def wrap(seq, size):
cmp=0
seqout=""
for i in range(0,len(seq)):
if i - cmp *size == size:
cmp=cmp+1
seqout=seqout+"\n"
seqout=seqout+seq[i]
return seqout
from optparse import OptionParser
parser = OptionParser()
parser.add_option("-f", "--fasta", dest="fasta_name", help="fasta file containing the list of proteins of interest")
parser.add_option("-l", "--list", dest="list_name", help="file containing the list of accessions of interest")
parser.add_option("-x", "--xml", dest="xml_file", help="XML file to extract proteins of interest")
parser.add_option("-o", "--output", dest="output_name", help="the name of the outputfile. If empty, the name of the fasta or list file will be used")
parser.epilog="For more help on 'extract_uniprot_XML' see the README."
(options, args) = parser.parse_args()
if not options.xml_file: # if filename is not given
parser.error('XML file not given\n')
if not options.fasta_name and not options.list_name: # if filename is not given
parser.error('Please give a fasta file or a list of accessions to extract from the XML file\n')
isFasta=0
if not options.list_name:
filename=options.fasta_name
isFasta=1
elif not options.fasta_name:
filename=options.list_name
if not options.output_name:
output_name=filename.split(".")[0]
l_acc=[]
with open(filename) as filin:
lines=filin.readlines()
if isFasta:
for line in lines:
if line.startswith(">"):
acc=line.split("|")[1]
l_acc.append(acc)
print("\n")
print("Number of sequence in fasta file: "+str(len(l_acc))+"\n")
else:
for line in lines:
l_acc.append(line[:-1])
print("\n")
print("Number of accessions in file: "+str(len(l_acc))+"\n")
xmlo=open(options.output_name+".xml","w")
if isFasta==0:
fastao=open(options.output_name+".fasta","w")
with open(options.xml_file) as filin:
xml_lines=filin.readlines()
flag_entry=0
temp_entry=[]
toprint=1
for line in xml_lines:
if line.startswith("<entry"):
acc=[]
seq=[]
name=""
fullname=""
database=line.split('dataset="')[1].split('"')[0]
temp_entry=[line]
flag_entry=1
toprint=1
elif line.startswith("<accession"):
acc.append(re.split(">|<",line)[2])
temp_entry.append(line)
elif line.startswith("<sequence"):
temp_entry.append(line)
if line.find("</sequence>"):
seq=re.split(">|<",line)[2]
elif line.startswith("<name>"):
temp_entry.append(line)
name=re.split(">|<",line)[2]
elif line.startswith("<fullName>"):
temp_entry.append(line)
if fullname=="":
fullname=re.split(">|<",line)[2]
elif line.startswith('</entry'):
temp_entry.append(line)
flag_entry=0
# we test if we have to keep this proteoform
for a in acc:
if a in l_acc:
if toprint==1:
for i in temp_entry:
xmlo.write(i)
if isFasta==0:
db=define_db(database)
seq_fasta=wrap(seq,80)
fastao.write(">"+db+"|"+a+"|"+name+" "+fullname+'\n'+seq_fasta+"\n")
toprint=0
else:
if flag_entry==0:
xmlo.write(line)
else:
temp_entry.append(line)
xmlo.close()
if isFasta==0:
fastao.close()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment