Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
T
TDP Comparative Tools
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Wiki
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MSBio
TDP Comparative Tools
Commits
4bd4005b
Commit
4bd4005b
authored
2 years ago
by
Karen DRUART
Browse files
Options
Downloads
Patches
Plain Diff
Upload New File
parent
511e254f
No related branches found
Branches containing commit
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
XML_Subsetter/extract_uniprot_XML.py
+183
-0
183 additions, 0 deletions
XML_Subsetter/extract_uniprot_XML.py
with
183 additions
and
0 deletions
XML_Subsetter/extract_uniprot_XML.py
0 → 100644
+
183
−
0
View file @
4bd4005b
#!/usr/bin/python3
# extract XML sequences from a subset fasta file OR extract XML and create a fasta file from a list of accessions id
# usage from a fasta file ./extract_uniprot_XML.py -x uniprot.xml -f file.fasta -o output_name
# usage from a list of accession ./extract_uniprot_XML.py -x uniprot.xml -l accessions.txt -o output_name
# created Jan 22; KD
#############################################################################
# Author: Karen Druart -- karen.druart@pasteur.fr #
# https://research.pasteur.fr/fr/member/karen-druart/ #
# Copyright (c) 2022 Institut Pasteur #
# #
# #
# Redistribution and use in source and binary forms, with or without #
# modification, are permitted provided that the following conditions #
# are met: #
# #
# 1. Redistributions of source code must retain the above copyright #
# notice, this list of conditions and the following disclaimer. #
# 2. Redistributions in binary form must reproduce the above copyright #
# notice, this list of conditions and the following disclaimer in the #
# documentation and/or other materials provided with the distribution. #
# 3. Neither the name of the copyright holder nor the names of its #
# contributors may be used to endorse or promote products derived from #
# this software without specific prior written permission. #
# #
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT #
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR #
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT #
# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, #
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT #
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, #
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY #
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT #
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE #
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
# #
# This program is free software: you can redistribute it and/or modify #
# #
#############################################################################
import
re
import
sys
import
getopt
def
define_db
(
database
):
if
database
==
"
Swiss-Prot
"
:
return
'
sp
'
elif
database
==
"
TrEMBL
"
:
return
'
tr
'
def
wrap
(
seq
,
size
):
cmp
=
0
seqout
=
""
for
i
in
range
(
0
,
len
(
seq
)):
if
i
-
cmp
*
size
==
size
:
cmp
=
cmp
+
1
seqout
=
seqout
+
"
\n
"
seqout
=
seqout
+
seq
[
i
]
return
seqout
from
optparse
import
OptionParser
parser
=
OptionParser
()
parser
.
add_option
(
"
-f
"
,
"
--fasta
"
,
dest
=
"
fasta_name
"
,
help
=
"
fasta file containing the list of proteins of interest
"
)
parser
.
add_option
(
"
-l
"
,
"
--list
"
,
dest
=
"
list_name
"
,
help
=
"
file containing the list of accessions of interest
"
)
parser
.
add_option
(
"
-x
"
,
"
--xml
"
,
dest
=
"
xml_file
"
,
help
=
"
XML file to extract proteins of interest
"
)
parser
.
add_option
(
"
-o
"
,
"
--output
"
,
dest
=
"
output_name
"
,
help
=
"
the name of the outputfile. If empty, the name of the fasta or list file will be used
"
)
parser
.
epilog
=
"
For more help on
'
extract_uniprot_XML
'
see the README.
"
(
options
,
args
)
=
parser
.
parse_args
()
if
not
options
.
xml_file
:
# if filename is not given
parser
.
error
(
'
XML file not given
\n
'
)
if
not
options
.
fasta_name
and
not
options
.
list_name
:
# if filename is not given
parser
.
error
(
'
Please give a fasta file or a list of accessions to extract from the XML file
\n
'
)
isFasta
=
0
if
not
options
.
list_name
:
filename
=
options
.
fasta_name
isFasta
=
1
elif
not
options
.
fasta_name
:
filename
=
options
.
list_name
if
not
options
.
output_name
:
output_name
=
filename
.
split
(
"
.
"
)[
0
]
l_acc
=
[]
with
open
(
filename
)
as
filin
:
lines
=
filin
.
readlines
()
if
isFasta
:
for
line
in
lines
:
if
line
.
startswith
(
"
>
"
):
acc
=
line
.
split
(
"
|
"
)[
1
]
l_acc
.
append
(
acc
)
print
(
"
\n
"
)
print
(
"
Number of sequence in fasta file:
"
+
str
(
len
(
l_acc
))
+
"
\n
"
)
else
:
for
line
in
lines
:
l_acc
.
append
(
line
[:
-
1
])
print
(
"
\n
"
)
print
(
"
Number of accessions in file:
"
+
str
(
len
(
l_acc
))
+
"
\n
"
)
xmlo
=
open
(
options
.
output_name
+
"
.xml
"
,
"
w
"
)
if
isFasta
==
0
:
fastao
=
open
(
options
.
output_name
+
"
.fasta
"
,
"
w
"
)
with
open
(
options
.
xml_file
)
as
filin
:
xml_lines
=
filin
.
readlines
()
flag_entry
=
0
temp_entry
=
[]
toprint
=
1
for
line
in
xml_lines
:
if
line
.
startswith
(
"
<entry
"
):
acc
=
[]
seq
=
[]
name
=
""
fullname
=
""
database
=
line
.
split
(
'
dataset=
"'
)[
1
].
split
(
'"'
)[
0
]
temp_entry
=
[
line
]
flag_entry
=
1
toprint
=
1
elif
line
.
startswith
(
"
<accession
"
):
acc
.
append
(
re
.
split
(
"
>|<
"
,
line
)[
2
])
temp_entry
.
append
(
line
)
elif
line
.
startswith
(
"
<sequence
"
):
temp_entry
.
append
(
line
)
if
line
.
find
(
"
</sequence>
"
):
seq
=
re
.
split
(
"
>|<
"
,
line
)[
2
]
elif
line
.
startswith
(
"
<name>
"
):
temp_entry
.
append
(
line
)
name
=
re
.
split
(
"
>|<
"
,
line
)[
2
]
elif
line
.
startswith
(
"
<fullName>
"
):
temp_entry
.
append
(
line
)
if
fullname
==
""
:
fullname
=
re
.
split
(
"
>|<
"
,
line
)[
2
]
elif
line
.
startswith
(
'
</entry
'
):
temp_entry
.
append
(
line
)
flag_entry
=
0
# we test if we have to keep this proteoform
for
a
in
acc
:
if
a
in
l_acc
:
if
toprint
==
1
:
for
i
in
temp_entry
:
xmlo
.
write
(
i
)
if
isFasta
==
0
:
db
=
define_db
(
database
)
seq_fasta
=
wrap
(
seq
,
80
)
fastao
.
write
(
"
>
"
+
db
+
"
|
"
+
a
+
"
|
"
+
name
+
"
"
+
fullname
+
'
\n
'
+
seq_fasta
+
"
\n
"
)
toprint
=
0
else
:
if
flag_entry
==
0
:
xmlo
.
write
(
line
)
else
:
temp_entry
.
append
(
line
)
xmlo
.
close
()
if
isFasta
==
0
:
fastao
.
close
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment