Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
hub-courses
python_one_week_4_biologists_solutions
Commits
01528585
Commit
01528585
authored
Sep 08, 2014
by
Bertrand NÉRON
Browse files
fix different versions of fasta reader
parent
8f9dd96c
Changes
3
Hide whitespace changes
Inline
Side-by-side
source/_static/code/fasta_iterator.py
View file @
01528585
...
...
@@ -15,8 +15,9 @@ def fasta_iter(fasta_file):
with
open
(
fasta_path
)
as
fasta_file
:
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
group
=
(
x
[
1
]
for
x
in
groupby
(
fasta_file
,
lambda
line
:
line
[
0
]
==
">"
))
group
=
(
x
[
1
]
for
x
in
groupby
(
fasta_file
,
lambda
line
:
line
.
startswith
(
">"
))
)
for
header
in
group
:
print
header
# drop the ">"
header
=
header
.
next
()[
1
:].
strip
()
header
=
header
.
split
()
...
...
@@ -35,4 +36,20 @@ def fasta_iter(fasta_file):
# something goes wrong in do something with seq
# but we don't quit the program (we catch the exception for instance)
# the fasta file is still open
# it's better to put the fasta file opening out the fasta reader see fasta filter
\ No newline at end of file
# it's better to put the fasta file opening out the fasta reader see fasta filter
if
__name__
==
'__main__'
:
import
sys
import
os.path
if
len
(
sys
.
argv
)
!=
2
:
sys
.
exit
(
"usage multiple_fasta fasta_path"
)
fasta_path
=
sys
.
argv
[
1
]
if
not
os
.
path
.
exists
(
fasta_path
):
sys
.
exit
(
"No such file: {}"
.
format
(
fasta_path
))
with
open
(
fasta_path
,
'r'
)
as
fasta_input
:
for
sequence
in
fasta_iter
(
fasta_input
):
print
"----------------"
print
sequence
\ No newline at end of file
source/_static/code/multiple_fasta_reader.py
View file @
01528585
...
...
@@ -2,7 +2,7 @@ from collections import namedtuple
Sequence
=
namedtuple
(
"Sequence"
,
"id comment sequence"
)
def
fasta_reader
(
fasta_
file
):
def
fasta_reader
(
fasta_
path
):
"""
:param fasta_path: the path to the file to parse
:type fasta_path: string
...
...
@@ -10,24 +10,28 @@ def fasta_reader(fasta_file):
:rtype: list of Sequence
"""
sequences
=
[]
id_
=
''
comment
=
''
sequence
=
''
for
line
in
fasta_infile
:
if
line
.
startswith
(
'>'
):
# a new sequence begin
if
id_
!=
''
:
# a sequence was already parsed so add it to the list
with
open
(
fasta_path
,
'r'
)
as
fasta_infile
:
id_
=
''
comment
=
''
sequence
=
''
for
line
in
fasta_infile
:
if
line
.
startswith
(
'>'
):
# a new sequence begin
if
id_
!=
''
:
# a sequence was already parsed so add it to the list
sequences
.
append
(
Sequence
(
id_
,
comment
,
sequence
))
sequence
=
''
header
=
line
.
split
()
id_
=
header
[
0
]
comment
=
' '
.
join
(
header
[
1
:])
else
:
sequence
+=
line
.
strip
()
sequences
.
append
(
Sequence
(
id_
,
comment
,
sequence
))
sequence
=
''
header
=
line
.
split
()
id_
=
header
[
0
]
comment
=
' '
.
join
(
header
[
1
:])
else
:
sequence
+=
line
.
strip
()
return
Sequence
(
id_
,
comment
,
sequence
)
return
sequences
# if we open the file in the fasta reader we are forced
# to read all the sequences and charge them in memory which can take huge space
# it's better to read sequences one by one and treat it as one is ready.
# see fasta_filter.py
\ No newline at end of file
# The problem with this implementation is that we have to load all
# sequences in memory before to start to work with
# it is better to return sequence one by one
# and treat them as they are loaded.
\ No newline at end of file
source/_static/code/multiple_fasta_reader2.py
0 → 100644
View file @
01528585
from
collections
import
namedtuple
Sequence
=
namedtuple
(
"Sequence"
,
"id comment sequence"
)
def
fasta_reader
(
fasta_file
):
"""
:param fasta_file: to the file in fasta format to parse
:type fasta_file: file object
:return: a sequence until they are sequences in the file
:rtype: a Sequence or None
"""
id_
=
''
comment
=
''
sequence
=
''
# As we use seek or tell, we cannot use for line in file object
# Because in the last case tell is always at the end of file
# even if when we read the first line
# So I use readline
line
=
fasta_file
.
readline
()
while
line
:
if
line
.
startswith
(
'>'
):
# a new sequence begin
if
id_
==
''
:
header
=
line
.
split
()
id_
=
header
[
0
]
comment
=
' '
.
join
(
header
[
1
:])
else
:
# I already parse a sequence
# So the begining of this sequence indicate the end of the
# previous sequence
# put the cursor one line in back for the next fasta_reader call
fasta_file
.
seek
(
-
len
(
line
),
1
)
# I return the previous sequence
return
Sequence
(
id_
,
comment
,
sequence
)
else
:
sequence
+=
line
.
strip
()
line
=
fasta_file
.
readline
()
if
id_
==
''
and
sequence
==
''
:
return
else
:
return
Sequence
(
id_
,
comment
,
sequence
)
# to return sequence by sequence we had to open the file outside the fasta_reader
# at each fasta_reader call the function return one sequence
# unitl the end of file
if
__name__
==
'__main__'
:
import
sys
import
os.path
if
len
(
sys
.
argv
)
!=
2
:
sys
.
exit
(
"usage multiple_fasta fasta_path"
)
fasta_path
=
sys
.
argv
[
1
]
if
not
os
.
path
.
exists
(
fasta_path
):
sys
.
exit
(
"No such file: {}"
.
format
(
fasta_path
))
with
open
(
fasta_path
,
'r'
)
as
fasta_input
:
sequence
=
True
while
sequence
is
not
None
:
sequence
=
fasta_reader
(
fasta_input
)
print
"----------------"
print
sequence
\ No newline at end of file
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment