Commit 01528585 authored by Bertrand  NÉRON's avatar Bertrand NÉRON
Browse files

fix different versions of fasta reader

parent 8f9dd96c
......@@ -15,8 +15,9 @@ def fasta_iter(fasta_file):
with open(fasta_path) as fasta_file:
# ditch the boolean (x[0]) and just keep the header or sequence since
# we know they alternate.
group = (x[1] for x in groupby(fasta_file , lambda line: line[0] == ">"))
group = (x[1] for x in groupby(fasta_file , lambda line: line.startswith(">")))
for header in group:
print header
# drop the ">"
header = header.next()[1:].strip()
header = header.split()
......@@ -35,4 +36,20 @@ def fasta_iter(fasta_file):
# something goes wrong in do something with seq
# but we don't quit the program (we catch the exception for instance)
# the fasta file is still open
# it's better to put the fasta file opening out the fasta reader see fasta filter
\ No newline at end of file
# it's better to put the fasta file opening out the fasta reader see fasta filter
if __name__ == '__main__':
import sys
import os.path
if len(sys.argv) != 2:
sys.exit("usage multiple_fasta fasta_path")
fasta_path = sys.argv[1]
if not os.path.exists(fasta_path):
sys.exit("No such file: {}".format(fasta_path))
with open(fasta_path, 'r') as fasta_input:
for sequence in fasta_iter(fasta_input):
print "----------------"
print sequence
\ No newline at end of file
......@@ -2,7 +2,7 @@ from collections import namedtuple
Sequence = namedtuple("Sequence", "id comment sequence")
def fasta_reader(fasta_file):
def fasta_reader(fasta_path):
"""
:param fasta_path: the path to the file to parse
:type fasta_path: string
......@@ -10,24 +10,28 @@ def fasta_reader(fasta_file):
:rtype: list of Sequence
"""
sequences = []
id_ = ''
comment = ''
sequence = ''
for line in fasta_infile:
if line.startswith('>'):
# a new sequence begin
if id_ != '':
# a sequence was already parsed so add it to the list
with open(fasta_path, 'r') as fasta_infile:
id_ = ''
comment = ''
sequence = ''
for line in fasta_infile:
if line.startswith('>'):
# a new sequence begin
if id_ != '':
# a sequence was already parsed so add it to the list
sequences.append(Sequence(id_ , comment, sequence))
sequence = ''
header = line.split()
id_ = header[0]
comment = ' '.join(header[1:])
else:
sequence += line.strip()
sequences.append(Sequence(id_ , comment, sequence))
sequence = ''
header = line.split()
id_ = header[0]
comment = ' '.join(header[1:])
else:
sequence += line.strip()
return Sequence(id_ , comment, sequence)
return sequences
# if we open the file in the fasta reader we are forced
# to read all the sequences and charge them in memory which can take huge space
# it's better to read sequences one by one and treat it as one is ready.
# see fasta_filter.py
\ No newline at end of file
# The problem with this implementation is that we have to load all
# sequences in memory before to start to work with
# it is better to return sequence one by one
# and treat them as they are loaded.
\ No newline at end of file
from collections import namedtuple
Sequence = namedtuple("Sequence", "id comment sequence")
def fasta_reader(fasta_file):
"""
:param fasta_file: to the file in fasta format to parse
:type fasta_file: file object
:return: a sequence until they are sequences in the file
:rtype: a Sequence or None
"""
id_ = ''
comment = ''
sequence = ''
# As we use seek or tell, we cannot use for line in file object
# Because in the last case tell is always at the end of file
# even if when we read the first line
# So I use readline
line = fasta_file.readline()
while line:
if line.startswith('>'):
# a new sequence begin
if id_ == '':
header = line.split()
id_ = header[0]
comment = ' '.join(header[1:])
else:
# I already parse a sequence
# So the begining of this sequence indicate the end of the
# previous sequence
# put the cursor one line in back for the next fasta_reader call
fasta_file.seek(-len(line),1)
# I return the previous sequence
return Sequence(id_ , comment, sequence)
else:
sequence += line.strip()
line = fasta_file.readline()
if id_ == '' and sequence == '':
return
else:
return Sequence(id_ , comment, sequence)
# to return sequence by sequence we had to open the file outside the fasta_reader
# at each fasta_reader call the function return one sequence
# unitl the end of file
if __name__ == '__main__':
import sys
import os.path
if len(sys.argv) != 2:
sys.exit("usage multiple_fasta fasta_path")
fasta_path = sys.argv[1]
if not os.path.exists(fasta_path):
sys.exit("No such file: {}".format(fasta_path))
with open(fasta_path, 'r') as fasta_input:
sequence = True
while sequence is not None:
sequence = fasta_reader(fasta_input)
print "----------------"
print sequence
\ No newline at end of file
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment