From 6f9f4bf8a4096219690f71762fe80b12ad14ab9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bertrand=20N=C3=A9ron?= <bneron@pasteur.fr>
Date: Sun, 7 Sep 2014 22:05:00 +0200
Subject: [PATCH] refactor fasta_reader to put open file outside function
---
source/_static/code/fasta_filter.py | 83 ++++++++++----------
source/_static/code/fasta_iterator.py | 9 ++-
source/_static/code/fasta_reader.py | 2 +-
source/_static/code/multiple_fasta_reader.py | 42 +++++-----
source/_static/code/similarity.py | 10 +--
5 files changed, 77 insertions(+), 69 deletions(-)
diff --git a/source/_static/code/fasta_filter.py b/source/_static/code/fasta_filter.py
index 657f6f5..a4a5e27 100644
--- a/source/_static/code/fasta_filter.py
+++ b/source/_static/code/fasta_filter.py
@@ -5,57 +5,58 @@ from itertools import groupby
Sequence = namedtuple("Sequence", "id comment sequence")
-def fasta_iter(fasta_path):
- """
- :param fasta_file: the file containing all input sequences in fasta format.
- :type fasta_file: file object
- :author: http://biostar.stackexchange.com/users/36/brentp
- :return: for a given fasta file, it returns an iterator which yields tuples
+def fasta_iter(fasta_file):
+ """
+ :param fasta_file: the file containing all input sequences in fasta format.
+ :type fasta_file: file object
+ :author: http://biostar.stackexchange.com/users/36/brentp
+ :return: for a given fasta file, it returns an iterator which yields tuples
(string id, string comment, int sequence length)
- :rtype: iterator
- """
- with open(fasta_path) as fasta_file:
- # ditch the boolean (x[0]) and just keep the header or sequence since
- # we know they alternate.
- group = (x[1] for x in groupby(fasta_file , lambda line: line[0] == ">"))
- for header in group:
- # drop the ">"
- header = header.next()[1:].strip()
- header = header.split()
- _id = header[0]
- comment = ' '.join(header[1:])
- seq = ''.join(s.strip() for s in group.next())
- yield Sequence(_id, comment, seq)
+ :rtype: iterator
+ """
+ # ditch the boolean (x[0]) and just keep the header or sequence since
+ # we know they alternate.
+ group = (x[1] for x in groupby(fasta_file , lambda line: line[0] == ">"))
+ for header in group:
+ # drop the ">"
+ header = header.next()[1:].strip()
+ header = header.split()
+ _id = header[0]
+ comment = ' '.join(header[1:])
+ seq = ''.join(s.strip() for s in group.next())
+ yield Sequence(_id, comment, seq)
+
-def fasta_writer(sequence, fasta_path):
+def fasta_writer(sequence, output_file):
"""
write a sequence in a file in fasta format
:param sequence: the sequence to print
:type sequence: Sequence instance
- :param fasta_path: the path to the file to print the sequence in
- :type fasta_path: string
+ :param v: the file to print the sequence in
+ :type output_file: file object
"""
- print "appel de fasta_writer ",sequence.id, " ",fasta_path
- with open(fasta_path, 'w') as output:
- output.write('>{0.id} {0.comment}\n'.format(seq))
- start = 0
- while start < len(seq.sequence):
- end = start + 80
- print start, " : ", end
- output.write(seq.sequence[start: end + 1] + '\n')
- start = end
+ output_file.write('>{0.id} {0.comment}\n'.format(seq))
+ start = 0
+ while start < len(seq.sequence):
+ end = start + 80
+ print start, " : ", end
+ output_file.write(seq.sequence[start: end + 1] + '\n')
+ start = end
+
if __name__ == '__main__':
if len(sys.argv) != 2:
sys.exit("usage: fasta_filter path/to/fasta/file/to/read")
input_path = sys.argv[1]
-
- for seq in fasta_iter(input_path):
- if seq.sequence.startswith('M') and seq.sequence.count('W') > 6:
- if os.path.exists(seq.id):
- print >> sys.stderr , "file {0} already exist: sequence {0} skipped".format(seq.id)
- continue
- else:
- output_fasta = seq.id + ".fa"
- fasta_writer(seq, output_fasta)
\ No newline at end of file
+
+ with open(input_path, 'r') as input_file:
+ for seq in fasta_iter(input_path):
+ if seq.sequence.startswith('M') and seq.sequence.count('W') > 6:
+ if os.path.exists(seq.id):
+ print >> sys.stderr , "file {0} already exist: sequence {0} skipped".format(seq.id)
+ continue
+ else:
+ output_fasta = seq.id + ".fa"
+ with open(output_path, 'w') as output_file:
+ fasta_writer(seq, output_file)
\ No newline at end of file
diff --git a/source/_static/code/fasta_iterator.py b/source/_static/code/fasta_iterator.py
index 65b2a40..b613157 100644
--- a/source/_static/code/fasta_iterator.py
+++ b/source/_static/code/fasta_iterator.py
@@ -3,7 +3,7 @@ from itertools import groupby
Sequence = namedtuple("Sequence", "id comment sequence")
-def fasta_iter(fasta_path):
+def fasta_iter(fasta_file):
"""
:param fasta_file: the file containing all input sequences in fasta format.
:type fasta_file: file object
@@ -30,4 +30,9 @@ def fasta_iter(fasta_path):
#f.next()
#or
# for seq in fasta_iter('seq.fasta'):
-# do something with seq
\ No newline at end of file
+# do something with seq
+#The problem with this implementation is
+# something goes wrong in do something with seq
+# but we don't quit the program (we catch the exception for instance)
+# the fasta file is still open
+# it's better to put the fasta file opening out the fasta reader see fasta filter
\ No newline at end of file
diff --git a/source/_static/code/fasta_reader.py b/source/_static/code/fasta_reader.py
index cc9a8a0..fe4da17 100644
--- a/source/_static/code/fasta_reader.py
+++ b/source/_static/code/fasta_reader.py
@@ -21,4 +21,4 @@ def fasta_reader(fasta_path):
in_sequence = True
else:
sequence += line.strip()
- return Sequence(id_ , comment, sequence)
\ No newline at end of file
+ return Sequence(id_ , comment, sequence)
\ No newline at end of file
diff --git a/source/_static/code/multiple_fasta_reader.py b/source/_static/code/multiple_fasta_reader.py
index 70776e8..6797a79 100644
--- a/source/_static/code/multiple_fasta_reader.py
+++ b/source/_static/code/multiple_fasta_reader.py
@@ -2,7 +2,7 @@ from collections import namedtuple
Sequence = namedtuple("Sequence", "id comment sequence")
-def fasta_reader(fasta_path):
+def fasta_reader(fasta_file):
"""
:param fasta_path: the path to the file to parse
:type fasta_path: string
@@ -10,22 +10,24 @@ def fasta_reader(fasta_path):
:rtype: list of Sequence
"""
sequences = []
- with open(fasta_path, 'r') as fasta_infile:
- id_ = ''
- comment = ''
- sequence = ''
- for line in fasta_infile:
- if line.startswith('>'):
- # a new sequence begin
- if id_ != '':
- # a sequence was already parsed so add it to the list
- sequences.append(Sequence(id_ , comment, sequence))
- sequence = ''
- header = line.split()
- id_ = header[0]
- comment = ' '.join(header[1:])
- else:
- sequence += line.strip()
- #append the last sequence of the file to the list
- sequences.append(Sequence(id_ , comment, sequence))
- return sequences
\ No newline at end of file
+ id_ = ''
+ comment = ''
+ sequence = ''
+ for line in fasta_infile:
+ if line.startswith('>'):
+ # a new sequence begin
+ if id_ != '':
+ # a sequence was already parsed so add it to the list
+ sequences.append(Sequence(id_ , comment, sequence))
+ sequence = ''
+ header = line.split()
+ id_ = header[0]
+ comment = ' '.join(header[1:])
+ else:
+ sequence += line.strip()
+ return Sequence(id_ , comment, sequence)
+
+# if we open the file in the fasta reader we are forced
+# to read all the sequences and charge them in memory which can take huge space
+# it's better to read sequences one by one and treat it as one is ready.
+# see fasta_filter.py
\ No newline at end of file
diff --git a/source/_static/code/similarity.py b/source/_static/code/similarity.py
index ab45dc9..b46d1a0 100644
--- a/source/_static/code/similarity.py
+++ b/source/_static/code/similarity.py
@@ -1,4 +1,4 @@
-from matrix import *
+import matrix
lines = iter( [' A G C U\n',
'A 1.0 0.5 0.0 0.0\n',
@@ -11,14 +11,14 @@ def parse_similarity_file():
"""
parse file containing RNA similarity matrix and return a matrix
"""
- sim_matrix = matrix_maker(4, 4)
+ sim_matrix = matrix.create(4, 4)
#skip first line
lines.next()
for row_no, line in enumerate(lines):
line = line.strip()
fields = line.split()
values = [float(val) for val in fields[1:]]
- matrix_replace_row(sim_matrix, row_no, values)
+ matrix.replace_row(sim_matrix, row_no, values)
return sim_matrix
def get_similarity(b1, b2, sim_matrix):
@@ -39,7 +39,7 @@ def get_similarity(b1, b2, sim_matrix):
raise KeyError("unknown base b1: " + str(b1))
if not b2 in bases:
raise KeyError("unknown base b2: " + str(b2))
- return matrix_get_cell(sim_matrix, bases[b1], bases[b2])
+ return matrix.get_cell(sim_matrix, bases[b1], bases[b2])
def compute_similarity(seq1, seq2, sim_matrix):
"""
@@ -63,7 +63,7 @@ if __name__ == '__main__':
seq1 = 'AGCAUCUA'
seq2 = 'ACCGUUCU'
sim_matrix = parse_similarity_file()
- print matrix_to_str(sim_matrix)
+ print matrix.to_str(sim_matrix)
similarity = compute_similarity(seq1, seq2, sim_matrix)
print similarity
\ No newline at end of file
--
GitLab