diff --git a/source/Input_Output.rst b/source/Input_Output.rst index f4eae1430a3ab28629a483059ea0ac651b1dc420..c6c79cc8983e4c8e2ce5976dd602e0a884d1a03c 100644 --- a/source/Input_Output.rst +++ b/source/Input_Output.rst @@ -13,29 +13,35 @@ Exercises Exercise -------- -Write a function which take the path of file as parameter -and display it's content on the screen. +Write a function that takes the path of file as parameter +and displays it's content on the screen. -We wait same behavior as the shell *cat* command. :: +We expect the same behavior as the shell ``cat`` command. + +:: import sys import os - + def cat(path): if not os.path.exists(path): - sys.exit("no such file: {0}".format(path) + # Exit Python with a non-zero value + # to signify a failure + sys.exit("no such file: {0}".format(path)) with open(path, 'r') as infile: for line in infile: - print line - + # By default, print adds a "\n" to what it prints + # lines from a file already end with "\n". + print(line, end="") + Exercise -------- -Write a function which take the path of a file in rebase format -and return in a dictionary the collection of the enzyme contains in the file. +Write a function that takes the path of a file in rebase format +and returns in a dictionary the collection of the enzyme contained in the file. The sequence of the binding site must be cleaned up. -use the file :download:`rebase_light.txt <_static/data/rebase_light.txt>` to test your code. +Use the file :download:`rebase_light.txt <_static/data/rebase_light.txt>` to test your code. .. literalinclude:: _static/code/rebase.py :linenos: @@ -46,11 +52,11 @@ use the file :download:`rebase_light.txt <_static/data/rebase_light.txt>` to tes Exercise -------- -write a function which take the path of a fasta file (containing only one sequence) -and return a data structure of your choice that allow to stock +Write a function that takes the path of a fasta file +and returns a data structure of your choice that allows to store the id of the sequence and the sequence itself. -use the file :download:`seq.fasta <_static/data/seq.fasta>` to test your code. +Use the file :download:`seq.fasta <_static/data/seq.fasta>` to test your code. .. literalinclude:: _static/code/fasta_reader.py :linenos: @@ -81,7 +87,7 @@ Write sequences with 80 aa/line Exercise -------- -we ran a blast with the folowing command *blastall -p blastp -d uniprot_sprot -i query_seq.fasta -e 1e-05 -m 8 -o blast2.txt* +We ran a blast with the following command *blastall -p blastp -d uniprot_sprot -i query_seq.fasta -e 1e-05 -m 8 -o blast2.txt* -m 8 is the tabular output. So each fields is separate to the following by a '\t' @@ -114,7 +120,7 @@ Hint: ^^^^^ Use the module csv in python https://docs.python.org/3/library/csv.html#module-csv -use a reader like below :: +Use a reader, as follows:: >>> reader = csv.reader(input, quotechar='"') @@ -134,6 +140,7 @@ use the file :download:`abcd.fasta <_static/data/abcd.fasta>` to test your code. solution 1 ^^^^^^^^^^ + .. literalinclude:: _static/code/multiple_fasta_reader.py :linenos: :language: python @@ -142,6 +149,7 @@ solution 1 solution 2 ^^^^^^^^^^ + .. literalinclude:: _static/code/multiple_fasta_reader2.py :linenos: :language: python @@ -150,6 +158,7 @@ solution 2 solution 3 ^^^^^^^^^^ + .. literalinclude:: _static/code/fasta_iterator.py :linenos: :language: python @@ -162,19 +171,19 @@ if the file is huge (>G0) it can be a problem. The third version allow to red sequences one by one. To do that we have to open the file outside the reader function The fasta format is very convenient for human but not for parser. -The end of a sequence is indicated by the end of file or the begining of a new one. +The end of a sequence is indicated by the end of file or the beginning of a new one. So with this version we have play with the cursor to place the cursor backward -when we encouter a new sequence. then the cursor is placed at the right place +when we encounter a new sequence. Then the cursor is placed at the right place for the next sequence. The third version is an iterator and use generator. -generators are functions which keep a state between to calls. -generators does not use return to return a value but the keyword yield. -Thus this implementation retrun sequence by sequence without to play with the cursor. +Generators are functions which keep a state between to calls. +Generators do not use return to return a value but the keyword yield. +Thus this implementation return sequence by sequence without to play with the cursor. You can call this function and put in in a loop or call next. Work with the sequence and pass to the next sequence on so on. -for instance which is a very convenient way to use it: :: +For instance which is a very convenient way to use it:: for seq in fasta_iter('my_fast_file.fasta'): print seq diff --git a/source/Object_Oriented_Programming.rst b/source/Object_Oriented_Programming.rst index c063e63f891046200b379b58887320ca1a023ded..4bf0d8b6e98b9061930ccb3c5f054d59a01dd50b 100644 --- a/source/Object_Oriented_Programming.rst +++ b/source/Object_Oriented_Programming.rst @@ -116,6 +116,9 @@ A tutorial is available https://biopython.org/wiki/SeqIO :: print("sequence =", sv40_rcd.seq) +Other example of usage of ``SeqIO``: :download:`seq_io.py <_static/code/seq_io.py>` + + Exercise -------- @@ -228,4 +231,4 @@ What is the benefit to use oop style instead of procedural style? :linenos: :language: python -:download:`fasta_object.py <_static/code/fasta_object.py>` . \ No newline at end of file +:download:`fasta_object.py <_static/code/fasta_object.py>` . diff --git a/source/_static/code/rebase.py b/source/_static/code/rebase.py index 030125401c01e043baed19266b583210f99b36c3..7fcc0b9b55d45bdb488c76cd1f6afb87b9bf935b 100644 --- a/source/_static/code/rebase.py +++ b/source/_static/code/rebase.py @@ -1,9 +1,10 @@ +#!/usr/bin/env python3 def rebase_parser(rebase_file): """ :param rebase_file: the rebase file to parse :type rebase_file: file object - :return: at each call return a tuple (str enz name, str binding site) + :return: at each call yields a tuple (str enz name, str binding site) :rtype: iterator """ def clean_seq(seq): @@ -15,25 +16,38 @@ def rebase_parser(rebase_file): if char in 'ACGT': clean_seq += char return clean_seq - + for line in rebase_file: fields = line.split() - #fields = fields.split() name = fields[0] seq = clean_seq(fields[2]) yield (name, seq) - - + + +def rebase2dict(rebase_path): + """ + :param rebase_path: the path to rebase file to parse + :type rebase_path: str + :return: a dict with items (str enz name, str binding site) + """ + with open(rebase_path, 'r') as rebase_input: + # enz_dict = {} + # for (name, seq) in rebase_parser(rebase_input): + # enz_dict[name] = seq + enz_dict = dict(rebase_parser(rebase_input)) + return enz_dict + + if __name__ == '__main__': import sys import os.path - + if len(sys.argv) != 2: - sys.exit("usage multiple_fasta fasta_path") + sys.exit("Usage: rebase.py rebase_file") rebase_path = sys.argv[1] if not os.path.exists(rebase_path): sys.exit("No such file: {}".format(rebase_path)) - - with open(rebase_path, 'r') as rebase_input: - for enz in rebase_parser(rebase_input): - print enz \ No newline at end of file + + enz_dict = rebase2dict(rebase_path) + print(enz_dict) + diff --git a/source/_static/code/seq_io.py b/source/_static/code/seq_io.py new file mode 100644 index 0000000000000000000000000000000000000000..84de0cb2786c07f38bf7f1e8434ee42b8cfd5a0c --- /dev/null +++ b/source/_static/code/seq_io.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python3 +"""Example of use of SeqIO from biopython + +Here, we parse a fasta file, put the records in a list, search for a motif in +the sequence of the first record, and create a subsequence around this motif. +""" + +from Bio import SeqIO + +records = list(SeqIO.parse(fasta_filename, "fasta")) +records +# [SeqRecord(seq=Seq('GCCTCGGCCTCTGCATAAATAAAAAAAATTAGTCAGCCATGGGGCGGAGAATGG...GCG', SingleLetterAlphabet()), id='gi|965480|gb|J02400.1|SV4CG', name='gi|965480|gb|J02400.1|SV4CG', description='gi|965480|gb|J02400.1|SV4CG Simian virus 40 complete genome', dbxrefs=[])] +sequence = records[0].seq +sequence +# Seq('GCCTCGGCCTCTGCATAAATAAAAAAAATTAGTCAGCCATGGGGCGGAGAATGG...GCG', SingleLetterAlphabet()) +motif = "TAAAT" +sequence.find(motif) +# 15 +motif_pos = sequence.find(motif) +subseq_start = motif_pos - 10 +subseq_end = motif_pos + len(motif) + 11 +subseq = sequence[subseq_start:subseq_end] +subseq +# Seq('GGCCTCTGCATAAATAAAAAAAATTA', SingleLetterAlphabet()) + diff --git a/source/_static/code/translate.py b/source/_static/code/translate.py index 288a0659da8990079d3e8958f7946b125aabd0d3..42168ea403672d2791cab617b8811da9d6454d92 100644 --- a/source/_static/code/translate.py +++ b/source/_static/code/translate.py @@ -1,4 +1,4 @@ -genetic_code = { 'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C', +genetic_code = { 'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C', 'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C', 'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*', 'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W', @@ -23,7 +23,11 @@ def translate(nuc_seq, code): # to avoid to compute len(seq)/3 at each loop # I compute it once and use a reference # it could be expensive if the sequence is very long. - cycle = len(nuc_seq)/3 + + # another way to determine the end of looping + # stop_iteration = len(nuc_seq) + # while (start + 2) < stop_iteration: + cycle = len(nuc_seq)//3 while n < cycle: start = n * 3 end = start + 3 @@ -34,23 +38,18 @@ def translate(nuc_seq, code): else: raise RuntimeError("unknow codon: " + codon) n += 1 + # if use the other looping solution + # n += 3 return prot_seq def translate2(nuc_seq, code, phase = 1): prot_seq = '' if 0 < phase < 4 : start = phase - 1 + nuc_seq = nuc_seq[start:] elif -4 < phase < 0: start = -phase - 1 nuc_seq = nuc_seq[::-1] - # an other way to determine the end of looping - stop_iteration = len(nuc_seq) - while (start + 2) < stop_iteration: - end = start + 3 - codon = nuc_seq[start:end].lower() - if codon in code: - prot_seq += code[codon] - else: - raise RuntimeError("unknow codon") - start += 3 + nuc_seq = nuc_seq[start:] + prot_seq = translate(nuc_seq, code) return prot_seq