From 150ca0354054610beafb5de6cb609229b5159ace Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bertrand=20N=C3=A9ron?= <bneron@pasteur.fr> Date: Mon, 1 Sep 2014 01:11:27 +0200 Subject: [PATCH] add exercises with solutions --- source/Control_Flow_Statements.rst | 14 +++- source/Creating_and_Calling_Funtions.rst | 49 +++++++++++++ source/Input_output.rst | 89 ++++++++++++++++++++++++ source/_static/code/fasta_iterator.py | 33 +++++++++ source/_static/code/fasta_reader.py | 21 ++++++ source/_static/code/parse_blast.py | 23 ++++++ source/index.rst | 5 +- 7 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 source/_static/code/fasta_iterator.py create mode 100644 source/_static/code/fasta_reader.py create mode 100644 source/_static/code/parse_blast.py diff --git a/source/Control_Flow_Statements.rst b/source/Control_Flow_Statements.rst index aad5bbc..eaa6f36 100644 --- a/source/Control_Flow_Statements.rst +++ b/source/Control_Flow_Statements.rst @@ -38,9 +38,21 @@ The fibonacci suite can be defined as following: a = b b = new_number -We will see another way more elegant to implement the fibonacci suite in next chapter. +We will see another way more elegant to implement the fibonacci suite in :ref:`Advance Programming Techniques` section. +Exercise +-------- + +display the largest element in list (containing float or integer only)?:: + l = [1,2,3,4,58,9] + for i in l: + highest = l[0] + if i > highest: + highest = i + print i + + Exercise -------- diff --git a/source/Creating_and_Calling_Funtions.rst b/source/Creating_and_Calling_Funtions.rst index 49f841c..ec5bafd 100644 --- a/source/Creating_and_Calling_Funtions.rst +++ b/source/Creating_and_Calling_Funtions.rst @@ -3,3 +3,52 @@ ****************************** Creating and Calling Functions ****************************** + +Exercises +========= + + +Exercice +-------- + +Use the code of the exetrcise 4.5.7 on the kmer. Make a function which compute all kmer of a given lenght +in a sequence. + +Exercise +-------- + +Write a function translate taht have a nucleic sequence as parameter, and return the translate sequence. +We give you a genetic code : :: + + code = { 'ttt': 'F', 'tct': 'S', 'tat': 'Y', 'tgt': 'C', + 'ttc': 'F', 'tcc': 'S', 'tac': 'Y', 'tgc': 'C', + 'tta': 'L', 'tca': 'S', 'taa': '*', 'tga': '*', + 'ttg': 'L', 'tcg': 'S', 'tag': '*', 'tgg': 'W', + 'ctt': 'L', 'cct': 'P', 'cat': 'H', 'cgt': 'R', + 'ctc': 'L', 'ccc': 'P', 'cac': 'H', 'cgc': 'R', + 'cta': 'L', 'cca': 'P', 'caa': 'Q', 'cga': 'R', + 'ctg': 'L', 'ccg': 'P', 'cag': 'Q', 'cgg': 'R', + 'att': 'I', 'act': 'T', 'aat': 'N', 'agt': 'S', + 'atc': 'I', 'acc': 'T', 'aac': 'N', 'agc': 'S', + 'ata': 'I', 'aca': 'T', 'aaa': 'K', 'aga': 'R', + 'atg': 'M', 'acg': 'T', 'aag': 'K', 'agg': 'R', + 'gtt': 'V', 'gct': 'A', 'gat': 'D', 'ggt': 'G', + 'gtc': 'V', 'gcc': 'A', 'gac': 'D', 'ggc': 'G', + 'gta': 'V', 'gca': 'A', 'gaa': 'E', 'gga': 'G', + 'gtg': 'V', 'gcg': 'A', 'gag': 'E', 'ggg': 'G' + } + +bonus +""""" + +This function have to take the phase as parameter + +bonus +""""" + +This function can take genetique code as default parameter + + + + + diff --git a/source/Input_output.rst b/source/Input_output.rst index aa4364e..df61015 100644 --- a/source/Input_output.rst +++ b/source/Input_output.rst @@ -3,3 +3,92 @@ ************************************ Variables, Expression and statements ************************************ + +Exercises +========= + +Exercise +-------- + +Write a function which take the path of file as parameter +and display it's content on the screen. + +We wait same behavior as the shell *cat* command. :: + + import sys + import os + + def cat(path): + if not os.path.exists(path): + sys.exit("no such file: {0}".format(path) + with open(path, 'r') as infile: + for line in infile: + print line + +Exercise +-------- + +Write a function which take the path of a file in rebase format +and return in a dictionnary the collection of the enzyme contains in the file. +The sequence of the binding site must be cleaned up. + +:download:`rebase_light.txt <_static/data/rebase_light.txt>` . + +Exercise +-------- + +write a function which take the path of a fasta file +and return a data structure of your choice that allow to stock +the id of the sequence and the sequence itself. + +:download:`seq.fasta <_static/data/seq.fasta>` . + +solution 1 +"""""""""" +.. literalinclude:: _static/code/fasta_reader.py + :linenos: + :language: python + +:download:`fasta_reader.py <_static/code/fasta_reader.py>` . + +solution 2 +"""""""""" + +.. literalinclude:: _static/code/fasta_iterator.py + :linenos: + :language: python + +:download:`fasta_iterator.py <_static/code/fasta_iterator.py>` . + + +The second version is an iterator. Thus it retrun sequence by sequence the advantage of this version. +If the file contains lot of sequences you have not to load all the file in memory. +You can call this function and put in in a loop or call next. work with the sequence and pass to the next sequence on so on. +for instance : :: + + for seq in fasta_iter('my_fast_file.fasta'): + print seq + +Exercise +-------- + +we ran a blast with the folowing command *blastall -p blastp -d uniprot_sprot -i query_seq.fasta -e 1e-05 -m 8 -o blast2.txt* + +-m 8 is the tabular output. So each fields is separate to the following by a '\t' + +The fields are: query id, database sequence (subject) id, percent identity, alignment length, number of mismatches, number of gap openings, +query start, query end, subject start, subject end, Expect value, HSP bit score. + +:download:`blast2.txt <_static/data/blast2.txt>` . + +| parse the file +| sort the hits by their *percent identity* in the descending order. +| write the results in a new file. + +(adapted from *managing your biological data with python* p138) :: + +.. literalinclude:: _static/code/parse_blast_output.py + :linenos: + :language: python + +:download:`parse_blast_output.py <_static/code/parse_blast_output.txt>` . \ No newline at end of file diff --git a/source/_static/code/fasta_iterator.py b/source/_static/code/fasta_iterator.py new file mode 100644 index 0000000..65b2a40 --- /dev/null +++ b/source/_static/code/fasta_iterator.py @@ -0,0 +1,33 @@ +from collections import namedtuple +from itertools import groupby + +Sequence = namedtuple("Sequence", "id comment sequence") + +def fasta_iter(fasta_path): + """ + :param fasta_file: the file containing all input sequences in fasta format. + :type fasta_file: file object + :author: http://biostar.stackexchange.com/users/36/brentp + :return: for a given fasta file, it returns an iterator which yields tuples + (string id, string comment, int sequence length) + :rtype: iterator + """ + with open(fasta_path) as fasta_file: + # ditch the boolean (x[0]) and just keep the header or sequence since + # we know they alternate. + group = (x[1] for x in groupby(fasta_file , lambda line: line[0] == ">")) + for header in group: + # drop the ">" + header = header.next()[1:].strip() + header = header.split() + _id = header[0] + comment = ' '.join(header[1:]) + seq = ''.join(s.strip() for s in group.next()) + yield Sequence(_id, comment, seq) + +#using exanple: +#f = fasta_iter('seq.fasta') +#f.next() +#or +# for seq in fasta_iter('seq.fasta'): +# do something with seq \ No newline at end of file diff --git a/source/_static/code/fasta_reader.py b/source/_static/code/fasta_reader.py new file mode 100644 index 0000000..a6595c3 --- /dev/null +++ b/source/_static/code/fasta_reader.py @@ -0,0 +1,21 @@ +from collections import namedtuple + +Sequence = namedtuple("Sequence", "id comment sequence") + +def fasta_reader(fasta_path): + with open(fasta_path, 'r') as fasta_infile: + id = '' + comment = '' + sequence = '' + in_sequence = False + for line in fasta_infile: + if line.startswith('>'): + header = line.split() + id = header[0] + comment = ' '.join(header[1:]) + in_sequence = True + elif in_sequence: + sequence += line.strip() + else: + continue + return Sequence(id , comment, sequence) \ No newline at end of file diff --git a/source/_static/code/parse_blast.py b/source/_static/code/parse_blast.py new file mode 100644 index 0000000..19d0c7b --- /dev/null +++ b/source/_static/code/parse_blast.py @@ -0,0 +1,23 @@ +from operator import itemgetter + +def parse_blast_output(input_file, output_file): + with open(input_file, 'r') as infile: + table = [] + for line in infile: + print i + col = line.split('\t') + try: + col[2] = float(col[2]) + except ValueError as err: + raise RuntimeError("error in parsing {} : {}".format(input_file, err)) + col[-1] = col[-1][:-1] + table.append(col) + #from this point the input_file is closed + table_sorted = sorted(table, key = itemgetter(2), reverse = True) + # alternative + # table_sorted = sorted(table, key = lambda x : x[2], reversed = True) + with open(output_file, 'w') as output: + for row in table_sorted: + row = [str(x) for x in row] + output.write("\t".join(row) + "\n") + diff --git a/source/index.rst b/source/index.rst index 1fc790f..d585d83 100644 --- a/source/index.rst +++ b/source/index.rst @@ -10,10 +10,13 @@ Contents: .. toctree:: :maxdepth: 2 - + :numbered: + + Introduction Variables Data_Types Collection_Data_Types + Logical_Operations Control_Flow_Statements Creating_and_Calling_Functions Modules_and_Packages -- GitLab