From e7ea101ef897ef3316944932a27779bcefe95bae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bertrand=20N=C3=A9ron?= <bneron@pasteur.fr> Date: Tue, 12 Aug 2014 23:56:53 +0200 Subject: [PATCH] simplify exercise on seraching restriction enzyme site in dna --- source/Collection_Data_Types.rst | 61 +++++++++----------------------- 1 file changed, 17 insertions(+), 44 deletions(-) diff --git a/source/Collection_Data_Types.rst b/source/Collection_Data_Types.rst index 7a4e7b7..2584f1c 100644 --- a/source/Collection_Data_Types.rst +++ b/source/Collection_Data_Types.rst @@ -203,62 +203,35 @@ and the 2 dna fragments: :: dna_1 = dna_1.replace('\n', '') dans_2 = dna_2.replace('\n', '') - + +algorithm 1 :: + enzymes = [ecor1, ecor5, bamh1, hind3, taq1, not1, sau3a1, hae3, sma1] digest_1 = [] for enz in enzymes: - pos = dna_1.find(enz.sequence) - if pos != -1: + if enz.sequence in dna_1: digest_1.append(enz) -with this first algorithm we find if an enzyme cut the dna but we cannot find all cuts in the dna for an enzyme. -If we find a cutting site, we must search again starting at the first nucleotid after the begining of the match -until the end of the the dna, for this we use the start parameter of the find function, and so on. -As we don't know how many loop we need to scan the dna until the end we use a ``while`` loop testing for the presence of a cutting site.:: - +algorithm 2 we can determine the position of the site :: + + enzymes = [ecor1, ecor5, bamh1, hind3, taq1, not1, sau3a1, hae3, sma1] digest_1 = [] for enz in enzymes: pos = dna_1.find(enz.sequence) - while pos != -1: + if pos != -1: digest_1.append(enz) - pos = dna_1.find(enz.sequence, pos + 1) - - digest_2 = [] - for enz in enzymes: - pos = dna_2.find(enz.sequence) - while pos != -1: - digest_2.append(enz) - pos = dna_2.find(enz.sequence, pos + 1) - - cut_dna_1 = set(digest_1) - cut_dna_2 = set(digest_2) - cut_dna_1_not_dna_2 = cut_dna_1 - cut_dna_2 - -If we want also the position, for instance to compute the fragments of dna. :: + +with these algorithms we find if an enzyme cut the dna but we cannot find all cuts in the dna for an enzyme. :: + + enzymes = [ecor1, ecor5, bamh1, hind3, taq1, not1, sau3a1, hae3, sma1] digest_1 = [] for enz in enzymes: - pos = dna_1.find(enz.sequence) - while pos != -1: - digest_1.append((enz, pos)) - pos = dna_1.find(enz.sequence, pos + 1) - - from operator import itemgetter - digest_1.sort(key=itemgetter(1)) - [(e.name, d) for e, d in digest_1] - - digest_2 = [] - for enz in enzymes: - pos = dna_2.find(enz.sequence) - while pos != -1: - digest_2.append((enz, pos)) - pos = dna_2.find(enz.sequence, pos + 1) - - cut_dna_1 = set([e.name for e in digest_1]) - cut_dna_2 = set([e.name for e in digest_2]) - cut_dna_1_not_dna_2 = cut_dna_1 - cut_dna_2 - - + print enz.name, dna_1.count(enz.sequence) + +the latter algorithm display the number of occurence of each enzyme, But we cannot determine the position of every sites. +We will see howt to do this later. + Exercise -- GitLab