Collection_Data_Types.rst

x = [1, 2, 3, 4]
y = x[1]
y = 3.14
x[1] = 'foo'
x = [1, 2, 3, 4]
x += [5, 6]
>>> x = [1, 2, 3, 4]
>>> id(x)
139950507563632
>>> x += [5,6]
>>> id(x)
139950507563632
x = [1, 2, 3, 4]
x[3] = -4 # what is the value of x now ?
y = sum(x)/len(x) #what is the value of y ? why ?

y = 0
float(sum(l)) / float(len(l))
x = [1, ['a','b','c'], 3, 4]
y = x[1]
y[2] = 'z'
# what is the value of x ?
x = [1, ['a','b','z'], 3, 4]
l = [1, 2, 3, 4, 5, 6, 7, 8, 9]
l1 = l[::2]
l2 = l[1::2]
bases = 'acgt'
codons = []
   for a in bases:
      for b in bases:
         for c in bases:
            codon = a + b + c
            codons.append(codon)
>>> l = [5,2,3,2,2,3,5,1]
>>> uniqify(l)
>>> [1,2,3,5] #is one of the solutions
>>> list(set(l))
import collections
RestrictEnzyme = collections.namedtuple("RestrictEnzyme", "name comment sequence cut end")

ecor1 = RestrictEnzyme("EcoRI", "Ecoli restriction enzime I", "gaattc", 1, "sticky")
ecor5 = RestrictEnzyme("EcoRV", "Ecoli restriction enzime V", "gatatc", 3, "blunt")
bamh1 = RestrictEnzyme("BamHI", "type II restriction endonuclease from Bacillus amyloliquefaciens ", "ggatcc", 1, "sticky")
hind3 = RestrictEnzyme("HindIII", "type II site-specific nuclease from Haemophilus influenzae", "aagctt", 1 , "sticky")
taq1 = RestrictEnzyme("TaqI", "Thermus aquaticus", "tcga", 1 , "sticky")
not1 = RestrictEnzyme("NotI", "Nocardia otitidis", "gcggccgc", 2 , "sticky")
sau3a1 = RestrictEnzyme("Sau3aI", "Staphylococcus aureus", "gatc", 0 , "sticky")
hae3 = RestrictEnzyme("HaeIII", "Haemophilus aegyptius", "ggcc", 2 , "blunt")
sma1 =  RestrictEnzyme("SmaI", "Serratia marcescens", "cccggg", 3 , "blunt")
dna_1 = """tcgcgcaacgtcgcctacatctcaagattcagcgccgagatccccgggggttgagcgatccccgtcagttggcgtgaattcag
cagcagcgcaccccgggcgtagaattccagttgcagataatagctgatttagttaacttggatcacagaagcttccaga
ccaccgtatggatcccaacgcactgttacggatccaattcgtacgtttggggtgatttgattcccgctgcctgccagg"""

dna_2 = """gagcatgagcggaattctgcatagcgcaagaatgcggccgcttagagcgatgctgccctaaactctatgcagcgggcgtgagg
attcagtggcttcagaattcctcccgggagaagctgaatagtgaaacgattgaggtgttgtggtgaaccgagtaag
agcagcttaaatcggagagaattccatttactggccagggtaagagttttggtaaatatatagtgatatctggcttg"""
dna_1 = dna_1.replace('\n', '')
dans_2 = dna_2.replace('\n', '')

enzymes = [ecor1, ecor5, bamh1, hind3, taq1, not1, sau3a1, hae3, sma1]
digest_1 = []
for enz in enzymes:
   pos = dna_1.find(enz.sequence)
   if pos != -1:
      digest_1.append(enz)
digest_1 = []
for enz in enzymes:
   pos = dna_1.find(enz.sequence)
   while pos != -1:
      digest_1.append(enz)
      pos = dna_1.find(enz.sequence, pos + 1)

digest_2 = []
for enz in enzymes:
   pos = dna_2.find(enz.sequence)
   while pos != -1:
      digest_2.append(enz)
      pos = dna_2.find(enz.sequence, pos + 1)

cut_dna_1 = set(digest_1)
cut_dna_2 = set(digest_2)
cut_dna_1_not_dna_2 = cut_dna_1 - cut_dna_2
digest_1 = []
for enz in enzymes:
   pos = dna_1.find(enz.sequence)
   while pos != -1:
      digest_1.append((enz, pos))
      pos = dna_1.find(enz.sequence, pos + 1)

from operator import itemgetter
digest_1.sort(key=itemgetter(1))
[(e.name, d) for e, d in digest_1]

digest_2 = []
for enz in enzymes:
   pos = dna_2.find(enz.sequence)
   while pos != -1:
      digest_2.append((enz, pos))
      pos = dna_2.find(enz.sequence, pos + 1)

cut_dna_1 = set([e.name for e in digest_1])
cut_dna_2 = set([e.name for e in digest_2])
cut_dna_1_not_dna_2 = cut_dna_1 - cut_dna_2
>>> l = [5,2,3,2,2,3,5,1]
>>> uniqify_with_order(l)
>>> [5,2,3,1]
>>> uniq = []
>>> for item in l:
>>>   if item not in uniq:
>>>      uniq.append(item)
>>> uniq_items = set()
>>> l_uniq = [x for x in l if x not in uniq_items and not uniq_items.add(x)]
s = """gtcagaccttcctcctcagaagctcacagaaaaacacgctttctgaaagattccacactcaatgccaaaatataccacag
gaaaattttgcaaggctcacggatttccagtgcaccactggctaaccaagtaggagcacctcttctactgccatgaaagg
aaaccttcaaaccctaccactgagccattaactaccatcctgtttaagatctgaaaaacatgaagactgtattgctcctg
atttgtcttctaggatctgctttcaccactccaaccgatccattgaactaccaatttggggcccatggacagaaaactgc
agagaagcataaatatactcattctgaaatgccagaggaagagaacacagggtttgtaaacaaaggtgatgtgctgtctg
gccacaggaccataaaagcagaggtaccggtactggatacacagaaggatgagccctgggcttccagaagacaaggacaa
ggtgatggtgagcatcaaacaaaaaacagcctgaggagcattaacttccttactctgcacagtaatccagggttggcttc
tgataaccaggaaagcaactctggcagcagcagggaacagcacagctctgagcaccaccagcccaggaggcacaggaaac
acggcaacatggctggccagtgggctctgagaggagaaagtccagtggatgctcttggtctggttcgtgagcgcaacaca"""
s = s.replace('\n', '')
kmers = {}
for i in range(len(s) - 3):
   kmer = s[i:i+3]
   kmers[kmer] = kmers.get(kmer, 0) + 1

for kmer, occurence in kmers.items():
   print kmer, " = ", occurence
import collections

s = s.replace('\n', '')
kmers = collection.defaultdict(int)
for i in range(len(s) - 3):
   kmer = s[i:i+3]
   kmers[kmer] += 1
  list_of_kmers = kmers.items()
  from operator import itemgetter
  list_of_kmers.sort(key=itemgetter(1))
  for kmer, occurence in list_of_kmers:
     print kmer, " = ", occurence

solution bonus ::

  list_of_kmers = kmers.items()
  list_of_kmers.sort(key = lambda kmer: kmer[1])
  for kmer, occurence in list_of_kmers:
     print kmer, " = ", occurence
seq = 'acggcaacatggctggccagtgggctctgagaggagaaagtccagtggatgctcttggtctggttcgtgagcgcaacaca'

base_comp = { 'a' : 't',
              'c' : 'g',
              'g' : 'c',
              't' : 'a'}
complement = ''
for base in seq:
   complement += base_comp[base]

reverse_comp = complement[::-1]
print reverse_comp
tgtgttgcgctcacgaaccagaccaagagcatccactggactttctcctctcagagcccactggccagccatgttgccgt
d = {1 : 'a', 2 : 'b', 3 : 'c' , 4 : 'd'}
inverted_d  {'a': 1, 'c': 3, 'b': 2, 'd': 4}
inverted_d = {}
for key in d.keys():
    inverted_d[d[key]] = key
inverted_d = {}
for key, value in d.items():
    inverted_d[value] = key
inverted_d = {v : k for k, v in d.items()}