Verified Commit ec869e63 authored by Bertrand  NÉRON's avatar Bertrand NÉRON
Browse files

add exercises using biopython

parent 6c4218f1
......@@ -93,6 +93,85 @@ The following python code provides an example of the expected behaviour of objec
:download:`point.py <_static/code/point.py>` .
Exercise
--------
Use biopython to read a fasta file (:download:`sv40.fasta <_static/data/sv40.fasta>`)
and display the attributes
* id
* name
* description
* seq
use the module SeqIO in biopython
A tutorial is available https://biopython.org/wiki/SeqIO ::
from Bio import SeqIO
sv40_rcd = SeqIO.read("sv40.fasta", "fasta")
print("id =", sv40_rcd.id)
print("name =", sv40_rcd.name)
print("description =", sv40_rcd.description)
print("sequence =", sv40_rcd.seq)
Exercise
--------
Translate the sequence in phase 1, 2, -2 ::
sv40_seq_phase1 = sv40_rcd.seq
sv40_seq_phase2 = sv40_rcd[1:]
sv40_seq_phase_2 = sv40_rcd[1:].reverse_complement(id=True)
Exercise
--------
* Create a sequence with the first 42 nucleotides
* Translate this sequence
* Mutate the nucleotide in position 18 'A' -> 'C'
* and translate the mutated sequence
see tutorial http://biopython.org/DIST/docs/tutorial/Tutorial.html#htoc28 ::
short_seq = sv40_seq_phase2[0:42]
short_seq.translate()
mutable_seq = short_seq.seq.tomutable()
mutable_seq[19] = 'C'
mutate_seq = mutable_seq.toseq()
mutate_seq.translate()
Exercise
--------
Open the file abcd.fasta (:download:`abcd.fasta <_static/data/abcd.fasta>`) and convert it in genbank format
**Hint**:
the seq alphabet attribute must be set to extended_protein
see Bio.Alphabet.IUPAC module ::
from Bio.Alphabet.IUPAC import extended_protein
with open("abcd.fasta", "r") as fasta, open('abcd.gb', 'w') as genbank:
for record in SeqIO.parse(fasta, "fasta"):
record.seq.alphabet = extended_protein
print(len(record.seq))
SeqIO.write(record, genbank, 'genbank')
Exercice
--------
Open the file abcd.fasta (:download:`abcd.fasta <_static/data/abcd.fasta>`) and filter out sequence <= 700
Write the results in fasta file ::
with open("abcd.fasta", "r") as input, open("abcd_short.fasta", "w") as output:
for record in SeqIO.parse(input, "fasta"):
if len(record.seq) > 700:
SeqIO.write(record, output, 'fasta')
Exercise
--------
......
from Bio import SeqIO
sv40_rcd = SeqIO.read("sv40.fasta", "fasta")
print("id =", sv40_rcd.id)
print("name =", sv40_rcd.name)
print("description =", sv40_rcd.description)
print("sequence =", sv40_rcd.seq)
translate the sequence in phase 1, 2, -2
sv40_seq_phase1 = sv40_rcd.seq
sv40_seq_phase2 = sv40_rcd[1:]
sv40_seq_phase_2 = sv40_rcd[1:].reverse_complement(id=True)
cree une seq avec les 42 premiers nucleotides en phase2
traduire cette sequence
muter le nucleotide 18 'A' -> 'C'
et traduire la sequence muté
short_seq = sv40_seq_phase2[0:42]
short_seq.translate()
mutable_seq = short_seq.seq.tomutable()
mutable_seq[19] = 'C'
mutate_seq = mutable_seq.toseq()
mutate_seq.translate()
from Bio.Alphabet.IUPAC import extended_protein
with open("abcd.fasta", "r") as fasta, open('abcd.gb', 'w') as genbank:
for record in SeqIO.parse(fasta, "fasta"):
record.seq.alphabet = extended_protein
SeqIO.write(record, genbank, 'genbank')
with open("abcd.fasta", "r") as input, open("abcd_short.fasta", "w") as output:
for record in SeqIO.parse(input, "fasta"):
if len(record.seq) > 700:
SeqIO.write(record, output, 'fasta')
>gi|965480|gb|J02400.1|SV4CG Simian virus 40 complete genome
GCCTCGGCCTCTGCATAAATAAAAAAAATTAGTCAGCCATGGGGCGGAGAATGGGCGGAACTGGGCGGAG
TTAGGGGCGGGATGGGCGGAGTTAGGGGCGGGACTATGGTTGCTGACTAATTGAGATGCATGCTTTGCAT
ACTTCTGCCTGCTGGGGAGCCTGGGGACTTTCCACACCTGGTTGCTGACTAATTGAGATGCATGCTTTGC
ATACTTCTGCCTGCTGGGGAGCCTGGGGACTTTCCACACCCTAACTGACACACATTCCACAGCTGGTTCT
TTCCGCCTCAGAAGGTACCTAACCAAGTTCCTCTTTCAGAGGTTATTTCAGGCCATGGTGCTGCGCCGGC
TGTCACGCCAGGCCTCCGTTAAGGTTCGTAGGTCATGGACTGAAAGTAAAAAAACAGCTCAACGCCTTTT
TGTGTTTGTTTTAGAGCTTTTGCTGCAATTTTGTGAAGGGGAAGATACTGTTGACGGGAAACGCAAAAAA
CCAGAAAGGTTAACTGAAAAACCAGAAAGTTAACTGGTAAGTTTAGTCTTTTTGTCTTTTATTTCAGGTC
CATGGGTGCTGCTTTAACACTGTTGGGGGACCTAATTGCTACTGTGTCTGAAGCTGCTGCTGCTACTGGA
TTTTCAGTAGCTGAAATTGCTGCTGGAGAGGCCGCTGCTGCAATTGAAGTGCAACTTGCATCTGTTGCTA
CTGTTGAAGGCCTAACAACCTCTGAGGCAATTGCTGCTATAGGCCTCACTCCACAGGCCTATGCTGTGAT
ATCTGGGGCTCCTGCTGCTATAGCTGGATTTGCAGCTTTACTGCAAACTGTGACTGGTGTGAGCGCTGTT
GCTCAAGTGGGGTATAGATTTTTTAGTGACTGGGATCACAAAGTTTCTACTGTTGGTTTATATCAACAAC
CAGGAATGGCTGTAGATTTGTATAGGCCAGATGATTACTATGATATTTTATTTCCTGGAGTACAAACCTT
TGTTCACAGTGTTCAGTATCTTGACCCCAGACATTGGGGTCCAACACTTTTTAATGCCATTTCTCAAGCT
TTTTGGCGTGTAATACAAAATGACATTCCTAGGCTCACCTCACAGGAGCTTGAAAGAAGAACCCAAAGAT
ATTTAAGGGACAGTTTGGCAAGGTTTTTAGAGGAAACTACTTGGACAGTAATTAATGCTCCTGTTAATTG
GTATAACTCTTTACAAGATTACTACTCTACTTTGTCTCCCATTAGGCCTACAATGGTGAGACAAGTAGCC
AACAGGGAAGGGTTGCAAATATCATTTGGGCACACCTATGATAATATTGATGAAGCAGACAGTATTCAGC
AAGTAACTGAGAGGTGGGAAGCTCAAAGCCAAAGTCCTAATGTGCAGTCAGGTGAATTTATTGAAAAATT
TGAGGCTCCTGGTGGTGCAAATCAAAGAACTGCTCCTCAGTGGATGTTGCCTTTACTTCTAGGCCTGTAC
GGAAGTGTTACTTCTGCTCTAAAAGCTTATGAAGATGGCCCCAACAAAAAGAAAAGGAAGTTGTCCAGGG
GCAGCTCCCAAAAAACCAAAGGAACCAGTGCAAGTGCCAAAGCTCGTCATAAAAGGAGGAATAGAAGTTC
TAGGAGTTAAAACTGGAGTAGACAGCTTCACTGAGGTGGAGTGCTTTTTAAATCCTCAAATGGGCAATCC
TGATGAACATCAAAAAGGCTTAAGTAAAAGCTTAGCAGCTGAAAAACAGTTTACAGATGACTCTCCAGAC
AAAGAACAACTGCCTTGCTACAGTGTGGCTAGAATTCCTTTGCCTAATTTAAATGAGGACTTAACCTGTG
GAAATATTTTGATGTGGGAAGCTGTTACTGTTAAAACTGAGGTTATTGGGGTAACTGCTATGTTAAACTT
GCATTCAGGGACACAAAAAACTCATGAAAATGGTGCTGGAAAACCCATTCAAGGGTCAAATTTTCATTTT
TTTGCTGTTGGTGGGGAACCTTTGGAGCTGCAGGGTGTGTTAGCAAACTACAGGACCAAATATCCTGCTC
AAACTGTAACCCCAAAAAATGCTACAGTTGACAGTCAGCAGATGAACACTGACCACAAGGCTGTTTTGGA
TAAGGATAATGCTTATCCAGTGGAGTGCTGGGTTCCTGATCCAAGTAAAAATGAAAACACTAGATATTTT
GGAACCTACACAGGTGGGGAAAATGTGCCTCCTGTTTTGCACATTACTAACACAGCAACCACAGTGCTTC
TTGATGAGCAGGGTGTTGGGCCCTTGTGCAAAGCTGACAGCTTGTATGTTTCTGCTGTTGACATTTGTGG
GCTGTTTACCAACACTTCTGGAACACAGCAGTGGAAGGGACTTCCCAGATATTTTAAAATTACCCTTAGA
AAGCGGTCTGTGAAAAACCCCTACCCAATTTCCTTTTTGTTAAGTGACCTAATTAACAGGAGGACACAGA
GGGTGGATGGGCAGCCTATGATTGGAATGTCCTCTCAAGTAGAGGAGGTTAGGGTTTATGAGGACACAGA
GGAGCTTCCTGGGGATCCAGACATGATAAGATACATTGATGAGTTTGGACAAACCACAACTAGAATGCAG
TGAAAAAAATGCTTTATTTGTGAAATTTGTGATGCTATTGCTTTATTTGTAACCATTATAAGCTGCAATA
AACAAGTTAACAACAACAATTGCATTCATTTTATGTTTCAGGTTCAGGGGGAGGTGTGGGAGGTTTTTTA
AAGCAAGTAAAACCTCTACAAATGTGGTATGGCTGATTATGATCATGAACAGACTGTGAGGACTGAGGGG
CCTGAAATGAGCCTTGGGACTGTGAATCAATGCCTGTTTCATGCCCTGAGTCTTCCATGTTCTTCTCCCC
ACCATCTTCATTTTTATCAGCATTTTCCTGGCTGTCTTCATCATCATCATCACTGTTTCTTAGCCAATCT
AAAACTCCAATTCCCATAGCCACATTAAACTTCATTTTTTGATACACTGACAAACTAAACTCTTTGTCCA
ATCTCTCTTTCCACTCCACAATTCTGCTCTGAATACTTTGAGCAAACTCAGCCACAGGTCTGTACCAAAT
TAACATAAGAAGCAAAGCAATGCCACTTTGAATTATTCTCTTTTCTAACAAAAACTCACTGCGTTCCAGG
CAATGCTTTAAATAATCTTTGGGCCTAAAATCTATTTGTTTTACAAATCTGGCCTGCAGTGTTTTAGGCA
CACTGTACTCATTCATGGTGACTATTCCAGGGGGAAATATTTGAGTTCTTTTATTTAGGTGTTTCTTTTC
TAAGTTTACCTTAACACTGCCATCCAAATAATCCCTTAAATTGTCCAGGTTATTAATTCCCTGACCTGAA
GGCAAATCTCTGGACTCCCCTCCAGTGCCCTTTACATCCTCAAAAACTACTAAAAACTGGTCAATAGCTA
CTCCTAGCTCAAAGTTCAGCCTGTCCAAGGGCAAATTAACATTTAAAGCTTTCCCCCCACATAATTCAAG
CAAAGCAGCTGCTAATGTAGTTTTACCACTATCAATTGGTCCTTTAAACAGCCAGTATCTTTTTTTAGGA
ATGTTGTACACCATGCATTTTAAAAAGTCATACACCACTGAATCCATTTTGGGCAACAAACAGTGTAGCC
AAGCAACTCCAGCCATCCATTCTTCTATGTCAGCAGAGCCTGTAGAACCAAACATTATATCCATCCTATC
CAAAAGATCATTAAATCTGTTTGTTAACATTTGTTCTCTAGTTAATTGTAGGCTATCAACCCGCTTTTTA
GCTAAAACAGTATCAACAGCCTGTTGGCATATGGTTTTTTGGTTTTTGCTGTCAGCAAATATAGCAGCAT
TTGCATAATGCTTTTCATGGTACTTATAGTGGCTGGGCTGTTCTTTTTTAATACATTTTAAACACATTTC
AAAACTGTACTGAAATTCCAAGTACATCCCAAGCAATAACAACACATCATCACATTTTGTTTCCATTGCA
TACTCTGTTACAAGCTTCCAGGACACTTGTTTAGTTTCCTCTGCTTCTTCTGGATTAAAATCATGCTCCT
TTAACCCACCTGGCAAACTTTCCTCAATAACAGAAAATGGATCTCTAGTCAAGGCACTATACATCAAATA
TTCCTTATTAACCCCTTTACAAATTAAAAAGCTAAAGGTACACAATTTTTGAGCATAGTTATTAATAGCA
GACACTCTATGCCTGTGTGGAGTAAGAAAAAACAGTATGTTATGATTATAACTGTTATGCCTACTTATAA
AGGTTACAGAATATTTTTCCATAATTTTCTTGTATAGCAGTGCAGCTTTTTCCTTTGTGGTGTAAATAGC
AAAGCAAGCAAGAGTTCTATTACTAAACACAGCATGACTCAAAAAACTTAGCAATTCTGAAGGAAAGTCC
TTGGGGTCTTCTACCTTTCTCTTCTTTTTTGGAGGAGTAGAATGTTGAGAGTCAGCAGTAGCCTCATCAT
CACTAGATGGCATTTCTTCTGAGCAAAACAGGTTTTCCTCATTAAAGGCATTCCACCACTGCTCCCATTC
ATCAGTTCCATAGGTTGGAATCTAAAATACACAAACAATTAGAATCAGTAGTTTAACACATTATACACTT
AAAAATTTTATATTTACCTTAGAGCTTTAAATCTCTGTAGGTAGTTTGTCCAATTATGTCACACCACAGA
AGTAAGGTTCCTTCACAAAGATCAAGTCCAAACCACATTCTAAAGCAATCGAAGCAGTAGCAATCAACCC
ACACAAGTGGATCTTTCCTGTATAATTTTCTATTTTCATGCTTCATCCTCAGTAAGCACAGCAAGCATAT
GCAGTTAGCAGACATTTTCTTTGCACACTCAGGCCATTGTTTGCAGTACATTGCATCAACACCAGGATTT
AAGGAAGAAGCAAATACCTCAGTTGCATCCCAGAAGCCTCCAAAGTCAGGTTGATGAGCATATTTTACTC
CATCTTCCATTTTCTTGTACAGAGTATTCATTTTCTTCATTTTTTCTTCATCTCCTCCTTTATCAGGATG
AAACTCCTTGCATTTTTTTAAATATGCCTTTCTCATCAGAGGAATATTCCCCCAGGCACTCCTTTCAAGA
CCTAGAAGGTCCATTAGCTGCAAAGATTCCTCTCTGTTTAAAACTTTATCCATCTTTGCAAAGCTTTTTG
CAAAAGCCTAGGCCTCCAAAAAAGCCTCCTCACTACTTCTGGAATAGCTCAGAGGCCGAGGCG
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment