Skip to content
Snippets Groups Projects
Commit fbd57554 authored by Blaise Li's avatar Blaise Li
Browse files

Creating genome with added mCherry chomosome.

parent 0c812075
Branches
No related tags found
No related merge requests found
...@@ -28,6 +28,7 @@ def main(): ...@@ -28,6 +28,7 @@ def main():
# input files # input files
genes_gtf = OPJ(annot_dir, "genes.gtf") genes_gtf = OPJ(annot_dir, "genes.gtf")
spikein_gtf = OPJ(annot_dir, "spike_ins.gtf") spikein_gtf = OPJ(annot_dir, "spike_ins.gtf")
mCherry_gtf = OPJ(annot_dir, "mCherry.gtf")
dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed") dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed")
rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed") rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed")
satel_bed = OPJ(annot_dir, "satellites_rmsk.bed") satel_bed = OPJ(annot_dir, "satellites_rmsk.bed")
...@@ -37,6 +38,7 @@ def main(): ...@@ -37,6 +38,7 @@ def main():
pd.concat(( pd.concat((
gtf_2_genes_exon_lengths(genes_gtf), gtf_2_genes_exon_lengths(genes_gtf),
spikein_gtf_2_lengths(spikein_gtf), spikein_gtf_2_lengths(spikein_gtf),
gtf_2_genes_exon_lengths(mCherry_gtf),
repeat_bed_2_lengths(dte_bed), repeat_bed_2_lengths(dte_bed),
repeat_bed_2_lengths(rte_bed), repeat_bed_2_lengths(rte_bed),
repeat_bed_2_lengths(satel_bed), repeat_bed_2_lengths(satel_bed),
......
...@@ -163,7 +163,7 @@ def repeat_bed_2_lengths(repeat_bed): ...@@ -163,7 +163,7 @@ def repeat_bed_2_lengths(repeat_bed):
def spikein_gtf_2_lengths(spikein_gtf): def spikein_gtf_2_lengths(spikein_gtf):
"""Computes the lengths of spike-ins, grouped by families. """Computes the lengths of spike-ins, grouped by families.
Returns a DataFrame associating the summed lengths to the spike-in namse. Returns a DataFrame associating the summed lengths to the spike-in names.
""" """
spikein_ends = {} spikein_ends = {}
with open(spikein_gtf) as gtf_file: with open(spikein_gtf) as gtf_file:
......
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
""""""
import sys
from collections import OrderedDict
from mappy import fastx_read
# https://docs.pyfilesystem.org/en/latest/guide.html
from fs.osfs import OSFS as FS
#from fs.tempfs import TempFS as FS
fasta_transgene = """\
>mex5_promoter
ATATCAGTTTTTAAAAAATTAAACCATAAAACAAATAATATAACCCAATTTTTACATCAAACCACAAGAAAAAAATACATTTGGGCCCACGGATAAAGAAATTAAAAAAATACATTTTTTAAAGGCGCACCGAATTAAAATTCATTTGGGTCTTACCGCGTATACCGTACTCCGTTTGTTTGATCATTTTTGTCAGCGCTGGCGGTTGTTTTTTCATTTCATTTCTGCTTCAAAGACGTTTTCTCGAATAATTTTTCGTTTATTCTCTTTTTTAAAATTAATTTCTAGCCGTAAATGTTATAAATTCACCCATTTAACGCAAATTTCATGGTAATCTCATGGAAAAATGCAGTTTCTTTGTTAAAGAAAGCTTAAATAGCAAAAATTCCCCGACTTTCCCCAAAATCCTGCTCGATTTTCCGTTTTCTCATTGTATTCTCTCTTAATTAATTTTATCGATAATCAATTGAATGTTTCAGACAGAGA
>exon_1
ATGGTCTCAAAGGGTGAAGAAGATAACATGGCAATTATTAAAGAGTTTATGCGTTTCAAGGTGCATATGGAGGGATCTGTCAATGGGCATGAGTTTGAAATTGAAGGTGAAGGAGAAGGCCGACCATATGAGGGAACACAAACCGCAAAACTAAAG
>intron_1
GTAAGTTTAAACATATATATACTAACTAACCCTGATTATTTAAATTTTCAG
>exon_2
GTAACTAAAGGCGGACCATTACCATTCGCCTGGGACATCCTCTCTCCACAGTTCATGTATGGAAGTAAAGCTTATGTTAAACATCCGGCAGATATACCAGATTATTTGAAACTTTCATTCCCGGAGGGTTTTAAGTGGGAACGCGTAATGAATTTTGAAGACGGAGGAGTTGTTACAGTGACGCAAGACTCAAG
>intron_2
GTAAGTTTAAACAGTTCGGTACTAACTAACCATACATATTTAAATTTTCAG
>exon_3
CCTCCAAGATGGAGAATTTATTTATAAAGTCAAACTTCGAGGAACGAATTTCCCCTCGGATGGACCTGTTATGCAGAAGAAGACTATGGGATGGGAAGCTTCAAGTGAAAGAATGTACCCTGAAGACGGTGCTCTTAAGGGAGAGATTAAACAACGTCTTAAATTGAAAGATGGAGGACATTACGATGCTGAG
>intron_3
GTAAGTTTAAACATGATTTTACTAACTAACTAATCTGATTTAAATTTTCAG
>exon_4
GTGAAGACAACTTACAAAGCCAAAAAACCAGTTCAGCTGCCAGGAGCGTACAATGTTAATATTAAACTGGATATCACCTCCCACAACGAGGATTACACTATCGTTGAGCAATATGAAAGAGCTGAAGGGCGGCACTCGACAGGTGGCATGGATGAATTGTATAAGG
>his11
ATGCCACCAAAGCCATCTGCCAAGGGAGCCAAGAAGGCCGCCAAGACCGTTACGAAGCCAAAGGACGGAAAGAAGAGACGTCATGCCCGTAAGGAATCATACTCCGTCTACATCTACCGTGTCCTCAAGCAAGTTCATCCAGACACTGGAGTTTCCTCCAAAGCCATGTCTATCATGAACTCTTTTGTCAACGATGTCTTCGAGCGTATTGCTGCTGAAGCATCCCGTCTTGCTCACTACAACAAGCGTTCCACAATCTCATCCCGCGAAATTCAGACCGCTGTCCGTCTGATCCTTCCAGGAGAGCTTGCCAAGCACGCCGTGTCTGAGGGAACCAAGGCCGTTACCAAGTACACTTCCAGCAAGTAA
>tbb2_3UTR
ATGCAAAATCCTTTCAAGCATTCCCTTCTTCTCTATCACTCTTCTTTCTTTTTGTCAAAAAATTCTCTCGCTAATTTATTTGCTTTTTTAATGTTATTATTTTATGACTTTTTATAGTCACTGAAAAGTTTGCATCTGAGTGAAGTGAATGCTATCAAAATGTGATTCTGTCTGATGTACTTTCACAATCTCTCTTCAATTCCATTTTGAAGTGCTTTAAACCCGAAAGGTTGAGAAAAATGCGAGCGCTCAAATATTTGTATTGTGTTCGTTGAGTGACCCAACAAAAAGAGGAAACTTTATTGTGCCGCCAAGAAAAAAGTC
"""
mCherry_transcript_parts = ["exon_1", "intron_1", "exon_2", "intron_2", "exon_3", "intron_3", "exon_4"]
"/pasteur/entites/Mhe/Genomes"
def main():
"""Main function of the program."""
with FS("/pasteur/entites/Mhe/Genomes/transgenes") as fs:
with fs.open("mCherry_transgene_parts.fa", "w") as transgene_file:
transgene_file.write(fasta_transgene)
sequence_parts = OrderedDict((seq_name, seq) for (seq_name, seq, _) in fastx_read(transgene_file.name))
# for (seq_name, seq, _) in fastx_read(fhw.name):
# print(seq_name, seq)
mCherry_seq = "".join(sequence_parts[seq_name] for seq_name in mCherry_transcript_parts)
with fs.open("mCherry_transcript.fa", "w") as transcript_file:
transcript_file.write(f">mCherry\n{mCherry_seq}\n")
with fs.open("mCherry.gtf", "w") as transcript_gtf_file:
annotations = "gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"mCherry\";"
transcript_gtf_file.write("\t".join([
"mCherry", "local", "transcript", "1", str(len(mCherry_seq)),
"0", "+", ".", annotations]))
transcript_gtf_file.write("\n")
with fs.open("mCherry_transgene.gtf", "w") as transgene_gtf_file:
last_pos_in_transgene = 0
last_pos_in_transcript = 0
exon_number = 0
for (seq_name, seq) in sequence_parts.items():
start_in_transgene = last_pos_in_transgene + 1
end_in_transgene = last_pos_in_transgene + len(seq)
if seq_name == "mex5_promoter":
annotations = "gene_id \"mex5\"; transcript_id \"mex5\"; gene_biotype \"transgene\";"
transgene_gtf_file.write("\t".join([
"mCherry_transgene", "local", "UTR", str(start_in_transgene), str(end_in_transgene),
"0", "+", ".", annotations]))
transgene_gtf_file.write("\n")
elif seq_name.startswith("exon"):
start_in_transcript = last_pos_in_transcript + 1
end_in_transcript = last_pos_in_transcript + len(seq)
exon_number += 1
annotations = f"exon_id \"mCherry.e{exon_number}\"; exon_number \"{exon_number}\"; gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"transgene\";"
transgene_gtf_file.write("\t".join([
"mCherry_transgene", "local", "exon", str(start_in_transgene), str(end_in_transgene),
"0", "+", ".", annotations]))
transgene_gtf_file.write("\n")
annotations = f"exon_id \"mCherry.e{exon_number}\"; exon_number \"{exon_number}\"; gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"mCherry\";"
transcript_gtf_file.write("\t".join([
"mCherry", "local", "exon", str(start_in_transcript), str(end_in_transcript),
"0", "+", ".", annotations]))
transcript_gtf_file.write("\n")
last_pos_in_transcript = end_in_transcript
elif seq_name.startswith("intron"):
start_in_transcript = last_pos_in_transcript + 1
end_in_transcript = last_pos_in_transcript + len(seq)
annotations = "gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"transgene\";"
transgene_gtf_file.write("\t".join([
"mCherry_transgene", "local", "intron", str(start_in_transgene), str(end_in_transgene),
"0", "+", ".", annotations]))
transgene_gtf_file.write("\n")
last_pos_in_transcript = end_in_transcript
elif seq_name == "his11":
annotations = "gene_id \"his11\"; transcript_id \"his11\"; gene_biotype \"transgene\";"
transgene_gtf_file.write("\t".join([
"mCherry_transgene", "local", "transcript", str(start_in_transgene), str(end_in_transgene),
"0", "+", ".", annotations]))
transgene_gtf_file.write("\n")
elif seq_name == "tbb2_3UTR":
annotations = "gene_id \"tbb2\"; transcript_id \"tbb2\"; gene_biotype \"transgene\";"
transgene_gtf_file.write("\t".join([
"mCherry_transgene", "local", "UTR", str(start_in_transgene), str(end_in_transgene),
"0", "+", ".", annotations]))
transgene_gtf_file.write("\n")
else:
raise ValueError(f"{seq_name} does not belong to the transgene.")
last_pos_in_transgene = end_in_transgene
return 0
if __name__ == "__main__":
sys.exit(main())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment