diff --git a/compute_genes_exon_lengths.py b/compute_genes_exon_lengths.py index 71a8676c583bc94330a0579ad7796bded668560e..e0f91e7aff58071f6fc23b1ebc5c5a9ad9a5bf70 100755 --- a/compute_genes_exon_lengths.py +++ b/compute_genes_exon_lengths.py @@ -28,6 +28,7 @@ def main(): # input files genes_gtf = OPJ(annot_dir, "genes.gtf") spikein_gtf = OPJ(annot_dir, "spike_ins.gtf") + mCherry_gtf = OPJ(annot_dir, "mCherry.gtf") dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed") rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed") satel_bed = OPJ(annot_dir, "satellites_rmsk.bed") @@ -37,6 +38,7 @@ def main(): pd.concat(( gtf_2_genes_exon_lengths(genes_gtf), spikein_gtf_2_lengths(spikein_gtf), + gtf_2_genes_exon_lengths(mCherry_gtf), repeat_bed_2_lengths(dte_bed), repeat_bed_2_lengths(rte_bed), repeat_bed_2_lengths(satel_bed), diff --git a/libhts/libhts/libhts.py b/libhts/libhts/libhts.py index ff1641de9c70163534644701aa973aac1838602e..eb91dc97f5b9c7b19ac82b1c7b4dec982d2d6acc 100644 --- a/libhts/libhts/libhts.py +++ b/libhts/libhts/libhts.py @@ -163,7 +163,7 @@ def repeat_bed_2_lengths(repeat_bed): def spikein_gtf_2_lengths(spikein_gtf): """Computes the lengths of spike-ins, grouped by families. - Returns a DataFrame associating the summed lengths to the spike-in namse. + Returns a DataFrame associating the summed lengths to the spike-in names. """ spikein_ends = {} with open(spikein_gtf) as gtf_file: diff --git a/make_mCherry_transgene.py b/make_mCherry_transgene.py new file mode 100755 index 0000000000000000000000000000000000000000..7cca3308fbb84a24fc6b8c7786cfae9ca5b39fa8 --- /dev/null +++ b/make_mCherry_transgene.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +# vim: set fileencoding=<utf-8> : +"""""" + +import sys +from collections import OrderedDict +from mappy import fastx_read + +# https://docs.pyfilesystem.org/en/latest/guide.html +from fs.osfs import OSFS as FS +#from fs.tempfs import TempFS as FS + +fasta_transgene = """\ +>mex5_promoter +ATATCAGTTTTTAAAAAATTAAACCATAAAACAAATAATATAACCCAATTTTTACATCAAACCACAAGAAAAAAATACATTTGGGCCCACGGATAAAGAAATTAAAAAAATACATTTTTTAAAGGCGCACCGAATTAAAATTCATTTGGGTCTTACCGCGTATACCGTACTCCGTTTGTTTGATCATTTTTGTCAGCGCTGGCGGTTGTTTTTTCATTTCATTTCTGCTTCAAAGACGTTTTCTCGAATAATTTTTCGTTTATTCTCTTTTTTAAAATTAATTTCTAGCCGTAAATGTTATAAATTCACCCATTTAACGCAAATTTCATGGTAATCTCATGGAAAAATGCAGTTTCTTTGTTAAAGAAAGCTTAAATAGCAAAAATTCCCCGACTTTCCCCAAAATCCTGCTCGATTTTCCGTTTTCTCATTGTATTCTCTCTTAATTAATTTTATCGATAATCAATTGAATGTTTCAGACAGAGA +>exon_1 +ATGGTCTCAAAGGGTGAAGAAGATAACATGGCAATTATTAAAGAGTTTATGCGTTTCAAGGTGCATATGGAGGGATCTGTCAATGGGCATGAGTTTGAAATTGAAGGTGAAGGAGAAGGCCGACCATATGAGGGAACACAAACCGCAAAACTAAAG +>intron_1 +GTAAGTTTAAACATATATATACTAACTAACCCTGATTATTTAAATTTTCAG +>exon_2 +GTAACTAAAGGCGGACCATTACCATTCGCCTGGGACATCCTCTCTCCACAGTTCATGTATGGAAGTAAAGCTTATGTTAAACATCCGGCAGATATACCAGATTATTTGAAACTTTCATTCCCGGAGGGTTTTAAGTGGGAACGCGTAATGAATTTTGAAGACGGAGGAGTTGTTACAGTGACGCAAGACTCAAG +>intron_2 +GTAAGTTTAAACAGTTCGGTACTAACTAACCATACATATTTAAATTTTCAG +>exon_3 +CCTCCAAGATGGAGAATTTATTTATAAAGTCAAACTTCGAGGAACGAATTTCCCCTCGGATGGACCTGTTATGCAGAAGAAGACTATGGGATGGGAAGCTTCAAGTGAAAGAATGTACCCTGAAGACGGTGCTCTTAAGGGAGAGATTAAACAACGTCTTAAATTGAAAGATGGAGGACATTACGATGCTGAG +>intron_3 +GTAAGTTTAAACATGATTTTACTAACTAACTAATCTGATTTAAATTTTCAG +>exon_4 +GTGAAGACAACTTACAAAGCCAAAAAACCAGTTCAGCTGCCAGGAGCGTACAATGTTAATATTAAACTGGATATCACCTCCCACAACGAGGATTACACTATCGTTGAGCAATATGAAAGAGCTGAAGGGCGGCACTCGACAGGTGGCATGGATGAATTGTATAAGG +>his11 +ATGCCACCAAAGCCATCTGCCAAGGGAGCCAAGAAGGCCGCCAAGACCGTTACGAAGCCAAAGGACGGAAAGAAGAGACGTCATGCCCGTAAGGAATCATACTCCGTCTACATCTACCGTGTCCTCAAGCAAGTTCATCCAGACACTGGAGTTTCCTCCAAAGCCATGTCTATCATGAACTCTTTTGTCAACGATGTCTTCGAGCGTATTGCTGCTGAAGCATCCCGTCTTGCTCACTACAACAAGCGTTCCACAATCTCATCCCGCGAAATTCAGACCGCTGTCCGTCTGATCCTTCCAGGAGAGCTTGCCAAGCACGCCGTGTCTGAGGGAACCAAGGCCGTTACCAAGTACACTTCCAGCAAGTAA +>tbb2_3UTR +ATGCAAAATCCTTTCAAGCATTCCCTTCTTCTCTATCACTCTTCTTTCTTTTTGTCAAAAAATTCTCTCGCTAATTTATTTGCTTTTTTAATGTTATTATTTTATGACTTTTTATAGTCACTGAAAAGTTTGCATCTGAGTGAAGTGAATGCTATCAAAATGTGATTCTGTCTGATGTACTTTCACAATCTCTCTTCAATTCCATTTTGAAGTGCTTTAAACCCGAAAGGTTGAGAAAAATGCGAGCGCTCAAATATTTGTATTGTGTTCGTTGAGTGACCCAACAAAAAGAGGAAACTTTATTGTGCCGCCAAGAAAAAAGTC +""" + +mCherry_transcript_parts = ["exon_1", "intron_1", "exon_2", "intron_2", "exon_3", "intron_3", "exon_4"] + +"/pasteur/entites/Mhe/Genomes" + +def main(): + """Main function of the program.""" + with FS("/pasteur/entites/Mhe/Genomes/transgenes") as fs: + with fs.open("mCherry_transgene_parts.fa", "w") as transgene_file: + transgene_file.write(fasta_transgene) + + sequence_parts = OrderedDict((seq_name, seq) for (seq_name, seq, _) in fastx_read(transgene_file.name)) + # for (seq_name, seq, _) in fastx_read(fhw.name): + # print(seq_name, seq) + mCherry_seq = "".join(sequence_parts[seq_name] for seq_name in mCherry_transcript_parts) + with fs.open("mCherry_transcript.fa", "w") as transcript_file: + transcript_file.write(f">mCherry\n{mCherry_seq}\n") + with fs.open("mCherry.gtf", "w") as transcript_gtf_file: + annotations = "gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"mCherry\";" + transcript_gtf_file.write("\t".join([ + "mCherry", "local", "transcript", "1", str(len(mCherry_seq)), + "0", "+", ".", annotations])) + transcript_gtf_file.write("\n") + with fs.open("mCherry_transgene.gtf", "w") as transgene_gtf_file: + last_pos_in_transgene = 0 + last_pos_in_transcript = 0 + exon_number = 0 + for (seq_name, seq) in sequence_parts.items(): + start_in_transgene = last_pos_in_transgene + 1 + end_in_transgene = last_pos_in_transgene + len(seq) + if seq_name == "mex5_promoter": + annotations = "gene_id \"mex5\"; transcript_id \"mex5\"; gene_biotype \"transgene\";" + transgene_gtf_file.write("\t".join([ + "mCherry_transgene", "local", "UTR", str(start_in_transgene), str(end_in_transgene), + "0", "+", ".", annotations])) + transgene_gtf_file.write("\n") + elif seq_name.startswith("exon"): + start_in_transcript = last_pos_in_transcript + 1 + end_in_transcript = last_pos_in_transcript + len(seq) + exon_number += 1 + annotations = f"exon_id \"mCherry.e{exon_number}\"; exon_number \"{exon_number}\"; gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"transgene\";" + transgene_gtf_file.write("\t".join([ + "mCherry_transgene", "local", "exon", str(start_in_transgene), str(end_in_transgene), + "0", "+", ".", annotations])) + transgene_gtf_file.write("\n") + annotations = f"exon_id \"mCherry.e{exon_number}\"; exon_number \"{exon_number}\"; gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"mCherry\";" + transcript_gtf_file.write("\t".join([ + "mCherry", "local", "exon", str(start_in_transcript), str(end_in_transcript), + "0", "+", ".", annotations])) + transcript_gtf_file.write("\n") + last_pos_in_transcript = end_in_transcript + elif seq_name.startswith("intron"): + start_in_transcript = last_pos_in_transcript + 1 + end_in_transcript = last_pos_in_transcript + len(seq) + annotations = "gene_id \"mCherry\"; transcript_id \"mCherry\"; gene_biotype \"transgene\";" + transgene_gtf_file.write("\t".join([ + "mCherry_transgene", "local", "intron", str(start_in_transgene), str(end_in_transgene), + "0", "+", ".", annotations])) + transgene_gtf_file.write("\n") + last_pos_in_transcript = end_in_transcript + elif seq_name == "his11": + annotations = "gene_id \"his11\"; transcript_id \"his11\"; gene_biotype \"transgene\";" + transgene_gtf_file.write("\t".join([ + "mCherry_transgene", "local", "transcript", str(start_in_transgene), str(end_in_transgene), + "0", "+", ".", annotations])) + transgene_gtf_file.write("\n") + elif seq_name == "tbb2_3UTR": + annotations = "gene_id \"tbb2\"; transcript_id \"tbb2\"; gene_biotype \"transgene\";" + transgene_gtf_file.write("\t".join([ + "mCherry_transgene", "local", "UTR", str(start_in_transgene), str(end_in_transgene), + "0", "+", ".", annotations])) + transgene_gtf_file.write("\n") + else: + raise ValueError(f"{seq_name} does not belong to the transgene.") + last_pos_in_transgene = end_in_transgene + return 0 + + +if __name__ == "__main__": + sys.exit(main())