Select Git revision
compute_genes_exon_lengths.py
compute_genes_exon_lengths.py 1.62 KiB
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""This script reads a gtf file and extract transcripts of different biotypes in separate bed files."""
import argparse
import os
import sys
import pandas as pd
from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths, spikein_gtf_2_lengths
OPJ = os.path.join
STRIP = str.strip
SPLIT = str.split
def main():
"""Main function of the program."""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-a", "--annot_dir",
required=True,
help="Directory in which to read annotations and write the resulting file.")
args = parser.parse_args()
annot_dir = args.annot_dir
# input files
genes_gtf = OPJ(annot_dir, "genes.gtf")
spikein_gtf = OPJ(annot_dir, "spike_ins.gtf")
mCherry_gtf = OPJ(annot_dir, "mCherry.gtf")
dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed")
rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed")
satel_bed = OPJ(annot_dir, "satellites_rmsk.bed")
simrep_bed = OPJ(annot_dir, "simple_repeats_rmsk.bed")
# output file
exon_lengths = OPJ(annot_dir, "union_exon_lengths.txt")
pd.concat((
gtf_2_genes_exon_lengths(genes_gtf),
spikein_gtf_2_lengths(spikein_gtf),
gtf_2_genes_exon_lengths(mCherry_gtf),
repeat_bed_2_lengths(dte_bed),
repeat_bed_2_lengths(rte_bed),
repeat_bed_2_lengths(satel_bed),
repeat_bed_2_lengths(simrep_bed))).to_csv(exon_lengths, sep="\t")
return 0
if __name__ == "__main__":
sys.exit(main())