Commit 259967e5 authored by Blaise Li's avatar Blaise Li
Browse files

Script to optimize codon composition of a gene.

parent d66e7bd3
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script reads:
* a gtf file containing the CDS parts of exons of a gene isoform
* the fasta file containing the sequence of the corresponding genome
* a list of codons ranked according to some optimality order (more optimal first)
It outputs a fasta sequence corresponding to the portions between the start of
the first CDS and the end of the last one, where codons in CDS are replaced by
the most optimal for a given amino-acid class.
TODO:
* Test on small example that it is correct.
* Compare RNA structure between original and optimized.
* Check the splice sites can still work.
"""
import argparse
import sys
import warnings
from collections import namedtuple
from Bio.Data import CodonTable
from mappy import fastx_read
from blimisc import reverse_complement
def formatwarning(message, category, filename, lineno, line):
"""Format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
Exon = namedtuple("Exon", ("chrom", "start", "stop", "strand"))
DNA2AA = CodonTable.unambiguous_dna_by_name["Standard"].forward_table
def make_codon_optimizer(codon_ranking):
"""Create a dict giving the most optimal codon by which a given
codon can be substituted.
*codon_ranking* should list codons with the most optimal first."""
# What is the most optimal codon for this aa?
aa2optimal = {}
# What is the most optimal codon that can be substituted to this one?
codon2optimal = {}
for codon in codon_ranking:
codon = codon.upper().replace("U", "T")
msg = f"Codon {codon} was seen more than once in the ranking."
assert codon not in codon2optimal, msg
aa = DNA2AA[codon]
# The first codon seen for this aa is the optimized one.
if aa not in aa2optimal:
aa2optimal[aa] = codon
codon2optimal[codon] = aa2optimal[aa]
return codon2optimal
def main():
"""Main function of the program."""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-e", "--exons",
help="Path to the gtf file containing CDS annotations.")
parser.add_argument(
"-i", "--fasta_in",
help="Path to a file containing the fasta sequence of the genome with "
"respect to which CDS annotations provided by --exons are defined.")
parser.add_argument(
"-o", "--fasta_out",
help="Path to the fasta file to generate.")
parser.add_argument(
"-s", "--fasta_out_spliced",
help="Path to the fasta file in which to write the spliced optimized sequence.")
parser.add_argument(
"-a", "--fasta_header",
help="Text to use in the output fasta header.")
parser.add_argument(
"-r", "--codon_ranking",
help="Path to a file containing the codons in the desired optimality "
"order. There should be one codon per line, with the most optimal "
"at the top")
args = parser.parse_args()
######################
# Load codon ranking #
######################
with open(args.codon_ranking) as codon_ranking_file:
codon2optimal = make_codon_optimizer(
[line.strip() for line in codon_ranking_file])
#################
# Reading exons #
#################
exons = []
with open(args.exons) as gtf_file:
for line in gtf_file:
[chrom, _, _, start, stop, _, strand, *_] = line.split("\t")
exons.append(Exon(chrom, int(start) - 1, int(stop), strand))
# print(exons[-1])
ref_chrom = exons[0].chrom
msg = "All exons are supposed to be located on the same chromosome."
assert all([exon.chrom == ref_chrom for exon in exons]), msg
ref_strand = exons[0].strand
msg = "All exons are supposed to be located on the same strand."
assert all([exon.strand == ref_strand for exon in exons]), msg
########################
# Determining CDS span #
########################
# Sorting will be by chrom, then start, then stop
exons = sorted(exons)
first_start = exons[0].start
last_stop = exons[-1].stop
###############################
# Getting chromosome sequence #
###############################
for (chrom_name, seq, *_) in fastx_read(args.fasta_in):
if chrom_name == ref_chrom:
break
msg = (f"The gene of interest is located on chromosome {ref_chrom}, "
"which was not found in {args.fasta_in}")
assert chrom_name == ref_chrom, msg
seq_len = len(seq)
seq = seq.upper()
if ref_strand == "-":
seq = reverse_complement(seq)
(first_start, last_stop) = (seq_len - last_stop, seq_len - first_start)
# Reverse exons too.
exons = [
Exon(chrom, seq_len - stop, seq_len - start, "+")
for (chrom, start, stop, strand) in reversed(exons)]
# Break seq into a mutable list of letters.
seq = list(seq)
# Number of nucleotides that were missing to make a complete
# codon at the end of the previously considered exon.
prev_excess = 0
prev_stop = None
# incomplete_codon = []
for (_, start, stop, _) in exons:
# Coding frame can be split across exons
# msg = "CDS should have a length multiple of 3."
# assert not (stop - start) % 3, msg
if prev_excess:
split_codon = "".join(
seq[prev_stop-(3-prev_excess):prev_stop]
+ seq[start:start+prev_excess])
assert len(split_codon) == 3, "Wrong split_codon length."
optimized = list(codon2optimal.get(split_codon, split_codon))
# TODO: check that we are not disrupting a splice site
seq[prev_stop-(3-prev_excess):prev_stop] = optimized[:3-prev_excess]
seq[start:start+prev_excess] = optimized[3-prev_excess:]
remaining_exon_len = stop - (start + prev_excess)
# Reset excess
new_excess = 0
optimized_len = 0
for pos in range(start+prev_excess, stop, 3):
optimized_len += 3
if optimized_len > remaining_exon_len:
new_excess = optimized_len - remaining_exon_len
else:
codon = "".join(seq[pos:pos+3])
# If the codon is not recorded, don't change it.
seq[pos:pos+3] = list(codon2optimal.get(codon, codon))
prev_stop = stop
prev_excess = new_excess
assert prev_excess == 0, "Last exon should end in-frame."
with open(args.fasta_out, "w") as optimized_fasta_file:
optimized_fasta_file.write(f">{args.fasta_header}\n")
# TODO: should we go back to the non-reverse-complement?
# -> Likely unnecessary
optimized_fasta_file.write(f"{''.join(seq[first_start:last_stop])}\n")
with open(args.fasta_out_spliced, "w") as optimized_fasta_file:
optimized_fasta_file.write(f">{args.fasta_header}_spliced\n")
# TODO: should we go back to the non-reverse-complement?
# -> Likely unnecessary
for (_, start, stop, _) in exons:
optimized_fasta_file.write(f"{''.join(seq[start:stop])}\n")
return 0
if __name__ == "__main__":
sys.exit(main())
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment