Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Blaise LI
bioinfo_utils
Commits
259967e5
Commit
259967e5
authored
Jul 16, 2019
by
Blaise Li
Browse files
Script to optimize codon composition of a gene.
parent
d66e7bd3
Changes
1
Hide whitespace changes
Inline
Side-by-side
make_codon_optimized_seq.py
0 → 100755
View file @
259967e5
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script reads:
* a gtf file containing the CDS parts of exons of a gene isoform
* the fasta file containing the sequence of the corresponding genome
* a list of codons ranked according to some optimality order (more optimal first)
It outputs a fasta sequence corresponding to the portions between the start of
the first CDS and the end of the last one, where codons in CDS are replaced by
the most optimal for a given amino-acid class.
TODO:
* Test on small example that it is correct.
* Compare RNA structure between original and optimized.
* Check the splice sites can still work.
"""
import
argparse
import
sys
import
warnings
from
collections
import
namedtuple
from
Bio.Data
import
CodonTable
from
mappy
import
fastx_read
from
blimisc
import
reverse_complement
def
formatwarning
(
message
,
category
,
filename
,
lineno
,
line
):
"""Format warning messages."""
return
"%s:%s: %s: %s
\n
"
%
(
filename
,
lineno
,
category
.
__name__
,
message
)
warnings
.
formatwarning
=
formatwarning
Exon
=
namedtuple
(
"Exon"
,
(
"chrom"
,
"start"
,
"stop"
,
"strand"
))
DNA2AA
=
CodonTable
.
unambiguous_dna_by_name
[
"Standard"
].
forward_table
def
make_codon_optimizer
(
codon_ranking
):
"""Create a dict giving the most optimal codon by which a given
codon can be substituted.
*codon_ranking* should list codons with the most optimal first."""
# What is the most optimal codon for this aa?
aa2optimal
=
{}
# What is the most optimal codon that can be substituted to this one?
codon2optimal
=
{}
for
codon
in
codon_ranking
:
codon
=
codon
.
upper
().
replace
(
"U"
,
"T"
)
msg
=
f
"Codon
{
codon
}
was seen more than once in the ranking."
assert
codon
not
in
codon2optimal
,
msg
aa
=
DNA2AA
[
codon
]
# The first codon seen for this aa is the optimized one.
if
aa
not
in
aa2optimal
:
aa2optimal
[
aa
]
=
codon
codon2optimal
[
codon
]
=
aa2optimal
[
aa
]
return
codon2optimal
def
main
():
"""Main function of the program."""
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
,
formatter_class
=
argparse
.
ArgumentDefaultsHelpFormatter
)
parser
.
add_argument
(
"-e"
,
"--exons"
,
help
=
"Path to the gtf file containing CDS annotations."
)
parser
.
add_argument
(
"-i"
,
"--fasta_in"
,
help
=
"Path to a file containing the fasta sequence of the genome with "
"respect to which CDS annotations provided by --exons are defined."
)
parser
.
add_argument
(
"-o"
,
"--fasta_out"
,
help
=
"Path to the fasta file to generate."
)
parser
.
add_argument
(
"-s"
,
"--fasta_out_spliced"
,
help
=
"Path to the fasta file in which to write the spliced optimized sequence."
)
parser
.
add_argument
(
"-a"
,
"--fasta_header"
,
help
=
"Text to use in the output fasta header."
)
parser
.
add_argument
(
"-r"
,
"--codon_ranking"
,
help
=
"Path to a file containing the codons in the desired optimality "
"order. There should be one codon per line, with the most optimal "
"at the top"
)
args
=
parser
.
parse_args
()
######################
# Load codon ranking #
######################
with
open
(
args
.
codon_ranking
)
as
codon_ranking_file
:
codon2optimal
=
make_codon_optimizer
(
[
line
.
strip
()
for
line
in
codon_ranking_file
])
#################
# Reading exons #
#################
exons
=
[]
with
open
(
args
.
exons
)
as
gtf_file
:
for
line
in
gtf_file
:
[
chrom
,
_
,
_
,
start
,
stop
,
_
,
strand
,
*
_
]
=
line
.
split
(
"
\t
"
)
exons
.
append
(
Exon
(
chrom
,
int
(
start
)
-
1
,
int
(
stop
),
strand
))
# print(exons[-1])
ref_chrom
=
exons
[
0
].
chrom
msg
=
"All exons are supposed to be located on the same chromosome."
assert
all
([
exon
.
chrom
==
ref_chrom
for
exon
in
exons
]),
msg
ref_strand
=
exons
[
0
].
strand
msg
=
"All exons are supposed to be located on the same strand."
assert
all
([
exon
.
strand
==
ref_strand
for
exon
in
exons
]),
msg
########################
# Determining CDS span #
########################
# Sorting will be by chrom, then start, then stop
exons
=
sorted
(
exons
)
first_start
=
exons
[
0
].
start
last_stop
=
exons
[
-
1
].
stop
###############################
# Getting chromosome sequence #
###############################
for
(
chrom_name
,
seq
,
*
_
)
in
fastx_read
(
args
.
fasta_in
):
if
chrom_name
==
ref_chrom
:
break
msg
=
(
f
"The gene of interest is located on chromosome
{
ref_chrom
}
, "
"which was not found in {args.fasta_in}"
)
assert
chrom_name
==
ref_chrom
,
msg
seq_len
=
len
(
seq
)
seq
=
seq
.
upper
()
if
ref_strand
==
"-"
:
seq
=
reverse_complement
(
seq
)
(
first_start
,
last_stop
)
=
(
seq_len
-
last_stop
,
seq_len
-
first_start
)
# Reverse exons too.
exons
=
[
Exon
(
chrom
,
seq_len
-
stop
,
seq_len
-
start
,
"+"
)
for
(
chrom
,
start
,
stop
,
strand
)
in
reversed
(
exons
)]
# Break seq into a mutable list of letters.
seq
=
list
(
seq
)
# Number of nucleotides that were missing to make a complete
# codon at the end of the previously considered exon.
prev_excess
=
0
prev_stop
=
None
# incomplete_codon = []
for
(
_
,
start
,
stop
,
_
)
in
exons
:
# Coding frame can be split across exons
# msg = "CDS should have a length multiple of 3."
# assert not (stop - start) % 3, msg
if
prev_excess
:
split_codon
=
""
.
join
(
seq
[
prev_stop
-
(
3
-
prev_excess
):
prev_stop
]
+
seq
[
start
:
start
+
prev_excess
])
assert
len
(
split_codon
)
==
3
,
"Wrong split_codon length."
optimized
=
list
(
codon2optimal
.
get
(
split_codon
,
split_codon
))
# TODO: check that we are not disrupting a splice site
seq
[
prev_stop
-
(
3
-
prev_excess
):
prev_stop
]
=
optimized
[:
3
-
prev_excess
]
seq
[
start
:
start
+
prev_excess
]
=
optimized
[
3
-
prev_excess
:]
remaining_exon_len
=
stop
-
(
start
+
prev_excess
)
# Reset excess
new_excess
=
0
optimized_len
=
0
for
pos
in
range
(
start
+
prev_excess
,
stop
,
3
):
optimized_len
+=
3
if
optimized_len
>
remaining_exon_len
:
new_excess
=
optimized_len
-
remaining_exon_len
else
:
codon
=
""
.
join
(
seq
[
pos
:
pos
+
3
])
# If the codon is not recorded, don't change it.
seq
[
pos
:
pos
+
3
]
=
list
(
codon2optimal
.
get
(
codon
,
codon
))
prev_stop
=
stop
prev_excess
=
new_excess
assert
prev_excess
==
0
,
"Last exon should end in-frame."
with
open
(
args
.
fasta_out
,
"w"
)
as
optimized_fasta_file
:
optimized_fasta_file
.
write
(
f
">
{
args
.
fasta_header
}
\n
"
)
# TODO: should we go back to the non-reverse-complement?
# -> Likely unnecessary
optimized_fasta_file
.
write
(
f
"
{
''
.
join
(
seq
[
first_start
:
last_stop
])
}
\n
"
)
with
open
(
args
.
fasta_out_spliced
,
"w"
)
as
optimized_fasta_file
:
optimized_fasta_file
.
write
(
f
">
{
args
.
fasta_header
}
_spliced
\n
"
)
# TODO: should we go back to the non-reverse-complement?
# -> Likely unnecessary
for
(
_
,
start
,
stop
,
_
)
in
exons
:
optimized_fasta_file
.
write
(
f
"
{
''
.
join
(
seq
[
start
:
stop
])
}
\n
"
)
return
0
if
__name__
==
"__main__"
:
sys
.
exit
(
main
())
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment