Skip to content
Snippets Groups Projects
Commit 0ea7c404 authored by Blaise Li's avatar Blaise Li
Browse files

Reorganize code.

parent 6a808558
No related branches found
No related tags found
No related merge requests found
......@@ -105,15 +105,12 @@ def load_counts_table(table_path, index_col="old_locus_tag"):
return codon_counts
def detect_fishy_genes(codon_counts):
def compute_criteria(codon_counts):
"""
Run diagnostics about genes included in table *codon_counts*.
These should help decide whether to exclude some of the genes.
Prepare a Boolean table and corresponging gene sets.
A table of boolean criteria is returned, with one line per gene.
There are one column and one gene set for each criterion.
"""
render_md("### Searching for mis-annotated genes")
wrong_start_col = codon_counts["expected_start_aa"] == "-"
start_stop_col = codon_counts["expected_start_aa"] == "*"
no_met_start_col = codon_counts["expected_start_aa"] != "M"
......@@ -149,20 +146,35 @@ def detect_fishy_genes(codon_counts):
show_counts=True).plot(fig=fig)
display(fig)
plt.close(fig)
wrong_start = set(criteria[wrong_start_col].index)
start_stop = set(criteria[start_stop_col].index)
no_met_start = set(criteria[no_met_start_col].index)
start_upstream = set(criteria[start_upstream_col].index)
# Not used
# has_stops = set(criteria[has_stops_col].index)
start_upstream_met_start = set(
criteria[start_upstream_met_start_col].index)
start_upstream_met_start_nostops = set(
criteria[start_upstream_met_start_nostops_col].index)
good_met_start = set(criteria[good_met_start_col].index)
has_stops_good_met_start = set(
criteria[has_stops_good_met_start_col].index)
no_stop_good_met_start = set(criteria[no_stop_good_met_start_col].index)
gene_sets = {
"wrong_start": set(criteria[wrong_start_col].index),
"start_stop": set(criteria[start_stop_col].index),
"no_met_start": set(criteria[no_met_start_col].index),
"start_upstream": set(criteria[start_upstream_col].index),
# Not used
# "has_stops": set(criteria[has_stops_col].index),
"start_upstream_met_start": set(
criteria[start_upstream_met_start_col].index),
"start_upstream_met_start_nostops": set(
criteria[start_upstream_met_start_nostops_col].index),
"good_met_start": set(criteria[good_met_start_col].index),
"has_stops_good_met_start": set(
criteria[has_stops_good_met_start_col].index),
"no_stop_good_met_start": set(
criteria[no_stop_good_met_start_col].index)}
return (criteria, gene_sets)
def detect_fishy_genes(codon_counts):
"""
Run diagnostics about genes included in table *codon_counts*.
These should help decide whether to exclude some of the genes.
A table of boolean criteria is returned, with one line per gene.
"""
render_md("### Searching for mis-annotated genes")
(criteria, gene_sets) = compute_criteria(codon_counts)
render_md(
"""
We should avoid genes that are not in-frame. We can likely exclude
......@@ -172,13 +184,16 @@ def detect_fishy_genes(codon_counts):
If it is "*" (`start_stop`) or "-" (`wrong_start`),
the codon is not a valid start codon.
""")
wrong_start = gene_sets["wrong_start"]
render_md(f"There are {len(wrong_start)} genes that start with a `-`:")
display(wrong_start)
start_stop = gene_sets["start_stop"]
render_md(
f"There are {len(start_stop)} genes that start with a stop codon:")
display(start_stop)
no_met_start = gene_sets["no_met_start"]
if no_met_start == wrong_start | start_stop:
render_md(
"""
......@@ -190,25 +205,34 @@ def detect_fishy_genes(codon_counts):
"There are genes that start with something else than 'M', "
"'-' or '*'.")
start_upstream = gene_sets["start_upstream"]
render_md(
f"{len(start_upstream)} genes have a possibly ill-defined"
" start position:")
display(start_upstream)
start_upstream_met_start = gene_sets["start_upstream_met_start"]
if start_upstream_met_start:
render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
" start:")
display(start_upstream_met_start)
start_upstream_met_start_nostops = gene_sets[
"start_upstream_met_start_nostops"]
if start_upstream_met_start_nostops:
render_md(f"{len(start_upstream_met_start_nostops)} of them"
" also do not contain any stop:")
display(start_upstream_met_start_nostops)
render_md("Those genes could probably be kept.")
has_stops_good_met_start = gene_sets["has_stops_good_met_start"]
render_md(
f"{len(has_stops_good_met_start)} genes contain stops"
" but have a well defined start position with 'M'.")
if len(has_stops_good_met_start) <= 10:
display(has_stops_good_met_start)
good_met_start = gene_sets["good_met_start"]
render_md(
f"{len(good_met_start)} genes have a well defined "
"start position with 'M'.")
......@@ -216,6 +240,8 @@ def detect_fishy_genes(codon_counts):
"""If genes that have stop readthrough are a known phenomenon,
only those among them that do not have a valid start codon
might be excluded.""")
no_stop_good_met_start = gene_sets["no_stop_good_met_start"]
render_md(
f"{len(no_stop_good_met_start)} genes have a well defined"
" start position with 'M' and contain no stop codon.")
......@@ -232,7 +258,8 @@ def make_counts_only(counts_table):
counts_table.index.name,
*counts_table.columns.difference(codon2aa)]
assert set(info_cols) == {
"old_locus_tag", "locus_tag", "length", "start_codon", "expected_start_aa",
"old_locus_tag", "locus_tag", "length",
"start_codon", "expected_start_aa",
"first_stop", "nb_stops", "start_upstream", "end_downstream"}
return counts_table.reset_index().set_index(info_cols)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment