Skip to content
Snippets Groups Projects
Commit 0ea7c404 authored by Blaise Li's avatar Blaise Li
Browse files

Reorganize code.

parent 6a808558
Branches
Tags
No related merge requests found
......@@ -105,15 +105,12 @@ def load_counts_table(table_path, index_col="old_locus_tag"):
return codon_counts
def detect_fishy_genes(codon_counts):
def compute_criteria(codon_counts):
"""
Run diagnostics about genes included in table *codon_counts*.
These should help decide whether to exclude some of the genes.
Prepare a Boolean table and corresponging gene sets.
A table of boolean criteria is returned, with one line per gene.
There are one column and one gene set for each criterion.
"""
render_md("### Searching for mis-annotated genes")
wrong_start_col = codon_counts["expected_start_aa"] == "-"
start_stop_col = codon_counts["expected_start_aa"] == "*"
no_met_start_col = codon_counts["expected_start_aa"] != "M"
......@@ -149,20 +146,35 @@ def detect_fishy_genes(codon_counts):
show_counts=True).plot(fig=fig)
display(fig)
plt.close(fig)
wrong_start = set(criteria[wrong_start_col].index)
start_stop = set(criteria[start_stop_col].index)
no_met_start = set(criteria[no_met_start_col].index)
start_upstream = set(criteria[start_upstream_col].index)
gene_sets = {
"wrong_start": set(criteria[wrong_start_col].index),
"start_stop": set(criteria[start_stop_col].index),
"no_met_start": set(criteria[no_met_start_col].index),
"start_upstream": set(criteria[start_upstream_col].index),
# Not used
# has_stops = set(criteria[has_stops_col].index)
start_upstream_met_start = set(
criteria[start_upstream_met_start_col].index)
start_upstream_met_start_nostops = set(
criteria[start_upstream_met_start_nostops_col].index)
good_met_start = set(criteria[good_met_start_col].index)
has_stops_good_met_start = set(
criteria[has_stops_good_met_start_col].index)
no_stop_good_met_start = set(criteria[no_stop_good_met_start_col].index)
# "has_stops": set(criteria[has_stops_col].index),
"start_upstream_met_start": set(
criteria[start_upstream_met_start_col].index),
"start_upstream_met_start_nostops": set(
criteria[start_upstream_met_start_nostops_col].index),
"good_met_start": set(criteria[good_met_start_col].index),
"has_stops_good_met_start": set(
criteria[has_stops_good_met_start_col].index),
"no_stop_good_met_start": set(
criteria[no_stop_good_met_start_col].index)}
return (criteria, gene_sets)
def detect_fishy_genes(codon_counts):
"""
Run diagnostics about genes included in table *codon_counts*.
These should help decide whether to exclude some of the genes.
A table of boolean criteria is returned, with one line per gene.
"""
render_md("### Searching for mis-annotated genes")
(criteria, gene_sets) = compute_criteria(codon_counts)
render_md(
"""
We should avoid genes that are not in-frame. We can likely exclude
......@@ -172,13 +184,16 @@ def detect_fishy_genes(codon_counts):
If it is "*" (`start_stop`) or "-" (`wrong_start`),
the codon is not a valid start codon.
""")
wrong_start = gene_sets["wrong_start"]
render_md(f"There are {len(wrong_start)} genes that start with a `-`:")
display(wrong_start)
start_stop = gene_sets["start_stop"]
render_md(
f"There are {len(start_stop)} genes that start with a stop codon:")
display(start_stop)
no_met_start = gene_sets["no_met_start"]
if no_met_start == wrong_start | start_stop:
render_md(
"""
......@@ -190,25 +205,34 @@ def detect_fishy_genes(codon_counts):
"There are genes that start with something else than 'M', "
"'-' or '*'.")
start_upstream = gene_sets["start_upstream"]
render_md(
f"{len(start_upstream)} genes have a possibly ill-defined"
" start position:")
display(start_upstream)
start_upstream_met_start = gene_sets["start_upstream_met_start"]
if start_upstream_met_start:
render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
" start:")
display(start_upstream_met_start)
start_upstream_met_start_nostops = gene_sets[
"start_upstream_met_start_nostops"]
if start_upstream_met_start_nostops:
render_md(f"{len(start_upstream_met_start_nostops)} of them"
" also do not contain any stop:")
display(start_upstream_met_start_nostops)
render_md("Those genes could probably be kept.")
has_stops_good_met_start = gene_sets["has_stops_good_met_start"]
render_md(
f"{len(has_stops_good_met_start)} genes contain stops"
" but have a well defined start position with 'M'.")
if len(has_stops_good_met_start) <= 10:
display(has_stops_good_met_start)
good_met_start = gene_sets["good_met_start"]
render_md(
f"{len(good_met_start)} genes have a well defined "
"start position with 'M'.")
......@@ -216,6 +240,8 @@ def detect_fishy_genes(codon_counts):
"""If genes that have stop readthrough are a known phenomenon,
only those among them that do not have a valid start codon
might be excluded.""")
no_stop_good_met_start = gene_sets["no_stop_good_met_start"]
render_md(
f"{len(no_stop_good_met_start)} genes have a well defined"
" start position with 'M' and contain no stop codon.")
......@@ -232,7 +258,8 @@ def make_counts_only(counts_table):
counts_table.index.name,
*counts_table.columns.difference(codon2aa)]
assert set(info_cols) == {
"old_locus_tag", "locus_tag", "length", "start_codon", "expected_start_aa",
"old_locus_tag", "locus_tag", "length",
"start_codon", "expected_start_aa",
"first_stop", "nb_stops", "start_upstream", "end_downstream"}
return counts_table.reset_index().set_index(info_cols)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment