diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index b8ee559137a1be78796f0174e2ff433e0ae026d8..04128d8917521a7cf428afd323cee6c1cd44d68c 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -105,15 +105,12 @@ def load_counts_table(table_path, index_col="old_locus_tag"): return codon_counts -def detect_fishy_genes(codon_counts): +def compute_criteria(codon_counts): """ - Run diagnostics about genes included in table *codon_counts*. - - These should help decide whether to exclude some of the genes. + Prepare a Boolean table and corresponging gene sets. - A table of boolean criteria is returned, with one line per gene. + There are one column and one gene set for each criterion. """ - render_md("### Searching for mis-annotated genes") wrong_start_col = codon_counts["expected_start_aa"] == "-" start_stop_col = codon_counts["expected_start_aa"] == "*" no_met_start_col = codon_counts["expected_start_aa"] != "M" @@ -149,20 +146,35 @@ def detect_fishy_genes(codon_counts): show_counts=True).plot(fig=fig) display(fig) plt.close(fig) - wrong_start = set(criteria[wrong_start_col].index) - start_stop = set(criteria[start_stop_col].index) - no_met_start = set(criteria[no_met_start_col].index) - start_upstream = set(criteria[start_upstream_col].index) - # Not used - # has_stops = set(criteria[has_stops_col].index) - start_upstream_met_start = set( - criteria[start_upstream_met_start_col].index) - start_upstream_met_start_nostops = set( - criteria[start_upstream_met_start_nostops_col].index) - good_met_start = set(criteria[good_met_start_col].index) - has_stops_good_met_start = set( - criteria[has_stops_good_met_start_col].index) - no_stop_good_met_start = set(criteria[no_stop_good_met_start_col].index) + gene_sets = { + "wrong_start": set(criteria[wrong_start_col].index), + "start_stop": set(criteria[start_stop_col].index), + "no_met_start": set(criteria[no_met_start_col].index), + "start_upstream": set(criteria[start_upstream_col].index), + # Not used + # "has_stops": set(criteria[has_stops_col].index), + "start_upstream_met_start": set( + criteria[start_upstream_met_start_col].index), + "start_upstream_met_start_nostops": set( + criteria[start_upstream_met_start_nostops_col].index), + "good_met_start": set(criteria[good_met_start_col].index), + "has_stops_good_met_start": set( + criteria[has_stops_good_met_start_col].index), + "no_stop_good_met_start": set( + criteria[no_stop_good_met_start_col].index)} + return (criteria, gene_sets) + + +def detect_fishy_genes(codon_counts): + """ + Run diagnostics about genes included in table *codon_counts*. + + These should help decide whether to exclude some of the genes. + + A table of boolean criteria is returned, with one line per gene. + """ + render_md("### Searching for mis-annotated genes") + (criteria, gene_sets) = compute_criteria(codon_counts) render_md( """ We should avoid genes that are not in-frame. We can likely exclude @@ -172,13 +184,16 @@ def detect_fishy_genes(codon_counts): If it is "*" (`start_stop`) or "-" (`wrong_start`), the codon is not a valid start codon. """) + wrong_start = gene_sets["wrong_start"] render_md(f"There are {len(wrong_start)} genes that start with a `-`:") display(wrong_start) + start_stop = gene_sets["start_stop"] render_md( f"There are {len(start_stop)} genes that start with a stop codon:") display(start_stop) + no_met_start = gene_sets["no_met_start"] if no_met_start == wrong_start | start_stop: render_md( """ @@ -190,25 +205,34 @@ def detect_fishy_genes(codon_counts): "There are genes that start with something else than 'M', " "'-' or '*'.") + start_upstream = gene_sets["start_upstream"] render_md( f"{len(start_upstream)} genes have a possibly ill-defined" " start position:") display(start_upstream) + + start_upstream_met_start = gene_sets["start_upstream_met_start"] if start_upstream_met_start: render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'" " start:") display(start_upstream_met_start) + start_upstream_met_start_nostops = gene_sets[ + "start_upstream_met_start_nostops"] if start_upstream_met_start_nostops: render_md(f"{len(start_upstream_met_start_nostops)} of them" " also do not contain any stop:") display(start_upstream_met_start_nostops) render_md("Those genes could probably be kept.") + + has_stops_good_met_start = gene_sets["has_stops_good_met_start"] render_md( f"{len(has_stops_good_met_start)} genes contain stops" " but have a well defined start position with 'M'.") if len(has_stops_good_met_start) <= 10: display(has_stops_good_met_start) + + good_met_start = gene_sets["good_met_start"] render_md( f"{len(good_met_start)} genes have a well defined " "start position with 'M'.") @@ -216,6 +240,8 @@ def detect_fishy_genes(codon_counts): """If genes that have stop readthrough are a known phenomenon, only those among them that do not have a valid start codon might be excluded.""") + + no_stop_good_met_start = gene_sets["no_stop_good_met_start"] render_md( f"{len(no_stop_good_met_start)} genes have a well defined" " start position with 'M' and contain no stop codon.") @@ -232,7 +258,8 @@ def make_counts_only(counts_table): counts_table.index.name, *counts_table.columns.difference(codon2aa)] assert set(info_cols) == { - "old_locus_tag", "locus_tag", "length", "start_codon", "expected_start_aa", + "old_locus_tag", "locus_tag", "length", + "start_codon", "expected_start_aa", "first_stop", "nb_stops", "start_upstream", "end_downstream"} return counts_table.reset_index().set_index(info_cols)