diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 7c3e67e48f1f874e642a5d600b9779bcd90c9bcb..3eed697de8eaffd4b3af6ad444367b2ab57533b7 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,10 +1,12 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = 0.1 +__version__ = 0.3 from .libcodonusage import ( aa2colour, columns_by_aa, + detect_fishy_genes, load_bias_table, + load_counts_table, render_md, violin_usage, violin_usage_vertical, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 088e124aed24b01011470505a9b39bcfcaf10246..de8d3c081a7f5302ee70cd03ae3885ac215dcec7 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -42,6 +42,12 @@ import numpy as np import pandas as pd # python3 -m pip install cytoolz from cytoolz import groupby +# Python module that facilitates the exploration of tbular data +# python3 -m pip install plydata +# from plydata import define, pull, query +# Python module to vizualize set intersections +# python3 -m pip install upsetplot +from upsetplot import from_indicators, UpSet def render_md(md_str): @@ -76,8 +82,141 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath( # value: hexadecimal html colour code aa2colour = {**colscheme["colors"], "*": '#000000'} + def load_counts_table(table_path, index_col="old_locus_tag"): + """ + Load a table or pre-computed codon counts at *table_path*. + + The lines correspond to genes. + Besides the columns containing the counts for each codon, + there are other columns containing various pieces of information + regarding those genes. + """ + render_md(f"Loading data from {table_path}...\n") codon_counts = pd.read_table(table_path, index_col=index_col) + nb_genes = len(codon_counts) + render_md( + f""" + There are {nb_genes} genes in the table. + + The table looks as follows (first 3 lines): + """) + display(codon_counts.head(3)) + return codon_counts + + +def detect_fishy_genes(codon_counts): + """ + Run diagnostics about genes included in table *codon_counts*. + + These should help decide whether to exclude some of the genes. + + A table of boolean criteria is returned, with one line per gene. + """ + render_md("### Searching for mis-annotated genes") + wrong_start_col = codon_counts["expected_start_aa"] == "-" + start_stop_col = codon_counts["expected_start_aa"] == "*" + no_met_start_col = codon_counts["expected_start_aa"] != "M" + start_upstream_col = codon_counts["start_upstream"] + has_stops_col = codon_counts["nb_stops"] > 0 + start_upstream_met_start_col = start_upstream_col & ~no_met_start_col + start_upstream_met_start_nostops_col = ( + start_upstream_met_start_col & ~has_stops_col) + has_stops_good_met_start_col = ( + has_stops_col & ~no_met_start_col + & ~start_upstream_met_start_col) + no_stop_good_met_start_col = ( + ~has_stops_col & ~no_met_start_col + & ~start_upstream_met_start_col) + criteria = pd.DataFrame({ + "wrong_start": wrong_start_col, + "start_stop": start_stop_col, + "no_met_start": no_met_start_col, + "start_upstream": start_upstream_col, + "has_stops": has_stops_col, + "start_upstream_met_start": start_upstream_met_start_col, + "start_upstream_met_start_nostops": ( + start_upstream_met_start_nostops_col), + "has_stops_good_met_start": has_stops_good_met_start_col}) + render_md("Number of genes in potentially fishy categories:\n\n") + display(criteria.agg(sum)) + render_md("Upset plot of the non-empty categories:\n\n") + fig = plt.figure() + UpSet( + from_indicators(*[criteria.columns], data=criteria), + show_counts=True).plot(fig=fig) + display(fig) + plt.close(fig) + wrong_start = set(criteria[wrong_start_col].index) + start_stop = set(criteria[start_stop_col].index) + no_met_start = set(criteria[no_met_start_col].index) + start_upstream = set(criteria[start_upstream_col].index) + # Not used + # has_stops = set(criteria[has_stops_col].index) + start_upstream_met_start = set( + criteria[start_upstream_met_start_col].index) + start_upstream_met_start_nostops = set( + criteria[start_upstream_met_start_nostops_col].index) + has_stops_good_met_start = set( + criteria[has_stops_good_met_start_col].index) + no_stop_good_met_start = set(criteria[no_stop_good_met_start_col].index) + render_md( + """ + We should avoid genes that are not in-frame. We can likely exclude + those that do not start with a valid start codon. + Here, this will be seen by looking at the translation of the first + codon. + If it is "*" (`start_stop`) or "-" (`wrong_start`), + the codon is not a valid start codon. + """) + render_md(f"There are {len(wrong_start)} genes that start with a `-`:") + display(wrong_start) + + render_md( + f"There are {len(start_stop)} genes that start with a stop codon:") + display(start_stop) + + if no_met_start == wrong_start | start_stop: + render_md( + """ + Genes not belonging to the `start_stop` or `wrong_start` + category all start with a methionine. + """) + else: + raise NotImplementedError( + "There are genes that start with something else than 'M', " + "'-' or '*'.") + + render_md( + f"{len(start_upstream)} genes have a possibly ill-defined" + " start position:") + display(start_upstream) + if start_upstream_met_start: + render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'" + " start:") + display(start_upstream_met_start) + + if start_upstream_met_start_nostops: + render_md(f"{len(start_upstream_met_start_nostops)} of them" + " also do not contain any stop:") + display(start_upstream_met_start_nostops) + render_md("Those genes could probably be kept.") + render_md( + f"{len(has_stops_good_met_start)} genes contain stops" + " but have a well defined start position with 'M'.") + if len(has_stops_good_met_start) <= 10: + display(has_stops_good_met_start) + render_md( + """If genes that have stop readthrough are a known phenomenon, + only those among them that do not have a valid start codon + might be excluded.""") + render_md( + f"{len(no_stop_good_met_start)} genes have a well defined" + " start position with 'M' and contain no stop codon.") + if len(no_stop_good_met_start) <= 10: + display(no_stop_good_met_start) + return criteria + def load_bias_table(table_path, nb_cluster_series=2): """ @@ -109,7 +248,7 @@ def load_bias_table(table_path, nb_cluster_series=2): # * `cluster_{aa}_full_bias` for each amino-acid # having more than one codon index_col=list(range(9 + nb_cluster_series * (len(aa2colour) - 2))), - header=[0,1]) + header=[0, 1]) def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):