Skip to content
Snippets Groups Projects
Commit 705a8246 authored by Blaise Li's avatar Blaise Li
Browse files

Code to run diagnostics about genes.

parent f43bdd71
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = 0.1
__version__ = 0.3
from .libcodonusage import (
aa2colour,
columns_by_aa,
detect_fishy_genes,
load_bias_table,
load_counts_table,
render_md,
violin_usage,
violin_usage_vertical,
......
......@@ -42,6 +42,12 @@ import numpy as np
import pandas as pd
# python3 -m pip install cytoolz
from cytoolz import groupby
# Python module that facilitates the exploration of tbular data
# python3 -m pip install plydata
# from plydata import define, pull, query
# Python module to vizualize set intersections
# python3 -m pip install upsetplot
from upsetplot import from_indicators, UpSet
def render_md(md_str):
......@@ -76,8 +82,141 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath(
# value: hexadecimal html colour code
aa2colour = {**colscheme["colors"], "*": '#000000'}
def load_counts_table(table_path, index_col="old_locus_tag"):
"""
Load a table or pre-computed codon counts at *table_path*.
The lines correspond to genes.
Besides the columns containing the counts for each codon,
there are other columns containing various pieces of information
regarding those genes.
"""
render_md(f"Loading data from {table_path}...\n")
codon_counts = pd.read_table(table_path, index_col=index_col)
nb_genes = len(codon_counts)
render_md(
f"""
There are {nb_genes} genes in the table.
The table looks as follows (first 3 lines):
""")
display(codon_counts.head(3))
return codon_counts
def detect_fishy_genes(codon_counts):
"""
Run diagnostics about genes included in table *codon_counts*.
These should help decide whether to exclude some of the genes.
A table of boolean criteria is returned, with one line per gene.
"""
render_md("### Searching for mis-annotated genes")
wrong_start_col = codon_counts["expected_start_aa"] == "-"
start_stop_col = codon_counts["expected_start_aa"] == "*"
no_met_start_col = codon_counts["expected_start_aa"] != "M"
start_upstream_col = codon_counts["start_upstream"]
has_stops_col = codon_counts["nb_stops"] > 0
start_upstream_met_start_col = start_upstream_col & ~no_met_start_col
start_upstream_met_start_nostops_col = (
start_upstream_met_start_col & ~has_stops_col)
has_stops_good_met_start_col = (
has_stops_col & ~no_met_start_col
& ~start_upstream_met_start_col)
no_stop_good_met_start_col = (
~has_stops_col & ~no_met_start_col
& ~start_upstream_met_start_col)
criteria = pd.DataFrame({
"wrong_start": wrong_start_col,
"start_stop": start_stop_col,
"no_met_start": no_met_start_col,
"start_upstream": start_upstream_col,
"has_stops": has_stops_col,
"start_upstream_met_start": start_upstream_met_start_col,
"start_upstream_met_start_nostops": (
start_upstream_met_start_nostops_col),
"has_stops_good_met_start": has_stops_good_met_start_col})
render_md("Number of genes in potentially fishy categories:\n\n")
display(criteria.agg(sum))
render_md("Upset plot of the non-empty categories:\n\n")
fig = plt.figure()
UpSet(
from_indicators(*[criteria.columns], data=criteria),
show_counts=True).plot(fig=fig)
display(fig)
plt.close(fig)
wrong_start = set(criteria[wrong_start_col].index)
start_stop = set(criteria[start_stop_col].index)
no_met_start = set(criteria[no_met_start_col].index)
start_upstream = set(criteria[start_upstream_col].index)
# Not used
# has_stops = set(criteria[has_stops_col].index)
start_upstream_met_start = set(
criteria[start_upstream_met_start_col].index)
start_upstream_met_start_nostops = set(
criteria[start_upstream_met_start_nostops_col].index)
has_stops_good_met_start = set(
criteria[has_stops_good_met_start_col].index)
no_stop_good_met_start = set(criteria[no_stop_good_met_start_col].index)
render_md(
"""
We should avoid genes that are not in-frame. We can likely exclude
those that do not start with a valid start codon.
Here, this will be seen by looking at the translation of the first
codon.
If it is "*" (`start_stop`) or "-" (`wrong_start`),
the codon is not a valid start codon.
""")
render_md(f"There are {len(wrong_start)} genes that start with a `-`:")
display(wrong_start)
render_md(
f"There are {len(start_stop)} genes that start with a stop codon:")
display(start_stop)
if no_met_start == wrong_start | start_stop:
render_md(
"""
Genes not belonging to the `start_stop` or `wrong_start`
category all start with a methionine.
""")
else:
raise NotImplementedError(
"There are genes that start with something else than 'M', "
"'-' or '*'.")
render_md(
f"{len(start_upstream)} genes have a possibly ill-defined"
" start position:")
display(start_upstream)
if start_upstream_met_start:
render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
" start:")
display(start_upstream_met_start)
if start_upstream_met_start_nostops:
render_md(f"{len(start_upstream_met_start_nostops)} of them"
" also do not contain any stop:")
display(start_upstream_met_start_nostops)
render_md("Those genes could probably be kept.")
render_md(
f"{len(has_stops_good_met_start)} genes contain stops"
" but have a well defined start position with 'M'.")
if len(has_stops_good_met_start) <= 10:
display(has_stops_good_met_start)
render_md(
"""If genes that have stop readthrough are a known phenomenon,
only those among them that do not have a valid start codon
might be excluded.""")
render_md(
f"{len(no_stop_good_met_start)} genes have a well defined"
" start position with 'M' and contain no stop codon.")
if len(no_stop_good_met_start) <= 10:
display(no_stop_good_met_start)
return criteria
def load_bias_table(table_path, nb_cluster_series=2):
"""
......@@ -109,7 +248,7 @@ def load_bias_table(table_path, nb_cluster_series=2):
# * `cluster_{aa}_full_bias` for each amino-acid
# having more than one codon
index_col=list(range(9 + nb_cluster_series * (len(aa2colour) - 2))),
header=[0,1])
header=[0, 1])
def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment