Skip to content
Snippets Groups Projects
Commit 57e7c060 authored by Blaise Li's avatar Blaise Li
Browse files

Functions to compute usage biases.

parent fa1283b6
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = 0.7
__version__ = "0.10"
from .libcodonusage import (
aa2colour,
by_aa_codon_usage,
codon2aa,
columns_by_aa,
detect_fishy_genes,
gene_wide_codon_usage,
load_bias_table,
load_counts_table,
make_aa_codon_columns,
......
......@@ -45,6 +45,10 @@ from cytoolz import groupby
# Python module that facilitates the exploration of tbular data
# python3 -m pip install plydata
# from plydata import define, pull, query
# Python library with useful data-processing features
# python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html
from sklearn.preprocessing import normalize
# Python module to vizualize set intersections
# python3 -m pip install upsetplot
from upsetplot import from_indicators, UpSet
......@@ -334,36 +338,171 @@ def save_counts_table(counts_table, table_path):
render_md(f"The table was saved at [{table_path}]({table_path}).")
def load_bias_table(table_path, nb_cluster_series=2):
def gene_wide_codon_usage(codon_counts, verbose=False):
"""
"""
render_md("""
We will compute codon usage "gene-wide" (i.e. not by amino-acid),
by looking at the proportion of codons within a gene's CDS.
""")
render_md("""
To compute codon proportions, we can divide each line by its sum,
or, equivalently (and hopefully more efficiently), normalize the data
using the "l1" norm (which, for positive-only values amounts to the sum).
""")
codon_proportions = pd.DataFrame(
normalize(codon_counts, norm="l1"),
index=codon_counts.index,
columns=codon_counts.columns)
# Doesn't seem to be working:
# codon_proportions.style.hide(axis="index")
if verbose:
display(codon_proportions.head(3))
# Check that the sum of proportions (columns) for a gene is 1
colsums = codon_proportions.sum(axis=1).values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
assert np.allclose(colsums, np.full(len(colsums), 1))
render_md("""
To compute the usage biases, we also need to compute codon proportions
in the global usage.
""")
render_md("We compute the global usage, as the sum on columns.")
global_usage = codon_counts.sum(axis=0)
render_md("Then we normalize it the same way as for the individual genes.")
# The values.reshape(1, -1) turns the Series data into a 2D array
# with one line. flatten() turns the data back to 1D
global_proportions = pd.Series(
normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(),
index=global_usage.index)
assert global_proportions.sum() == 1.
render_md("""
Codon usage biases in genes can then be computed by subtracting
the corresponding global proportion to a codon's proportion.
""")
codon_usage_biases = codon_proportions.sub(global_proportions)
render_md("""
We can standardize these values (by dividing by their standard deviation
across genes) so that they are more comparable between codons.
""")
standardized_codon_usage_biases = codon_usage_biases.div(
codon_usage_biases.std())
# Doesn't seem to be working:
# standardized_codon_usage_biases.style.hide(axis="index")
if verbose:
display(standardized_codon_usage_biases.head(3))
return standardized_codon_usage_biases
def by_aa_codon_usage(codon_counts, verbose=False):
"""
"""
render_md("""
We will compute codon usage "by amino-acid", by looking at the
proportion of codons for each amino-acid within a gene's CDS.
""")
render_md("""
We first need to compute, for a given gene, the total number of codons
corresponding to each amino-acid.
""")
summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
render_md("""
To compute codon proportions, we can then divide each codon counts by
the total for the corresponding amino-acid.
""")
codon_proportions = codon_counts.div(
summed_by_aa)
# Doesn't seem to be working:
# codon_proportions.style.hide(axis="index")
if verbose:
display(codon_proportions.head(3))
# There are some (genes, aa) pairs where the sum is zero.
# This is normal if that gene does not have that amino-acid.
sums = codon_proportions.groupby(level=0, axis=1).sum()
# Values in sums should be either zero or one.
all_ones = np.full(sums.shape, 1.)
all_zeroes = np.full(sums.shape, 0.)
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1 or to 0
assert np.array_equal(
np.isclose(
sums.values, all_ones),
np.logical_not(
np.isclose(
sums.values, all_zeroes)))
render_md("""
To compute the usage biases, we also need to compute codon proportions
in the global usage.
""")
render_md("""
We compute the global usage, as the sum of the counts for a given codon,
across genes.
""")
global_usage = codon_counts.sum(axis=0)
render_md("Then we sum over codons corresponding to the same amino-acid.")
global_summed_by_aa = global_usage.groupby(level=0).sum()
render_md("""
Dividing the global usage for each codon by the total for the corresponding
amino-acid, we can now compute the global proportions the same way as for
the individual genes.
""")
global_proportions = global_usage.div(
global_summed_by_aa)
# The proportions of codons for a given amino-acid should sum to 1
assert all(
val == 1.
for val in global_proportions.groupby(level=0).sum())
render_md("""
Codon usage biases in genes can then be computed by subtracting
the corresponding global proportion to a codon's proportion.
""")
codon_usage_biases = codon_proportions.sub(global_proportions)
render_md("""
We can standardize these values (by dividing by their standard deviation
across genes) so that they are more comparable between codons.
""")
standardized_codon_usage_biases = codon_usage_biases.div(
codon_usage_biases.std())
# Doesn't seem to be working:
# standardized_codon_usage_biases.style.hide(axis="index")
if verbose:
display(standardized_codon_usage_biases.head(3))
return standardized_codon_usage_biases
def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
"""
Load a table containing by-amino-acid codon usage biases.
The table, located at *table_path*, should have tab-separated columns.
It is expected to contain a series of informative columns
preceding the columns containing the codon usage bias values.
The first nine columns contain information about gene identifiers
and features extracted from CDS annotation data.
The first *nb_info_cols* columns contain information about gene
identifiers and features extracted from CDS annotation data.
Then, there can be series of gene-clustering information, where each
column of the series corresponds to clusters defined based on codon-usage
biases among the codons coding an amino-acid. There are no columns for
amino-acids coded by only one codon (M and W).
column of the series corresponds to clusters defined based on
codon-usage biases among the codons coding an amino-acid.
There are no columns for amino-acids coded by only one codon (M and W).
*nb_cluster_series* specifies the number of such series.
The result is a pandas DataFrame object, where all the above information
is encoded as a MultiIndex, the codon usage biases being in the remaining
columns.
is encoded as a MultiIndex, the codon usage biases being in the
remaining columns.
"""
return pd.read_csv(
table_path,
sep="\t",
# 9 starting levels from the initial table,
# *nb_info_cols* starting levels from the initial table,
# plus levels corresponging to clustering information.
# By default, from 2 methods (nb_cluster_series=2):
# * `cluster_{aa}_kmeans` for each amino-acid
# having more than one codon
# * `cluster_{aa}_full_bias` for each amino-acid
# having more than one codon
index_col=list(range(9 + nb_cluster_series * (len(aa2colour) - 2))),
index_col=list(
range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))),
header=[0, 1])
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment