Skip to content
Snippets Groups Projects
Commit e486cb54 authored by Blaise Li's avatar Blaise Li
Browse files

Computation of cluster centroids.

parent 3b410576
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = "0.12"
__version__ = "0.13"
from .libcodonusage import (
aa2colour,
aa_usage,
by_aa_codon_usage,
centroid_usage,
codon2aa,
columns_by_aa,
detect_fishy_genes,
......
......@@ -17,6 +17,8 @@
import json
from operator import itemgetter
from pathlib import Path
# python3 -m pip install cytoolz
from cytoolz import groupby
# To render mardown in a Jupyter notebook on gitlab
from IPython.core.display import display, HTML
# python3 -m pip install markdown
......@@ -40,8 +42,6 @@ import numpy as np
# python3 -m pip install pandas
# See https://pandas.pydata.org/pandas-docs/stable/index.html
import pandas as pd
# python3 -m pip install cytoolz
from cytoolz import groupby
# Python module that facilitates the exploration of tbular data
# python3 -m pip install plydata
# from plydata import define, pull, query
......@@ -569,6 +569,72 @@ methionine (M) and tryptophan (W).
all_nan_cols)
def centroid_usage(codon_counts, all_nan_cols):
"""
Define "centroids" for gene clusters, one per codon in *codon_counts*.
The standardized usage biases of these centroids are computed so that
the codon proportions for a given amino-acid are the same as the global
proportions, except for the codons corresponding to a certain amino-acid.
For each amino-acid, there is one centroid per codon, where the
proportion for this codon is set to 1.0, and 0.0 for the other codons.
"""
summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
global_usage = codon_counts.sum(axis=0)
global_summed_by_aa = global_usage.groupby(level=0).sum()
global_proportions_by_aa = global_usage.div(
global_summed_by_aa)
codon_proportions_by_aa = codon_counts.div(summed_by_aa)
codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
global_proportions_by_aa)
# _columns_by_aa may be different from columns_by_aa
# Groups of column headers for each amino-acid
# key: aa, value: list of (aa, codon) pairs
_columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns)
def generate_centroid_proportions():
for (aa1, columns) in _columns_by_aa.items():
for (_, codon1) in columns:
proportions = global_proportions_by_aa.copy()
for (aa2, codon2) in columns:
proportions[(aa2, codon2)] = 0.0
proportions[(aa1, codon1)] = 1.0
proportions.name = (
f"{aa1}_{codon1}", f"{aa1}_{codon1}",
*np.full(
codon_proportions_by_aa.index.nlevels - 2,
np.nan))
yield proportions
render_md("""
For each codon, a "centroid" (future center of a cluster of genes)
is defined where the proportion of this codon is set to 1.0.
For the other codons corresponding to the same amino-acid, the proportion
is therefore set to 0.0.
For codons corresponding to other amino-acids, the proportions are set to
the proportions observed in the global data.
""")
centroids_proportions_by_aa = pd.DataFrame(
generate_centroid_proportions())
centroids_proportions_by_aa.index.rename(
codon_proportions_by_aa.index.names,
inplace=True)
render_md("""
The biases for the centroids are then computed by subtracting to those
proportions the proportions in the global data, and the biases are
standardized by dividing by the standard deviation of codon usage biases
in the data.
""")
# Compute biases using the same global proportions as the real data
# and standardize by the same standard deviations as the real data
centroids_scub_by_aa = centroids_proportions_by_aa.sub(
global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop(
columns=all_nan_cols)
# centroids_scub_by_aa.head(3)
return centroids_scub_by_aa
def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
"""
Load a table containing by-amino-acid codon usage biases.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment