Computation of cluster centroids.

e486cb54 · Blaise Li · 3b410576 · e486cb54 · e486cb54
Commit e486cb54 authored 3 years ago by Blaise Li
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.12"
+__version__ = "0.13"
 from .libcodonusage import (
    aa2colour,
    aa_usage,
    by_aa_codon_usage,
+    centroid_usage,
    codon2aa,
    columns_by_aa,
    detect_fishy_genes,

--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -17,6 +17,8 @@
 import json
 from operator import itemgetter
 from pathlib import Path
+# python3 -m pip install cytoolz
+from cytoolz import groupby
 # To render mardown in a Jupyter notebook on gitlab
 from IPython.core.display import display, HTML
 # python3 -m pip install markdown
@@ -40,8 +42,6 @@ import numpy as np
 # python3 -m pip install pandas
 # See https://pandas.pydata.org/pandas-docs/stable/index.html
 import pandas as pd
-# python3 -m pip install cytoolz
-from cytoolz import groupby
 # Python module that facilitates the exploration of tbular data
 # python3 -m pip install plydata
 # from plydata import define, pull, query
@@ -569,6 +569,72 @@ methionine (M) and tryptophan (W).
        all_nan_cols)


+def centroid_usage(codon_counts, all_nan_cols):
+    """
+    Define "centroids" for gene clusters, one per codon in *codon_counts*.
+
+    The standardized usage biases of these centroids are computed so that
+    the codon proportions for a given amino-acid are the same as the global
+    proportions, except for the codons corresponding to a certain amino-acid.
+    For each amino-acid, there is one centroid per codon, where the
+    proportion for this codon is set to 1.0, and 0.0 for the other codons.
+    """
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    global_usage = codon_counts.sum(axis=0)
+    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    global_proportions_by_aa = global_usage.div(
+            global_summed_by_aa)
+    codon_proportions_by_aa = codon_counts.div(summed_by_aa)
+    codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
+        global_proportions_by_aa)
+    # _columns_by_aa may be different from columns_by_aa
+    # Groups of column headers for each amino-acid
+    # key: aa, value: list of (aa, codon) pairs
+    _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns)
+
+    def generate_centroid_proportions():
+        for (aa1, columns) in _columns_by_aa.items():
+            for (_, codon1) in columns:
+                proportions = global_proportions_by_aa.copy()
+                for (aa2, codon2) in columns:
+                    proportions[(aa2, codon2)] = 0.0
+                proportions[(aa1, codon1)] = 1.0
+                proportions.name = (
+                    f"{aa1}_{codon1}", f"{aa1}_{codon1}",
+                    *np.full(
+                        codon_proportions_by_aa.index.nlevels - 2,
+                        np.nan))
+                yield proportions
+
+    render_md("""
+For each codon, a "centroid" (future center of a cluster of genes)
+is defined where the proportion of this codon is set to 1.0.
+For the other codons corresponding to the same amino-acid, the proportion
+is therefore set to 0.0.
+For codons corresponding to other amino-acids, the proportions are set to
+the proportions observed in the global data.
+""")
+    centroids_proportions_by_aa = pd.DataFrame(
+        generate_centroid_proportions())
+    centroids_proportions_by_aa.index.rename(
+        codon_proportions_by_aa.index.names,
+        inplace=True)
+
+    render_md("""
+The biases for the centroids are then computed by subtracting to those
+proportions the proportions in the global data, and the biases are
+standardized by dividing by the standard deviation of codon usage biases
+in the data.
+""")
+    # Compute biases using the same global proportions as the real data
+    # and standardize by the same standard deviations as the real data
+    centroids_scub_by_aa = centroids_proportions_by_aa.sub(
+        global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop(
+            columns=all_nan_cols)
+    # centroids_scub_by_aa.head(3)
+    return centroids_scub_by_aa
+
+
 def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
    """
    Load a table containing by-amino-acid codon usage biases.