From 57e7c060e7602d3f532e4e75daeb4101ef160f90 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 14 Mar 2022 09:16:07 +0100
Subject: [PATCH] Functions to compute usage biases.

---
 libcodonusage/__init__.py      |   4 +-
 libcodonusage/libcodonusage.py | 159 ++++++++++++++++++++++++++++++---
 2 files changed, 152 insertions(+), 11 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 33f7a90..b2e5210 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,11 +1,13 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = 0.7
+__version__ = "0.10"
 from .libcodonusage import (
     aa2colour,
+    by_aa_codon_usage,
     codon2aa,
     columns_by_aa,
     detect_fishy_genes,
+    gene_wide_codon_usage,
     load_bias_table,
     load_counts_table,
     make_aa_codon_columns,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index fe0429c..f9ab307 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -45,6 +45,10 @@ from cytoolz import groupby
 # Python module that facilitates the exploration of tbular data
 # python3 -m pip install plydata
 # from plydata import define, pull, query
+# Python library with useful data-processing features
+# python3 -m pip install scikit-learn
+# https://scikit-learn.org/stable/install.html
+from sklearn.preprocessing import normalize
 # Python module to vizualize set intersections
 # python3 -m pip install upsetplot
 from upsetplot import from_indicators, UpSet
@@ -334,36 +338,171 @@ def save_counts_table(counts_table, table_path):
     render_md(f"The table was saved at [{table_path}]({table_path}).")
 
 
-def load_bias_table(table_path, nb_cluster_series=2):
+def gene_wide_codon_usage(codon_counts, verbose=False):
+    """
+    """
+    render_md("""
+We will compute codon usage "gene-wide" (i.e. not by amino-acid),
+by looking at the proportion of codons within a gene's CDS.
+""")
+    render_md("""
+To compute codon proportions, we can divide each line by its sum,
+or, equivalently (and hopefully more efficiently), normalize the data
+using the "l1" norm (which, for positive-only values amounts to the sum).
+""")
+    codon_proportions = pd.DataFrame(
+        normalize(codon_counts, norm="l1"),
+        index=codon_counts.index,
+        columns=codon_counts.columns)
+    # Doesn't seem to be working:
+    # codon_proportions.style.hide(axis="index")
+    if verbose:
+        display(codon_proportions.head(3))
+    # Check that the sum of proportions (columns) for a gene is 1
+    colsums = codon_proportions.sum(axis=1).values
+    # Due to imprecision in float arithmetics,
+    # we can only check that the sums are close to 1
+    assert np.allclose(colsums, np.full(len(colsums), 1))
+    render_md("""
+To compute the usage biases, we also need to compute codon proportions
+in the global usage.
+""")
+    render_md("We compute the global usage, as the sum on columns.")
+    global_usage = codon_counts.sum(axis=0)
+    render_md("Then we normalize it the same way as for the individual genes.")
+    # The values.reshape(1, -1) turns the Series data into a 2D array
+    # with one line. flatten() turns the data back to 1D
+    global_proportions = pd.Series(
+        normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(),
+        index=global_usage.index)
+    assert global_proportions.sum() == 1.
+    render_md("""
+Codon usage biases in genes can then be computed by subtracting
+the corresponding global proportion to a codon's proportion.
+""")
+    codon_usage_biases = codon_proportions.sub(global_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between codons.
+""")
+    standardized_codon_usage_biases = codon_usage_biases.div(
+        codon_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_codon_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_codon_usage_biases.head(3))
+    return standardized_codon_usage_biases
+
+
+def by_aa_codon_usage(codon_counts, verbose=False):
+    """
+    """
+    render_md("""
+We will compute codon usage "by amino-acid", by looking at the
+proportion of codons for each amino-acid within a gene's CDS.
+""")
+    render_md("""
+We first need to compute, for a given gene, the total number of codons
+corresponding to each amino-acid.
+""")
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    render_md("""
+To compute codon proportions, we can then divide each codon counts by
+the total for the corresponding amino-acid.
+""")
+    codon_proportions = codon_counts.div(
+        summed_by_aa)
+    # Doesn't seem to be working:
+    # codon_proportions.style.hide(axis="index")
+    if verbose:
+        display(codon_proportions.head(3))
+    # There are some (genes, aa) pairs where the sum is zero.
+    # This is normal if that gene does not have that amino-acid.
+    sums = codon_proportions.groupby(level=0, axis=1).sum()
+    # Values in sums should be either zero or one.
+    all_ones = np.full(sums.shape, 1.)
+    all_zeroes = np.full(sums.shape, 0.)
+    # Due to imprecision in float arithmetics,
+    # we can only check that the sums are close to 1 or to 0
+    assert np.array_equal(
+        np.isclose(
+            sums.values, all_ones),
+        np.logical_not(
+            np.isclose(
+                sums.values, all_zeroes)))
+    render_md("""
+To compute the usage biases, we also need to compute codon proportions
+in the global usage.
+""")
+    render_md("""
+We compute the global usage, as the sum of the counts for a given codon,
+across genes.
+""")
+    global_usage = codon_counts.sum(axis=0)
+    render_md("Then we sum over codons corresponding to the same amino-acid.")
+    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    render_md("""
+Dividing the global usage for each codon by the total for the corresponding
+amino-acid, we can now compute the global proportions the same way as for
+the individual genes.
+""")
+    global_proportions = global_usage.div(
+        global_summed_by_aa)
+    # The proportions of codons for a given amino-acid should sum to 1
+    assert all(
+        val == 1.
+        for val in global_proportions.groupby(level=0).sum())
+    render_md("""
+Codon usage biases in genes can then be computed by subtracting
+the corresponding global proportion to a codon's proportion.
+""")
+    codon_usage_biases = codon_proportions.sub(global_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between codons.
+""")
+    standardized_codon_usage_biases = codon_usage_biases.div(
+        codon_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_codon_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_codon_usage_biases.head(3))
+    return standardized_codon_usage_biases
+
+
+def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
     """
     Load a table containing by-amino-acid codon usage biases.
 
     The table, located at *table_path*, should have tab-separated columns.
     It is expected to contain a series of informative columns
     preceding the columns containing the codon usage bias values.
-    The first nine columns contain information about gene identifiers
-    and features extracted from CDS annotation data.
+
+    The first *nb_info_cols* columns contain information about gene
+    identifiers and features extracted from CDS annotation data.
+
     Then, there can be series of gene-clustering information, where each
-    column of the series corresponds to clusters defined based on codon-usage
-    biases among the codons coding an amino-acid. There are no columns for
-    amino-acids coded by only one codon (M and W).
+    column of the series corresponds to clusters defined based on
+    codon-usage biases among the codons coding an amino-acid.
+    There are no columns for amino-acids coded by only one codon (M and W).
     *nb_cluster_series* specifies the number of such series.
 
     The result is a pandas DataFrame object, where all the above information
-    is encoded as a MultiIndex, the codon usage biases being in the remaining
-    columns.
+    is encoded as a MultiIndex, the codon usage biases being in the
+    remaining columns.
     """
     return pd.read_csv(
         table_path,
         sep="\t",
-        # 9 starting levels from the initial table,
+        # *nb_info_cols* starting levels from the initial table,
         # plus levels corresponging to clustering information.
         # By default, from 2 methods (nb_cluster_series=2):
         # * `cluster_{aa}_kmeans` for each amino-acid
         #   having more than one codon
         # * `cluster_{aa}_full_bias` for each amino-acid
         #   having more than one codon
-        index_col=list(range(9 + nb_cluster_series * (len(aa2colour) - 2))),
+        index_col=list(
+            range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))),
         header=[0, 1])
 
 
-- 
GitLab