From a602b0bfe75d8ca4cd9bb2639b421e752aba513b Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 14 Mar 2022 12:04:02 +0100
Subject: [PATCH] Function computing aa usage biases.

---
 libcodonusage/__init__.py      |  3 +-
 libcodonusage/libcodonusage.py | 77 +++++++++++++++++++++++++++++++++-
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index b2e5210..5b810c1 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,8 +1,9 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.10"
+__version__ = "0.11"
 from .libcodonusage import (
     aa2colour,
+    aa_usage,
     by_aa_codon_usage,
     codon2aa,
     columns_by_aa,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index f9ab307..761e24e 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -105,7 +105,7 @@ def load_counts_table(
     """
     render_md(f"Loading data from [{table_path}]({table_path})...\n")
     codon_counts = pd.read_table(table_path, index_col=index_col)
-    if index_unique and not(codon_counts.index.is_unique):
+    if index_unique and not codon_counts.index.is_unique:
         raise ValueError(f"Index {index_col} contains repeated values.\n")
     nb_genes = len(codon_counts)
     render_md(
@@ -340,6 +340,9 @@ def save_counts_table(counts_table, table_path):
 
 def gene_wide_codon_usage(codon_counts, verbose=False):
     """
+    Compute codon usage biases "gene-wide" as the standardized
+    difference between a gene's codon proportions and global
+    codon proportions.
     """
     render_md("""
 We will compute codon usage "gene-wide" (i.e. not by amino-acid),
@@ -396,6 +399,10 @@ across genes) so that they are more comparable between codons.
 
 def by_aa_codon_usage(codon_counts, verbose=False):
     """
+    Compute codon usage biases "by amino-acid" as the standardized
+    difference between a gene's codon proportions and global
+    codon proportions, where proportions are computed within
+    groups of same-amino-acid-coding codons instead of gene-wide.
     """
     render_md("""
 We will compute codon usage "by amino-acid", by looking at the
@@ -470,6 +477,74 @@ across genes) so that they are more comparable between codons.
     return standardized_codon_usage_biases
 
 
+def aa_usage(codon_counts, verbose=False):
+    """
+    Compute amino-acid usage biases as the standardized
+    difference between a gene's amino-acid proportions
+    and global amino-acid proportions.
+    """
+    render_md("""
+We will compute amino-acid usage, by looking at the
+proportions of amino-acids within a gene's CDS.
+""")
+    render_md("""
+We first need to compute, for a given gene, the total number of codons
+corresponding to each amino-acid.
+""")
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    render_md("""
+To compute amino-acid proportions, we can divide each line by its sum,
+or, equivalently (and hopefully more efficiently), normalize the data
+using the "l1" norm (which, for positive-only values amounts to the sum).
+""")
+    aa_proportions = pd.DataFrame(
+        normalize(summed_by_aa, norm="l1"),
+        index=summed_by_aa.index,
+        columns=summed_by_aa.columns)
+    # Doesn't seem to be working:
+    # aa_proportions.style.hide(axis="index")
+    if verbose:
+        display(aa_proportions.head(3))
+    # Checking that proportions sum to 1
+    colsums = aa_proportions.sum(axis=1)
+    assert np.allclose(colsums, np.full(len(colsums), 1))
+    # Then, computing the global amino-acid proportions
+    render_md("""
+To compute the usage biases, we also need to compute amino-acid proportions
+in the global usage.
+""")
+    render_md("""
+We compute the global usage, as the sum of the counts for a given codon,
+across genes.
+""")
+    global_usage = codon_counts.sum(axis=0)
+    render_md("Then we sum over codons corresponding to the same amino-acid.")
+    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    render_md("Then we normalize it the same way as for the individual genes.")
+    # The values.reshape(1, -1) turns the Series data into a 2D array
+    # with one line. flatten() turns the data back to 1D
+    global_aa_proportions = pd.Series(
+        normalize(
+            global_summed_by_aa.values.reshape(1, -1),
+            norm="l1").flatten(),
+        index=global_summed_by_aa.index)
+    render_md("""
+Amino-acid usage biases can be computed by subtracting the corresponding
+global proportion to an amino-acid's proportion.
+""")
+    aa_usage_biases = aa_proportions.sub(global_aa_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between amino-acids.
+""")
+    standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_aa_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_aa_usage_biases.head(3))
+    return standardized_aa_usage_biases
+
+
 def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
     """
     Load a table containing by-amino-acid codon usage biases.
-- 
GitLab