From 2f2bba77a9b1867bb45d6035a794c0cf956c972c Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Tue, 3 Oct 2023 12:45:57 +0200
Subject: [PATCH] Also compute global RSCU.

Hopefully, this can be used to fill NaNs for PCA.
---
 libcodonusage/__init__.py      |  2 +-
 libcodonusage/libcodonusage.py | 19 +++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index bcdaab8..2065ff6 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022-2023 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.27.4"
+__version__ = "0.27.5"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 2408e4a..2be726c 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -531,7 +531,7 @@ def check_aa_codon_columns(table):
     Check that the columns of *table* correspond to (aa, codon) pairs.
     """
     msg = "Codon proportions table should have two levels: 'aa' and 'codon'"
-    if table.columns.nlevels !=2:
+    if table.columns.nlevels != 2:
         raise ValueError(msg)
     if table.columns.names[0] != "aa":
         raise ValueError(msg)
@@ -549,6 +549,9 @@ def compute_rscu(codon_proportions_by_aa):
     same amino-acid. The column index should be a pandas MultiIndex
     where the first level is the amino-acid name, and the second level
     the codon.
+
+    Return a pair where the first element is the rscu table, and
+    the second is the degeneracy table.
     """
     check_aa_codon_columns(codon_proportions_by_aa)
     degeneracy = pd.Series(
@@ -578,7 +581,9 @@ def compute_rscu(codon_proportions_by_aa):
                     itemgetter(0),
                     codon_proportions_by_aa.columns)).items()])),
         index=codon_proportions_by_aa.columns)
-    return codon_proportions_by_aa.mul(degeneracy)
+    return (
+        codon_proportions_by_aa.mul(degeneracy),
+        degeneracy)
 
 
 def by_aa_codon_usage(
@@ -640,13 +645,14 @@ This amounts to computing codon counts relative to the average codon counts
 for that amino-acid.
 (This corresponds to R3 in {SUZUKI_LINK})
 """)
-        rscu = compute_rscu(codon_proportions)
+        (rscu, degeneracy) = compute_rscu(codon_proportions)
         render_md(f"""
 We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative
 to the maxium across synonymous codons of the counts for that amino-acid.
 """)
+        max_counts = codon_counts.T.groupby("aa").max().T
         r4_table = codon_counts.div(
-            codon_counts.T.groupby("aa").max().T)
+            max_counts)
     if ref_filter_dict is None:
         counts_for_global = codon_counts
     else:
@@ -695,7 +701,8 @@ across genes) so that they are more comparable between codons.
             "proportions": codon_proportions,
             "rscu": rscu,
             "r4_table": r4_table,
-            "global_proportions": global_proportions}
+            "global_proportions": global_proportions,
+            "global_rscu": global_proportions.mul(degeneracy)}
     return standardized_codon_usage_biases
 
 
@@ -944,7 +951,7 @@ def centroid_usage(codon_counts, all_nan_cols):
     global_usage = codon_counts.sum(axis=0)
     global_summed_by_aa = global_usage.groupby(level=0).sum()
     global_proportions_by_aa = global_usage.div(
-            global_summed_by_aa)
+        global_summed_by_aa)
     codon_proportions_by_aa = codon_counts.div(summed_by_aa)
     codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
         global_proportions_by_aa)
-- 
GitLab