Also compute global RSCU.

Hopefully, this can be used to fill NaNs for PCA.

Also compute global RSCU.
2f2bba77 · Blaise Li · d83084c7 · 2f2bba77 · 2f2bba77
Commit 2f2bba77 authored 1 year ago by Blaise Li
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
 __copyright__ = "Copyright (C) 2022-2023 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.27.4"
+__version__ = "0.27.5"
 from .libcodonusage import (
    aa2colour,
    aa_usage,

--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -531,7 +531,7 @@ def check_aa_codon_columns(table):
    Check that the columns of *table* correspond to (aa, codon) pairs.
    """
    msg = "Codon proportions table should have two levels: 'aa' and 'codon'"
-    if table.columns.nlevels !=2:
+    if table.columns.nlevels != 2:
        raise ValueError(msg)
    if table.columns.names[0] != "aa":
        raise ValueError(msg)
@@ -549,6 +549,9 @@ def compute_rscu(codon_proportions_by_aa):
    same amino-acid. The column index should be a pandas MultiIndex
    where the first level is the amino-acid name, and the second level
    the codon.
+
+    Return a pair where the first element is the rscu table, and
+    the second is the degeneracy table.
    """
    check_aa_codon_columns(codon_proportions_by_aa)
    degeneracy = pd.Series(
@@ -578,7 +581,9 @@ def compute_rscu(codon_proportions_by_aa):
                    itemgetter(0),
                    codon_proportions_by_aa.columns)).items()])),
        index=codon_proportions_by_aa.columns)
-    return codon_proportions_by_aa.mul(degeneracy)
+    return (
+        codon_proportions_by_aa.mul(degeneracy),
+        degeneracy)


 def by_aa_codon_usage(
@@ -640,13 +645,14 @@ This amounts to computing codon counts relative to the average codon counts
 for that amino-acid.
 (This corresponds to R3 in {SUZUKI_LINK})
 """)
-        rscu = compute_rscu(codon_proportions)
+        (rscu, degeneracy) = compute_rscu(codon_proportions)
        render_md(f"""
 We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative
 to the maxium across synonymous codons of the counts for that amino-acid.
 """)
+        max_counts = codon_counts.T.groupby("aa").max().T
        r4_table = codon_counts.div(
-            codon_counts.T.groupby("aa").max().T)
+            max_counts)
    if ref_filter_dict is None:
        counts_for_global = codon_counts
    else:
@@ -695,7 +701,8 @@ across genes) so that they are more comparable between codons.
            "proportions": codon_proportions,
            "rscu": rscu,
            "r4_table": r4_table,
-            "global_proportions": global_proportions}
+            "global_proportions": global_proportions,
+            "global_rscu": global_proportions.mul(degeneracy)}
    return standardized_codon_usage_biases


@@ -944,7 +951,7 @@ def centroid_usage(codon_counts, all_nan_cols):
    global_usage = codon_counts.sum(axis=0)
    global_summed_by_aa = global_usage.groupby(level=0).sum()
    global_proportions_by_aa = global_usage.div(
-            global_summed_by_aa)
+        global_summed_by_aa)
    codon_proportions_by_aa = codon_counts.div(summed_by_aa)
    codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
        global_proportions_by_aa)