From 30025b0ae2b0dbc2bfcc3f7294a0895c11cc1eb0 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Thu, 14 Sep 2023 11:49:08 +0200
Subject: [PATCH] Added code to compute RSCU.

Not tested.
---
 libcodonusage/__init__.py      |  5 ++-
 libcodonusage/libcodonusage.py | 68 ++++++++++++++++++++++++++++++----
 2 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 1254cf9..b6c2fcc 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
-__copyright__ = "Copyright (C) 2022 Blaise Li"
+__copyright__ = "Copyright (C) 2022-2023 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.27"
+__version__ = "0.27.1"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
@@ -10,6 +10,7 @@ from .libcodonusage import (
     codon_usage_pca,
     columns_by_aa,
     compare_clusterings,
+    compute_rscu,
     detect_fishy_genes,
     exclude_all_nan_cols,
     extract_top_genes_from_cluster,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 9e60d26..848efc2 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright (C) 2021-2022 Blaise Li
+# Copyright (C) 2021-2023 Blaise Li
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@ import json
 from operator import attrgetter, itemgetter
 from pathlib import Path
 # python3 -m pip install cytoolz
-from cytoolz import concat, groupby, unique
+from cytoolz import concat, groupby, unique, valmap
 # To render mardown in a Jupyter notebook on gitlab
 from IPython.display import display
 from IPython.core.display import HTML
@@ -65,7 +65,8 @@ fmt_metadata = {
     "png": {"creationDate": None},
     "svg": {
         "Creator": None, "Date": None,
-        "Title": "Distribution of standardized codon usages biases (by aa), by chromosome"}
+        "Title": ("Distribution of standardized codon usages biases"
+                  " (by aa), by chromosome")}
 }
 
 
@@ -153,7 +154,7 @@ def compute_criteria(codon_counts):
     start_upstream_met_start_col = start_upstream_col & ~no_met_start_col
     start_upstream_met_start_nostops_col = (
         start_upstream_met_start_col & ~has_stops_col)
-    good_met_start_col = (~start_upstream_col & ~no_met_start_col)
+    good_met_start_col = ~start_upstream_col & ~no_met_start_col
     has_stops_good_met_start_col = (
         has_stops_col & ~no_met_start_col
         & ~start_upstream_met_start_col)
@@ -465,7 +466,47 @@ across genes) so that they are more comparable between codons.
     return standardized_codon_usage_biases
 
 
-# TODO: add option to output RSCU instead of / besides proportions
+def compute_rscu(codon_proportions_by_aa):
+    """
+    Compute Relative Syninymous Codon Usage from proportions in genes.
+
+    *codon_proportions_by_aa* should be a pandas DataFrame where
+    rows correspond to genes, and columns to codons. It contains
+    the proportions in the gene of the codon among those coding the
+    same amino-acid. The column index should be a pandas MultiIndex
+    where the first level is the amino-acid name, and the second level
+    the codon.
+    """
+    degeneracy = pd.Series(
+        # concat "flattens" the list of iterables given as arguments
+        # (list of tuples of repeated degeneracy values)
+        list(concat([
+            # we want stretches of the same degeneracy values
+            # of as many elements as there are codons in this degeneracy group,
+            # hence the tuple multiplication
+            ((degen),) * degen
+            # valmap transforms values in a dict given as second argument
+            # using the function given as first argument
+            # We extract only the value from the (key, value) pairs
+            # generated by the .items() methods of the resulting dict
+            # (Actually, we could probably have just used the .values() method)
+            for (_, degen) in valmap(
+                # len gives the length of each group,
+                # where a group contains the (aa, codon) pairs for a given aa,
+                # that is, the degeneracy for this group of codons
+                len,
+                # groupby creates a dict where keys are obtained from
+                # the function given as first argument and values are
+                # groups made from the iterable given as second argument
+                # (here, column names, which are (aa, codon) pairs)
+                # itemgetter(0) extracts the amino-acid from a pair (aa, codon)
+                groupby(
+                    itemgetter(0),
+                    codon_proportions_by_aa.columns)).items()])),
+        index=codon_proportions_by_aa.columns)
+    return codon_proportions_by_aa.mul(degeneracy)
+
+
 def by_aa_codon_usage(
         codon_counts,
         verbose=False, return_more=False, ref_filter_dict=None):
@@ -515,6 +556,16 @@ the total for the corresponding amino-acid.
         np.logical_not(
             np.isclose(
                 sums.values, all_zeroes)))
+    if return_more:
+        render_md(f"""
+We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying
+codon proportions by the degeneracy of the corresponding amino-acid, that is,
+by the number of different codons existing for that amino-acid.
+This amounts to computing codon counts relative to the average codon counts
+for that amino-acid.
+(This corresponds to R3 in {SUZUKI_LINK})
+""")
+        rscu = compute_rscu(codon_proportions)
     if ref_filter_dict is None:
         counts_for_global = codon_counts
     else:
@@ -561,6 +612,7 @@ across genes) so that they are more comparable between codons.
         return {
             "biases": standardized_codon_usage_biases,
             "proportions": codon_proportions,
+            "rscu": rscu,
             "global_proportions": global_proportions}
     return standardized_codon_usage_biases
 
@@ -711,7 +763,7 @@ def codon_influence_in_components(
         pd.Series(
             components[component],
             index=colnames).plot.bar(
-                ax=axes[component],
+                ax=axis,
                 # colname is supposed to end with the 3-letters codon
                 color=[
                     nuc2colour[colname[-1]]
@@ -871,7 +923,7 @@ def make_centroids_cluster_finder(
     biases for the centroids.
     """
     # The columns that contain data pertaining to codons coding aa:
-    cols_for_aa = (centroids_table.columns.get_level_values(0) == aa)
+    cols_for_aa = centroids_table.columns.get_level_values(0) == aa
     cluster_names = [
         f"{aa}_{codon}"
         for (aa, codon) in centroids_table.columns[cols_for_aa]]
@@ -1115,7 +1167,6 @@ def extract_top_genes_from_cluster(
     aa_dir = clusters_dir.joinpath(star2stop(aa))
     # Files where to save the results
     top_dir = aa_dir.joinpath("top_genes")
-    top_dir.mkdir(parents=True, exist_ok=True)
     path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
     path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
     path_to_fig = top_dir.joinpath(star2stop(
@@ -1165,6 +1216,7 @@ def extract_top_genes_from_cluster(
     # Save the figure and close it.
     plt.tight_layout()
     if savefig:
+        top_dir.mkdir(parents=True, exist_ok=True)
         plt.savefig(path_to_fig, metadata={'creationDate': None})
         plt.close()
         # Save the list of the top genes
-- 
GitLab