From 7b63a9e271af7b6ed18179a9f174e90519a1a552 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 2 May 2022 12:16:22 +0200
Subject: [PATCH] Optional codon influence in PCA.

---
 libcodonusage/__init__.py      |  2 +-
 libcodonusage/libcodonusage.py | 87 ++++++++++++++++++++++++----------
 2 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 4fcad87..a79d9a5 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.25"
+__version__ = "0.26"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 49d5c6b..3bb2a5e 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -674,10 +674,62 @@ methionine (M) and tryptophan (W).
         all_nan_cols)
 
 
+def codon_influence_in_components(
+        components, colnames,
+        figs_dir=None, formats=None):
+    """
+    Plot the influence of the columns in the first 4 principal axes of a PCA.
+
+    Each column should correspond to a codon, and will be represented as a
+    colour bar whose coulour is based on the last letter of the codon.
+
+    *components* should be a numpy array representing the principal
+    axes of a PCA, such as the `components_` attribute of a fitted
+    `sklearn.decomposition.PCA` object.
+
+    *colnames* should be the names of the columns in the *components*
+    array and  are expected to match the following pattern:
+    <aa>_<codon>, where <aa> is a single-letter code for an amino-acid,
+    and <codon> is one of the 3-letter codons for this amino-acid,
+    in capital letters among A, T, G and C (i.e. in the DNA alphabet).
+    The last letter will be used to set the colour of a bar in the plots
+
+    *figs_dir* should be a path to a directory that will be used
+    to save graphics representing the influence of each data column
+    on the first four principal components.
+
+    *formats* should be a list of formats in which the figures should
+    be saved, such as "svg" or "png".
+    """
+    render_md(
+        "Vizualizing the influence of codons in the first 4 components\n")
+    # TODO: *figsize* could be adapted depending on the number of columns
+    (fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
+    for (component, axis) in enumerate(axes):
+        pd.Series(
+            components[component],
+            index=colnames).plot.bar(
+                ax=axes[component],
+                # colname is supposed to end with the 3-letters codon
+                color=[
+                    nuc2colour[colname[-1]]
+                    for colname in colnames])
+        axis.set_ylabel(f"weight in component {component}")
+        # axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
+    fig.subplots_adjust(hspace=.5)
+    if figs_dir is not None and formats is not None:
+        for ext in formats:
+            plt.savefig(
+                figs_dir.joinpath(f"PCA_components.{ext}"),
+                metadata=fmt_metadata[ext])
+    display(fig)
+    plt.close(fig)
+
+
 def codon_usage_pca(
         usage_data,
         figs_dir=None, hue="chrom", exclude_cols=None,
-        formats=None):
+        formats=None, cols_are_codons=True):
     """
     Perform Principal Component Analysis on *usage_data*.
 
@@ -696,9 +748,12 @@ def codon_usage_pca(
 
     If *figs_dir* is not None, this path to a directory will be used
     to save graphics representing the projection of the observations
-    in the first four principal components (0 vs. 1 and 2 vs. 3)
-    as well as graphics representing the influence of each data column
+    in the first four principal components (0 vs. 1 and 2 vs. 3).
+
+    Unless *cols_are_codons* is set to False, there will also be
+    graphics representing the influence of each data column
     on the first four principal components.
+
     *formats* should be a list of formats in which the figures should
     be saved, such as "svg" or "png".
 
@@ -730,28 +785,10 @@ def codon_usage_pca(
                 metadata=fmt_metadata[ext])
     display(fig)
     plt.close(fig)
-    render_md(
-        "Vizualizing the influence of codons in the first 4 components\n")
-    (fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
-    for (component, axis) in enumerate(axes):
-        pd.Series(
-            pca.components_[component],
-            index=usage_data.columns).plot.bar(
-                ax=axes[component],
-                # colname is supposed to end with the 3-letters codon
-                color=[
-                    nuc2colour[colname[-1]]
-                    for colname in usage_data.columns])
-        axis.set_ylabel(f"weight in component {component}")
-        # axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
-    fig.subplots_adjust(hspace=.5)
-    if figs_dir is not None and formats is not None:
-        for ext in formats:
-            plt.savefig(
-                figs_dir.joinpath(f"PCA_components.{ext}"),
-                metadata=fmt_metadata[ext])
-    display(fig)
-    plt.close(fig)
+    if cols_are_codons:
+        codon_influence_in_components(
+            pca.components_, usage_data.columns,
+            figs_dir=figs_dir, formats=formats)
     return (pca, transformed_data)
 
 
-- 
GitLab