From 80d5d3ae590e19f2460f621afda4ec1f509b8576 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Thu, 17 Mar 2022 12:34:24 +0100
Subject: [PATCH] Functions to cluster around "full-bias" centroids.

---
 libcodonusage/__init__.py      |  4 ++-
 libcodonusage/libcodonusage.py | 50 +++++++++++++++++++++++++++++++++-
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 3626e45..9e9c9d7 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.13"
+__version__ = "0.15"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
@@ -14,6 +14,8 @@ from .libcodonusage import (
     load_bias_table,
     load_counts_table,
     make_aa_codon_columns,
+    make_cluster_table,
+    make_centroids_cluster_finder,
     make_counts_only,
     render_md,
     save_counts_table,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index d4531c0..72bd6d1 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -18,7 +18,7 @@ import json
 from operator import itemgetter
 from pathlib import Path
 # python3 -m pip install cytoolz
-from cytoolz import groupby
+from cytoolz import groupby, unique
 # To render mardown in a Jupyter notebook on gitlab
 from IPython.core.display import display, HTML
 # python3 -m pip install markdown
@@ -45,6 +45,8 @@ import pandas as pd
 # Python module that facilitates the exploration of tbular data
 # python3 -m pip install plydata
 # from plydata import define, pull, query
+# python3 -m pip install scipy
+from scipy.spatial.distance import sqeuclidean as sqdist
 # Python library with useful data-processing features
 # python3 -m pip install scikit-learn
 # https://scikit-learn.org/stable/install.html
@@ -635,6 +637,52 @@ in the data.
     return centroids_scub_by_aa
 
 
+def make_centroids_cluster_finder(centroids_table, aa):
+    """
+    Make a function that, when applied to a row in the standardized
+    codon bias table, determines to what centroid among those
+    corresponding to *aa* it is the closest.
+    The *centroids_table* should contain standardized codon usage
+    biases for the centroids.
+    """
+    # The columns that contain data pertaining to codons coding aa:
+    cols_for_aa = (centroids_table.columns.get_level_values(0) == aa)
+    cluster_names = [
+        f"{aa}_{codon}"
+        for (aa, codon) in centroids_table.columns[cols_for_aa]]
+
+    def closest_centroid(gene):
+        dists_to_centroids = {}
+        for cluster_name in cluster_names:
+            dists_to_centroids[cluster_name] = sqdist(
+                gene[cols_for_aa].values,
+                centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values)
+        # We return the cluster_name (`min(...))[1]`)
+        # associated with the minimum distance
+        return min(
+            (dist, cluster_name)
+            for (cluster_name, dist)
+            in dists_to_centroids.items())[1]
+    return closest_centroid
+
+
+def make_cluster_table(scub_table, centroids_scub_table):
+    """
+    Make a table for the genes in standardized codon usage bias
+    table *scub_table* where each column tells, for a given
+    amino-acid to which centroid in *centroids_scub_table*
+    it is the closest.
+    """
+    return pd.DataFrame(
+        {
+            (f"cluster_{aa}_full_bias", ""): scub_table.apply(
+                make_centroids_cluster_finder(centroids_scub_table, aa),
+                axis=1).values
+            for aa
+            in unique(centroids_scub_table.columns.get_level_values(0))},
+        index=scub_table.index)
+
+
 def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
     """
     Load a table containing by-amino-acid codon usage biases.
-- 
GitLab