From 80d5d3ae590e19f2460f621afda4ec1f509b8576 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Thu, 17 Mar 2022 12:34:24 +0100 Subject: [PATCH] Functions to cluster around "full-bias" centroids. --- libcodonusage/__init__.py | 4 ++- libcodonusage/libcodonusage.py | 50 +++++++++++++++++++++++++++++++++- 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 3626e45..9e9c9d7 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.13" +__version__ = "0.15" from .libcodonusage import ( aa2colour, aa_usage, @@ -14,6 +14,8 @@ from .libcodonusage import ( load_bias_table, load_counts_table, make_aa_codon_columns, + make_cluster_table, + make_centroids_cluster_finder, make_counts_only, render_md, save_counts_table, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index d4531c0..72bd6d1 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -18,7 +18,7 @@ import json from operator import itemgetter from pathlib import Path # python3 -m pip install cytoolz -from cytoolz import groupby +from cytoolz import groupby, unique # To render mardown in a Jupyter notebook on gitlab from IPython.core.display import display, HTML # python3 -m pip install markdown @@ -45,6 +45,8 @@ import pandas as pd # Python module that facilitates the exploration of tbular data # python3 -m pip install plydata # from plydata import define, pull, query +# python3 -m pip install scipy +from scipy.spatial.distance import sqeuclidean as sqdist # Python library with useful data-processing features # python3 -m pip install scikit-learn # https://scikit-learn.org/stable/install.html @@ -635,6 +637,52 @@ in the data. return centroids_scub_by_aa +def make_centroids_cluster_finder(centroids_table, aa): + """ + Make a function that, when applied to a row in the standardized + codon bias table, determines to what centroid among those + corresponding to *aa* it is the closest. + The *centroids_table* should contain standardized codon usage + biases for the centroids. + """ + # The columns that contain data pertaining to codons coding aa: + cols_for_aa = (centroids_table.columns.get_level_values(0) == aa) + cluster_names = [ + f"{aa}_{codon}" + for (aa, codon) in centroids_table.columns[cols_for_aa]] + + def closest_centroid(gene): + dists_to_centroids = {} + for cluster_name in cluster_names: + dists_to_centroids[cluster_name] = sqdist( + gene[cols_for_aa].values, + centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values) + # We return the cluster_name (`min(...))[1]`) + # associated with the minimum distance + return min( + (dist, cluster_name) + for (cluster_name, dist) + in dists_to_centroids.items())[1] + return closest_centroid + + +def make_cluster_table(scub_table, centroids_scub_table): + """ + Make a table for the genes in standardized codon usage bias + table *scub_table* where each column tells, for a given + amino-acid to which centroid in *centroids_scub_table* + it is the closest. + """ + return pd.DataFrame( + { + (f"cluster_{aa}_full_bias", ""): scub_table.apply( + make_centroids_cluster_finder(centroids_scub_table, aa), + axis=1).values + for aa + in unique(centroids_scub_table.columns.get_level_values(0))}, + index=scub_table.index) + + def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases. -- GitLab