Skip to content
Snippets Groups Projects
Commit 80d5d3ae authored by Blaise Li's avatar Blaise Li
Browse files

Functions to cluster around "full-bias" centroids.

parent e486cb54
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = "0.13"
__version__ = "0.15"
from .libcodonusage import (
aa2colour,
aa_usage,
......@@ -14,6 +14,8 @@ from .libcodonusage import (
load_bias_table,
load_counts_table,
make_aa_codon_columns,
make_cluster_table,
make_centroids_cluster_finder,
make_counts_only,
render_md,
save_counts_table,
......
......@@ -18,7 +18,7 @@ import json
from operator import itemgetter
from pathlib import Path
# python3 -m pip install cytoolz
from cytoolz import groupby
from cytoolz import groupby, unique
# To render mardown in a Jupyter notebook on gitlab
from IPython.core.display import display, HTML
# python3 -m pip install markdown
......@@ -45,6 +45,8 @@ import pandas as pd
# Python module that facilitates the exploration of tbular data
# python3 -m pip install plydata
# from plydata import define, pull, query
# python3 -m pip install scipy
from scipy.spatial.distance import sqeuclidean as sqdist
# Python library with useful data-processing features
# python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html
......@@ -635,6 +637,52 @@ in the data.
return centroids_scub_by_aa
def make_centroids_cluster_finder(centroids_table, aa):
"""
Make a function that, when applied to a row in the standardized
codon bias table, determines to what centroid among those
corresponding to *aa* it is the closest.
The *centroids_table* should contain standardized codon usage
biases for the centroids.
"""
# The columns that contain data pertaining to codons coding aa:
cols_for_aa = (centroids_table.columns.get_level_values(0) == aa)
cluster_names = [
f"{aa}_{codon}"
for (aa, codon) in centroids_table.columns[cols_for_aa]]
def closest_centroid(gene):
dists_to_centroids = {}
for cluster_name in cluster_names:
dists_to_centroids[cluster_name] = sqdist(
gene[cols_for_aa].values,
centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values)
# We return the cluster_name (`min(...))[1]`)
# associated with the minimum distance
return min(
(dist, cluster_name)
for (cluster_name, dist)
in dists_to_centroids.items())[1]
return closest_centroid
def make_cluster_table(scub_table, centroids_scub_table):
"""
Make a table for the genes in standardized codon usage bias
table *scub_table* where each column tells, for a given
amino-acid to which centroid in *centroids_scub_table*
it is the closest.
"""
return pd.DataFrame(
{
(f"cluster_{aa}_full_bias", ""): scub_table.apply(
make_centroids_cluster_finder(centroids_scub_table, aa),
axis=1).values
for aa
in unique(centroids_scub_table.columns.get_level_values(0))},
index=scub_table.index)
def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
"""
Load a table containing by-amino-acid codon usage biases.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment