diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 3107f30870c7f07d494f2065049442353da363ab..3626e45e9d0ad826a9e7e0c9f0be9f0409d4d4e6 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,10 +1,11 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.12" +__version__ = "0.13" from .libcodonusage import ( aa2colour, aa_usage, by_aa_codon_usage, + centroid_usage, codon2aa, columns_by_aa, detect_fishy_genes, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index edc2ecd9757d2a0701d8d92432dfa12bd256e097..d4531c0c429747903ac603c6af54181afe28e25b 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -17,6 +17,8 @@ import json from operator import itemgetter from pathlib import Path +# python3 -m pip install cytoolz +from cytoolz import groupby # To render mardown in a Jupyter notebook on gitlab from IPython.core.display import display, HTML # python3 -m pip install markdown @@ -40,8 +42,6 @@ import numpy as np # python3 -m pip install pandas # See https://pandas.pydata.org/pandas-docs/stable/index.html import pandas as pd -# python3 -m pip install cytoolz -from cytoolz import groupby # Python module that facilitates the exploration of tbular data # python3 -m pip install plydata # from plydata import define, pull, query @@ -569,6 +569,72 @@ methionine (M) and tryptophan (W). all_nan_cols) +def centroid_usage(codon_counts, all_nan_cols): + """ + Define "centroids" for gene clusters, one per codon in *codon_counts*. + + The standardized usage biases of these centroids are computed so that + the codon proportions for a given amino-acid are the same as the global + proportions, except for the codons corresponding to a certain amino-acid. + For each amino-acid, there is one centroid per codon, where the + proportion for this codon is set to 1.0, and 0.0 for the other codons. + """ + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + global_usage = codon_counts.sum(axis=0) + global_summed_by_aa = global_usage.groupby(level=0).sum() + global_proportions_by_aa = global_usage.div( + global_summed_by_aa) + codon_proportions_by_aa = codon_counts.div(summed_by_aa) + codon_usage_biases_by_aa = codon_proportions_by_aa.sub( + global_proportions_by_aa) + # _columns_by_aa may be different from columns_by_aa + # Groups of column headers for each amino-acid + # key: aa, value: list of (aa, codon) pairs + _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns) + + def generate_centroid_proportions(): + for (aa1, columns) in _columns_by_aa.items(): + for (_, codon1) in columns: + proportions = global_proportions_by_aa.copy() + for (aa2, codon2) in columns: + proportions[(aa2, codon2)] = 0.0 + proportions[(aa1, codon1)] = 1.0 + proportions.name = ( + f"{aa1}_{codon1}", f"{aa1}_{codon1}", + *np.full( + codon_proportions_by_aa.index.nlevels - 2, + np.nan)) + yield proportions + + render_md(""" +For each codon, a "centroid" (future center of a cluster of genes) +is defined where the proportion of this codon is set to 1.0. +For the other codons corresponding to the same amino-acid, the proportion +is therefore set to 0.0. +For codons corresponding to other amino-acids, the proportions are set to +the proportions observed in the global data. +""") + centroids_proportions_by_aa = pd.DataFrame( + generate_centroid_proportions()) + centroids_proportions_by_aa.index.rename( + codon_proportions_by_aa.index.names, + inplace=True) + + render_md(""" +The biases for the centroids are then computed by subtracting to those +proportions the proportions in the global data, and the biases are +standardized by dividing by the standard deviation of codon usage biases +in the data. +""") + # Compute biases using the same global proportions as the real data + # and standardize by the same standard deviations as the real data + centroids_scub_by_aa = centroids_proportions_by_aa.sub( + global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop( + columns=all_nan_cols) + # centroids_scub_by_aa.head(3) + return centroids_scub_by_aa + + def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases.