From 2f2bba77a9b1867bb45d6035a794c0cf956c972c Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Tue, 3 Oct 2023 12:45:57 +0200 Subject: [PATCH] Also compute global RSCU. Hopefully, this can be used to fill NaNs for PCA. --- libcodonusage/__init__.py | 2 +- libcodonusage/libcodonusage.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index bcdaab8..2065ff6 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022-2023 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.27.4" +__version__ = "0.27.5" from .libcodonusage import ( aa2colour, aa_usage, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 2408e4a..2be726c 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -531,7 +531,7 @@ def check_aa_codon_columns(table): Check that the columns of *table* correspond to (aa, codon) pairs. """ msg = "Codon proportions table should have two levels: 'aa' and 'codon'" - if table.columns.nlevels !=2: + if table.columns.nlevels != 2: raise ValueError(msg) if table.columns.names[0] != "aa": raise ValueError(msg) @@ -549,6 +549,9 @@ def compute_rscu(codon_proportions_by_aa): same amino-acid. The column index should be a pandas MultiIndex where the first level is the amino-acid name, and the second level the codon. + + Return a pair where the first element is the rscu table, and + the second is the degeneracy table. """ check_aa_codon_columns(codon_proportions_by_aa) degeneracy = pd.Series( @@ -578,7 +581,9 @@ def compute_rscu(codon_proportions_by_aa): itemgetter(0), codon_proportions_by_aa.columns)).items()])), index=codon_proportions_by_aa.columns) - return codon_proportions_by_aa.mul(degeneracy) + return ( + codon_proportions_by_aa.mul(degeneracy), + degeneracy) def by_aa_codon_usage( @@ -640,13 +645,14 @@ This amounts to computing codon counts relative to the average codon counts for that amino-acid. (This corresponds to R3 in {SUZUKI_LINK}) """) - rscu = compute_rscu(codon_proportions) + (rscu, degeneracy) = compute_rscu(codon_proportions) render_md(f""" We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative to the maxium across synonymous codons of the counts for that amino-acid. """) + max_counts = codon_counts.T.groupby("aa").max().T r4_table = codon_counts.div( - codon_counts.T.groupby("aa").max().T) + max_counts) if ref_filter_dict is None: counts_for_global = codon_counts else: @@ -695,7 +701,8 @@ across genes) so that they are more comparable between codons. "proportions": codon_proportions, "rscu": rscu, "r4_table": r4_table, - "global_proportions": global_proportions} + "global_proportions": global_proportions, + "global_rscu": global_proportions.mul(degeneracy)} return standardized_codon_usage_biases @@ -944,7 +951,7 @@ def centroid_usage(codon_counts, all_nan_cols): global_usage = codon_counts.sum(axis=0) global_summed_by_aa = global_usage.groupby(level=0).sum() global_proportions_by_aa = global_usage.div( - global_summed_by_aa) + global_summed_by_aa) codon_proportions_by_aa = codon_counts.div(summed_by_aa) codon_usage_biases_by_aa = codon_proportions_by_aa.sub( global_proportions_by_aa) -- GitLab