From a602b0bfe75d8ca4cd9bb2639b421e752aba513b Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Mon, 14 Mar 2022 12:04:02 +0100 Subject: [PATCH] Function computing aa usage biases. --- libcodonusage/__init__.py | 3 +- libcodonusage/libcodonusage.py | 77 +++++++++++++++++++++++++++++++++- 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index b2e5210..5b810c1 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,8 +1,9 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.10" +__version__ = "0.11" from .libcodonusage import ( aa2colour, + aa_usage, by_aa_codon_usage, codon2aa, columns_by_aa, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index f9ab307..761e24e 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -105,7 +105,7 @@ def load_counts_table( """ render_md(f"Loading data from [{table_path}]({table_path})...\n") codon_counts = pd.read_table(table_path, index_col=index_col) - if index_unique and not(codon_counts.index.is_unique): + if index_unique and not codon_counts.index.is_unique: raise ValueError(f"Index {index_col} contains repeated values.\n") nb_genes = len(codon_counts) render_md( @@ -340,6 +340,9 @@ def save_counts_table(counts_table, table_path): def gene_wide_codon_usage(codon_counts, verbose=False): """ + Compute codon usage biases "gene-wide" as the standardized + difference between a gene's codon proportions and global + codon proportions. """ render_md(""" We will compute codon usage "gene-wide" (i.e. not by amino-acid), @@ -396,6 +399,10 @@ across genes) so that they are more comparable between codons. def by_aa_codon_usage(codon_counts, verbose=False): """ + Compute codon usage biases "by amino-acid" as the standardized + difference between a gene's codon proportions and global + codon proportions, where proportions are computed within + groups of same-amino-acid-coding codons instead of gene-wide. """ render_md(""" We will compute codon usage "by amino-acid", by looking at the @@ -470,6 +477,74 @@ across genes) so that they are more comparable between codons. return standardized_codon_usage_biases +def aa_usage(codon_counts, verbose=False): + """ + Compute amino-acid usage biases as the standardized + difference between a gene's amino-acid proportions + and global amino-acid proportions. + """ + render_md(""" +We will compute amino-acid usage, by looking at the +proportions of amino-acids within a gene's CDS. +""") + render_md(""" +We first need to compute, for a given gene, the total number of codons +corresponding to each amino-acid. +""") + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + render_md(""" +To compute amino-acid proportions, we can divide each line by its sum, +or, equivalently (and hopefully more efficiently), normalize the data +using the "l1" norm (which, for positive-only values amounts to the sum). +""") + aa_proportions = pd.DataFrame( + normalize(summed_by_aa, norm="l1"), + index=summed_by_aa.index, + columns=summed_by_aa.columns) + # Doesn't seem to be working: + # aa_proportions.style.hide(axis="index") + if verbose: + display(aa_proportions.head(3)) + # Checking that proportions sum to 1 + colsums = aa_proportions.sum(axis=1) + assert np.allclose(colsums, np.full(len(colsums), 1)) + # Then, computing the global amino-acid proportions + render_md(""" +To compute the usage biases, we also need to compute amino-acid proportions +in the global usage. +""") + render_md(""" +We compute the global usage, as the sum of the counts for a given codon, +across genes. +""") + global_usage = codon_counts.sum(axis=0) + render_md("Then we sum over codons corresponding to the same amino-acid.") + global_summed_by_aa = global_usage.groupby(level=0).sum() + render_md("Then we normalize it the same way as for the individual genes.") + # The values.reshape(1, -1) turns the Series data into a 2D array + # with one line. flatten() turns the data back to 1D + global_aa_proportions = pd.Series( + normalize( + global_summed_by_aa.values.reshape(1, -1), + norm="l1").flatten(), + index=global_summed_by_aa.index) + render_md(""" +Amino-acid usage biases can be computed by subtracting the corresponding +global proportion to an amino-acid's proportion. +""") + aa_usage_biases = aa_proportions.sub(global_aa_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between amino-acids. +""") + standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std()) + # Doesn't seem to be working: + # standardized_aa_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_aa_usage_biases.head(3)) + return standardized_aa_usage_biases + + def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): """ Load a table containing by-amino-acid codon usage biases. -- GitLab