diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 1254cf9ee64479b766bfb0f1403c8b2100dabd41..b6c2fcc1a58d8c00aad8265a5bee03e67810dd49 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ -__copyright__ = "Copyright (C) 2022 Blaise Li" +__copyright__ = "Copyright (C) 2022-2023 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.27" +__version__ = "0.27.1" from .libcodonusage import ( aa2colour, aa_usage, @@ -10,6 +10,7 @@ from .libcodonusage import ( codon_usage_pca, columns_by_aa, compare_clusterings, + compute_rscu, detect_fishy_genes, exclude_all_nan_cols, extract_top_genes_from_cluster, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 9e60d26b39ea4b993692e499a159729147339af5..848efc2bee85e8089d53bac09e1100e576c8f442 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -# Copyright (C) 2021-2022 Blaise Li +# Copyright (C) 2021-2023 Blaise Li # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ import json from operator import attrgetter, itemgetter from pathlib import Path # python3 -m pip install cytoolz -from cytoolz import concat, groupby, unique +from cytoolz import concat, groupby, unique, valmap # To render mardown in a Jupyter notebook on gitlab from IPython.display import display from IPython.core.display import HTML @@ -65,7 +65,8 @@ fmt_metadata = { "png": {"creationDate": None}, "svg": { "Creator": None, "Date": None, - "Title": "Distribution of standardized codon usages biases (by aa), by chromosome"} + "Title": ("Distribution of standardized codon usages biases" + " (by aa), by chromosome")} } @@ -153,7 +154,7 @@ def compute_criteria(codon_counts): start_upstream_met_start_col = start_upstream_col & ~no_met_start_col start_upstream_met_start_nostops_col = ( start_upstream_met_start_col & ~has_stops_col) - good_met_start_col = (~start_upstream_col & ~no_met_start_col) + good_met_start_col = ~start_upstream_col & ~no_met_start_col has_stops_good_met_start_col = ( has_stops_col & ~no_met_start_col & ~start_upstream_met_start_col) @@ -465,7 +466,47 @@ across genes) so that they are more comparable between codons. return standardized_codon_usage_biases -# TODO: add option to output RSCU instead of / besides proportions +def compute_rscu(codon_proportions_by_aa): + """ + Compute Relative Syninymous Codon Usage from proportions in genes. + + *codon_proportions_by_aa* should be a pandas DataFrame where + rows correspond to genes, and columns to codons. It contains + the proportions in the gene of the codon among those coding the + same amino-acid. The column index should be a pandas MultiIndex + where the first level is the amino-acid name, and the second level + the codon. + """ + degeneracy = pd.Series( + # concat "flattens" the list of iterables given as arguments + # (list of tuples of repeated degeneracy values) + list(concat([ + # we want stretches of the same degeneracy values + # of as many elements as there are codons in this degeneracy group, + # hence the tuple multiplication + ((degen),) * degen + # valmap transforms values in a dict given as second argument + # using the function given as first argument + # We extract only the value from the (key, value) pairs + # generated by the .items() methods of the resulting dict + # (Actually, we could probably have just used the .values() method) + for (_, degen) in valmap( + # len gives the length of each group, + # where a group contains the (aa, codon) pairs for a given aa, + # that is, the degeneracy for this group of codons + len, + # groupby creates a dict where keys are obtained from + # the function given as first argument and values are + # groups made from the iterable given as second argument + # (here, column names, which are (aa, codon) pairs) + # itemgetter(0) extracts the amino-acid from a pair (aa, codon) + groupby( + itemgetter(0), + codon_proportions_by_aa.columns)).items()])), + index=codon_proportions_by_aa.columns) + return codon_proportions_by_aa.mul(degeneracy) + + def by_aa_codon_usage( codon_counts, verbose=False, return_more=False, ref_filter_dict=None): @@ -515,6 +556,16 @@ the total for the corresponding amino-acid. np.logical_not( np.isclose( sums.values, all_zeroes))) + if return_more: + render_md(f""" +We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying +codon proportions by the degeneracy of the corresponding amino-acid, that is, +by the number of different codons existing for that amino-acid. +This amounts to computing codon counts relative to the average codon counts +for that amino-acid. +(This corresponds to R3 in {SUZUKI_LINK}) +""") + rscu = compute_rscu(codon_proportions) if ref_filter_dict is None: counts_for_global = codon_counts else: @@ -561,6 +612,7 @@ across genes) so that they are more comparable between codons. return { "biases": standardized_codon_usage_biases, "proportions": codon_proportions, + "rscu": rscu, "global_proportions": global_proportions} return standardized_codon_usage_biases @@ -711,7 +763,7 @@ def codon_influence_in_components( pd.Series( components[component], index=colnames).plot.bar( - ax=axes[component], + ax=axis, # colname is supposed to end with the 3-letters codon color=[ nuc2colour[colname[-1]] @@ -871,7 +923,7 @@ def make_centroids_cluster_finder( biases for the centroids. """ # The columns that contain data pertaining to codons coding aa: - cols_for_aa = (centroids_table.columns.get_level_values(0) == aa) + cols_for_aa = centroids_table.columns.get_level_values(0) == aa cluster_names = [ f"{aa}_{codon}" for (aa, codon) in centroids_table.columns[cols_for_aa]] @@ -1115,7 +1167,6 @@ def extract_top_genes_from_cluster( aa_dir = clusters_dir.joinpath(star2stop(aa)) # Files where to save the results top_dir = aa_dir.joinpath("top_genes") - top_dir.mkdir(parents=True, exist_ok=True) path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) path_to_fig = top_dir.joinpath(star2stop( @@ -1165,6 +1216,7 @@ def extract_top_genes_from_cluster( # Save the figure and close it. plt.tight_layout() if savefig: + top_dir.mkdir(parents=True, exist_ok=True) plt.savefig(path_to_fig, metadata={'creationDate': None}) plt.close() # Save the list of the top genes