From b9011ed25e3de307bdbca326c1b72e629cb58317 Mon Sep 17 00:00:00 2001 From: Marie Anselmet <marie.anselmet@pasteur.fr> Date: Fri, 20 Oct 2023 11:26:02 +0200 Subject: [PATCH] Add little docu to new fonctions --- build/lib/libcodonusage/__init__.py | 45 + build/lib/libcodonusage/libcodonusage.py | 1797 ++++++++++++++++++++++ libcodonusage/libcodonusage.py | 18 +- 3 files changed, 1856 insertions(+), 4 deletions(-) create mode 100644 build/lib/libcodonusage/__init__.py create mode 100644 build/lib/libcodonusage/libcodonusage.py diff --git a/build/lib/libcodonusage/__init__.py b/build/lib/libcodonusage/__init__.py new file mode 100644 index 0000000..45862d8 --- /dev/null +++ b/build/lib/libcodonusage/__init__.py @@ -0,0 +1,45 @@ +__copyright__ = "Copyright (C) 2022-2023 Blaise Li" +__licence__ = "GNU GPLv3" +__version__ = "0.28.1" +from .libcodonusage import ( + aa2colour, + aa_usage, + by_aa_codon_usage, + centroid_usage, + codon2aa, + codon_usage_pca, + columns_by_aa, + compare_clusterings, + compute_rscu, + detect_fishy_genes, + exclude_all_nan_cols, + extract_top_genes_from_cluster, + filter_on_idx_levels, + find_most_biased_genes, + find_valley, + format_codon_labels, + gene_wide_codon_usage, + group_codons_by_sum, + load_bias_table, + load_counts_table, + load_table_with_info_index, + make_aa_codon_columns, + make_cluster_table, + make_centroids_cluster_finder, + make_counts_only, + plot_codon_usage_for_gene_list, + remove_codons, + render_md, + save_counts_table, + sort_counts_by_aa, + split_info_index, + star2stop, + sum_codon_counts, + to_long_form, + violin_usage, + violin_usage_vertical, + violin_usage_by_clusters, + violin_usage_by_clusters_splitby, + violin_with_thresh, + write_cluster_lists, + ) diff --git a/build/lib/libcodonusage/libcodonusage.py b/build/lib/libcodonusage/libcodonusage.py new file mode 100644 index 0000000..603a8bd --- /dev/null +++ b/build/lib/libcodonusage/libcodonusage.py @@ -0,0 +1,1797 @@ +#!/usr/bin/env python3 +# Copyright (C) 2021-2023 Blaise Li +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +"""Functions used in Jupyter notebooks.""" +from http.cookies import Morsel +from itertools import combinations +import json +from operator import attrgetter, itemgetter +from pathlib import Path +from unittest import mock +# python3 -m pip install cytoolz +from cytoolz import concat, groupby, unique, valmap +# To render mardown in a Jupyter notebook on gitlab +from IPython.display import display +from IPython.core.display import HTML +# python3 -m pip install markdown +from markdown import markdown +# Basic plotting library for Python +# python3 -m pip install matplotlib +# See https://matplotlib.org/ +import matplotlib.pyplot as plt +# Python library to make some nicer plots +# python3 -m pip install seaborn +# See https://seaborn.pydata.org/ +import seaborn as sns +# Useful Python package for bioinformatics. +# python3 -m pip install biotite +# See https://www.biotite-python.org/install.html +import biotite.sequence.graphics as bgraphs +from biotite.sequence import CodonTable, NucleotideSequence +# python3 -m pip install numpy +import numpy as np +# Python module to handle tabular data +# python3 -m pip install pandas +# See https://pandas.pydata.org/pandas-docs/stable/index.html +import pandas as pd +# Python module that facilitates the exploration of tbular data +# python3 -m pip install plydata +# from plydata import define, pull, query +# python3 -m pip install scipy +from scipy.linalg import LinAlgError +from scipy.spatial.distance import sqeuclidean as sqdist +from scipy.stats import gaussian_kde +# Python library with useful data-processing features +# python3 -m pip install scikit-learn +# https://scikit-learn.org/stable/install.html +from sklearn.decomposition import PCA +from sklearn.preprocessing import normalize +# Python module to vizualize set intersections +# python3 -m pip install upsetplot +from upsetplot import from_indicators, UpSet + + +fmt_metadata = { + "png": {"creationDate": None}, + "svg": { + "Creator": None, "Date": None, + "Title": ("Distribution of standardized codon usages biases" + " (by aa), by chromosome")} +} + + +def render_md(md_str): + """ + Render a markdown string *md_str* in a Jupyter notebook. + + More direct rendering is not possible on gitlab for security reasons. + """ + display(HTML(markdown(md_str))) + + +codon2aa = CodonTable.load(11).codon_dict() +# Column headers made from (aa, codon) pairs +codon_headers = pd.MultiIndex.from_tuples( + ((aa, codon) for (codon, aa) in codon2aa.items()), + names=("aa", "codon")) +# Groups of column headers for each amino-acid +# key: aa, value: list of (aa, codon) pairs +columns_by_aa = groupby(itemgetter(0), codon_headers) + + +###################################### +# Associating colours to amino-acids # +###################################### +# We load amino-acid colouring information to use in graphical representations. +# We'll use colour schemes provided by the biotite package: +# https://www.biotite-python.org/examples/gallery/sequence/color_schemes_protein.html +# We look directly at the json file +# instead of using the "native" biotite mechanisms +with Path(bgraphs.colorschemes._scheme_dir).joinpath( + "jalview_zappo.json").open() as fh: + colscheme = json.load(fh) +# We add the black colour for the stop "amino-acid" and associated codons +# key: amino-acid +# value: hexadecimal html colour code +aa2colour = {**colscheme["colors"], "*": '#000000'} +###################################### +# Associating colours to nucleotides # +###################################### +nuc_alphabet = NucleotideSequence.alphabet_unamb +nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet) +nuc2colour = dict(zip(nuc_alphabet, nuc_colours)) + + +def load_counts_table( + table_path, index_col="old_locus_tag", index_unique=True): + """ + Load a table of pre-computed codon counts at *table_path*. + + The lines correspond to genes. + Besides the columns containing the counts for each codon, + there are other columns containing various pieces of information + regarding those genes. + + If *index_unique* is True, a ValueError error will be raised if + some elements in the the column *index_col* are not unique. + """ + render_md(f"Loading data from [{table_path}]({table_path})...\n") + codon_counts = pd.read_table(table_path, index_col=index_col) + if index_unique and not codon_counts.index.is_unique: + raise ValueError(f"Index {index_col} contains repeated values.\n") + nb_genes = len(codon_counts) + render_md( + f""" + There are {nb_genes} genes in the table. + + The table looks as follows (first 3 lines): +""") + display(codon_counts.head(3)) + return codon_counts + + +def compute_criteria(codon_counts): + """ + Prepare a Boolean table and corresponging gene sets. + + There are one column and one gene set for each criterion. + """ + wrong_start_col = codon_counts["expected_start_aa"] == "-" + start_stop_col = codon_counts["expected_start_aa"] == "*" + no_met_start_col = codon_counts["expected_start_aa"] != "M" + start_upstream_col = codon_counts["start_upstream"] + has_stops_col = codon_counts["nb_stops"] > 0 + start_upstream_met_start_col = start_upstream_col & ~no_met_start_col + start_upstream_met_start_nostops_col = ( + start_upstream_met_start_col & ~has_stops_col) + good_met_start_col = ~start_upstream_col & ~no_met_start_col + has_stops_good_met_start_col = ( + has_stops_col & ~no_met_start_col + & ~start_upstream_met_start_col) + no_stop_good_met_start_col = ( + ~has_stops_col & ~no_met_start_col + & ~start_upstream_met_start_col) + criteria = pd.DataFrame({ + "wrong_start": wrong_start_col, + "start_stop": start_stop_col, + "no_met_start": no_met_start_col, + "start_upstream": start_upstream_col, + "has_stops": has_stops_col, + "start_upstream_met_start": start_upstream_met_start_col, + "start_upstream_met_start_nostops": ( + start_upstream_met_start_nostops_col), + "good_met_start": good_met_start_col, + "has_stops_good_met_start": has_stops_good_met_start_col}) + render_md("Number of genes fitting various selection criteria:\n\n") + display(criteria.agg(sum)) + render_md("Upset plot of the non-empty categories:\n\n") + fig = plt.figure() + try: + UpSet( + from_indicators(*[criteria.columns], data=criteria), + show_counts=True).plot(fig=fig) + except AttributeError: + pass + display(fig) + plt.close(fig) + gene_sets = { + "wrong_start": set(criteria[wrong_start_col].index), + "start_stop": set(criteria[start_stop_col].index), + "no_met_start": set(criteria[no_met_start_col].index), + "start_upstream": set(criteria[start_upstream_col].index), + # Not used + # "has_stops": set(criteria[has_stops_col].index), + "start_upstream_met_start": set( + criteria[start_upstream_met_start_col].index), + "start_upstream_met_start_nostops": set( + criteria[start_upstream_met_start_nostops_col].index), + "good_met_start": set(criteria[good_met_start_col].index), + "has_stops_good_met_start": set( + criteria[has_stops_good_met_start_col].index), + "no_stop_good_met_start": set( + criteria[no_stop_good_met_start_col].index)} + return (criteria, gene_sets) + + +def detect_fishy_genes(codon_counts): + """ + Run diagnostics about genes included in table *codon_counts*. + + These should help decide whether to exclude some of the genes. + + A table of boolean criteria is returned, with one line per gene. + """ + + def display_gene_set(gene_set, max_size=10): + """ + Print out genes in a gene set, depending on their number. + """ + if gene_set and len(gene_set) <= max_size: + print(" ", ", ".join(gene_set)) + + render_md("### Searching for mis-annotated genes") + (criteria, gene_sets) = compute_criteria(codon_counts) + render_md( + """ +We should avoid genes that are not in-frame. We can likely exclude +those that do not start with a valid start codon. +Here, this will be seen by looking at the translation of the first +codon. +If it is "*" (`start_stop`) or "-" (`wrong_start`), +the codon is not a valid start codon. +""") + wrong_start = gene_sets["wrong_start"] + render_md(f"There are {len(wrong_start)} genes that start with a `-`.") + display_gene_set(wrong_start) + + start_stop = gene_sets["start_stop"] + render_md( + f"There are {len(start_stop)} genes that start with a stop codon.") + display_gene_set(start_stop) + + no_met_start = gene_sets["no_met_start"] + if no_met_start == wrong_start | start_stop: + render_md( + """ +Genes not belonging to the `start_stop` or `wrong_start` +category all start with a methionine. +""") + else: + raise NotImplementedError( + "There are genes that start with something else than 'M', " + "'-' or '*'.") + + start_upstream = gene_sets["start_upstream"] + render_md( + f"{len(start_upstream)} genes have a possibly ill-defined" + " start position.") + display_gene_set(start_upstream) + + start_upstream_met_start = gene_sets["start_upstream_met_start"] + if start_upstream_met_start: + render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'" + " start.") + display_gene_set(start_upstream_met_start) + + start_upstream_met_start_nostops = gene_sets[ + "start_upstream_met_start_nostops"] + if start_upstream_met_start_nostops: + render_md(f"{len(start_upstream_met_start_nostops)} of them" + " also do not contain any stop.") + display_gene_set(start_upstream_met_start_nostops) + render_md("Those genes could probably be kept.") + + has_stops_good_met_start = gene_sets["has_stops_good_met_start"] + render_md( + f"{len(has_stops_good_met_start)} genes contain stops" + " but have a well defined start position with 'M'.") + display_gene_set(has_stops_good_met_start) + + good_met_start = gene_sets["good_met_start"] + render_md( + f"{len(good_met_start)} genes have a well defined " + "start position with 'M'.") + render_md( + """ +If genes that have stop readthrough are a known phenomenon, +only those among them that do not have a valid start codon +might be excluded. +""") + + no_stop_good_met_start = gene_sets["no_stop_good_met_start"] + render_md( + f"{len(no_stop_good_met_start)} genes have a well defined" + " start position with 'M' and contain no stop codon.") + display_gene_set(no_stop_good_met_start) + return criteria + + +def make_counts_only( + counts_table, + seq_id_kw="locus_tag", alt_tag_kw="old_locus_tag", + ref_info_cols=None): + """ + Integrate "informative" columns of *counts_table* into the index. + """ + # To ensure a stable order: + if ref_info_cols is None: + ref_info_cols = [ + alt_tag_kw, seq_id_kw, + "start", "end", "length", + "start_codon", "expected_start_aa", + "first_stop", "nb_stops", "start_upstream", "end_downstream"] + # To ensure no info columns are lost: + info_cols = [ + counts_table.index.name, + *counts_table.columns.difference(codon2aa)] + assert set(info_cols) == set(ref_info_cols) + return counts_table.reset_index().set_index(ref_info_cols) + + +def make_aa_codon_columns(counts_table): + """Transform column headers into a (aa, codon) MultiIndex.""" + counts_table.columns = pd.MultiIndex.from_tuples( + ((codon2aa[codon], codon) for codon in counts_table.columns), + names=("aa", "codon")) + render_md( + "We associated amino-acid information to codons, " + "as an extra level in the table columns.") + return counts_table + + +def sort_counts_by_aa(counts_table): + """ + Sort columns so that codons are grouped by corresponding amino-acid. + """ + aacodons_order = list(aa2colour.keys()) + # We need to include both aas and codons in the list + # because it will be used for both the aa and the codon levels + # when sorting the (aa, codon) MultiIndex + aacodons_order.extend(codon2aa.keys()) + + def aa_sortkey(col): + """ + Key function to use when sorting a MultiIndex consisting in + (aa, codon) pairs. + """ + return col.map(aacodons_order.index) + sorted_counts = counts_table.sort_index( + axis=1, level=(0, 1), ascending=(True, True), key=aa_sortkey) + render_md( + "The columns were sorted in order to group codons" + " by associated amino-acid.") + return sorted_counts + + +def save_counts_table(counts_table, table_path): + """ + Save table *counts_table* at *table_path*. + """ + table_path.parent.mkdir(parents=True, exist_ok=True) + counts_table.to_csv(table_path, sep="\t") + render_md(f"The table was saved at [{table_path}]({table_path}).") + + +def load_table_with_info_index(table_path, nb_info_cols, nb_cluster_series=0): + """ + Load a table containing by-amino-acid codon counts or usage biases. + + The table, located at *table_path*, should have tab-separated columns. + It is expected to contain a series of informative columns + preceding the columns containing the codon usage bias values. + + The first *nb_info_cols* columns contain information about gene + identifiers and features extracted from CDS annotation data. + + Then, there can be series of gene-clustering information, where each + column of the series corresponds to clusters defined based on + codon-usage biases among the codons coding an amino-acid. + There are no columns for amino-acids coded by only one codon (M and W). + *nb_cluster_series* specifies the number of such series. + + The result is a pandas DataFrame object, where all the above information + is encoded as a MultiIndex, the codon counts or usage biases being in the + remaining columns. + """ + return pd.read_csv( + table_path, + sep="\t", + # *nb_info_cols* starting levels from the initial table, + # plus levels corresponging to clustering information. + # By default, from 2 methods (nb_cluster_series=2): + # * `cluster_{aa}_kmeans` for each amino-acid + # having more than one codon + # * `cluster_{aa}_full_bias` for each amino-acid + # having more than one codon + index_col=list( + range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))), + header=[0, 1]) + + +def split_info_index(table, keep_index_cols): + """ + Split table *table* into info and data. + + Info is supposed to be contained in the index of DataFrame *table*, + data in its columns. + + Return a pair of tables, one for the data, one for the info, + where the index contains only the levels listed in *keep_index_cols* + """ + drop_info_cols = [ + colname for colname in table.index.names + if colname not in keep_index_cols] + info_table = table.reset_index()[ + [*keep_index_cols, *drop_info_cols]].set_index(keep_index_cols) + # To avoid loss of index levels in input by side effect: + out_table = table.copy() + out_table.index = out_table.index.droplevel(drop_info_cols) + # To ensure indices have their levels in the same order + # in out_table and in infoinfo_table: + out_table = out_table.reset_index().set_index(keep_index_cols) + return (out_table, info_table) + + +def filter_on_idx_levels(counts_table, filter_dict): + """ + Filter a table *counts_table* based on values of certain index levels. + + *filter_dict* should contain index level names as keys and values that + rows should have at this level to be included in the output + filtered table. + """ + row_filter = np.all([ + counts_table.index.get_level_values(idx_lvl) == idx_val + for (idx_lvl, idx_val) in filter_dict.items()], axis=0) + return counts_table[row_filter] + + +# Codon usage calculations can be done in various ways. +# Some examples are given at page 6500 (2 of the pdf) of +# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032) +SUZUKI_DOI = "10.1016/j.febslet.2005.10.032" +SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})" + + +def remove_codons(codon_counts, codon_list): + """ + Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons). + """ + codon_counts.drop(columns=codon_list, inplace=True) + return codon_counts + + +def sum_codon_counts(row, codons): + """ + Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*. + """ + sum = 0 + for cod in codons: + sum += row[cod] + return sum + + +def group_codons_by_sum(codon_counts, group_name, dict_classes, filter): + """ + Group codons by sum given specific classes in *codon_counts* table. + + *group_name* contains the name of the grouping, and plays the role of aa names in the original + codon counts table. + *dict_classes* contains the different classes under this grouping as keys and the associated + list of codons as values. + *filter* is a boolean set to True if you want to filter out other codons than the ones specified in + dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for + the grouped_classes. + """ + list_classes = list(dict_classes.items()) + list_classes_names = [] + for key, value in dict_classes.items(): + codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1) + list_classes_names.append(key) + if filter: + return codon_counts.loc[:, ([group_name], list_classes_names)] + else: + return codon_counts + + +def gene_wide_codon_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): + """ + Compute codon usage biases "gene-wide" as the standardized + difference between a gene's codon proportions and global + codon proportions (default). + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. + """ + render_md(f""" +We will compute codon usage "gene-wide" (i.e. not by amino-acid), +by looking at the proportion of codons within a gene's CDS. +(This corresponds to R1 in {SUZUKI_LINK}) +""") + render_md(""" +To compute codon proportions, we can divide each line by its sum, +or, equivalently (and hopefully more efficiently), normalize the data +using the "l1" norm (which, for positive-only values amounts to the sum). +""") + codon_proportions = pd.DataFrame( + normalize(codon_counts, norm="l1"), + index=codon_counts.index, + columns=codon_counts.columns) + # Doesn't seem to be working: + # codon_proportions.style.hide(axis="index") + if verbose: + display(codon_proportions.head(3)) + # Check that the sum of proportions (columns) for a gene is 1 + colsums = codon_proportions.sum(axis=1).values + # Due to imprecision in float arithmetics, + # we can only check that the sums are close to 1 + #assert np.allclose(colsums, np.full(len(colsums), 1)) + print("mean", np.mean(colsums)) + assert np.isclose(np.mean(colsums), 1) + if ref_filter_dict is None: + counts_for_global = codon_counts + else: + counts_for_global = filter_on_idx_levels( + codon_counts, ref_filter_dict) + render_md(""" +To compute the usage biases, we also need to compute codon proportions +in the global usage. +""") + render_md("We compute the global usage, as the sum on columns.") + global_usage = counts_for_global.sum(axis=0) + render_md("Then we normalize it the same way as for the individual genes.") + # The values.reshape(1, -1) turns the Series data into a 2D array + # with one line. flatten() turns the data back to 1D + global_proportions = pd.Series( + normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(), + index=global_usage.index) + # Can be 0.999999999999... + # if global_proportions.sum() != 1.: + # display(global_proportions) + # display(global_proportions.sum()) + assert np.isclose(global_proportions.sum(), 1.) + # assert global_proportions.sum() == 1. + render_md(""" +Codon usage biases in genes can then be computed by subtracting +the corresponding global proportion to a codon's proportion. +""") + codon_usage_biases = codon_proportions.sub(global_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between codons. +""") + standardized_codon_usage_biases = codon_usage_biases.div( + codon_usage_biases.std()) + # Doesn't seem to be working: + # standardized_codon_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_codon_usage_biases.head(3)) + if return_more: + return { + "biases": standardized_codon_usage_biases, + "proportions": codon_proportions, + "global_proportions": global_proportions} + return standardized_codon_usage_biases + + +def check_aa_codon_columns(table): + """ + Check that the columns of *table* correspond to (aa, codon) pairs. + """ + msg = "Codon proportions table should have two levels: 'aa' and 'codon'" + if table.columns.nlevels != 2: + raise ValueError(msg) + if table.columns.names[0] != "aa": + raise ValueError(msg) + if table.columns.names[1] != "codon": + raise ValueError(msg) + + +def compute_rscu(codon_proportions_by_aa): + """ + Compute Relative Synonymous Codon Usage from proportions in genes. + + *codon_proportions_by_aa* should be a pandas DataFrame where + rows correspond to genes, and columns to codons. It contains + the proportions in the gene of the codon among those coding the + same amino-acid. The column index should be a pandas MultiIndex + where the first level is the amino-acid name, and the second level + the codon. + + Return a pair where the first element is the rscu table, and + the second is the degeneracy table. + """ + check_aa_codon_columns(codon_proportions_by_aa) + degeneracy = pd.Series( + # concat "flattens" the list of iterables given as arguments + # (list of tuples of repeated degeneracy values) + list(concat([ + # we want stretches of the same degeneracy values + # of as many elements as there are codons in this degeneracy group, + # hence the tuple multiplication + ((degen),) * degen + # valmap transforms values in a dict given as second argument + # using the function given as first argument + # We extract only the value from the (key, value) pairs + # generated by the .items() methods of the resulting dict + # (Actually, we could probably have just used the .values() method) + for (_, degen) in valmap( + # len gives the length of each group, + # where a group contains the (aa, codon) pairs for a given aa, + # that is, the degeneracy for this group of codons + len, + # groupby creates a dict where keys are obtained from + # the function given as first argument and values are + # groups made from the iterable given as second argument + # (here, column names, which are (aa, codon) pairs) + # itemgetter(0) extracts the amino-acid from a pair (aa, codon) + groupby( + itemgetter(0), + codon_proportions_by_aa.columns)).items()])), + index=codon_proportions_by_aa.columns) + return ( + codon_proportions_by_aa.mul(degeneracy), + degeneracy) + + +def by_aa_codon_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): + """ + Compute codon usage biases "by amino-acid" as the standardized + difference between a gene's codon proportions and global + codon proportions, where proportions are computed within + groups of same-amino-acid-coding codons instead of gene-wide. + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. + """ + check_aa_codon_columns(codon_counts) + render_md(f""" +We will compute codon usage "by amino-acid", by looking at the +proportion of codons for each amino-acid within a gene's CDS. +(This corresponds to R2 in {SUZUKI_LINK}) +""") + render_md(""" +We first need to compute, for a given gene, the total number of codons +corresponding to each amino-acid. +""") + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + render_md(""" +To compute codon proportions, we can then divide each codon counts by +the total for the corresponding amino-acid. +""") + codon_proportions = codon_counts.div( + summed_by_aa) + # Doesn't seem to be working: + # codon_proportions.style.hide(axis="index") + if verbose: + display(codon_proportions.head(3)) + # There are some (genes, aa) pairs where the sum is zero. + # This is normal if that gene does not have that amino-acid. + sums = codon_proportions.groupby(level=0, axis=1).sum() + # Values in sums should be either zero or one. + all_ones = np.full(sums.shape, 1.) + all_zeroes = np.full(sums.shape, 0.) + # Due to imprecision in float arithmetics, + # we can only check that the sums are close to 1 or to 0 + assert np.array_equal( + np.isclose( + sums.values, all_ones), + np.logical_not( + np.isclose( + sums.values, all_zeroes))) + if return_more: + render_md(f""" +We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying +codon proportions by the degeneracy of the corresponding amino-acid, that is, +by the number of different codons existing for that amino-acid. +This amounts to computing codon counts relative to the average codon counts +for that amino-acid. +(This corresponds to R3 in {SUZUKI_LINK}) +""") + (rscu, degeneracy) = compute_rscu(codon_proportions) + render_md(f""" +We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative +to the maxium across synonymous codons of the counts for that amino-acid. +""") + max_counts = codon_counts.T.groupby("aa").max().T + r4_table = codon_counts.div( + max_counts) + if ref_filter_dict is None: + counts_for_global = codon_counts + else: + counts_for_global = filter_on_idx_levels( + codon_counts, ref_filter_dict) + render_md(""" +To compute the usage biases, we also need to compute codon proportions +in the global usage. +""") + render_md(""" +We compute the global usage, as the sum of the counts for a given codon, +across genes. +""") + global_usage = counts_for_global.sum(axis=0) + if return_more: + # Same computation as for individual genes, + # but based on the total counts. + global_max_counts = global_usage.T.groupby("aa").max().T + global_r4 = global_usage.div( + global_max_counts) + render_md("Then we sum over codons corresponding to the same amino-acid.") + global_summed_by_aa = global_usage.groupby(level=0).sum() + render_md(""" +Dividing the global usage for each codon by the total for the corresponding +amino-acid, we can now compute the global proportions the same way as for +the individual genes. +""") + global_proportions = global_usage.div( + global_summed_by_aa) + # The proportions of codons for a given amino-acid should sum to 1 + assert all( + val == 1. + for val in global_proportions.groupby(level=0).sum()) + render_md(""" +Codon usage biases in genes can then be computed by subtracting +the corresponding global proportion to a codon's proportion. +""") + codon_usage_biases = codon_proportions.sub(global_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between codons. +""") + standardized_codon_usage_biases = codon_usage_biases.div( + codon_usage_biases.std()) + # Doesn't seem to be working: + # standardized_codon_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_codon_usage_biases.head(3)) + if return_more: + return { + "biases": standardized_codon_usage_biases, + "proportions": codon_proportions, + "rscu": rscu, + "r4_table": r4_table, + "global_proportions": global_proportions, + "global_rscu": global_proportions.mul(degeneracy), + "global_r4": global_r4} + return standardized_codon_usage_biases + + +def aa_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): + """ + Compute amino-acid usage biases as the standardized + difference between a gene's amino-acid proportions + and global amino-acid proportions. + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. + """ + check_aa_codon_columns(codon_counts) + render_md(""" +We will compute amino-acid usage, by looking at the +proportions of amino-acids within a gene's CDS. +""") + render_md(""" +We first need to compute, for a given gene, the total number of codons +corresponding to each amino-acid. +""") + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + render_md(""" +To compute amino-acid proportions, we can divide each line by its sum, +or, equivalently (and hopefully more efficiently), normalize the data +using the "l1" norm (which, for positive-only values amounts to the sum). +""") + aa_proportions = pd.DataFrame( + normalize(summed_by_aa, norm="l1"), + index=summed_by_aa.index, + columns=summed_by_aa.columns) + # Doesn't seem to be working: + # aa_proportions.style.hide(axis="index") + if verbose: + display(aa_proportions.head(3)) + # Checking that proportions sum to 1 + colsums = aa_proportions.sum(axis=1) + assert np.allclose(colsums, np.full(len(colsums), 1)) + # Then, computing the global amino-acid proportions + if ref_filter_dict is None: + counts_for_global = summed_by_aa + else: + counts_for_global = filter_on_idx_levels( + summed_by_aa, ref_filter_dict) + render_md(""" +To compute the usage biases, we also need to compute amino-acid proportions +in the global usage. +""") +# render_md(""" +# We compute the global usage, as the sum of the counts for a given codon, +# across genes. +# """) +# global_usage = codon_counts.sum(axis=0) + render_md("Then we sum over codons corresponding to the same amino-acid.") + # global_summed_by_aa = global_usage.groupby(level=0).sum() + global_summed_by_aa = counts_for_global.sum() + render_md("Then we normalize it the same way as for the individual genes.") + # The values.reshape(1, -1) turns the Series data into a 2D array + # with one line. flatten() turns the data back to 1D + global_aa_proportions = pd.Series( + normalize( + global_summed_by_aa.values.reshape(1, -1), + norm="l1").flatten(), + index=global_summed_by_aa.index) + render_md(""" +Amino-acid usage biases can be computed by subtracting the corresponding +global proportion to an amino-acid's proportion. +""") + aa_usage_biases = aa_proportions.sub(global_aa_proportions) + render_md(""" +We can standardize these values (by dividing by their standard deviation +across genes) so that they are more comparable between amino-acids. +""") + standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std()) + # Doesn't seem to be working: + # standardized_aa_usage_biases.style.hide(axis="index") + if verbose: + display(standardized_aa_usage_biases.head(3)) + if return_more: + return { + "biases": standardized_aa_usage_biases, + "proportions": aa_proportions, + "global_proportions": global_aa_proportions} + return standardized_aa_usage_biases + + +def exclude_all_nan_cols(usage_table, fill_other_nas=np.nan): + """ + Detect columns in *usage_table* that contain only NaNs + and remove them from the table. Other NaN values are replaced + with *fill_other_nas*. + """ + render_md(""" +Standardization may result in division by zero for usage data +that have a zero standard deviation. +This is expected to be the case for "by amino-acid" usage biases +for codons corresponding to amino-acids having only one codon: +methionine (M) and tryptophan (W). +""") + all_nan_cols = usage_table.columns[ + usage_table.isna().all()] + if len(all_nan_cols): + render_md("The following columns contain only NaNs:") + display(all_nan_cols) + render_md("This likely resulted from a division by zero.") + render_md("These columns will be excluded.") + return ( + usage_table.drop(columns=all_nan_cols).fillna(fill_other_nas), + all_nan_cols) + + +def codon_influence_in_components( + components, colnames, + figs_dir=None, more_components=False, formats=None): + """ + Plot the influence of the columns in the first 4 principal axes of a PCA. + + Each column should correspond to a codon, and will be represented as a + colour bar whose coulour is based on the last letter of the codon. + + *components* should be a numpy array representing the principal + axes of a PCA, such as the `components_` attribute of a fitted + `sklearn.decomposition.PCA` object. + + *colnames* should be the names of the columns in the *components* + array and are expected to match the following pattern: + <aa>_<codon>, where <aa> is a single-letter code for an amino-acid, + and <codon> is one of the 3-letter codons for this amino-acid, + in capital letters among A, T, G and C (i.e. in the DNA alphabet). + The last letter will be used to set the colour of a bar in the plots + + *figs_dir* should be a path to a directory that will be used + to save graphics representing the influence of each data column + on the first four principal components. + + *formats* should be a list of formats in which the figures should + be saved, such as "svg" or "png". + """ + render_md( + "Vizualizing the influence of codons in the first components\n") + # TODO: *figsize* could be adapted depending on the number of columns + if more_components: + (fig, axes) = plt.subplots(12, 1, figsize=(16, 60)) + else: + (fig, axes) = plt.subplots(4, 1, figsize=(16, 16)) + for (component, axis) in enumerate(axes): + pd.Series( + components[component], + index=colnames).plot.bar( + ax=axis, + # colname is supposed to end with the 3-letters codon + color=[ + nuc2colour[colname[-1]] + for colname in colnames]) + axis.set_ylabel(f"weight in component {component}") + # axis.set_xticklabels(axis.get_xticklabels(), rotation=90) + fig.subplots_adjust(hspace=.5) + if figs_dir is not None and formats is not None: + for ext in formats: + plt.savefig( + figs_dir.joinpath(f"PCA_components.{ext}"), + metadata=fmt_metadata[ext]) + display(fig) + plt.close(fig) + + +def codon_usage_pca( + usage_data, + figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False, + formats=None, cols_are_codons=True): + """ + Perform Principal Component Analysis on *usage_data*. + + DataFrame *usage_data* is expected to contain observations as lines + and contain numerical values in columns, without NaNs, corresponding + to codons, where the columns headers are supposed to match the following + pattern: <aa>_<codon>, where <aa> is a single-letter code for + an amino-acid, and <codon> is one of the 3-letter codons for this + amino-acid, in capital letters among A, T, G and C + (i.e. in the DNA alphabet). + + *hue* is expected to be one of the levels of the MultiIndex of + *usage_data*, and will be used to assign colours to the observations + in PCA plots. By default, colour is based on the content of a "chrom" + level in the index. + + If *figs_dir* is not None, this path to a directory will be used + to save graphics representing the projection of the observations + in the first four principal components (0 vs. 1 and 2 vs. 3). + + Unless *cols_are_codons* is set to False, there will also be + graphics representing the influence of each data column + on the first four principal components. + + *formats* should be a list of formats in which the figures should + be saved, such as "svg" or "png". + + If *exclude_cols* is not None, the columns with the names contained + in the iterable *exclude_cols* will not be included in the PCA analysis. + """ + if figs_dir is not None: + figs_dir = Path(figs_dir) + figs_dir.mkdir(parents=True, exist_ok=True) + if exclude_cols is not None: + usage_data = usage_data.drop(columns=exclude_cols) + pca = PCA().fit(usage_data) + transformed_data = pd.DataFrame( + pca.transform(usage_data), + index=usage_data.index).reset_index(level=hue) + render_md( + "Plotting genes on the first components\n") + if plot_more_components: + (fig, axes) = plt.subplots(3, 2, figsize=(16, 25)) + sns.scatterplot( + data=transformed_data, + x=0, y=1, hue=hue, marker=".", ax=axes[0,0]) + sns.scatterplot( + data=transformed_data, + x=2, y=3, hue=hue, marker=".", ax=axes[0,1]) + sns.scatterplot( + data=transformed_data, + x=4, y=5, hue=hue, marker=".", ax=axes[1,0]) + sns.scatterplot( + data=transformed_data, + x=6, y=7, hue=hue, marker=".", ax=axes[1,1]) + sns.scatterplot( + data=transformed_data, + x=8, y=9, hue=hue, marker=".", ax=axes[2,0]) + sns.scatterplot( + data=transformed_data, + x=10, y=11, hue=hue, marker=".", ax=axes[2,1]) + else: + (fig, axes) = plt.subplots(1, 2, figsize=(16, 8)) + sns.scatterplot( + data=transformed_data, + x=0, y=1, hue=hue, marker=".", ax=axes[0]) + sns.scatterplot( + data=transformed_data, + x=2, y=3, hue=hue, marker=".", ax=axes[1]) + + if figs_dir is not None and formats is not None: + for ext in formats: + plt.savefig( + figs_dir.joinpath(f"PCA_projections.{ext}"), + metadata=fmt_metadata[ext]) + display(fig) + plt.close(fig) + if cols_are_codons: + codon_influence_in_components( + pca.components_, usage_data.columns, + figs_dir=figs_dir, more_components=plot_more_components, formats=formats) + return (pca, transformed_data) + + +def centroid_usage(codon_counts, all_nan_cols): + """ + Define "centroids" for gene clusters, one per codon in *codon_counts*. + + The standardized usage biases of these centroids are computed so that + the codon proportions for a given amino-acid are the same as the global + proportions, except for the codons corresponding to a certain amino-acid. + For each amino-acid, there is one centroid per codon, where the + proportion for this codon is set to 1.0, and 0.0 for the other codons. + """ + check_aa_codon_columns(codon_counts) + summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() + global_usage = codon_counts.sum(axis=0) + global_summed_by_aa = global_usage.groupby(level=0).sum() + global_proportions_by_aa = global_usage.div( + global_summed_by_aa) + codon_proportions_by_aa = codon_counts.div(summed_by_aa) + codon_usage_biases_by_aa = codon_proportions_by_aa.sub( + global_proportions_by_aa) + # _columns_by_aa may be different from columns_by_aa + # Groups of column headers for each amino-acid + # key: aa, value: list of (aa, codon) pairs + _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns) + + def generate_centroid_proportions(): + for (aa1, columns) in _columns_by_aa.items(): + for (_, codon1) in columns: + proportions = global_proportions_by_aa.copy() + for (aa2, codon2) in columns: + proportions[(aa2, codon2)] = 0.0 + proportions[(aa1, codon1)] = 1.0 + proportions.name = ( + f"{aa1}_{codon1}", f"{aa1}_{codon1}", + *np.full( + codon_proportions_by_aa.index.nlevels - 2, + np.nan)) + yield proportions + + render_md(""" +For each codon, a "centroid" (future center of a cluster of genes) +is defined where the proportion of this codon is set to 1.0. +For the other codons corresponding to the same amino-acid, the proportion +is therefore set to 0.0. +For codons corresponding to other amino-acids, the proportions are set to +the proportions observed in the global data. +""") + centroids_proportions_by_aa = pd.DataFrame( + generate_centroid_proportions()) + centroids_proportions_by_aa.index.rename( + codon_proportions_by_aa.index.names, + inplace=True) + + render_md(""" +The biases for the centroids are then computed by subtracting to those +proportions the proportions in the global data, and the biases are +standardized by dividing by the standard deviation of codon usage biases +in the data. +""") + # Compute biases using the same global proportions as the real data + # and standardize by the same standard deviations as the real data + centroids_scub_by_aa = centroids_proportions_by_aa.sub( + global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop( + columns=all_nan_cols) + # centroids_scub_by_aa.head(3) + return centroids_scub_by_aa + + +def make_centroids_cluster_finder( + centroids_table, + aa): # pylint: disable=C0103 + """ + Make a function that, when applied to a row in the standardized + codon bias table, determines to what centroid among those + corresponding to *aa* it is the closest. + The *centroids_table* should contain standardized codon usage + biases for the centroids. + """ + # The columns that contain data pertaining to codons coding aa: + cols_for_aa = centroids_table.columns.get_level_values(0) == aa + cluster_names = [ + f"{aa}_{codon}" + for (aa, codon) in centroids_table.columns[cols_for_aa]] + + def closest_centroid(gene): + dists_to_centroids = {} + for cluster_name in cluster_names: + dists_to_centroids[cluster_name] = sqdist( + gene[cols_for_aa].values, + centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values) + # We return the cluster_name (`min(...))[1]`) + # associated with the minimum distance + return min( + (dist, cluster_name) + for (cluster_name, dist) + in dists_to_centroids.items())[1] + return closest_centroid + + +def make_cluster_table(scub_table, centroids_scub_table): + """ + Make a table for the genes in standardized codon usage bias + table *scub_table* where each column tells, for a given + amino-acid to which centroid in *centroids_scub_table* + it is the closest. + """ + return pd.DataFrame( + { + (f"cluster_{aa}_full_bias", ""): scub_table.apply( + make_centroids_cluster_finder(centroids_scub_table, aa), + axis=1).values + for aa # pylint: disable=C0103 + in unique(centroids_scub_table.columns.get_level_values(0))}, + index=scub_table.index) + + +def compare_clusterings( + usage_table, + clust1_template="cluster_{aa}_full_bias", + clust2_template="cluster_{aa}_highest_bias"): + """ + Compare lists of genes in *usage_table* for the clusters + found in the columns whose name conform to + *clust1_template* and *clust2_template*. + """ + crosstabs = {} + for aa in aa2colour.keys(): # pylint: disable=C0103 + if aa in {"M", "W"}: + continue + clust1_level = clust1_template.format(aa=aa) + clust2_level = clust2_template.format(aa=aa) + crosstabs[aa] = pd.crosstab( + usage_table.reset_index()[clust1_level], + usage_table.reset_index()[clust2_level]) + cols_for_aa = columns_by_aa[aa] + cluster_names = [ + f"{aa}_{codon}" + for (aa, codon) in usage_table[cols_for_aa].columns] + # Checking that the clusters are the same + same_clusters = True + for (clust_1, clust_2) in combinations(cluster_names, 2): + # assert crosstabs[aa][clust_1][clust_2] == 0 + try: + # if crosstabs[aa][clust_1][clust_2] != 0: + if (crosstabs[aa][clust_1][clust_2] != 0) \ + or (crosstabs[aa][clust_2][clust_1] != 0): + same_clusters = False + except KeyError: + # At least one cluster is absent. + # Is the absence symmetric? + miss_1_in_1 = clust_1 in usage_table.reset_index()[ + clust1_level] + miss_2_in_2 = clust_2 not in usage_table.reset_index()[ + clust2_level] + miss_1_in_2 = clust_1 in usage_table.reset_index()[ + clust2_level] + miss_2_in_1 = clust_2 not in usage_table.reset_index()[ + clust1_level] + if ( + (miss_1_in_1 != miss_1_in_2) + or (miss_2_in_1 != miss_2_in_2)): + same_clusters = False + if not same_clusters: + render_md(f"\nClusters for {aa} differ:\n") + else: + render_md(f"\nClusters for {aa} are identical:\n") + display(crosstabs[aa]) + + +def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): + """ + Load a table containing by-amino-acid codon usage biases. + + The table, located at *table_path*, should have tab-separated columns. + It is expected to contain a series of informative columns + preceding the columns containing the codon usage bias values. + + The first *nb_info_cols* columns contain information about gene + identifiers and features extracted from CDS annotation data. + + Then, there can be series of gene-clustering information, where each + column of the series corresponds to clusters defined based on + codon-usage biases among the codons coding an amino-acid. + There are no columns for amino-acids coded by only one codon (M and W). + *nb_cluster_series* specifies the number of such series. + + The result is a pandas DataFrame object, where all the above information + is encoded as a MultiIndex, the codon usage biases being in the + remaining columns. + """ + return load_table_with_info_index( + table_path, nb_info_cols, nb_cluster_series) + + +def star2stop(text): + """ + Replace stars with "stop", for use in file paths. + """ + return text.replace("*", "stop") + + +def write_cluster_lists( + usage_table, + aa, # pylint: disable=C0103 + clusters_dir, + cluster_level_template, + y_label_template, + alt_tag_kw="old_locus_tag"): + """ + Extract lists of genes in *usage_table** for the clusters + associated with codons for amino-acid *aa* and write the + resulting lists in a sub-folder (named based on *aa*) + of the directory *clusters_dir*. + The clusters are found in the column whose name conforms to + *cluster_level_template* + Also generate violin plots for each cluster. + """ + md_report = f"* Clusters for {aa}:\n\n" + aa_dir = clusters_dir.joinpath(star2stop(aa)) + aa_dir.mkdir(parents=True, exist_ok=True) + # key: cluster + # value: relative path to a file containing old locus tags + # for genes belonging to this cluster + relpaths_to_cluster_lists = {} + for (cluster, gene_list) in groupby(itemgetter(0), zip( + usage_table.index.get_level_values( + cluster_level_template.format(aa=aa)), + usage_table.index.get_level_values(alt_tag_kw))).items(): + path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt")) + with path_to_clusterfile.open("w") as clust_fh: + clust_fh.write("\n".join(map(itemgetter(1), gene_list))) + relpaths_to_cluster_lists[cluster] = str( + path_to_clusterfile.relative_to('.')) + md_report += "\n\n".join([ + f" - [{cluster}]({relpath_to_list})" + for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()]) + violin_usage_by_clusters( + usage_table, + aa, + y_label_template, + cluster_level_template=cluster_level_template, + vertical=True) + path_to_fig = aa_dir.joinpath(star2stop( + f"usage_biases_violin_plots_by_cluster_for_{aa}.png")) + plt.savefig(path_to_fig, metadata=fmt_metadata["png"]) + plt.close() + relpath_to_fig = str(path_to_fig.relative_to('.')) + md_report += ( + f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n") + return md_report + + +def find_valley(values): + """ + Try to find a cutting threshold for the *values*, + where the corresponding density distribution is lowest. + """ + if len(set(values)) == 1: + (single_val,) = set(values) + return single_val + min_val = min(values) + max_val = max(values) + val_range = max_val - min_val + # We use 100 bins + try: + val_density = gaussian_kde(values).evaluate( + np.linspace(min_val, max_val, 100)) + except LinAlgError as err: + print(f"min: {min_val}, max: {max_val}, N: {len(values)}.") + print(str(err)) + raise + # Where is the density lowest? (excluding first and last) + min_density_idx = val_density[1:-1].argmin() + # Finding the value corresponding to the index in the `val_density` array: + return min_val + (min_density_idx * val_range / 100) + + +def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"): + """ + Compute a usage bias threshold hopefully separating the most biased genes + from the remainder of the cluster where *data* contains codon usage biases + for the genes in the cluster and has an *alt_tag_kw* level in its index. + Return the threshold in terms of usage bias and the list of genes above it. + + /!\ This is likely very sensitive to the shape + of the usage bias distribution. + """ + thresh = find_valley(data.values) + # Try avoiding cases with threshold "misplaced" at the bottom + # of the distribution + retries = 0 + while do_retries and thresh < 0: + retries += 1 + thresh = find_valley(data.sort_values().iloc[retries:].values) + top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values + return (thresh, top_genes, retries) + + +def extract_top_genes_from_cluster( + usage_table, + aa, # pylint: disable=C0103 + codon, + clusters_dir, + axis=None, savefig=True, alt_tag_kw="old_locus_tag"): + """ + Return a few lines of markdown with links pointing to + where the saved results are expected to be found. + """ + md_report = f" - Most biased genes for {codon} ({aa}):\n\n" + aa_dir = clusters_dir.joinpath(star2stop(aa)) + # Files where to save the results + top_dir = aa_dir.joinpath("top_genes") + path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) + path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) + path_to_fig = top_dir.joinpath(star2stop( + f"usage_biases_violin_plot_{aa}_{codon}.png")) + # Load the list of genes belonging to the cluster + # associated with a high usage bias for *codon* + path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt")) + with open(path_to_clusterfile) as gene_list_fh: + genes = [line.strip() for line in gene_list_fh] + # Extract the usage bias data for these genes and for this codon + idx_level = usage_table.index.names.index(alt_tag_kw) + try: + # data = usage_table.loc[genes][(aa, codon)] + data = usage_table.loc[ + (*(slice(None) for _ in range(idx_level)), genes), + (aa, codon)] + except KeyError: + print(genes[0]) + print(usage_table.index.names) + raise + q3 = data.quantile(0.75) # pylint: disable=C0103 + top25 = data[(data > q3)].index.get_level_values( + alt_tag_kw).values + (thresh, top_genes, retries) = find_most_biased_genes( + data, do_retries=True, alt_tag_kw=alt_tag_kw) + if retries != 0: + print( + f"Bottom-most {retries} values had to be discarded in order " + f"to avoid a negative threshold for the cluster {aa}_{codon}.") + top_genes_check = set( + usage_table[ + (usage_table[(aa, codon)] > thresh)].index.get_level_values( + alt_tag_kw).values) + if set(top_genes) != top_genes_check: + assert set(top_genes).issubset(top_genes_check) + extra_genes_not_in_cluster = top_genes_check - set(top_genes) + print( + f"{len(extra_genes_not_in_cluster)} genes were found " + f"with a high bias for {codon} but not belonging " + f"to the cluster {aa}_{codon}.") + # For visual verification purpose, + # display where the threshold is situated on a violin plot + axis = violin_with_thresh( + data, [(thresh, "red"), (q3, "blue")], + f"standardized usage bias for {codon} ({aa})", + facecolor=aa2colour[aa], axis=axis) + # Save the figure and close it. + plt.tight_layout() + if savefig: + top_dir.mkdir(parents=True, exist_ok=True) + plt.savefig(path_to_fig, metadata={'creationDate': None}) + plt.close() + # Save the list of the top genes + with open(path_to_top25, "w") as gene_list_fh: + gene_list_fh.write("\n".join(top25)) + with open(path_to_top, "w") as gene_list_fh: + gene_list_fh.write("\n".join(top_genes)) + relpath_to_top25 = str(path_to_top25.relative_to('.')) + relpath_to_top = str(path_to_top.relative_to('.')) + relpath_to_fig = str(path_to_fig.relative_to('.')) + md_report += "\n\n".join([ + f" * [top 25% genes]({relpath_to_top25})", + f" * [top genes]({relpath_to_top})", + f" * [violin plot with thresholds ({q3:.3}, " + f"{thresh:.3})]({relpath_to_fig})"]) + return md_report + + +# TODO: +def plot_codon_usage_for_gene_list( + gene_list_path, + usage_table, + aa, # pylint: disable=C0103 + codon, + figs_dir=None, + axis=None, savefig=True, alt_tag_kw="old_locus_tag"): + """ + Make a violin plot of codon usage biases for genes in file *gene_list_path* + using data in *usage_table*. + The plot will represent the distribution of usage bias + for codon *codon* of amino-acid *aa*. + The plot will be saved inside *figs_dir* if *savefig* is set to True. + Gene identifiers are expected to be *alt_tag_kw* identifiers. + """ + md_report = f" - Genes from {gene_list_path}:\n\n" + with open(gene_list_path) as gene_list_fh: + genes = [ + line.strip() for line in gene_list_fh + if line[0] != "#"] + # Extract the usage bias data for these genes and for this codon + idx_level = usage_table.index.names.index(alt_tag_kw) + try: + # data = usage_table.loc[genes][(aa, codon)] + data = usage_table.loc[ + (*(slice(None) for _ in range(idx_level)), genes), + (aa, codon)] + except KeyError: + print( + f"Genes are supposed to be provided as {alt_tag_kw} identifiers.") + print(genes[0]) + print(usage_table.index.names) + raise + q3 = data.quantile(0.75) # pylint: disable=C0103 + # top25 = data[(data > q3)].index.get_level_values( + # alt_tag_kw).values + (thresh, top_genes, retries) = find_most_biased_genes( + data, do_retries=True, alt_tag_kw=alt_tag_kw) + if retries != 0: + print( + f"Bottom-most {retries} values (out of {len(data)}) " + "had to be discarded in order to avoid a negative threshold " + f"for the cluster {aa}_{codon}.") + top_genes_check = set( + usage_table[ + (usage_table[(aa, codon)] > thresh)].index.get_level_values( + alt_tag_kw).values) + if set(top_genes) != top_genes_check: + assert set(top_genes).issubset(top_genes_check) + extra_genes_not_in_cluster = top_genes_check - set(top_genes) + print( + f"{len(extra_genes_not_in_cluster)} genes were found " + f"with a high bias for {codon} but not belonging to " + f"the cluster {aa}_{codon}.") + # For visual verification purpose, + # display where the threshold is situated on a violin plot + axis = violin_with_thresh( + data, [(thresh, "red"), (q3, "blue")], + f"standardized usage bias for {codon} ({aa})", + facecolor=aa2colour[aa], axis=axis) + # Save the figure and close it. + plt.tight_layout() + if not savefig: + return md_report + if figs_dir is None: + raise ValueError("No *figs_dir* directory specified.\n") + aa_dir = figs_dir.joinpath(star2stop(aa)) + # Files where to save the results + aa_dir.mkdir(parents=True, exist_ok=True) + path_to_fig = aa_dir.joinpath(star2stop( + f"usage_biases_violin_plot_{aa}_{codon}.png")) + plt.savefig(path_to_fig, metadata=fmt_metadata["png"]) + plt.close() + # Save the list of the top genes + # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) + # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) + # with open(path_to_top25, "w") as gene_list_fh: + # gene_list_fh.write("\n".join(top25)) + # with open(path_to_top, "w") as gene_list_fh: + # gene_list_fh.write("\n".join(top_genes)) + # relpath_to_top25 = str(path_to_top25.relative_to('.')) + # relpath_to_top = str(path_to_top.relative_to('.')) + relpath_to_fig = str(path_to_fig.relative_to('.')) + md_report += "\n\n".join([ + # f" * [top 25% genes]({relpath_to_top25})", + # f" * [top genes]({relpath_to_top})", + f" * [violin plot with thresholds ({q3:.3}, " + f"{thresh:.3})]({relpath_to_fig})"]) + return md_report + + +def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"): + """ + Plot a boxplot from pandas DataFrame *usage_table*. + + Use *y_label* as y-axis label. + """ + nb_genes = usage_table.shape[0] + if whiskers == "1.5 IQR": + axis = usage_table.plot.box(figsize=(18, 6), rot=90, sym=".") + axis.set_title( + f"Distributions of codon usage biases among {nb_genes} " + "genes (whiskers at 1.5 IQR).") + elif whiskers == "5 percent tails": + axis = usage_table.plot.box( + figsize=(18, 6), rot=90, sym=".", whis=(5, 95)) + axis.set_title( + f"Distributions of codon usage biases among {nb_genes} " + "genes (whiskers at the 5th and 95th percentiles).") + axis.set_ylabel(ylabel) + + +def to_long_form(usage_table, ylabel, others=None): + """ + Transform data in *usage_table* into "long form". + + The goal is to be able to use aa as hue in violin plots. + + See https://pandas.pydata.org/docs/user_guide/10min.html#stack + and https://seaborn.pydata.org/generated/seaborn.violinplot.html + + *y_label* is the column header you want to set for the usage values. + + *others* should be a list of extra information you want to extract + from the index (which is expected to be a pandas MultiIndex). + """ + col_nb_levels = usage_table.columns.nlevels + row_nb_levels = usage_table.index.nlevels + long_form = usage_table.stack( + level=list(range(col_nb_levels))).to_frame().reset_index( + level=list(range( + row_nb_levels, row_nb_levels + col_nb_levels))) + if others is None: + others = [] + if col_nb_levels == 1: + long_form.columns = ["aa", ylabel] + others = [other for other in others if other != "aa"] + elif col_nb_levels == 2: + long_form.columns = ["aa", "codon", ylabel] + others = [other for other in others if other not in {"aa", "codon"}] + else: + raise NotImplementedError( + "Don't know how to deal with column headers " + "with more than 2 levels ('aa' and 'codon')\n") + long_form = long_form.reset_index(level=[ + lvl_name for lvl_name in usage_table.index.names + if lvl_name.startswith("cluster")]) + if not set(others).issubset(set(long_form.index.names)): + print(set(others)) + print(set(long_form.index.names)) + raise ValueError( + "All elements in *others* should be in the index " + "of *usage_table*") + return long_form.assign(**{ + other: long_form.index.get_level_values(other) for other in others}) + + +def variable2order(variable): + """ + Define the order of amino-acids or codons. + """ + if variable == "aa": + return list(columns_by_aa.keys()) + if variable == "codon": + return [codon for (_, codon) in concat(columns_by_aa.values())] + raise ValueError("variable can only be 'aa' or 'codon'.\n") + + +def format_codon_labels(codons): + """ + Add amino-acid information to codons meant to be used as tick labels. + """ + return [ + plt.Text(x, y, f"{codon} ({codon2aa[codon]})") + for (x, y, codon) in map(attrgetter("_x", "_y", "_text"), codons)] + + +def violin_usage( + usage_table, variable, ylabel, + hue="aa", axis=None, **violin_kwargs): + """ + Plot violin plots of codon usage biases. + + *variable* should be either "codon" or "aa". + """ + if hue in {"aa", "codon"}: + dodge = False + nb_violins = 1 + else: + dodge = True + # dodge = True implies there will be one violin + # per possible value of *hue*, side by side + nb_violins = len(usage_table.index.get_level_values(hue).unique()) + long_form = to_long_form(usage_table, ylabel, others=[hue]) + if axis is None: + _, axis = plt.subplots(figsize=(18 * nb_violins, 6)) + do_legend = True + else: + do_legend = False + if hue == "aa": + palette = aa2colour + else: + palette = None + kwargs = { + "x": variable, "y": ylabel, "order": variable2order(variable), + "hue": hue, "palette": palette, "dodge": dodge, + "data": long_form, "ax": axis, "orient": "v", "scale": "count"} + kwargs.update(violin_kwargs) + axis = sns.violinplot(**kwargs) + if do_legend: + plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0) + if variable == "codon": + ticklabels = format_codon_labels(axis.get_xticklabels()) + else: + ticklabels = axis.get_xticklabels() + # TODO: Expect issues when variable == "aa" + axis.set_xticklabels(ticklabels, rotation=90) + return axis + + +def violin_usage_vertical( + usage_table, variable, ylabel, + hue="aa", axis=None, **violin_kwargs): + """ + Plot vertically stacked violin plots of codon usage biases. + + *variable* should be either "codon" or "aa". + """ + if hue in {"aa", "codon"}: + dodge = False + nb_violins = 1 + else: + dodge = True + # dodge = True implies there will be one violin + # per possible value of *hue*, side by side + nb_violins = len(usage_table.index.get_level_values(hue).unique()) + long_form = to_long_form(usage_table, ylabel, others=[hue]) + if axis is None: + if variable == "codon": + base_height = 44 + elif variable == "aa": + base_height = 18 + else: + raise NotImplementedError(f"Unsupported variable: {variable}.") + _, axis = plt.subplots(figsize=(6, base_height * nb_violins)) + if hue == "aa": + palette = aa2colour + else: + palette = None + kwargs = { + "y": variable, "x": ylabel, + "order": variable2order(variable), + "hue": hue, "palette": palette, "dodge": dodge, + "data": long_form, "ax": axis, "orient": "h", "scale": "count"} + kwargs.update(violin_kwargs) + axis = sns.violinplot(**kwargs) + if variable == "codon": + ticklabels = format_codon_labels(axis.get_yticklabels()) + axis.set_yticklabels(ticklabels) + return axis + + +def violin_usage_by_clusters(usage_with_clusters, aa, # pylint: disable=C0103 + ylabel_template, + cluster_level_template="cluster_{aa}", + vertical=False, **violin_kwargs): + """ + Plot a series of violin plots for each cluster of genes. + + The clusters are defined in usage table *usage_with_clusters* + based on usage biases for the codons coding for amino-acid *aa*. + + *usage_with_clusters* should have clustering indications as + an index level named according to *cluster_level_template* + (default "cluster_{aa}"), where {aa} is to be replaced by *aa*. + """ + clusters = usage_with_clusters.groupby( + level=cluster_level_template.format(aa=aa)) + # To adjust figure size + if "hue" in violin_kwargs and violin_kwargs["hue"] not in {"aa", "codon"}: + # dodge will be set to True in later plotting call. + # This implies that there will be one violin + # per possible value of *hue*, side by side + nb_violins = len(usage_with_clusters.index.get_level_values( + violin_kwargs["hue"]).unique()) + else: + nb_violins = 1 + if vertical: + fig, axes = plt.subplots( + ncols=clusters.ngroups, + constrained_layout=True, + figsize=(6 * clusters.ngroups, 44 * nb_violins)) + else: + fig, axes = plt.subplots( + nrows=clusters.ngroups, + constrained_layout=True, + figsize=(18 * nb_violins, 6 * clusters.ngroups)) + for ((cluster, usage_table), axis) in zip(clusters, axes): + kwargs = {"axis": axis} + kwargs.update(violin_kwargs) + if vertical: + violin_usage_vertical( + usage_table, "codon", + ylabel_template.format(aa=aa, cluster=cluster), + **kwargs) + else: + violin_usage( + usage_table, "codon", + ylabel_template.format(aa=aa, cluster=cluster), + **kwargs) + return fig + + +def violin_usage_by_clusters_splitby( + usage_with_clusters, aa, # pylint: disable=C0103 + ylabel_template, + cluster_level_template="cluster_{aa}", + vertical=False, by_lvl="chrom", + **violin_kwargs): + """ + Plot a series of violin plots for each cluster of genes, + splitting the figures according to groups defined by the + content of the index level *by_lvl* of *usage_with_clusters*. + + The clusters are defined in usage table *usage_with_clusters* + based on usage biases for the codons coding for amino-acid *aa*. + + *usage_with_clusters* should have clustering indications as + an index level named according to *cluster_level_template* + (default "cluster_{aa}"), where {aa} is to be replaced by *aa*. + """ + classes = sorted(set( + usage_with_clusters.index.get_level_values(by_lvl))) + idx_level = usage_with_clusters.index.names.index(by_lvl) + for clss in classes: + fig = violin_usage_by_clusters( + usage_with_clusters.loc[ + (*(slice(None) for _ in range(idx_level)), clss), ], + aa, ylabel_template + f" for {clss}", + cluster_level_template=cluster_level_template, + vertical=vertical, **violin_kwargs) + fig.suptitle(f"Violin plots for {clss}") + + +# Based on +# https://matplotlib.org/stable/gallery/statistics/customized_violin.html +def adjacent_values(q0, q1, q3, q4): # pylint: disable=C0103 + """ + Define whiskers positions using the 1.5 * IQR criterion. + + Use the values of quartiles *q0* (min), *q1*, *q3* and *q4* (max). + """ + upper_adjacent_value = q3 + (q3 - q1) * 1.5 + upper_adjacent_value = np.clip(upper_adjacent_value, q3, q4) + lower_adjacent_value = q1 - (q3 - q1) * 1.5 + lower_adjacent_value = np.clip(lower_adjacent_value, q0, q1) + return (lower_adjacent_value, upper_adjacent_value) + + +def violin_with_thresh(data, thresh, ylabel, + facecolor="grey", axis=None, **violin_kwargs): + """ + Draw a violin plot from *data* with a horizontal line at value *thresh*. + + If *thresh* is a single value, the line will be red. + If *thresh* contains pairs, it will be assumed that these + are (threshold, colour) pairs, to be used to plot various lines. + Y-axis will be labeled using *ylabel*. + """ + if axis is None: + _, axis = plt.subplots(figsize=(2, 3.5)) + kwargs = {"showextrema": False} + kwargs.update(violin_kwargs) + violin_parts = axis.violinplot(data, **kwargs) + # https://stackoverflow.com/a/26291582/1878788 + [body] = violin_parts["bodies"] + body.set_facecolor(facecolor) + body.set_edgecolor("black") + try: + (val, col) = thresh[0] + except TypeError: + thresh = [(thresh, "red")] + for (val, col) in thresh: + axis.axhline(val, color=col) + axis.text( + 1.01, val, f"{val:.3}", + horizontalalignment="left", verticalalignment="bottom") + (q0, q1, median, q3, q4) = np.percentile( # pylint: disable=C0103 + data, [0, 25, 50, 75, 100]) + (w_min, w_max) = adjacent_values(q0, q1, q3, q4) + axis.scatter(1, median, marker="o", s=30, color="white", zorder=3) + axis.vlines(1, q1, q3, linestyle="-", linewidth=5, color="black") + axis.vlines(1, w_min, w_max, linestyle="-", linewidth=1, color="black") + axis.set_xticks([]) + axis.set_ylabel(ylabel) + return axis + + +if __name__ == "__main__": + import doctest + doctest.testmod() diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 3cbb666..603a8bd 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -463,15 +463,25 @@ def sum_codon_counts(row, codons): return sum -def group_codons_by_sum(codon_counts, class_name, dict_classes, filter): - +def group_codons_by_sum(codon_counts, group_name, dict_classes, filter): + """ + Group codons by sum given specific classes in *codon_counts* table. + + *group_name* contains the name of the grouping, and plays the role of aa names in the original + codon counts table. + *dict_classes* contains the different classes under this grouping as keys and the associated + list of codons as values. + *filter* is a boolean set to True if you want to filter out other codons than the ones specified in + dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for + the grouped_classes. + """ list_classes = list(dict_classes.items()) list_classes_names = [] for key, value in dict_classes.items(): - codon_counts[class_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1) + codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1) list_classes_names.append(key) if filter: - return codon_counts.loc[:, ([class_name], list_classes_names)] + return codon_counts.loc[:, ([group_name], list_classes_names)] else: return codon_counts -- GitLab