From 5660ae18776fe93b75f81d821b9c61e8d941012d Mon Sep 17 00:00:00 2001 From: manselme <marie.anselmet@pasteur.fr> Date: Thu, 19 Oct 2023 17:21:23 +0200 Subject: [PATCH] Delete libcodonusage.py --- build/lib/libcodonusage/libcodonusage.py | 1759 ---------------------- 1 file changed, 1759 deletions(-) delete mode 100644 build/lib/libcodonusage/libcodonusage.py diff --git a/build/lib/libcodonusage/libcodonusage.py b/build/lib/libcodonusage/libcodonusage.py deleted file mode 100644 index 706b30e..0000000 --- a/build/lib/libcodonusage/libcodonusage.py +++ /dev/null @@ -1,1759 +0,0 @@ -#!/usr/bin/env python3 -# Copyright (C) 2021-2023 Blaise Li -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <https://www.gnu.org/licenses/>. -"""Functions used in Jupyter notebooks.""" -from http.cookies import Morsel -from itertools import combinations -import json -from operator import attrgetter, itemgetter -from pathlib import Path -from unittest import mock -# python3 -m pip install cytoolz -from cytoolz import concat, groupby, unique, valmap -# To render mardown in a Jupyter notebook on gitlab -from IPython.display import display -from IPython.core.display import HTML -# python3 -m pip install markdown -from markdown import markdown -# Basic plotting library for Python -# python3 -m pip install matplotlib -# See https://matplotlib.org/ -import matplotlib.pyplot as plt -# Python library to make some nicer plots -# python3 -m pip install seaborn -# See https://seaborn.pydata.org/ -import seaborn as sns -# Useful Python package for bioinformatics. -# python3 -m pip install biotite -# See https://www.biotite-python.org/install.html -import biotite.sequence.graphics as bgraphs -from biotite.sequence import CodonTable, NucleotideSequence -# python3 -m pip install numpy -import numpy as np -# Python module to handle tabular data -# python3 -m pip install pandas -# See https://pandas.pydata.org/pandas-docs/stable/index.html -import pandas as pd -# Python module that facilitates the exploration of tbular data -# python3 -m pip install plydata -# from plydata import define, pull, query -# python3 -m pip install scipy -from scipy.linalg import LinAlgError -from scipy.spatial.distance import sqeuclidean as sqdist -from scipy.stats import gaussian_kde -# Python library with useful data-processing features -# python3 -m pip install scikit-learn -# https://scikit-learn.org/stable/install.html -from sklearn.decomposition import PCA -from sklearn.preprocessing import normalize -# Python module to vizualize set intersections -# python3 -m pip install upsetplot -from upsetplot import from_indicators, UpSet - - -fmt_metadata = { - "png": {"creationDate": None}, - "svg": { - "Creator": None, "Date": None, - "Title": ("Distribution of standardized codon usages biases" - " (by aa), by chromosome")} -} - - -def render_md(md_str): - """ - Render a markdown string *md_str* in a Jupyter notebook. - - More direct rendering is not possible on gitlab for security reasons. - """ - display(HTML(markdown(md_str))) - - -codon2aa = CodonTable.load(11).codon_dict() -# Column headers made from (aa, codon) pairs -codon_headers = pd.MultiIndex.from_tuples( - ((aa, codon) for (codon, aa) in codon2aa.items()), - names=("aa", "codon")) -# Groups of column headers for each amino-acid -# key: aa, value: list of (aa, codon) pairs -columns_by_aa = groupby(itemgetter(0), codon_headers) - - -###################################### -# Associating colours to amino-acids # -###################################### -# We load amino-acid colouring information to use in graphical representations. -# We'll use colour schemes provided by the biotite package: -# https://www.biotite-python.org/examples/gallery/sequence/color_schemes_protein.html -# We look directly at the json file -# instead of using the "native" biotite mechanisms -with Path(bgraphs.colorschemes._scheme_dir).joinpath( - "jalview_zappo.json").open() as fh: - colscheme = json.load(fh) -# We add the black colour for the stop "amino-acid" and associated codons -# key: amino-acid -# value: hexadecimal html colour code -aa2colour = {**colscheme["colors"], "*": '#000000'} -###################################### -# Associating colours to nucleotides # -###################################### -nuc_alphabet = NucleotideSequence.alphabet_unamb -nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet) -nuc2colour = dict(zip(nuc_alphabet, nuc_colours)) - - -def load_counts_table( - table_path, index_col="old_locus_tag", index_unique=True): - """ - Load a table of pre-computed codon counts at *table_path*. - - The lines correspond to genes. - Besides the columns containing the counts for each codon, - there are other columns containing various pieces of information - regarding those genes. - - If *index_unique* is True, a ValueError error will be raised if - some elements in the the column *index_col* are not unique. - """ - render_md(f"Loading data from [{table_path}]({table_path})...\n") - codon_counts = pd.read_table(table_path, index_col=index_col) - if index_unique and not codon_counts.index.is_unique: - raise ValueError(f"Index {index_col} contains repeated values.\n") - nb_genes = len(codon_counts) - render_md( - f""" - There are {nb_genes} genes in the table. - - The table looks as follows (first 3 lines): -""") - display(codon_counts.head(3)) - return codon_counts - -#def filter_codon_counts_table(codon_counts, ) - -def compute_criteria(codon_counts): - """ - Prepare a Boolean table and corresponging gene sets. - - There are one column and one gene set for each criterion. - """ - wrong_start_col = codon_counts["expected_start_aa"] == "-" - start_stop_col = codon_counts["expected_start_aa"] == "*" - no_met_start_col = codon_counts["expected_start_aa"] != "M" - start_upstream_col = codon_counts["start_upstream"] - has_stops_col = codon_counts["nb_stops"] > 0 - start_upstream_met_start_col = start_upstream_col & ~no_met_start_col - start_upstream_met_start_nostops_col = ( - start_upstream_met_start_col & ~has_stops_col) - good_met_start_col = ~start_upstream_col & ~no_met_start_col - has_stops_good_met_start_col = ( - has_stops_col & ~no_met_start_col - & ~start_upstream_met_start_col) - no_stop_good_met_start_col = ( - ~has_stops_col & ~no_met_start_col - & ~start_upstream_met_start_col) - criteria = pd.DataFrame({ - "wrong_start": wrong_start_col, - "start_stop": start_stop_col, - "no_met_start": no_met_start_col, - "start_upstream": start_upstream_col, - "has_stops": has_stops_col, - "start_upstream_met_start": start_upstream_met_start_col, - "start_upstream_met_start_nostops": ( - start_upstream_met_start_nostops_col), - "good_met_start": good_met_start_col, - "has_stops_good_met_start": has_stops_good_met_start_col}) - render_md("Number of genes fitting various selection criteria:\n\n") - display(criteria.agg(sum)) - render_md("Upset plot of the non-empty categories:\n\n") - fig = plt.figure() - try: - UpSet( - from_indicators(*[criteria.columns], data=criteria), - show_counts=True).plot(fig=fig) - except AttributeError: - pass - display(fig) - plt.close(fig) - gene_sets = { - "wrong_start": set(criteria[wrong_start_col].index), - "start_stop": set(criteria[start_stop_col].index), - "no_met_start": set(criteria[no_met_start_col].index), - "start_upstream": set(criteria[start_upstream_col].index), - # Not used - # "has_stops": set(criteria[has_stops_col].index), - "start_upstream_met_start": set( - criteria[start_upstream_met_start_col].index), - "start_upstream_met_start_nostops": set( - criteria[start_upstream_met_start_nostops_col].index), - "good_met_start": set(criteria[good_met_start_col].index), - "has_stops_good_met_start": set( - criteria[has_stops_good_met_start_col].index), - "no_stop_good_met_start": set( - criteria[no_stop_good_met_start_col].index)} - return (criteria, gene_sets) - - -def detect_fishy_genes(codon_counts): - """ - Run diagnostics about genes included in table *codon_counts*. - - These should help decide whether to exclude some of the genes. - - A table of boolean criteria is returned, with one line per gene. - """ - - def display_gene_set(gene_set, max_size=10): - """ - Print out genes in a gene set, depending on their number. - """ - if gene_set and len(gene_set) <= max_size: - print(" ", ", ".join(gene_set)) - - render_md("### Searching for mis-annotated genes") - (criteria, gene_sets) = compute_criteria(codon_counts) - render_md( - """ -We should avoid genes that are not in-frame. We can likely exclude -those that do not start with a valid start codon. -Here, this will be seen by looking at the translation of the first -codon. -If it is "*" (`start_stop`) or "-" (`wrong_start`), -the codon is not a valid start codon. -""") - wrong_start = gene_sets["wrong_start"] - render_md(f"There are {len(wrong_start)} genes that start with a `-`.") - display_gene_set(wrong_start) - - start_stop = gene_sets["start_stop"] - render_md( - f"There are {len(start_stop)} genes that start with a stop codon.") - display_gene_set(start_stop) - - no_met_start = gene_sets["no_met_start"] - if no_met_start == wrong_start | start_stop: - render_md( - """ -Genes not belonging to the `start_stop` or `wrong_start` -category all start with a methionine. -""") - else: - raise NotImplementedError( - "There are genes that start with something else than 'M', " - "'-' or '*'.") - - start_upstream = gene_sets["start_upstream"] - render_md( - f"{len(start_upstream)} genes have a possibly ill-defined" - " start position.") - display_gene_set(start_upstream) - - start_upstream_met_start = gene_sets["start_upstream_met_start"] - if start_upstream_met_start: - render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'" - " start.") - display_gene_set(start_upstream_met_start) - - start_upstream_met_start_nostops = gene_sets[ - "start_upstream_met_start_nostops"] - if start_upstream_met_start_nostops: - render_md(f"{len(start_upstream_met_start_nostops)} of them" - " also do not contain any stop.") - display_gene_set(start_upstream_met_start_nostops) - render_md("Those genes could probably be kept.") - - has_stops_good_met_start = gene_sets["has_stops_good_met_start"] - render_md( - f"{len(has_stops_good_met_start)} genes contain stops" - " but have a well defined start position with 'M'.") - display_gene_set(has_stops_good_met_start) - - good_met_start = gene_sets["good_met_start"] - render_md( - f"{len(good_met_start)} genes have a well defined " - "start position with 'M'.") - render_md( - """ -If genes that have stop readthrough are a known phenomenon, -only those among them that do not have a valid start codon -might be excluded. -""") - - no_stop_good_met_start = gene_sets["no_stop_good_met_start"] - render_md( - f"{len(no_stop_good_met_start)} genes have a well defined" - " start position with 'M' and contain no stop codon.") - display_gene_set(no_stop_good_met_start) - return criteria - - -def make_counts_only( - counts_table, - seq_id_kw="locus_tag", alt_tag_kw="old_locus_tag", - ref_info_cols=None): - """ - Integrate "informative" columns of *counts_table* into the index. - """ - # To ensure a stable order: - if ref_info_cols is None: - ref_info_cols = [ - alt_tag_kw, seq_id_kw, - "start", "end", "length", - "start_codon", "expected_start_aa", - "first_stop", "nb_stops", "start_upstream", "end_downstream"] - # To ensure no info columns are lost: - info_cols = [ - counts_table.index.name, - *counts_table.columns.difference(codon2aa)] - assert set(info_cols) == set(ref_info_cols) - return counts_table.reset_index().set_index(ref_info_cols) - - -def make_aa_codon_columns(counts_table): - """Transform column headers into a (aa, codon) MultiIndex.""" - counts_table.columns = pd.MultiIndex.from_tuples( - ((codon2aa[codon], codon) for codon in counts_table.columns), - names=("aa", "codon")) - render_md( - "We associated amino-acid information to codons, " - "as an extra level in the table columns.") - return counts_table - - -def sort_counts_by_aa(counts_table): - """ - Sort columns so that codons are grouped by corresponding amino-acid. - """ - aacodons_order = list(aa2colour.keys()) - # We need to include both aas and codons in the list - # because it will be used for both the aa and the codon levels - # when sorting the (aa, codon) MultiIndex - aacodons_order.extend(codon2aa.keys()) - - def aa_sortkey(col): - """ - Key function to use when sorting a MultiIndex consisting in - (aa, codon) pairs. - """ - return col.map(aacodons_order.index) - sorted_counts = counts_table.sort_index( - axis=1, level=(0, 1), ascending=(True, True), key=aa_sortkey) - render_md( - "The columns were sorted in order to group codons" - " by associated amino-acid.") - return sorted_counts - - -def save_counts_table(counts_table, table_path): - """ - Save table *counts_table* at *table_path*. - """ - table_path.parent.mkdir(parents=True, exist_ok=True) - counts_table.to_csv(table_path, sep="\t") - render_md(f"The table was saved at [{table_path}]({table_path}).") - - -def load_table_with_info_index(table_path, nb_info_cols, nb_cluster_series=0): - """ - Load a table containing by-amino-acid codon counts or usage biases. - - The table, located at *table_path*, should have tab-separated columns. - It is expected to contain a series of informative columns - preceding the columns containing the codon usage bias values. - - The first *nb_info_cols* columns contain information about gene - identifiers and features extracted from CDS annotation data. - - Then, there can be series of gene-clustering information, where each - column of the series corresponds to clusters defined based on - codon-usage biases among the codons coding an amino-acid. - There are no columns for amino-acids coded by only one codon (M and W). - *nb_cluster_series* specifies the number of such series. - - The result is a pandas DataFrame object, where all the above information - is encoded as a MultiIndex, the codon counts or usage biases being in the - remaining columns. - """ - return pd.read_csv( - table_path, - sep="\t", - # *nb_info_cols* starting levels from the initial table, - # plus levels corresponging to clustering information. - # By default, from 2 methods (nb_cluster_series=2): - # * `cluster_{aa}_kmeans` for each amino-acid - # having more than one codon - # * `cluster_{aa}_full_bias` for each amino-acid - # having more than one codon - index_col=list( - range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))), - header=[0, 1]) - - -def split_info_index(table, keep_index_cols): - """ - Split table *table* into info and data. - - Info is supposed to be contained in the index of DataFrame *table*, - data in its columns. - - Return a pair of tables, one for the data, one for the info, - where the index contains only the levels listed in *keep_index_cols* - """ - drop_info_cols = [ - colname for colname in table.index.names - if colname not in keep_index_cols] - info_table = table.reset_index()[ - [*keep_index_cols, *drop_info_cols]].set_index(keep_index_cols) - # To avoid loss of index levels in input by side effect: - out_table = table.copy() - out_table.index = out_table.index.droplevel(drop_info_cols) - # To ensure indices have their levels in the same order - # in out_table and in infoinfo_table: - out_table = out_table.reset_index().set_index(keep_index_cols) - return (out_table, info_table) - - -def filter_on_idx_levels(counts_table, filter_dict): - """ - Filter a table *counts_table* based on values of certain index levels. - - *filter_dict* should contain index level names as keys and values that - rows should have at this level to be included in the output - filtered table. - """ - row_filter = np.all([ - counts_table.index.get_level_values(idx_lvl) == idx_val - for (idx_lvl, idx_val) in filter_dict.items()], axis=0) - return counts_table[row_filter] - - -# Codon usage calculations can be done in various ways. -# Some examples are given at page 6500 (2 of the pdf) of -# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032) -SUZUKI_DOI = "10.1016/j.febslet.2005.10.032" -SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})" - - -def gene_wide_codon_usage( - codon_counts, - verbose=False, return_more=False, ref_filter_dict=None): - """ - Compute codon usage biases "gene-wide" as the standardized - difference between a gene's codon proportions and global - codon proportions (default). - - If *ref_filter_dict* is not None, it should be a dict of - (index_level, index_value) pairs and the global codon proportions - will actually be proportions computed on the part of the data - restricted to the genes where the *index_level* has the *index_value* - for all those pairs. - """ - render_md(f""" -We will compute codon usage "gene-wide" (i.e. not by amino-acid), -by looking at the proportion of codons within a gene's CDS. -(This corresponds to R1 in {SUZUKI_LINK}) -""") - render_md(""" -To compute codon proportions, we can divide each line by its sum, -or, equivalently (and hopefully more efficiently), normalize the data -using the "l1" norm (which, for positive-only values amounts to the sum). -""") - print("codon_proportions1") - print(codon_counts) - codon_proportions = pd.DataFrame( - normalize(codon_counts, norm="l1"), - index=codon_counts.index, - columns=codon_counts.columns) - print("codon_proportions2") - print(codon_proportions) - # Doesn't seem to be working: - # codon_proportions.style.hide(axis="index") - if verbose: - display(codon_proportions.head(3)) - # Check that the sum of proportions (columns) for a gene is 1 - colsums = codon_proportions.sum(axis=1).values - # Due to imprecision in float arithmetics, - # we can only check that the sums are close to 1 - assert np.allclose(colsums, np.full(len(colsums), 1)) - if ref_filter_dict is None: - counts_for_global = codon_counts - else: - counts_for_global = filter_on_idx_levels( - codon_counts, ref_filter_dict) - render_md(""" -To compute the usage biases, we also need to compute codon proportions -in the global usage. -""") - render_md("We compute the global usage, as the sum on columns.") - global_usage = counts_for_global.sum(axis=0) - render_md("Then we normalize it the same way as for the individual genes.") - # The values.reshape(1, -1) turns the Series data into a 2D array - # with one line. flatten() turns the data back to 1D - global_proportions = pd.Series( - normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(), - index=global_usage.index) - # Can be 0.999999999999... - # if global_proportions.sum() != 1.: - # display(global_proportions) - # display(global_proportions.sum()) - assert np.isclose(global_proportions.sum(), 1.) - # assert global_proportions.sum() == 1. - render_md(""" -Codon usage biases in genes can then be computed by subtracting -the corresponding global proportion to a codon's proportion. -""") - codon_usage_biases = codon_proportions.sub(global_proportions) - render_md(""" -We can standardize these values (by dividing by their standard deviation -across genes) so that they are more comparable between codons. -""") - standardized_codon_usage_biases = codon_usage_biases.div( - codon_usage_biases.std()) - # Doesn't seem to be working: - # standardized_codon_usage_biases.style.hide(axis="index") - if verbose: - display(standardized_codon_usage_biases.head(3)) - if return_more: - return { - "biases": standardized_codon_usage_biases, - "proportions": codon_proportions, - "global_proportions": global_proportions} - return standardized_codon_usage_biases - - -def check_aa_codon_columns(table): - """ - Check that the columns of *table* correspond to (aa, codon) pairs. - """ - msg = "Codon proportions table should have two levels: 'aa' and 'codon'" - if table.columns.nlevels != 2: - raise ValueError(msg) - if table.columns.names[0] != "aa": - raise ValueError(msg) - if table.columns.names[1] != "codon": - raise ValueError(msg) - - -def compute_rscu(codon_proportions_by_aa): - """ - Compute Relative Synonymous Codon Usage from proportions in genes. - - *codon_proportions_by_aa* should be a pandas DataFrame where - rows correspond to genes, and columns to codons. It contains - the proportions in the gene of the codon among those coding the - same amino-acid. The column index should be a pandas MultiIndex - where the first level is the amino-acid name, and the second level - the codon. - - Return a pair where the first element is the rscu table, and - the second is the degeneracy table. - """ - check_aa_codon_columns(codon_proportions_by_aa) - degeneracy = pd.Series( - # concat "flattens" the list of iterables given as arguments - # (list of tuples of repeated degeneracy values) - list(concat([ - # we want stretches of the same degeneracy values - # of as many elements as there are codons in this degeneracy group, - # hence the tuple multiplication - ((degen),) * degen - # valmap transforms values in a dict given as second argument - # using the function given as first argument - # We extract only the value from the (key, value) pairs - # generated by the .items() methods of the resulting dict - # (Actually, we could probably have just used the .values() method) - for (_, degen) in valmap( - # len gives the length of each group, - # where a group contains the (aa, codon) pairs for a given aa, - # that is, the degeneracy for this group of codons - len, - # groupby creates a dict where keys are obtained from - # the function given as first argument and values are - # groups made from the iterable given as second argument - # (here, column names, which are (aa, codon) pairs) - # itemgetter(0) extracts the amino-acid from a pair (aa, codon) - groupby( - itemgetter(0), - codon_proportions_by_aa.columns)).items()])), - index=codon_proportions_by_aa.columns) - return ( - codon_proportions_by_aa.mul(degeneracy), - degeneracy) - - -def by_aa_codon_usage( - codon_counts, - verbose=False, return_more=False, ref_filter_dict=None): - """ - Compute codon usage biases "by amino-acid" as the standardized - difference between a gene's codon proportions and global - codon proportions, where proportions are computed within - groups of same-amino-acid-coding codons instead of gene-wide. - - If *ref_filter_dict* is not None, it should be a dict of - (index_level, index_value) pairs and the global codon proportions - will actually be proportions computed on the part of the data - restricted to the genes where the *index_level* has the *index_value* - for all those pairs. - """ - check_aa_codon_columns(codon_counts) - render_md(f""" -We will compute codon usage "by amino-acid", by looking at the -proportion of codons for each amino-acid within a gene's CDS. -(This corresponds to R2 in {SUZUKI_LINK}) -""") - render_md(""" -We first need to compute, for a given gene, the total number of codons -corresponding to each amino-acid. -""") - summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() - render_md(""" -To compute codon proportions, we can then divide each codon counts by -the total for the corresponding amino-acid. -""") - codon_proportions = codon_counts.div( - summed_by_aa) - # Doesn't seem to be working: - # codon_proportions.style.hide(axis="index") - if verbose: - display(codon_proportions.head(3)) - # There are some (genes, aa) pairs where the sum is zero. - # This is normal if that gene does not have that amino-acid. - sums = codon_proportions.groupby(level=0, axis=1).sum() - # Values in sums should be either zero or one. - all_ones = np.full(sums.shape, 1.) - all_zeroes = np.full(sums.shape, 0.) - # Due to imprecision in float arithmetics, - # we can only check that the sums are close to 1 or to 0 - assert np.array_equal( - np.isclose( - sums.values, all_ones), - np.logical_not( - np.isclose( - sums.values, all_zeroes))) - if return_more: - render_md(f""" -We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying -codon proportions by the degeneracy of the corresponding amino-acid, that is, -by the number of different codons existing for that amino-acid. -This amounts to computing codon counts relative to the average codon counts -for that amino-acid. -(This corresponds to R3 in {SUZUKI_LINK}) -""") - (rscu, degeneracy) = compute_rscu(codon_proportions) - render_md(f""" -We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative -to the maxium across synonymous codons of the counts for that amino-acid. -""") - max_counts = codon_counts.T.groupby("aa").max().T - r4_table = codon_counts.div( - max_counts) - if ref_filter_dict is None: - counts_for_global = codon_counts - else: - counts_for_global = filter_on_idx_levels( - codon_counts, ref_filter_dict) - render_md(""" -To compute the usage biases, we also need to compute codon proportions -in the global usage. -""") - render_md(""" -We compute the global usage, as the sum of the counts for a given codon, -across genes. -""") - global_usage = counts_for_global.sum(axis=0) - if return_more: - # Same computation as for individual genes, - # but based on the total counts. - global_max_counts = global_usage.T.groupby("aa").max().T - global_r4 = global_usage.div( - global_max_counts) - render_md("Then we sum over codons corresponding to the same amino-acid.") - global_summed_by_aa = global_usage.groupby(level=0).sum() - render_md(""" -Dividing the global usage for each codon by the total for the corresponding -amino-acid, we can now compute the global proportions the same way as for -the individual genes. -""") - global_proportions = global_usage.div( - global_summed_by_aa) - # The proportions of codons for a given amino-acid should sum to 1 - assert all( - val == 1. - for val in global_proportions.groupby(level=0).sum()) - render_md(""" -Codon usage biases in genes can then be computed by subtracting -the corresponding global proportion to a codon's proportion. -""") - codon_usage_biases = codon_proportions.sub(global_proportions) - render_md(""" -We can standardize these values (by dividing by their standard deviation -across genes) so that they are more comparable between codons. -""") - standardized_codon_usage_biases = codon_usage_biases.div( - codon_usage_biases.std()) - # Doesn't seem to be working: - # standardized_codon_usage_biases.style.hide(axis="index") - if verbose: - display(standardized_codon_usage_biases.head(3)) - if return_more: - return { - "biases": standardized_codon_usage_biases, - "proportions": codon_proportions, - "rscu": rscu, - "r4_table": r4_table, - "global_proportions": global_proportions, - "global_rscu": global_proportions.mul(degeneracy), - "global_r4": global_r4} - return standardized_codon_usage_biases - - -def aa_usage( - codon_counts, - verbose=False, return_more=False, ref_filter_dict=None): - """ - Compute amino-acid usage biases as the standardized - difference between a gene's amino-acid proportions - and global amino-acid proportions. - - If *ref_filter_dict* is not None, it should be a dict of - (index_level, index_value) pairs and the global codon proportions - will actually be proportions computed on the part of the data - restricted to the genes where the *index_level* has the *index_value* - for all those pairs. - """ - check_aa_codon_columns(codon_counts) - render_md(""" -We will compute amino-acid usage, by looking at the -proportions of amino-acids within a gene's CDS. -""") - render_md(""" -We first need to compute, for a given gene, the total number of codons -corresponding to each amino-acid. -""") - summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() - render_md(""" -To compute amino-acid proportions, we can divide each line by its sum, -or, equivalently (and hopefully more efficiently), normalize the data -using the "l1" norm (which, for positive-only values amounts to the sum). -""") - aa_proportions = pd.DataFrame( - normalize(summed_by_aa, norm="l1"), - index=summed_by_aa.index, - columns=summed_by_aa.columns) - # Doesn't seem to be working: - # aa_proportions.style.hide(axis="index") - if verbose: - display(aa_proportions.head(3)) - # Checking that proportions sum to 1 - colsums = aa_proportions.sum(axis=1) - assert np.allclose(colsums, np.full(len(colsums), 1)) - # Then, computing the global amino-acid proportions - if ref_filter_dict is None: - counts_for_global = summed_by_aa - else: - counts_for_global = filter_on_idx_levels( - summed_by_aa, ref_filter_dict) - render_md(""" -To compute the usage biases, we also need to compute amino-acid proportions -in the global usage. -""") -# render_md(""" -# We compute the global usage, as the sum of the counts for a given codon, -# across genes. -# """) -# global_usage = codon_counts.sum(axis=0) - render_md("Then we sum over codons corresponding to the same amino-acid.") - # global_summed_by_aa = global_usage.groupby(level=0).sum() - global_summed_by_aa = counts_for_global.sum() - render_md("Then we normalize it the same way as for the individual genes.") - # The values.reshape(1, -1) turns the Series data into a 2D array - # with one line. flatten() turns the data back to 1D - global_aa_proportions = pd.Series( - normalize( - global_summed_by_aa.values.reshape(1, -1), - norm="l1").flatten(), - index=global_summed_by_aa.index) - render_md(""" -Amino-acid usage biases can be computed by subtracting the corresponding -global proportion to an amino-acid's proportion. -""") - aa_usage_biases = aa_proportions.sub(global_aa_proportions) - render_md(""" -We can standardize these values (by dividing by their standard deviation -across genes) so that they are more comparable between amino-acids. -""") - standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std()) - # Doesn't seem to be working: - # standardized_aa_usage_biases.style.hide(axis="index") - if verbose: - display(standardized_aa_usage_biases.head(3)) - if return_more: - return { - "biases": standardized_aa_usage_biases, - "proportions": aa_proportions, - "global_proportions": global_aa_proportions} - return standardized_aa_usage_biases - - -def exclude_all_nan_cols(usage_table, fill_other_nas=np.nan): - """ - Detect columns in *usage_table* that contain only NaNs - and remove them from the table. Other NaN values are replaced - with *fill_other_nas*. - """ - render_md(""" -Standardization may result in division by zero for usage data -that have a zero standard deviation. -This is expected to be the case for "by amino-acid" usage biases -for codons corresponding to amino-acids having only one codon: -methionine (M) and tryptophan (W). -""") - all_nan_cols = usage_table.columns[ - usage_table.isna().all()] - if len(all_nan_cols): - render_md("The following columns contain only NaNs:") - display(all_nan_cols) - render_md("This likely resulted from a division by zero.") - render_md("These columns will be excluded.") - return ( - usage_table.drop(columns=all_nan_cols).fillna(fill_other_nas), - all_nan_cols) - - -def codon_influence_in_components( - components, colnames, - figs_dir=None, more_components=False, formats=None): - """ - Plot the influence of the columns in the first 4 principal axes of a PCA. - - Each column should correspond to a codon, and will be represented as a - colour bar whose coulour is based on the last letter of the codon. - - *components* should be a numpy array representing the principal - axes of a PCA, such as the `components_` attribute of a fitted - `sklearn.decomposition.PCA` object. - - *colnames* should be the names of the columns in the *components* - array and are expected to match the following pattern: - <aa>_<codon>, where <aa> is a single-letter code for an amino-acid, - and <codon> is one of the 3-letter codons for this amino-acid, - in capital letters among A, T, G and C (i.e. in the DNA alphabet). - The last letter will be used to set the colour of a bar in the plots - - *figs_dir* should be a path to a directory that will be used - to save graphics representing the influence of each data column - on the first four principal components. - - *formats* should be a list of formats in which the figures should - be saved, such as "svg" or "png". - """ - render_md( - "Vizualizing the influence of codons in the first components\n") - # TODO: *figsize* could be adapted depending on the number of columns - if more_components: - (fig, axes) = plt.subplots(12, 1, figsize=(16, 60)) - else: - (fig, axes) = plt.subplots(4, 1, figsize=(16, 16)) - for (component, axis) in enumerate(axes): - pd.Series( - components[component], - index=colnames).plot.bar( - ax=axis, - # colname is supposed to end with the 3-letters codon - color=[ - nuc2colour[colname[-1]] - for colname in colnames]) - axis.set_ylabel(f"weight in component {component}") - # axis.set_xticklabels(axis.get_xticklabels(), rotation=90) - fig.subplots_adjust(hspace=.5) - if figs_dir is not None and formats is not None: - for ext in formats: - plt.savefig( - figs_dir.joinpath(f"PCA_components.{ext}"), - metadata=fmt_metadata[ext]) - display(fig) - plt.close(fig) - - -def codon_usage_pca( - usage_data, - figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False, - formats=None, cols_are_codons=True): - """ - Perform Principal Component Analysis on *usage_data*. - - DataFrame *usage_data* is expected to contain observations as lines - and contain numerical values in columns, without NaNs, corresponding - to codons, where the columns headers are supposed to match the following - pattern: <aa>_<codon>, where <aa> is a single-letter code for - an amino-acid, and <codon> is one of the 3-letter codons for this - amino-acid, in capital letters among A, T, G and C - (i.e. in the DNA alphabet). - - *hue* is expected to be one of the levels of the MultiIndex of - *usage_data*, and will be used to assign colours to the observations - in PCA plots. By default, colour is based on the content of a "chrom" - level in the index. - - If *figs_dir* is not None, this path to a directory will be used - to save graphics representing the projection of the observations - in the first four principal components (0 vs. 1 and 2 vs. 3). - - Unless *cols_are_codons* is set to False, there will also be - graphics representing the influence of each data column - on the first four principal components. - - *formats* should be a list of formats in which the figures should - be saved, such as "svg" or "png". - - If *exclude_cols* is not None, the columns with the names contained - in the iterable *exclude_cols* will not be included in the PCA analysis. - """ - if figs_dir is not None: - figs_dir = Path(figs_dir) - figs_dir.mkdir(parents=True, exist_ok=True) - if exclude_cols is not None: - usage_data = usage_data.drop(columns=exclude_cols) - pca = PCA().fit(usage_data) - transformed_data = pd.DataFrame( - pca.transform(usage_data), - index=usage_data.index).reset_index(level=hue) - render_md( - "Plotting genes on the first components\n") - if plot_more_components: - (fig, axes) = plt.subplots(3, 2, figsize=(16, 25)) - sns.scatterplot( - data=transformed_data, - x=0, y=1, hue=hue, marker=".", ax=axes[0,0]) - sns.scatterplot( - data=transformed_data, - x=2, y=3, hue=hue, marker=".", ax=axes[0,1]) - sns.scatterplot( - data=transformed_data, - x=4, y=5, hue=hue, marker=".", ax=axes[1,0]) - sns.scatterplot( - data=transformed_data, - x=6, y=7, hue=hue, marker=".", ax=axes[1,1]) - sns.scatterplot( - data=transformed_data, - x=8, y=9, hue=hue, marker=".", ax=axes[2,0]) - sns.scatterplot( - data=transformed_data, - x=10, y=11, hue=hue, marker=".", ax=axes[2,1]) - else: - (fig, axes) = plt.subplots(1, 2, figsize=(16, 8)) - sns.scatterplot( - data=transformed_data, - x=0, y=1, hue=hue, marker=".", ax=axes[0]) - sns.scatterplot( - data=transformed_data, - x=2, y=3, hue=hue, marker=".", ax=axes[1]) - - if figs_dir is not None and formats is not None: - for ext in formats: - plt.savefig( - figs_dir.joinpath(f"PCA_projections.{ext}"), - metadata=fmt_metadata[ext]) - display(fig) - plt.close(fig) - if cols_are_codons: - codon_influence_in_components( - pca.components_, usage_data.columns, - figs_dir=figs_dir, more_components=plot_more_components, formats=formats) - return (pca, transformed_data) - - -def centroid_usage(codon_counts, all_nan_cols): - """ - Define "centroids" for gene clusters, one per codon in *codon_counts*. - - The standardized usage biases of these centroids are computed so that - the codon proportions for a given amino-acid are the same as the global - proportions, except for the codons corresponding to a certain amino-acid. - For each amino-acid, there is one centroid per codon, where the - proportion for this codon is set to 1.0, and 0.0 for the other codons. - """ - check_aa_codon_columns(codon_counts) - summed_by_aa = codon_counts.groupby(level=0, axis=1).sum() - global_usage = codon_counts.sum(axis=0) - global_summed_by_aa = global_usage.groupby(level=0).sum() - global_proportions_by_aa = global_usage.div( - global_summed_by_aa) - codon_proportions_by_aa = codon_counts.div(summed_by_aa) - codon_usage_biases_by_aa = codon_proportions_by_aa.sub( - global_proportions_by_aa) - # _columns_by_aa may be different from columns_by_aa - # Groups of column headers for each amino-acid - # key: aa, value: list of (aa, codon) pairs - _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns) - - def generate_centroid_proportions(): - for (aa1, columns) in _columns_by_aa.items(): - for (_, codon1) in columns: - proportions = global_proportions_by_aa.copy() - for (aa2, codon2) in columns: - proportions[(aa2, codon2)] = 0.0 - proportions[(aa1, codon1)] = 1.0 - proportions.name = ( - f"{aa1}_{codon1}", f"{aa1}_{codon1}", - *np.full( - codon_proportions_by_aa.index.nlevels - 2, - np.nan)) - yield proportions - - render_md(""" -For each codon, a "centroid" (future center of a cluster of genes) -is defined where the proportion of this codon is set to 1.0. -For the other codons corresponding to the same amino-acid, the proportion -is therefore set to 0.0. -For codons corresponding to other amino-acids, the proportions are set to -the proportions observed in the global data. -""") - centroids_proportions_by_aa = pd.DataFrame( - generate_centroid_proportions()) - centroids_proportions_by_aa.index.rename( - codon_proportions_by_aa.index.names, - inplace=True) - - render_md(""" -The biases for the centroids are then computed by subtracting to those -proportions the proportions in the global data, and the biases are -standardized by dividing by the standard deviation of codon usage biases -in the data. -""") - # Compute biases using the same global proportions as the real data - # and standardize by the same standard deviations as the real data - centroids_scub_by_aa = centroids_proportions_by_aa.sub( - global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop( - columns=all_nan_cols) - # centroids_scub_by_aa.head(3) - return centroids_scub_by_aa - - -def make_centroids_cluster_finder( - centroids_table, - aa): # pylint: disable=C0103 - """ - Make a function that, when applied to a row in the standardized - codon bias table, determines to what centroid among those - corresponding to *aa* it is the closest. - The *centroids_table* should contain standardized codon usage - biases for the centroids. - """ - # The columns that contain data pertaining to codons coding aa: - cols_for_aa = centroids_table.columns.get_level_values(0) == aa - cluster_names = [ - f"{aa}_{codon}" - for (aa, codon) in centroids_table.columns[cols_for_aa]] - - def closest_centroid(gene): - dists_to_centroids = {} - for cluster_name in cluster_names: - dists_to_centroids[cluster_name] = sqdist( - gene[cols_for_aa].values, - centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values) - # We return the cluster_name (`min(...))[1]`) - # associated with the minimum distance - return min( - (dist, cluster_name) - for (cluster_name, dist) - in dists_to_centroids.items())[1] - return closest_centroid - - -def make_cluster_table(scub_table, centroids_scub_table): - """ - Make a table for the genes in standardized codon usage bias - table *scub_table* where each column tells, for a given - amino-acid to which centroid in *centroids_scub_table* - it is the closest. - """ - return pd.DataFrame( - { - (f"cluster_{aa}_full_bias", ""): scub_table.apply( - make_centroids_cluster_finder(centroids_scub_table, aa), - axis=1).values - for aa # pylint: disable=C0103 - in unique(centroids_scub_table.columns.get_level_values(0))}, - index=scub_table.index) - - -def compare_clusterings( - usage_table, - clust1_template="cluster_{aa}_full_bias", - clust2_template="cluster_{aa}_highest_bias"): - """ - Compare lists of genes in *usage_table* for the clusters - found in the columns whose name conform to - *clust1_template* and *clust2_template*. - """ - crosstabs = {} - for aa in aa2colour.keys(): # pylint: disable=C0103 - if aa in {"M", "W"}: - continue - clust1_level = clust1_template.format(aa=aa) - clust2_level = clust2_template.format(aa=aa) - crosstabs[aa] = pd.crosstab( - usage_table.reset_index()[clust1_level], - usage_table.reset_index()[clust2_level]) - cols_for_aa = columns_by_aa[aa] - cluster_names = [ - f"{aa}_{codon}" - for (aa, codon) in usage_table[cols_for_aa].columns] - # Checking that the clusters are the same - same_clusters = True - for (clust_1, clust_2) in combinations(cluster_names, 2): - # assert crosstabs[aa][clust_1][clust_2] == 0 - try: - # if crosstabs[aa][clust_1][clust_2] != 0: - if (crosstabs[aa][clust_1][clust_2] != 0) \ - or (crosstabs[aa][clust_2][clust_1] != 0): - same_clusters = False - except KeyError: - # At least one cluster is absent. - # Is the absence symmetric? - miss_1_in_1 = clust_1 in usage_table.reset_index()[ - clust1_level] - miss_2_in_2 = clust_2 not in usage_table.reset_index()[ - clust2_level] - miss_1_in_2 = clust_1 in usage_table.reset_index()[ - clust2_level] - miss_2_in_1 = clust_2 not in usage_table.reset_index()[ - clust1_level] - if ( - (miss_1_in_1 != miss_1_in_2) - or (miss_2_in_1 != miss_2_in_2)): - same_clusters = False - if not same_clusters: - render_md(f"\nClusters for {aa} differ:\n") - else: - render_md(f"\nClusters for {aa} are identical:\n") - display(crosstabs[aa]) - - -def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2): - """ - Load a table containing by-amino-acid codon usage biases. - - The table, located at *table_path*, should have tab-separated columns. - It is expected to contain a series of informative columns - preceding the columns containing the codon usage bias values. - - The first *nb_info_cols* columns contain information about gene - identifiers and features extracted from CDS annotation data. - - Then, there can be series of gene-clustering information, where each - column of the series corresponds to clusters defined based on - codon-usage biases among the codons coding an amino-acid. - There are no columns for amino-acids coded by only one codon (M and W). - *nb_cluster_series* specifies the number of such series. - - The result is a pandas DataFrame object, where all the above information - is encoded as a MultiIndex, the codon usage biases being in the - remaining columns. - """ - return load_table_with_info_index( - table_path, nb_info_cols, nb_cluster_series) - - -def star2stop(text): - """ - Replace stars with "stop", for use in file paths. - """ - return text.replace("*", "stop") - - -def write_cluster_lists( - usage_table, - aa, # pylint: disable=C0103 - clusters_dir, - cluster_level_template, - y_label_template, - alt_tag_kw="old_locus_tag"): - """ - Extract lists of genes in *usage_table** for the clusters - associated with codons for amino-acid *aa* and write the - resulting lists in a sub-folder (named based on *aa*) - of the directory *clusters_dir*. - The clusters are found in the column whose name conforms to - *cluster_level_template* - Also generate violin plots for each cluster. - """ - md_report = f"* Clusters for {aa}:\n\n" - aa_dir = clusters_dir.joinpath(star2stop(aa)) - aa_dir.mkdir(parents=True, exist_ok=True) - # key: cluster - # value: relative path to a file containing old locus tags - # for genes belonging to this cluster - relpaths_to_cluster_lists = {} - for (cluster, gene_list) in groupby(itemgetter(0), zip( - usage_table.index.get_level_values( - cluster_level_template.format(aa=aa)), - usage_table.index.get_level_values(alt_tag_kw))).items(): - path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt")) - with path_to_clusterfile.open("w") as clust_fh: - clust_fh.write("\n".join(map(itemgetter(1), gene_list))) - relpaths_to_cluster_lists[cluster] = str( - path_to_clusterfile.relative_to('.')) - md_report += "\n\n".join([ - f" - [{cluster}]({relpath_to_list})" - for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()]) - violin_usage_by_clusters( - usage_table, - aa, - y_label_template, - cluster_level_template=cluster_level_template, - vertical=True) - path_to_fig = aa_dir.joinpath(star2stop( - f"usage_biases_violin_plots_by_cluster_for_{aa}.png")) - plt.savefig(path_to_fig, metadata=fmt_metadata["png"]) - plt.close() - relpath_to_fig = str(path_to_fig.relative_to('.')) - md_report += ( - f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n") - return md_report - - -def find_valley(values): - """ - Try to find a cutting threshold for the *values*, - where the corresponding density distribution is lowest. - """ - if len(set(values)) == 1: - (single_val,) = set(values) - return single_val - min_val = min(values) - max_val = max(values) - val_range = max_val - min_val - # We use 100 bins - try: - val_density = gaussian_kde(values).evaluate( - np.linspace(min_val, max_val, 100)) - except LinAlgError as err: - print(f"min: {min_val}, max: {max_val}, N: {len(values)}.") - print(str(err)) - raise - # Where is the density lowest? (excluding first and last) - min_density_idx = val_density[1:-1].argmin() - # Finding the value corresponding to the index in the `val_density` array: - return min_val + (min_density_idx * val_range / 100) - - -def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"): - """ - Compute a usage bias threshold hopefully separating the most biased genes - from the remainder of the cluster where *data* contains codon usage biases - for the genes in the cluster and has an *alt_tag_kw* level in its index. - Return the threshold in terms of usage bias and the list of genes above it. - - /!\ This is likely very sensitive to the shape - of the usage bias distribution. - """ - thresh = find_valley(data.values) - # Try avoiding cases with threshold "misplaced" at the bottom - # of the distribution - retries = 0 - while do_retries and thresh < 0: - retries += 1 - thresh = find_valley(data.sort_values().iloc[retries:].values) - top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values - return (thresh, top_genes, retries) - - -def extract_top_genes_from_cluster( - usage_table, - aa, # pylint: disable=C0103 - codon, - clusters_dir, - axis=None, savefig=True, alt_tag_kw="old_locus_tag"): - """ - Return a few lines of markdown with links pointing to - where the saved results are expected to be found. - """ - md_report = f" - Most biased genes for {codon} ({aa}):\n\n" - aa_dir = clusters_dir.joinpath(star2stop(aa)) - # Files where to save the results - top_dir = aa_dir.joinpath("top_genes") - path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) - path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) - path_to_fig = top_dir.joinpath(star2stop( - f"usage_biases_violin_plot_{aa}_{codon}.png")) - # Load the list of genes belonging to the cluster - # associated with a high usage bias for *codon* - path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt")) - with open(path_to_clusterfile) as gene_list_fh: - genes = [line.strip() for line in gene_list_fh] - # Extract the usage bias data for these genes and for this codon - idx_level = usage_table.index.names.index(alt_tag_kw) - try: - # data = usage_table.loc[genes][(aa, codon)] - data = usage_table.loc[ - (*(slice(None) for _ in range(idx_level)), genes), - (aa, codon)] - except KeyError: - print(genes[0]) - print(usage_table.index.names) - raise - q3 = data.quantile(0.75) # pylint: disable=C0103 - top25 = data[(data > q3)].index.get_level_values( - alt_tag_kw).values - (thresh, top_genes, retries) = find_most_biased_genes( - data, do_retries=True, alt_tag_kw=alt_tag_kw) - if retries != 0: - print( - f"Bottom-most {retries} values had to be discarded in order " - f"to avoid a negative threshold for the cluster {aa}_{codon}.") - top_genes_check = set( - usage_table[ - (usage_table[(aa, codon)] > thresh)].index.get_level_values( - alt_tag_kw).values) - if set(top_genes) != top_genes_check: - assert set(top_genes).issubset(top_genes_check) - extra_genes_not_in_cluster = top_genes_check - set(top_genes) - print( - f"{len(extra_genes_not_in_cluster)} genes were found " - f"with a high bias for {codon} but not belonging " - f"to the cluster {aa}_{codon}.") - # For visual verification purpose, - # display where the threshold is situated on a violin plot - axis = violin_with_thresh( - data, [(thresh, "red"), (q3, "blue")], - f"standardized usage bias for {codon} ({aa})", - facecolor=aa2colour[aa], axis=axis) - # Save the figure and close it. - plt.tight_layout() - if savefig: - top_dir.mkdir(parents=True, exist_ok=True) - plt.savefig(path_to_fig, metadata={'creationDate': None}) - plt.close() - # Save the list of the top genes - with open(path_to_top25, "w") as gene_list_fh: - gene_list_fh.write("\n".join(top25)) - with open(path_to_top, "w") as gene_list_fh: - gene_list_fh.write("\n".join(top_genes)) - relpath_to_top25 = str(path_to_top25.relative_to('.')) - relpath_to_top = str(path_to_top.relative_to('.')) - relpath_to_fig = str(path_to_fig.relative_to('.')) - md_report += "\n\n".join([ - f" * [top 25% genes]({relpath_to_top25})", - f" * [top genes]({relpath_to_top})", - f" * [violin plot with thresholds ({q3:.3}, " - f"{thresh:.3})]({relpath_to_fig})"]) - return md_report - - -# TODO: -def plot_codon_usage_for_gene_list( - gene_list_path, - usage_table, - aa, # pylint: disable=C0103 - codon, - figs_dir=None, - axis=None, savefig=True, alt_tag_kw="old_locus_tag"): - """ - Make a violin plot of codon usage biases for genes in file *gene_list_path* - using data in *usage_table*. - The plot will represent the distribution of usage bias - for codon *codon* of amino-acid *aa*. - The plot will be saved inside *figs_dir* if *savefig* is set to True. - Gene identifiers are expected to be *alt_tag_kw* identifiers. - """ - md_report = f" - Genes from {gene_list_path}:\n\n" - with open(gene_list_path) as gene_list_fh: - genes = [ - line.strip() for line in gene_list_fh - if line[0] != "#"] - # Extract the usage bias data for these genes and for this codon - idx_level = usage_table.index.names.index(alt_tag_kw) - try: - # data = usage_table.loc[genes][(aa, codon)] - data = usage_table.loc[ - (*(slice(None) for _ in range(idx_level)), genes), - (aa, codon)] - except KeyError: - print( - f"Genes are supposed to be provided as {alt_tag_kw} identifiers.") - print(genes[0]) - print(usage_table.index.names) - raise - q3 = data.quantile(0.75) # pylint: disable=C0103 - # top25 = data[(data > q3)].index.get_level_values( - # alt_tag_kw).values - (thresh, top_genes, retries) = find_most_biased_genes( - data, do_retries=True, alt_tag_kw=alt_tag_kw) - if retries != 0: - print( - f"Bottom-most {retries} values (out of {len(data)}) " - "had to be discarded in order to avoid a negative threshold " - f"for the cluster {aa}_{codon}.") - top_genes_check = set( - usage_table[ - (usage_table[(aa, codon)] > thresh)].index.get_level_values( - alt_tag_kw).values) - if set(top_genes) != top_genes_check: - assert set(top_genes).issubset(top_genes_check) - extra_genes_not_in_cluster = top_genes_check - set(top_genes) - print( - f"{len(extra_genes_not_in_cluster)} genes were found " - f"with a high bias for {codon} but not belonging to " - f"the cluster {aa}_{codon}.") - # For visual verification purpose, - # display where the threshold is situated on a violin plot - axis = violin_with_thresh( - data, [(thresh, "red"), (q3, "blue")], - f"standardized usage bias for {codon} ({aa})", - facecolor=aa2colour[aa], axis=axis) - # Save the figure and close it. - plt.tight_layout() - if not savefig: - return md_report - if figs_dir is None: - raise ValueError("No *figs_dir* directory specified.\n") - aa_dir = figs_dir.joinpath(star2stop(aa)) - # Files where to save the results - aa_dir.mkdir(parents=True, exist_ok=True) - path_to_fig = aa_dir.joinpath(star2stop( - f"usage_biases_violin_plot_{aa}_{codon}.png")) - plt.savefig(path_to_fig, metadata=fmt_metadata["png"]) - plt.close() - # Save the list of the top genes - # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) - # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) - # with open(path_to_top25, "w") as gene_list_fh: - # gene_list_fh.write("\n".join(top25)) - # with open(path_to_top, "w") as gene_list_fh: - # gene_list_fh.write("\n".join(top_genes)) - # relpath_to_top25 = str(path_to_top25.relative_to('.')) - # relpath_to_top = str(path_to_top.relative_to('.')) - relpath_to_fig = str(path_to_fig.relative_to('.')) - md_report += "\n\n".join([ - # f" * [top 25% genes]({relpath_to_top25})", - # f" * [top genes]({relpath_to_top})", - f" * [violin plot with thresholds ({q3:.3}, " - f"{thresh:.3})]({relpath_to_fig})"]) - return md_report - - -def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"): - """ - Plot a boxplot from pandas DataFrame *usage_table*. - - Use *y_label* as y-axis label. - """ - nb_genes = usage_table.shape[0] - if whiskers == "1.5 IQR": - axis = usage_table.plot.box(figsize=(18, 6), rot=90, sym=".") - axis.set_title( - f"Distributions of codon usage biases among {nb_genes} " - "genes (whiskers at 1.5 IQR).") - elif whiskers == "5 percent tails": - axis = usage_table.plot.box( - figsize=(18, 6), rot=90, sym=".", whis=(5, 95)) - axis.set_title( - f"Distributions of codon usage biases among {nb_genes} " - "genes (whiskers at the 5th and 95th percentiles).") - axis.set_ylabel(ylabel) - - -def to_long_form(usage_table, ylabel, others=None): - """ - Transform data in *usage_table* into "long form". - - The goal is to be able to use aa as hue in violin plots. - - See https://pandas.pydata.org/docs/user_guide/10min.html#stack - and https://seaborn.pydata.org/generated/seaborn.violinplot.html - - *y_label* is the column header you want to set for the usage values. - - *others* should be a list of extra information you want to extract - from the index (which is expected to be a pandas MultiIndex). - """ - col_nb_levels = usage_table.columns.nlevels - row_nb_levels = usage_table.index.nlevels - long_form = usage_table.stack( - level=list(range(col_nb_levels))).to_frame().reset_index( - level=list(range( - row_nb_levels, row_nb_levels + col_nb_levels))) - if others is None: - others = [] - if col_nb_levels == 1: - long_form.columns = ["aa", ylabel] - others = [other for other in others if other != "aa"] - elif col_nb_levels == 2: - long_form.columns = ["aa", "codon", ylabel] - others = [other for other in others if other not in {"aa", "codon"}] - else: - raise NotImplementedError( - "Don't know how to deal with column headers " - "with more than 2 levels ('aa' and 'codon')\n") - long_form = long_form.reset_index(level=[ - lvl_name for lvl_name in usage_table.index.names - if lvl_name.startswith("cluster")]) - if not set(others).issubset(set(long_form.index.names)): - print(set(others)) - print(set(long_form.index.names)) - raise ValueError( - "All elements in *others* should be in the index " - "of *usage_table*") - return long_form.assign(**{ - other: long_form.index.get_level_values(other) for other in others}) - - -def variable2order(variable): - """ - Define the order of amino-acids or codons. - """ - if variable == "aa": - return list(columns_by_aa.keys()) - if variable == "codon": - return [codon for (_, codon) in concat(columns_by_aa.values())] - raise ValueError("variable can only be 'aa' or 'codon'.\n") - - -def format_codon_labels(codons): - """ - Add amino-acid information to codons meant to be used as tick labels. - """ - return [ - plt.Text(x, y, f"{codon} ({codon2aa[codon]})") - for (x, y, codon) in map(attrgetter("_x", "_y", "_text"), codons)] - - -def violin_usage( - usage_table, variable, ylabel, - hue="aa", axis=None, **violin_kwargs): - """ - Plot violin plots of codon usage biases. - - *variable* should be either "codon" or "aa". - """ - if hue in {"aa", "codon"}: - dodge = False - nb_violins = 1 - else: - dodge = True - # dodge = True implies there will be one violin - # per possible value of *hue*, side by side - nb_violins = len(usage_table.index.get_level_values(hue).unique()) - long_form = to_long_form(usage_table, ylabel, others=[hue]) - if axis is None: - _, axis = plt.subplots(figsize=(18 * nb_violins, 6)) - do_legend = True - else: - do_legend = False - if hue == "aa": - palette = aa2colour - else: - palette = None - kwargs = { - "x": variable, "y": ylabel, "order": variable2order(variable), - "hue": hue, "palette": palette, "dodge": dodge, - "data": long_form, "ax": axis, "orient": "v", "scale": "count"} - kwargs.update(violin_kwargs) - axis = sns.violinplot(**kwargs) - if do_legend: - plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0) - if variable == "codon": - ticklabels = format_codon_labels(axis.get_xticklabels()) - else: - ticklabels = axis.get_xticklabels() - # TODO: Expect issues when variable == "aa" - axis.set_xticklabels(ticklabels, rotation=90) - return axis - - -def violin_usage_vertical( - usage_table, variable, ylabel, - hue="aa", axis=None, **violin_kwargs): - """ - Plot vertically stacked violin plots of codon usage biases. - - *variable* should be either "codon" or "aa". - """ - if hue in {"aa", "codon"}: - dodge = False - nb_violins = 1 - else: - dodge = True - # dodge = True implies there will be one violin - # per possible value of *hue*, side by side - nb_violins = len(usage_table.index.get_level_values(hue).unique()) - long_form = to_long_form(usage_table, ylabel, others=[hue]) - if axis is None: - if variable == "codon": - base_height = 44 - elif variable == "aa": - base_height = 18 - else: - raise NotImplementedError(f"Unsupported variable: {variable}.") - _, axis = plt.subplots(figsize=(6, base_height * nb_violins)) - if hue == "aa": - palette = aa2colour - else: - palette = None - kwargs = { - "y": variable, "x": ylabel, - "order": variable2order(variable), - "hue": hue, "palette": palette, "dodge": dodge, - "data": long_form, "ax": axis, "orient": "h", "scale": "count"} - kwargs.update(violin_kwargs) - axis = sns.violinplot(**kwargs) - if variable == "codon": - ticklabels = format_codon_labels(axis.get_yticklabels()) - axis.set_yticklabels(ticklabels) - return axis - - -def violin_usage_by_clusters(usage_with_clusters, aa, # pylint: disable=C0103 - ylabel_template, - cluster_level_template="cluster_{aa}", - vertical=False, **violin_kwargs): - """ - Plot a series of violin plots for each cluster of genes. - - The clusters are defined in usage table *usage_with_clusters* - based on usage biases for the codons coding for amino-acid *aa*. - - *usage_with_clusters* should have clustering indications as - an index level named according to *cluster_level_template* - (default "cluster_{aa}"), where {aa} is to be replaced by *aa*. - """ - clusters = usage_with_clusters.groupby( - level=cluster_level_template.format(aa=aa)) - # To adjust figure size - if "hue" in violin_kwargs and violin_kwargs["hue"] not in {"aa", "codon"}: - # dodge will be set to True in later plotting call. - # This implies that there will be one violin - # per possible value of *hue*, side by side - nb_violins = len(usage_with_clusters.index.get_level_values( - violin_kwargs["hue"]).unique()) - else: - nb_violins = 1 - if vertical: - fig, axes = plt.subplots( - ncols=clusters.ngroups, - constrained_layout=True, - figsize=(6 * clusters.ngroups, 44 * nb_violins)) - else: - fig, axes = plt.subplots( - nrows=clusters.ngroups, - constrained_layout=True, - figsize=(18 * nb_violins, 6 * clusters.ngroups)) - for ((cluster, usage_table), axis) in zip(clusters, axes): - kwargs = {"axis": axis} - kwargs.update(violin_kwargs) - if vertical: - violin_usage_vertical( - usage_table, "codon", - ylabel_template.format(aa=aa, cluster=cluster), - **kwargs) - else: - violin_usage( - usage_table, "codon", - ylabel_template.format(aa=aa, cluster=cluster), - **kwargs) - return fig - - -def violin_usage_by_clusters_splitby( - usage_with_clusters, aa, # pylint: disable=C0103 - ylabel_template, - cluster_level_template="cluster_{aa}", - vertical=False, by_lvl="chrom", - **violin_kwargs): - """ - Plot a series of violin plots for each cluster of genes, - splitting the figures according to groups defined by the - content of the index level *by_lvl* of *usage_with_clusters*. - - The clusters are defined in usage table *usage_with_clusters* - based on usage biases for the codons coding for amino-acid *aa*. - - *usage_with_clusters* should have clustering indications as - an index level named according to *cluster_level_template* - (default "cluster_{aa}"), where {aa} is to be replaced by *aa*. - """ - classes = sorted(set( - usage_with_clusters.index.get_level_values(by_lvl))) - idx_level = usage_with_clusters.index.names.index(by_lvl) - for clss in classes: - fig = violin_usage_by_clusters( - usage_with_clusters.loc[ - (*(slice(None) for _ in range(idx_level)), clss), ], - aa, ylabel_template + f" for {clss}", - cluster_level_template=cluster_level_template, - vertical=vertical, **violin_kwargs) - fig.suptitle(f"Violin plots for {clss}") - - -# Based on -# https://matplotlib.org/stable/gallery/statistics/customized_violin.html -def adjacent_values(q0, q1, q3, q4): # pylint: disable=C0103 - """ - Define whiskers positions using the 1.5 * IQR criterion. - - Use the values of quartiles *q0* (min), *q1*, *q3* and *q4* (max). - """ - upper_adjacent_value = q3 + (q3 - q1) * 1.5 - upper_adjacent_value = np.clip(upper_adjacent_value, q3, q4) - lower_adjacent_value = q1 - (q3 - q1) * 1.5 - lower_adjacent_value = np.clip(lower_adjacent_value, q0, q1) - return (lower_adjacent_value, upper_adjacent_value) - - -def violin_with_thresh(data, thresh, ylabel, - facecolor="grey", axis=None, **violin_kwargs): - """ - Draw a violin plot from *data* with a horizontal line at value *thresh*. - - If *thresh* is a single value, the line will be red. - If *thresh* contains pairs, it will be assumed that these - are (threshold, colour) pairs, to be used to plot various lines. - Y-axis will be labeled using *ylabel*. - """ - if axis is None: - _, axis = plt.subplots(figsize=(2, 3.5)) - kwargs = {"showextrema": False} - kwargs.update(violin_kwargs) - violin_parts = axis.violinplot(data, **kwargs) - # https://stackoverflow.com/a/26291582/1878788 - [body] = violin_parts["bodies"] - body.set_facecolor(facecolor) - body.set_edgecolor("black") - try: - (val, col) = thresh[0] - except TypeError: - thresh = [(thresh, "red")] - for (val, col) in thresh: - axis.axhline(val, color=col) - axis.text( - 1.01, val, f"{val:.3}", - horizontalalignment="left", verticalalignment="bottom") - (q0, q1, median, q3, q4) = np.percentile( # pylint: disable=C0103 - data, [0, 25, 50, 75, 100]) - (w_min, w_max) = adjacent_values(q0, q1, q3, q4) - axis.scatter(1, median, marker="o", s=30, color="white", zorder=3) - axis.vlines(1, q1, q3, linestyle="-", linewidth=5, color="black") - axis.vlines(1, w_min, w_max, linestyle="-", linewidth=1, color="black") - axis.set_xticks([]) - axis.set_ylabel(ylabel) - return axis - - -if __name__ == "__main__": - import doctest - doctest.testmod() -- GitLab