diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index c45672d8a14f37072d6d1dd6f5f7b99fc9b464f4..4241c507bd3127ebcfcaed45519d8030c91371e3 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,19 +1,22 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.22" +__version__ = "0.23" from .libcodonusage import ( aa2colour, aa_usage, by_aa_codon_usage, centroid_usage, codon2aa, + codon_usage_pca, columns_by_aa, compare_clusterings, detect_fishy_genes, exclude_all_nan_cols, extract_top_genes_from_cluster, + filter_on_idx_levels, find_most_biased_genes, find_valley, + format_codon_labels, gene_wide_codon_usage, load_bias_table, load_counts_table, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index d839d896102ec157857575c0095afcd0e95ff9a6..c675e92d77b3953861902607b2bf89ba658dd632 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -36,7 +36,7 @@ import seaborn as sns # python3 -m pip install biotite # See https://www.biotite-python.org/install.html import biotite.sequence.graphics as bgraphs -from biotite.sequence import CodonTable +from biotite.sequence import CodonTable, NucleotideSequence # python3 -m pip install numpy import numpy as np # Python module to handle tabular data @@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde # Python library with useful data-processing features # python3 -m pip install scikit-learn # https://scikit-learn.org/stable/install.html +from sklearn.decomposition import PCA from sklearn.preprocessing import normalize # Python module to vizualize set intersections # python3 -m pip install upsetplot @@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath( # key: amino-acid # value: hexadecimal html colour code aa2colour = {**colscheme["colors"], "*": '#000000'} +###################################### +# Associating colours to nucleotides # +###################################### +nuc_alphabet = NucleotideSequence.alphabet_unamb +nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet) +nuc2colour = dict(zip(nuc_alphabet, nuc_colours)) def load_counts_table( @@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path): render_md(f"The table was saved at [{table_path}]({table_path}).") +def filter_on_idx_levels(counts_table, filter_dict): + """ + Filter a table *counts_table* based on values of certain index levels. + + *filter_dict* should contain index level names as keys and values that + rows should have at this level to be included in the output + filtered table. + """ + row_filter = np.all([ + counts_table.index.get_level_values(idx_lvl) == idx_val + for (idx_lvl, idx_val) in filter_dict.items()], axis=0) + return counts_table[row_filter] + + # Codon usage calculations can be done in various ways. # Some examples are given at page 6500 (2 of the pdf) of # [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032) @@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032" SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})" -def gene_wide_codon_usage(codon_counts, verbose=False, return_more=False): +def gene_wide_codon_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): """ Compute codon usage biases "gene-wide" as the standardized difference between a gene's codon proportions and global - codon proportions. + codon proportions (default). + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. """ render_md(f""" We will compute codon usage "gene-wide" (i.e. not by amino-acid), @@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum). # Due to imprecision in float arithmetics, # we can only check that the sums are close to 1 assert np.allclose(colsums, np.full(len(colsums), 1)) + if ref_filter_dict is None: + counts_for_global = codon_counts + else: + counts_for_global = filter_on_idx_levels( + codon_counts, ref_filter_dict) render_md(""" To compute the usage biases, we also need to compute codon proportions in the global usage. """) render_md("We compute the global usage, as the sum on columns.") - global_usage = codon_counts.sum(axis=0) + global_usage = counts_for_global.sum(axis=0) render_md("Then we normalize it the same way as for the individual genes.") # The values.reshape(1, -1) turns the Series data into a 2D array # with one line. flatten() turns the data back to 1D @@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons. # TODO: add option to output RSCU instead of / besides proportions -def by_aa_codon_usage(codon_counts, verbose=False, return_more=False): +def by_aa_codon_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): """ Compute codon usage biases "by amino-acid" as the standardized difference between a gene's codon proportions and global codon proportions, where proportions are computed within groups of same-amino-acid-coding codons instead of gene-wide. + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. """ render_md(f""" We will compute codon usage "by amino-acid", by looking at the @@ -464,6 +506,11 @@ the total for the corresponding amino-acid. np.logical_not( np.isclose( sums.values, all_zeroes))) + if ref_filter_dict is None: + counts_for_global = codon_counts + else: + counts_for_global = filter_on_idx_levels( + codon_counts, ref_filter_dict) render_md(""" To compute the usage biases, we also need to compute codon proportions in the global usage. @@ -472,7 +519,7 @@ in the global usage. We compute the global usage, as the sum of the counts for a given codon, across genes. """) - global_usage = codon_counts.sum(axis=0) + global_usage = counts_for_global.sum(axis=0) render_md("Then we sum over codons corresponding to the same amino-acid.") global_summed_by_aa = global_usage.groupby(level=0).sum() render_md(""" @@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons. return standardized_codon_usage_biases -def aa_usage(codon_counts, verbose=False, return_more=False): +def aa_usage( + codon_counts, + verbose=False, return_more=False, ref_filter_dict=None): """ Compute amino-acid usage biases as the standardized difference between a gene's amino-acid proportions and global amino-acid proportions. + + If *ref_filter_dict* is not None, it should be a dict of + (index_level, index_value) pairs and the global codon proportions + will actually be proportions computed on the part of the data + restricted to the genes where the *index_level* has the *index_value* + for all those pairs. """ render_md(""" We will compute amino-acid usage, by looking at the @@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum). colsums = aa_proportions.sum(axis=1) assert np.allclose(colsums, np.full(len(colsums), 1)) # Then, computing the global amino-acid proportions + if ref_filter_dict is None: + counts_for_global = summed_by_aa + else: + counts_for_global = filter_on_idx_levels( + summed_by_aa, ref_filter_dict) render_md(""" To compute the usage biases, we also need to compute amino-acid proportions in the global usage. """) - render_md(""" -We compute the global usage, as the sum of the counts for a given codon, -across genes. -""") - global_usage = codon_counts.sum(axis=0) +# render_md(""" +# We compute the global usage, as the sum of the counts for a given codon, +# across genes. +# """) +# global_usage = codon_counts.sum(axis=0) render_md("Then we sum over codons corresponding to the same amino-acid.") - global_summed_by_aa = global_usage.groupby(level=0).sum() + # global_summed_by_aa = global_usage.groupby(level=0).sum() + global_summed_by_aa = counts_for_global.sum() render_md("Then we normalize it the same way as for the individual genes.") # The values.reshape(1, -1) turns the Series data into a 2D array # with one line. flatten() turns the data back to 1D @@ -606,6 +667,75 @@ methionine (M) and tryptophan (W). all_nan_cols) +def codon_usage_pca(usage_data, figs_dir=None, hue="chrom"): + """ + Perform Principal Component Analysis on *usage_data*. + + DataFrame *usage_data* is expected to contain observations as lines + and contain numerical values in columns, without NaNs, corresponding + to codons, where the columns headers are supposed to match the following + pattern: <aa>_<codon>, where <aa> is a single-letter code for + an amino-acid, and <codon> is one of the 3-letter codons for this + amino-acid, in capital letters among A, T, G and C + (i.e. in the DNA alphabet). + + *hue* is expected to be one of the levels of the MultiIndex of + *usage_data*, and will be used to assign colours to the observations + in PCA plots. By default, colour is based on the content of a "chrom" + level in the index. + + If *figs_dir* is not None, this path to a directory will be used + to save graphics representing the projection of the observations + in the first four principal components (0 vs. 1 and 2 vs. 3) + as well as graphics representing the influence of each data column + on the first four principal components. + """ + if figs_dir is not None: + figs_dir = Path(figs_dir) + figs_dir.mkdir(parents=True, exist_ok=True) + pca = PCA().fit(usage_data) + transformed_data = pd.DataFrame( + pca.transform(usage_data), + index=usage_data.index).reset_index(level=hue) + render_md( + "Plotting genes on the first 4 components\n") + (fig, axes) = plt.subplots(1, 2, figsize=(16, 8)) + sns.scatterplot( + data=transformed_data, + x=0, y=1, hue=hue, marker=".", ax=axes[0]) + sns.scatterplot( + data=transformed_data, + x=2, y=3, hue=hue, marker=".", ax=axes[1]) + if figs_dir is not None: + plt.savefig( + figs_dir.joinpath("PCA_projections.png"), + metadata={'creationDate': None}) + display(fig) + plt.close(fig) + render_md( + "Vizualizing the influence of codons in the first 4 components\n") + (fig, axes) = plt.subplots(4, 1, figsize=(16, 16)) + for (component, axis) in enumerate(axes): + pd.Series( + pca.components_[component], + index=usage_data.columns).plot.bar( + ax=axes[component], + # colname is supposed to end with the 3-letters codon + color=[ + nuc2colour[colname[-1]] + for colname in usage_data.columns]) + axis.set_ylabel(f"weight in component {component}") + # axis.set_xticklabels(axis.get_xticklabels(), rotation=90) + fig.subplots_adjust(hspace=.5) + if figs_dir is not None: + plt.savefig( + figs_dir.joinpath("PCA_components.png"), + metadata={'creationDate': None}) + display(fig) + plt.close(fig) + return (pca, transformed_data) + + def centroid_usage(codon_counts, all_nan_cols): """ Define "centroids" for gene clusters, one per codon in *codon_counts*. @@ -1202,7 +1332,7 @@ def violin_usage( "hue": hue, "palette": palette, "dodge": dodge, "data": long_form, "ax": axis, "orient": "v", "scale": "count"} kwargs.update(violin_kwargs) - sns.violinplot(**kwargs) + axis = sns.violinplot(**kwargs) # sns.violinplot(x=variable, y=ylabel, order=variable2order(variable), # hue=hue, palette=palette, dodge=dodge, # data=long_form, ax=axis, orient="v", scale="count", @@ -1246,7 +1376,7 @@ def violin_usage_vertical( "hue": hue, "palette": palette, "dodge": dodge, "data": long_form, "ax": axis, "orient": "h", "scale": "count"} kwargs.update(violin_kwargs) - sns.violinplot(**kwargs) + axis = sns.violinplot(**kwargs) # sns.violinplot( # y=variable, x=ylabel, order=variable2order(variable), # hue=hue, palette=palette, dodge=dodge,