diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 4fcad87316df480991f3e8fabe88abfeb3522f6b..a79d9a56401f7f9bc3565091ebbac3f643411fc6 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.25" +__version__ = "0.26" from .libcodonusage import ( aa2colour, aa_usage, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 49d5c6b46ca620a30ac2b1c1910a384619614a6c..3bb2a5ede3ec557e0f3a201b40bdbbd0c6b09d6b 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -674,10 +674,62 @@ methionine (M) and tryptophan (W). all_nan_cols) +def codon_influence_in_components( + components, colnames, + figs_dir=None, formats=None): + """ + Plot the influence of the columns in the first 4 principal axes of a PCA. + + Each column should correspond to a codon, and will be represented as a + colour bar whose coulour is based on the last letter of the codon. + + *components* should be a numpy array representing the principal + axes of a PCA, such as the `components_` attribute of a fitted + `sklearn.decomposition.PCA` object. + + *colnames* should be the names of the columns in the *components* + array and are expected to match the following pattern: + <aa>_<codon>, where <aa> is a single-letter code for an amino-acid, + and <codon> is one of the 3-letter codons for this amino-acid, + in capital letters among A, T, G and C (i.e. in the DNA alphabet). + The last letter will be used to set the colour of a bar in the plots + + *figs_dir* should be a path to a directory that will be used + to save graphics representing the influence of each data column + on the first four principal components. + + *formats* should be a list of formats in which the figures should + be saved, such as "svg" or "png". + """ + render_md( + "Vizualizing the influence of codons in the first 4 components\n") + # TODO: *figsize* could be adapted depending on the number of columns + (fig, axes) = plt.subplots(4, 1, figsize=(16, 16)) + for (component, axis) in enumerate(axes): + pd.Series( + components[component], + index=colnames).plot.bar( + ax=axes[component], + # colname is supposed to end with the 3-letters codon + color=[ + nuc2colour[colname[-1]] + for colname in colnames]) + axis.set_ylabel(f"weight in component {component}") + # axis.set_xticklabels(axis.get_xticklabels(), rotation=90) + fig.subplots_adjust(hspace=.5) + if figs_dir is not None and formats is not None: + for ext in formats: + plt.savefig( + figs_dir.joinpath(f"PCA_components.{ext}"), + metadata=fmt_metadata[ext]) + display(fig) + plt.close(fig) + + def codon_usage_pca( usage_data, figs_dir=None, hue="chrom", exclude_cols=None, - formats=None): + formats=None, cols_are_codons=True): """ Perform Principal Component Analysis on *usage_data*. @@ -696,9 +748,12 @@ def codon_usage_pca( If *figs_dir* is not None, this path to a directory will be used to save graphics representing the projection of the observations - in the first four principal components (0 vs. 1 and 2 vs. 3) - as well as graphics representing the influence of each data column + in the first four principal components (0 vs. 1 and 2 vs. 3). + + Unless *cols_are_codons* is set to False, there will also be + graphics representing the influence of each data column on the first four principal components. + *formats* should be a list of formats in which the figures should be saved, such as "svg" or "png". @@ -730,28 +785,10 @@ def codon_usage_pca( metadata=fmt_metadata[ext]) display(fig) plt.close(fig) - render_md( - "Vizualizing the influence of codons in the first 4 components\n") - (fig, axes) = plt.subplots(4, 1, figsize=(16, 16)) - for (component, axis) in enumerate(axes): - pd.Series( - pca.components_[component], - index=usage_data.columns).plot.bar( - ax=axes[component], - # colname is supposed to end with the 3-letters codon - color=[ - nuc2colour[colname[-1]] - for colname in usage_data.columns]) - axis.set_ylabel(f"weight in component {component}") - # axis.set_xticklabels(axis.get_xticklabels(), rotation=90) - fig.subplots_adjust(hspace=.5) - if figs_dir is not None and formats is not None: - for ext in formats: - plt.savefig( - figs_dir.joinpath(f"PCA_components.{ext}"), - metadata=fmt_metadata[ext]) - display(fig) - plt.close(fig) + if cols_are_codons: + codon_influence_in_components( + pca.components_, usage_data.columns, + figs_dir=figs_dir, formats=formats) return (pca, transformed_data)