Skip to content
Snippets Groups Projects
Commit 7b63a9e2 authored by Blaise Li's avatar Blaise Li
Browse files

Optional codon influence in PCA.

parent e6f55702
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = "0.25"
__version__ = "0.26"
from .libcodonusage import (
aa2colour,
aa_usage,
......
......@@ -674,10 +674,62 @@ methionine (M) and tryptophan (W).
all_nan_cols)
def codon_influence_in_components(
components, colnames,
figs_dir=None, formats=None):
"""
Plot the influence of the columns in the first 4 principal axes of a PCA.
Each column should correspond to a codon, and will be represented as a
colour bar whose coulour is based on the last letter of the codon.
*components* should be a numpy array representing the principal
axes of a PCA, such as the `components_` attribute of a fitted
`sklearn.decomposition.PCA` object.
*colnames* should be the names of the columns in the *components*
array and are expected to match the following pattern:
<aa>_<codon>, where <aa> is a single-letter code for an amino-acid,
and <codon> is one of the 3-letter codons for this amino-acid,
in capital letters among A, T, G and C (i.e. in the DNA alphabet).
The last letter will be used to set the colour of a bar in the plots
*figs_dir* should be a path to a directory that will be used
to save graphics representing the influence of each data column
on the first four principal components.
*formats* should be a list of formats in which the figures should
be saved, such as "svg" or "png".
"""
render_md(
"Vizualizing the influence of codons in the first 4 components\n")
# TODO: *figsize* could be adapted depending on the number of columns
(fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
for (component, axis) in enumerate(axes):
pd.Series(
components[component],
index=colnames).plot.bar(
ax=axes[component],
# colname is supposed to end with the 3-letters codon
color=[
nuc2colour[colname[-1]]
for colname in colnames])
axis.set_ylabel(f"weight in component {component}")
# axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
fig.subplots_adjust(hspace=.5)
if figs_dir is not None and formats is not None:
for ext in formats:
plt.savefig(
figs_dir.joinpath(f"PCA_components.{ext}"),
metadata=fmt_metadata[ext])
display(fig)
plt.close(fig)
def codon_usage_pca(
usage_data,
figs_dir=None, hue="chrom", exclude_cols=None,
formats=None):
formats=None, cols_are_codons=True):
"""
Perform Principal Component Analysis on *usage_data*.
......@@ -696,9 +748,12 @@ def codon_usage_pca(
If *figs_dir* is not None, this path to a directory will be used
to save graphics representing the projection of the observations
in the first four principal components (0 vs. 1 and 2 vs. 3)
as well as graphics representing the influence of each data column
in the first four principal components (0 vs. 1 and 2 vs. 3).
Unless *cols_are_codons* is set to False, there will also be
graphics representing the influence of each data column
on the first four principal components.
*formats* should be a list of formats in which the figures should
be saved, such as "svg" or "png".
......@@ -730,28 +785,10 @@ def codon_usage_pca(
metadata=fmt_metadata[ext])
display(fig)
plt.close(fig)
render_md(
"Vizualizing the influence of codons in the first 4 components\n")
(fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
for (component, axis) in enumerate(axes):
pd.Series(
pca.components_[component],
index=usage_data.columns).plot.bar(
ax=axes[component],
# colname is supposed to end with the 3-letters codon
color=[
nuc2colour[colname[-1]]
for colname in usage_data.columns])
axis.set_ylabel(f"weight in component {component}")
# axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
fig.subplots_adjust(hspace=.5)
if figs_dir is not None and formats is not None:
for ext in formats:
plt.savefig(
figs_dir.joinpath(f"PCA_components.{ext}"),
metadata=fmt_metadata[ext])
display(fig)
plt.close(fig)
if cols_are_codons:
codon_influence_in_components(
pca.components_, usage_data.columns,
figs_dir=figs_dir, formats=formats)
return (pca, transformed_data)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment