Skip to content
Snippets Groups Projects
Commit b37e7ce5 authored by Blaise Li's avatar Blaise Li
Browse files

PCA, filtering by chromosome.

parent efd54b2d
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li" __copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3" __licence__ = "GNU GPLv3"
__version__ = "0.22" __version__ = "0.23"
from .libcodonusage import ( from .libcodonusage import (
aa2colour, aa2colour,
aa_usage, aa_usage,
by_aa_codon_usage, by_aa_codon_usage,
centroid_usage, centroid_usage,
codon2aa, codon2aa,
codon_usage_pca,
columns_by_aa, columns_by_aa,
compare_clusterings, compare_clusterings,
detect_fishy_genes, detect_fishy_genes,
exclude_all_nan_cols, exclude_all_nan_cols,
extract_top_genes_from_cluster, extract_top_genes_from_cluster,
filter_on_idx_levels,
find_most_biased_genes, find_most_biased_genes,
find_valley, find_valley,
format_codon_labels,
gene_wide_codon_usage, gene_wide_codon_usage,
load_bias_table, load_bias_table,
load_counts_table, load_counts_table,
......
...@@ -36,7 +36,7 @@ import seaborn as sns ...@@ -36,7 +36,7 @@ import seaborn as sns
# python3 -m pip install biotite # python3 -m pip install biotite
# See https://www.biotite-python.org/install.html # See https://www.biotite-python.org/install.html
import biotite.sequence.graphics as bgraphs import biotite.sequence.graphics as bgraphs
from biotite.sequence import CodonTable from biotite.sequence import CodonTable, NucleotideSequence
# python3 -m pip install numpy # python3 -m pip install numpy
import numpy as np import numpy as np
# Python module to handle tabular data # Python module to handle tabular data
...@@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde ...@@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde
# Python library with useful data-processing features # Python library with useful data-processing features
# python3 -m pip install scikit-learn # python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html # https://scikit-learn.org/stable/install.html
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize from sklearn.preprocessing import normalize
# Python module to vizualize set intersections # Python module to vizualize set intersections
# python3 -m pip install upsetplot # python3 -m pip install upsetplot
...@@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath( ...@@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath(
# key: amino-acid # key: amino-acid
# value: hexadecimal html colour code # value: hexadecimal html colour code
aa2colour = {**colscheme["colors"], "*": '#000000'} aa2colour = {**colscheme["colors"], "*": '#000000'}
######################################
# Associating colours to nucleotides #
######################################
nuc_alphabet = NucleotideSequence.alphabet_unamb
nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet)
nuc2colour = dict(zip(nuc_alphabet, nuc_colours))
def load_counts_table( def load_counts_table(
...@@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path): ...@@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path):
render_md(f"The table was saved at [{table_path}]({table_path}).") render_md(f"The table was saved at [{table_path}]({table_path}).")
def filter_on_idx_levels(counts_table, filter_dict):
"""
Filter a table *counts_table* based on values of certain index levels.
*filter_dict* should contain index level names as keys and values that
rows should have at this level to be included in the output
filtered table.
"""
row_filter = np.all([
counts_table.index.get_level_values(idx_lvl) == idx_val
for (idx_lvl, idx_val) in filter_dict.items()], axis=0)
return counts_table[row_filter]
# Codon usage calculations can be done in various ways. # Codon usage calculations can be done in various ways.
# Some examples are given at page 6500 (2 of the pdf) of # Some examples are given at page 6500 (2 of the pdf) of
# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032) # [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
...@@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032" ...@@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})" SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
def gene_wide_codon_usage(codon_counts, verbose=False, return_more=False): def gene_wide_codon_usage(
codon_counts,
verbose=False, return_more=False, ref_filter_dict=None):
""" """
Compute codon usage biases "gene-wide" as the standardized Compute codon usage biases "gene-wide" as the standardized
difference between a gene's codon proportions and global difference between a gene's codon proportions and global
codon proportions. codon proportions (default).
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
""" """
render_md(f""" render_md(f"""
We will compute codon usage "gene-wide" (i.e. not by amino-acid), We will compute codon usage "gene-wide" (i.e. not by amino-acid),
...@@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum). ...@@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# Due to imprecision in float arithmetics, # Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1 # we can only check that the sums are close to 1
assert np.allclose(colsums, np.full(len(colsums), 1)) assert np.allclose(colsums, np.full(len(colsums), 1))
if ref_filter_dict is None:
counts_for_global = codon_counts
else:
counts_for_global = filter_on_idx_levels(
codon_counts, ref_filter_dict)
render_md(""" render_md("""
To compute the usage biases, we also need to compute codon proportions To compute the usage biases, we also need to compute codon proportions
in the global usage. in the global usage.
""") """)
render_md("We compute the global usage, as the sum on columns.") render_md("We compute the global usage, as the sum on columns.")
global_usage = codon_counts.sum(axis=0) global_usage = counts_for_global.sum(axis=0)
render_md("Then we normalize it the same way as for the individual genes.") render_md("Then we normalize it the same way as for the individual genes.")
# The values.reshape(1, -1) turns the Series data into a 2D array # The values.reshape(1, -1) turns the Series data into a 2D array
# with one line. flatten() turns the data back to 1D # with one line. flatten() turns the data back to 1D
...@@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons. ...@@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons.
# TODO: add option to output RSCU instead of / besides proportions # TODO: add option to output RSCU instead of / besides proportions
def by_aa_codon_usage(codon_counts, verbose=False, return_more=False): def by_aa_codon_usage(
codon_counts,
verbose=False, return_more=False, ref_filter_dict=None):
""" """
Compute codon usage biases "by amino-acid" as the standardized Compute codon usage biases "by amino-acid" as the standardized
difference between a gene's codon proportions and global difference between a gene's codon proportions and global
codon proportions, where proportions are computed within codon proportions, where proportions are computed within
groups of same-amino-acid-coding codons instead of gene-wide. groups of same-amino-acid-coding codons instead of gene-wide.
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
""" """
render_md(f""" render_md(f"""
We will compute codon usage "by amino-acid", by looking at the We will compute codon usage "by amino-acid", by looking at the
...@@ -464,6 +506,11 @@ the total for the corresponding amino-acid. ...@@ -464,6 +506,11 @@ the total for the corresponding amino-acid.
np.logical_not( np.logical_not(
np.isclose( np.isclose(
sums.values, all_zeroes))) sums.values, all_zeroes)))
if ref_filter_dict is None:
counts_for_global = codon_counts
else:
counts_for_global = filter_on_idx_levels(
codon_counts, ref_filter_dict)
render_md(""" render_md("""
To compute the usage biases, we also need to compute codon proportions To compute the usage biases, we also need to compute codon proportions
in the global usage. in the global usage.
...@@ -472,7 +519,7 @@ in the global usage. ...@@ -472,7 +519,7 @@ in the global usage.
We compute the global usage, as the sum of the counts for a given codon, We compute the global usage, as the sum of the counts for a given codon,
across genes. across genes.
""") """)
global_usage = codon_counts.sum(axis=0) global_usage = counts_for_global.sum(axis=0)
render_md("Then we sum over codons corresponding to the same amino-acid.") render_md("Then we sum over codons corresponding to the same amino-acid.")
global_summed_by_aa = global_usage.groupby(level=0).sum() global_summed_by_aa = global_usage.groupby(level=0).sum()
render_md(""" render_md("""
...@@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons. ...@@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons.
return standardized_codon_usage_biases return standardized_codon_usage_biases
def aa_usage(codon_counts, verbose=False, return_more=False): def aa_usage(
codon_counts,
verbose=False, return_more=False, ref_filter_dict=None):
""" """
Compute amino-acid usage biases as the standardized Compute amino-acid usage biases as the standardized
difference between a gene's amino-acid proportions difference between a gene's amino-acid proportions
and global amino-acid proportions. and global amino-acid proportions.
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
""" """
render_md(""" render_md("""
We will compute amino-acid usage, by looking at the We will compute amino-acid usage, by looking at the
...@@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum). ...@@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
colsums = aa_proportions.sum(axis=1) colsums = aa_proportions.sum(axis=1)
assert np.allclose(colsums, np.full(len(colsums), 1)) assert np.allclose(colsums, np.full(len(colsums), 1))
# Then, computing the global amino-acid proportions # Then, computing the global amino-acid proportions
if ref_filter_dict is None:
counts_for_global = summed_by_aa
else:
counts_for_global = filter_on_idx_levels(
summed_by_aa, ref_filter_dict)
render_md(""" render_md("""
To compute the usage biases, we also need to compute amino-acid proportions To compute the usage biases, we also need to compute amino-acid proportions
in the global usage. in the global usage.
""") """)
render_md(""" # render_md("""
We compute the global usage, as the sum of the counts for a given codon, # We compute the global usage, as the sum of the counts for a given codon,
across genes. # across genes.
""") # """)
global_usage = codon_counts.sum(axis=0) # global_usage = codon_counts.sum(axis=0)
render_md("Then we sum over codons corresponding to the same amino-acid.") render_md("Then we sum over codons corresponding to the same amino-acid.")
global_summed_by_aa = global_usage.groupby(level=0).sum() # global_summed_by_aa = global_usage.groupby(level=0).sum()
global_summed_by_aa = counts_for_global.sum()
render_md("Then we normalize it the same way as for the individual genes.") render_md("Then we normalize it the same way as for the individual genes.")
# The values.reshape(1, -1) turns the Series data into a 2D array # The values.reshape(1, -1) turns the Series data into a 2D array
# with one line. flatten() turns the data back to 1D # with one line. flatten() turns the data back to 1D
...@@ -606,6 +667,75 @@ methionine (M) and tryptophan (W). ...@@ -606,6 +667,75 @@ methionine (M) and tryptophan (W).
all_nan_cols) all_nan_cols)
def codon_usage_pca(usage_data, figs_dir=None, hue="chrom"):
"""
Perform Principal Component Analysis on *usage_data*.
DataFrame *usage_data* is expected to contain observations as lines
and contain numerical values in columns, without NaNs, corresponding
to codons, where the columns headers are supposed to match the following
pattern: <aa>_<codon>, where <aa> is a single-letter code for
an amino-acid, and <codon> is one of the 3-letter codons for this
amino-acid, in capital letters among A, T, G and C
(i.e. in the DNA alphabet).
*hue* is expected to be one of the levels of the MultiIndex of
*usage_data*, and will be used to assign colours to the observations
in PCA plots. By default, colour is based on the content of a "chrom"
level in the index.
If *figs_dir* is not None, this path to a directory will be used
to save graphics representing the projection of the observations
in the first four principal components (0 vs. 1 and 2 vs. 3)
as well as graphics representing the influence of each data column
on the first four principal components.
"""
if figs_dir is not None:
figs_dir = Path(figs_dir)
figs_dir.mkdir(parents=True, exist_ok=True)
pca = PCA().fit(usage_data)
transformed_data = pd.DataFrame(
pca.transform(usage_data),
index=usage_data.index).reset_index(level=hue)
render_md(
"Plotting genes on the first 4 components\n")
(fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
sns.scatterplot(
data=transformed_data,
x=0, y=1, hue=hue, marker=".", ax=axes[0])
sns.scatterplot(
data=transformed_data,
x=2, y=3, hue=hue, marker=".", ax=axes[1])
if figs_dir is not None:
plt.savefig(
figs_dir.joinpath("PCA_projections.png"),
metadata={'creationDate': None})
display(fig)
plt.close(fig)
render_md(
"Vizualizing the influence of codons in the first 4 components\n")
(fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
for (component, axis) in enumerate(axes):
pd.Series(
pca.components_[component],
index=usage_data.columns).plot.bar(
ax=axes[component],
# colname is supposed to end with the 3-letters codon
color=[
nuc2colour[colname[-1]]
for colname in usage_data.columns])
axis.set_ylabel(f"weight in component {component}")
# axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
fig.subplots_adjust(hspace=.5)
if figs_dir is not None:
plt.savefig(
figs_dir.joinpath("PCA_components.png"),
metadata={'creationDate': None})
display(fig)
plt.close(fig)
return (pca, transformed_data)
def centroid_usage(codon_counts, all_nan_cols): def centroid_usage(codon_counts, all_nan_cols):
""" """
Define "centroids" for gene clusters, one per codon in *codon_counts*. Define "centroids" for gene clusters, one per codon in *codon_counts*.
...@@ -1202,7 +1332,7 @@ def violin_usage( ...@@ -1202,7 +1332,7 @@ def violin_usage(
"hue": hue, "palette": palette, "dodge": dodge, "hue": hue, "palette": palette, "dodge": dodge,
"data": long_form, "ax": axis, "orient": "v", "scale": "count"} "data": long_form, "ax": axis, "orient": "v", "scale": "count"}
kwargs.update(violin_kwargs) kwargs.update(violin_kwargs)
sns.violinplot(**kwargs) axis = sns.violinplot(**kwargs)
# sns.violinplot(x=variable, y=ylabel, order=variable2order(variable), # sns.violinplot(x=variable, y=ylabel, order=variable2order(variable),
# hue=hue, palette=palette, dodge=dodge, # hue=hue, palette=palette, dodge=dodge,
# data=long_form, ax=axis, orient="v", scale="count", # data=long_form, ax=axis, orient="v", scale="count",
...@@ -1246,7 +1376,7 @@ def violin_usage_vertical( ...@@ -1246,7 +1376,7 @@ def violin_usage_vertical(
"hue": hue, "palette": palette, "dodge": dodge, "hue": hue, "palette": palette, "dodge": dodge,
"data": long_form, "ax": axis, "orient": "h", "scale": "count"} "data": long_form, "ax": axis, "orient": "h", "scale": "count"}
kwargs.update(violin_kwargs) kwargs.update(violin_kwargs)
sns.violinplot(**kwargs) axis = sns.violinplot(**kwargs)
# sns.violinplot( # sns.violinplot(
# y=variable, x=ylabel, order=variable2order(variable), # y=variable, x=ylabel, order=variable2order(variable),
# hue=hue, palette=palette, dodge=dodge, # hue=hue, palette=palette, dodge=dodge,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment