PCA, filtering by chromosome.

b37e7ce5 · Blaise Li · efd54b2d · b37e7ce5 · b37e7ce5
Commit b37e7ce5 authored 3 years ago by Blaise Li
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.22"
+__version__ = "0.23"
 from .libcodonusage import (
    aa2colour,
    aa_usage,
    by_aa_codon_usage,
    centroid_usage,
    codon2aa,
+    codon_usage_pca,
    columns_by_aa,
    compare_clusterings,
    detect_fishy_genes,
    exclude_all_nan_cols,
    extract_top_genes_from_cluster,
+    filter_on_idx_levels,
    find_most_biased_genes,
    find_valley,
+    format_codon_labels,
    gene_wide_codon_usage,
    load_bias_table,
    load_counts_table,

--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -36,7 +36,7 @@ import seaborn as sns
 # python3 -m pip install biotite
 # See https://www.biotite-python.org/install.html
 import biotite.sequence.graphics as bgraphs
-from biotite.sequence import CodonTable
+from biotite.sequence import CodonTable, NucleotideSequence
 # python3 -m pip install numpy
 import numpy as np
 # Python module to handle tabular data
@@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde
 # Python library with useful data-processing features
 # python3 -m pip install scikit-learn
 # https://scikit-learn.org/stable/install.html
+from sklearn.decomposition import PCA
 from sklearn.preprocessing import normalize
 # Python module to vizualize set intersections
 # python3 -m pip install upsetplot
@@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath(
 # key: amino-acid
 # value: hexadecimal html colour code
 aa2colour = {**colscheme["colors"], "*": '#000000'}
+######################################
+# Associating colours to nucleotides #
+######################################
+nuc_alphabet = NucleotideSequence.alphabet_unamb
+nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet)
+nuc2colour = dict(zip(nuc_alphabet, nuc_colours))
 def load_counts_table(
@@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path):
    render_md(f"The table was saved at [{table_path}]({table_path}).")
+def filter_on_idx_levels(counts_table, filter_dict):
+    """
+    Filter a table *counts_table* based on values of certain index levels.
+    *filter_dict* should contain index level names as keys and values that
+    rows should have at this level to be included in the output
+    filtered table.
+    """
+    row_filter = np.all([
+        counts_table.index.get_level_values(idx_lvl) == idx_val
+        for (idx_lvl, idx_val) in filter_dict.items()], axis=0)
+    return counts_table[row_filter]
 # Codon usage calculations can be done in various ways.
 # Some examples are given at page 6500 (2 of the pdf) of
 # [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
@@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
 SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
-def gene_wide_codon_usage(codon_counts, verbose=False, return_more=False):
+def gene_wide_codon_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
    """
    Compute codon usage biases "gene-wide" as the standardized
    difference between a gene's codon proportions and global
-    codon proportions.
+    codon proportions (default).
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
    """
    render_md(f"""
 We will compute codon usage "gene-wide" (i.e. not by amino-acid),
@@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
    # Due to imprecision in float arithmetics,
    # we can only check that the sums are close to 1
    assert np.allclose(colsums, np.full(len(colsums), 1))
+    if ref_filter_dict is None:
+        counts_for_global = codon_counts
+    else:
+        counts_for_global = filter_on_idx_levels(
+            codon_counts, ref_filter_dict)
    render_md("""
 To compute the usage biases, we also need to compute codon proportions
 in the global usage.
 """)
    render_md("We compute the global usage, as the sum on columns.")
-    global_usage = codon_counts.sum(axis=0)
+    global_usage = counts_for_global.sum(axis=0)
    render_md("Then we normalize it the same way as for the individual genes.")
    # The values.reshape(1, -1) turns the Series data into a 2D array
    # with one line. flatten() turns the data back to 1D
@@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons.
 # TODO: add option to output RSCU instead of / besides proportions
-def by_aa_codon_usage(codon_counts, verbose=False, return_more=False):
+def by_aa_codon_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
    """
    Compute codon usage biases "by amino-acid" as the standardized
    difference between a gene's codon proportions and global
    codon proportions, where proportions are computed within
    groups of same-amino-acid-coding codons instead of gene-wide.
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
    """
    render_md(f"""
 We will compute codon usage "by amino-acid", by looking at the
@@ -464,6 +506,11 @@ the total for the corresponding amino-acid.
        np.logical_not(
            np.isclose(
                sums.values, all_zeroes)))
+    if ref_filter_dict is None:
+        counts_for_global = codon_counts
+    else:
+        counts_for_global = filter_on_idx_levels(
+            codon_counts, ref_filter_dict)
    render_md("""
 To compute the usage biases, we also need to compute codon proportions
 in the global usage.
@@ -472,7 +519,7 @@ in the global usage.
 We compute the global usage, as the sum of the counts for a given codon,
 across genes.
 """)
-    global_usage = codon_counts.sum(axis=0)
+    global_usage = counts_for_global.sum(axis=0)
    render_md("Then we sum over codons corresponding to the same amino-acid.")
    global_summed_by_aa = global_usage.groupby(level=0).sum()
    render_md("""
@@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons.
    return standardized_codon_usage_biases
-def aa_usage(codon_counts, verbose=False, return_more=False):
+def aa_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
    """
    Compute amino-acid usage biases as the standardized
    difference between a gene's amino-acid proportions
    and global amino-acid proportions.
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
    """
    render_md("""
 We will compute amino-acid usage, by looking at the
@@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
    colsums = aa_proportions.sum(axis=1)
    assert np.allclose(colsums, np.full(len(colsums), 1))
    # Then, computing the global amino-acid proportions
+    if ref_filter_dict is None:
+        counts_for_global = summed_by_aa
+    else:
+        counts_for_global = filter_on_idx_levels(
+            summed_by_aa, ref_filter_dict)
    render_md("""
 To compute the usage biases, we also need to compute amino-acid proportions
 in the global usage.
 """)
-    render_md("""
+#     render_md("""
-We compute the global usage, as the sum of the counts for a given codon,
+# We compute the global usage, as the sum of the counts for a given codon,
-across genes.
+# across genes.
-""")
+# """)
-    global_usage = codon_counts.sum(axis=0)
+#     global_usage = codon_counts.sum(axis=0)
    render_md("Then we sum over codons corresponding to the same amino-acid.")
-    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    # global_summed_by_aa = global_usage.groupby(level=0).sum()
+    global_summed_by_aa = counts_for_global.sum()
    render_md("Then we normalize it the same way as for the individual genes.")
    # The values.reshape(1, -1) turns the Series data into a 2D array
    # with one line. flatten() turns the data back to 1D
@@ -606,6 +667,75 @@ methionine (M) and tryptophan (W).
        all_nan_cols)
+def codon_usage_pca(usage_data, figs_dir=None, hue="chrom"):
+    """
+    Perform Principal Component Analysis on *usage_data*.
+    DataFrame *usage_data* is expected to contain observations as lines
+    and contain numerical values in columns, without NaNs, corresponding
+    to codons, where the columns headers are supposed to match the following
+    pattern: <aa>_<codon>, where <aa> is a single-letter code for
+    an amino-acid, and <codon> is one of the 3-letter codons for this
+    amino-acid, in capital letters among A, T, G and C
+    (i.e. in the DNA alphabet).
+    *hue* is expected to be one of the levels of the MultiIndex of
+    *usage_data*, and will be used to assign colours to the observations
+    in PCA plots. By default, colour is based on the content of a "chrom"
+    level in the index.
+    If *figs_dir* is not None, this path to a directory will be used
+    to save graphics representing the projection of the observations
+    in the first four principal components (0 vs. 1 and 2 vs. 3)
+    as well as graphics representing the influence of each data column
+    on the first four principal components.
+    """
+    if figs_dir is not None:
+        figs_dir = Path(figs_dir)
+        figs_dir.mkdir(parents=True, exist_ok=True)
+    pca = PCA().fit(usage_data)
+    transformed_data = pd.DataFrame(
+        pca.transform(usage_data),
+        index=usage_data.index).reset_index(level=hue)
+    render_md(
+        "Plotting genes on the first 4 components\n")
+    (fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
+    sns.scatterplot(
+        data=transformed_data,
+        x=0, y=1, hue=hue, marker=".", ax=axes[0])
+    sns.scatterplot(
+        data=transformed_data,
+        x=2, y=3, hue=hue, marker=".", ax=axes[1])
+    if figs_dir is not None:
+        plt.savefig(
+            figs_dir.joinpath("PCA_projections.png"),
+            metadata={'creationDate': None})
+    display(fig)
+    plt.close(fig)
+    render_md(
+        "Vizualizing the influence of codons in the first 4 components\n")
+    (fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
+    for (component, axis) in enumerate(axes):
+        pd.Series(
+            pca.components_[component],
+            index=usage_data.columns).plot.bar(
+                ax=axes[component],
+                # colname is supposed to end with the 3-letters codon
+                color=[
+                    nuc2colour[colname[-1]]
+                    for colname in usage_data.columns])
+        axis.set_ylabel(f"weight in component {component}")
+        # axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
+    fig.subplots_adjust(hspace=.5)
+    if figs_dir is not None:
+        plt.savefig(
+            figs_dir.joinpath("PCA_components.png"),
+            metadata={'creationDate': None})
+    display(fig)
+    plt.close(fig)
+    return (pca, transformed_data)
 def centroid_usage(codon_counts, all_nan_cols):
    """
    Define "centroids" for gene clusters, one per codon in *codon_counts*.
@@ -1202,7 +1332,7 @@ def violin_usage(
        "hue": hue, "palette": palette, "dodge": dodge,
        "data": long_form, "ax": axis, "orient": "v", "scale": "count"}
    kwargs.update(violin_kwargs)
-    sns.violinplot(**kwargs)
+    axis = sns.violinplot(**kwargs)
    # sns.violinplot(x=variable, y=ylabel, order=variable2order(variable),
    #                hue=hue, palette=palette, dodge=dodge,
    #                data=long_form, ax=axis, orient="v", scale="count",
@@ -1246,7 +1376,7 @@ def violin_usage_vertical(
        "hue": hue, "palette": palette, "dodge": dodge,
        "data": long_form, "ax": axis, "orient": "h", "scale": "count"}
    kwargs.update(violin_kwargs)
-    sns.violinplot(**kwargs)
+    axis = sns.violinplot(**kwargs)
    # sns.violinplot(
    #     y=variable, x=ylabel, order=variable2order(variable),
    #     hue=hue, palette=palette, dodge=dodge,