From 4f5b9211c2a646c554a664a856c1a56c98a1ef3f Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 23 Mar 2022 14:13:25 +0100
Subject: [PATCH] Gene extraction and plotting functions.

---
 libcodonusage/__init__.py      |   5 +-
 libcodonusage/libcodonusage.py | 205 ++++++++++++++++++++++++++++++++-
 2 files changed, 206 insertions(+), 4 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 1d922c3..3fa340f 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.16"
+__version__ = "0.17"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
@@ -10,6 +10,8 @@ from .libcodonusage import (
     columns_by_aa,
     detect_fishy_genes,
     exclude_all_nan_cols,
+    extract_top_genes_from_cluster,
+    find_most_biased_genes,
     find_valley,
     gene_wide_codon_usage,
     load_bias_table,
@@ -18,6 +20,7 @@ from .libcodonusage import (
     make_cluster_table,
     make_centroids_cluster_finder,
     make_counts_only,
+    plot_codon_usage_for_gene_list,
     render_md,
     save_counts_table,
     sort_counts_by_aa,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index f27b205..7721658 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -639,7 +639,9 @@ in the data.
     return centroids_scub_by_aa
 
 
-def make_centroids_cluster_finder(centroids_table, aa):
+def make_centroids_cluster_finder(
+        centroids_table,
+        aa):  # pylint: disable=C0103
     """
     Make a function that, when applied to a row in the standardized
     codon bias table, determines to what centroid among those
@@ -680,7 +682,7 @@ def make_cluster_table(scub_table, centroids_scub_table):
             (f"cluster_{aa}_full_bias", ""): scub_table.apply(
                 make_centroids_cluster_finder(centroids_scub_table, aa),
                 axis=1).values
-            for aa
+            for aa  # pylint: disable=C0103
             in unique(centroids_scub_table.columns.get_level_values(0))},
         index=scub_table.index)
 
@@ -729,7 +731,9 @@ def star2stop(text):
 
 
 def write_cluster_lists(
-        usage_table, aa, clusters_dir,
+        usage_table,
+        aa,  # pylint: disable=C0103
+        clusters_dir,
         cluster_level_template,
         y_label_template,
         alt_tag_kw="old_locus_tag"):
@@ -802,6 +806,201 @@ def find_valley(values):
     return min_val + (min_density_idx * val_range / 100)
 
 
+def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"):
+    """
+    Compute a usage bias threshold hopefully separating the most biased genes
+    from the remainder of the cluster where *data* contains codon usage biases
+    for the genes in the cluster and has an *alt_tag_kw* level in its index.
+    Return the threshold in terms of usage bias and the list of genes above it.
+
+    /!\ This is likely very sensitive to the shape
+    of the usage bias distribution.
+    """
+    thresh = find_valley(data.values)
+    # Try avoiding cases with threshold "misplaced" at the bottom
+    # of the distribution
+    retries = 0
+    while do_retries and thresh < 0:
+        retries += 1
+        thresh = find_valley(data.sort_values().iloc[retries:].values)
+    top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values
+    return (thresh, top_genes, retries)
+
+
+def extract_top_genes_from_cluster(
+        usage_table,
+        aa,  # pylint: disable=C0103
+        codon,
+        clusters_dir,
+        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
+    """
+    Return a few lines of markdown with links pointing to
+    where the saved results are expected to be found.
+    """
+    md_report = f"    - Most biased genes for {codon} ({aa}):\n\n"
+    aa_dir = clusters_dir.joinpath(star2stop(aa))
+    # Files where to save the results
+    top_dir = aa_dir.joinpath("top_genes")
+    top_dir.mkdir(parents=True, exist_ok=True)
+    path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
+    path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
+    path_to_fig = top_dir.joinpath(star2stop(
+        f"usage_biases_violin_plot_{aa}_{codon}.png"))
+    # Load the list of genes belonging to the cluster
+    # associated with a high usage bias for *codon*
+    path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt"))
+    with open(path_to_clusterfile) as gene_list_fh:
+        genes = [line.strip() for line in gene_list_fh]
+    # Extract the usage bias data for these genes and for this codon
+    idx_level = usage_table.index.names.index(alt_tag_kw)
+    try:
+        # data = usage_table.loc[genes][(aa, codon)]
+        data = usage_table.loc[
+            (*(slice(None) for _ in range(idx_level)), genes),
+            (aa, codon)]
+    except KeyError:
+        print(genes[0])
+        print(usage_table.index.names)
+        raise
+    q3 = data.quantile(0.75)  # pylint: disable=C0103
+    top25 = data[(data > q3)].index.get_level_values(
+        alt_tag_kw).values
+    (thresh, top_genes, retries) = find_most_biased_genes(
+        data, do_retries=True, alt_tag_kw=alt_tag_kw)
+    if retries != 0:
+        print(
+            f"Bottom-most {retries} values had to be discarded in order "
+            f"to avoid a negative threshold for the cluster {aa}_{codon}.")
+    top_genes_check = set(
+        usage_table[
+            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
+                alt_tag_kw).values)
+    if set(top_genes) != top_genes_check:
+        assert set(top_genes).issubset(top_genes_check)
+        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
+        print(
+            f"{len(extra_genes_not_in_cluster)} genes were found "
+            f"with a high bias for {codon} but not belonging "
+            f"to the cluster {aa}_{codon}.")
+    # For visual verification purpose,
+    # display where the threshold is situated on a violin plot
+    axis = violin_with_thresh(
+        data, [(thresh, "red"), (q3, "blue")],
+        f"standardized usage bias for {codon} ({aa})",
+        facecolor=aa2colour[aa], axis=axis)
+    # Save the figure and close it.
+    plt.tight_layout()
+    if savefig:
+        plt.savefig(path_to_fig, metadata={'creationDate': None})
+        plt.close()
+        # Save the list of the top genes
+        with open(path_to_top25, "w") as gene_list_fh:
+            gene_list_fh.write("\n".join(top25))
+        with open(path_to_top, "w") as gene_list_fh:
+            gene_list_fh.write("\n".join(top_genes))
+    relpath_to_top25 = str(path_to_top25.relative_to('.'))
+    relpath_to_top = str(path_to_top.relative_to('.'))
+    relpath_to_fig = str(path_to_fig.relative_to('.'))
+    md_report += "\n\n".join([
+        f"        * [top 25% genes]({relpath_to_top25})",
+        f"        * [top genes]({relpath_to_top})",
+        f"        * [violin plot with thresholds ({q3:.3}, "
+        f"{thresh:.3})]({relpath_to_fig})"])
+    return md_report
+
+
+# TODO:
+def plot_codon_usage_for_gene_list(
+        gene_list_path,
+        usage_table,
+        aa,  # pylint: disable=C0103
+        codon,
+        figs_dir=None,
+        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
+    """
+    Make a violin plot of codon usage biases for genes in file *gene_list_path*
+    using data in *usage_table*.
+    The plot will represent the distribution of usage bias
+    for codon *codon* of amino-acid *aa*.
+    The plot will be saved inside *figs_dir* if *savefig* is set to True.
+    Gene identifiers are expected to be *alt_tag_kw* identifiers.
+    """
+    md_report = f"    - Genes from {gene_list_path}:\n\n"
+    with open(gene_list_path) as gene_list_fh:
+        genes = [
+            line.strip() for line in gene_list_fh
+            if line[0] != "#"]
+    # Extract the usage bias data for these genes and for this codon
+    idx_level = usage_table.index.names.index(alt_tag_kw)
+    try:
+        # data = usage_table.loc[genes][(aa, codon)]
+        data = usage_table.loc[
+            (*(slice(None) for _ in range(idx_level)), genes),
+            (aa, codon)]
+    except KeyError:
+        print(
+            f"Genes are supposed to be provided as {alt_tag_kw} identifiers.")
+        print(genes[0])
+        print(usage_table.index.names)
+        raise
+    q3 = data.quantile(0.75)  # pylint: disable=C0103
+    # top25 = data[(data > q3)].index.get_level_values(
+    #     alt_tag_kw).values
+    (thresh, top_genes, retries) = find_most_biased_genes(
+        data, do_retries=True, alt_tag_kw=alt_tag_kw)
+    if retries != 0:
+        print(
+            f"Bottom-most {retries} values (out of {len(data)}) "
+            "had to be discarded in order to avoid a negative threshold "
+            f"for the cluster {aa}_{codon}.")
+    top_genes_check = set(
+        usage_table[
+            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
+                alt_tag_kw).values)
+    if set(top_genes) != top_genes_check:
+        assert set(top_genes).issubset(top_genes_check)
+        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
+        print(
+            f"{len(extra_genes_not_in_cluster)} genes were found "
+            f"with a high bias for {codon} but not belonging to "
+            f"the cluster {aa}_{codon}.")
+    # For visual verification purpose,
+    # display where the threshold is situated on a violin plot
+    axis = violin_with_thresh(
+        data, [(thresh, "red"), (q3, "blue")],
+        f"standardized usage bias for {codon} ({aa})",
+        facecolor=aa2colour[aa], axis=axis)
+    # Save the figure and close it.
+    plt.tight_layout()
+    if not savefig:
+        return md_report
+    if figs_dir is None:
+        raise ValueError("No *figs_dir* directory specified.\n")
+    aa_dir = figs_dir.joinpath(star2stop(aa))
+    # Files where to save the results
+    aa_dir.mkdir(parents=True, exist_ok=True)
+    path_to_fig = aa_dir.joinpath(star2stop(
+        f"usage_biases_violin_plot_{aa}_{codon}.png"))
+    plt.savefig(path_to_fig, metadata={'creationDate': None})
+    plt.close()
+    # Save the list of the top genes
+    # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
+    # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
+    # with open(path_to_top25, "w") as gene_list_fh:
+    #     gene_list_fh.write("\n".join(top25))
+    # with open(path_to_top, "w") as gene_list_fh:
+    #     gene_list_fh.write("\n".join(top_genes))
+    # relpath_to_top25 = str(path_to_top25.relative_to('.'))
+    # relpath_to_top = str(path_to_top.relative_to('.'))
+    relpath_to_fig = str(path_to_fig.relative_to('.'))
+    md_report += "\n\n".join([
+        # f"        * [top 25% genes]({relpath_to_top25})",
+        # f"        * [top genes]({relpath_to_top})",
+        f"        * [violin plot with thresholds ({q3:.3}, "
+        f"{thresh:.3})]({relpath_to_fig})"])
+    return md_report
+
+
 def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
     """
     Plot a boxplot from pandas DataFrame *usage_table*.
-- 
GitLab