From ae4349e451fc942410bc699dc19b8923e9d22f3b Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 18 Mar 2022 23:03:10 +0100
Subject: [PATCH] Threshold finding function.

---
 libcodonusage/__init__.py      |  4 ++-
 libcodonusage/libcodonusage.py | 48 ++++++++++++++++++++++++++++++----
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 45910b8..1d922c3 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.15"
+__version__ = "0.16"
 from .libcodonusage import (
     aa2colour,
     aa_usage,
@@ -10,6 +10,7 @@ from .libcodonusage import (
     columns_by_aa,
     detect_fishy_genes,
     exclude_all_nan_cols,
+    find_valley,
     gene_wide_codon_usage,
     load_bias_table,
     load_counts_table,
@@ -20,6 +21,7 @@ from .libcodonusage import (
     render_md,
     save_counts_table,
     sort_counts_by_aa,
+    star2stop,
     violin_usage,
     violin_usage_vertical,
     violin_usage_by_clusters,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 1889d81..f27b205 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -46,7 +46,9 @@ import pandas as pd
 # python3 -m pip install plydata
 # from plydata import define, pull, query
 # python3 -m pip install scipy
+from scipy.linalg import LinAlgError
 from scipy.spatial.distance import sqeuclidean as sqdist
+from scipy.stats import gaussian_kde
 # Python library with useful data-processing features
 # python3 -m pip install scikit-learn
 # https://scikit-learn.org/stable/install.html
@@ -729,8 +731,16 @@ def star2stop(text):
 def write_cluster_lists(
         usage_table, aa, clusters_dir,
         cluster_level_template,
-        y_label_template):
+        y_label_template,
+        alt_tag_kw="old_locus_tag"):
     """
+    Extract lists of genes in *usage_table** for the clusters
+    associated with codons for amino-acid *aa* and write the
+    resulting lists in a sub-folder (named based on *aa*)
+    of the directory *clusters_dir*.
+    The clusters are found in the column whose name conforms to
+    *cluster_level_template*
+    Also generate violin plots for each cluster.
     """
     md_report = f"* Clusters for {aa}:\n\n"
     aa_dir = clusters_dir.joinpath(star2stop(aa))
@@ -740,12 +750,14 @@ def write_cluster_lists(
     #        for genes belonging to this cluster
     relpaths_to_cluster_lists = {}
     for (cluster, gene_list) in groupby(itemgetter(0), zip(
-        usage_table.index.get_level_values(cluster_level_template.format(aa=aa)),
-        usage_table.index.get_level_values("old_locus_tag"))).items():
+            usage_table.index.get_level_values(
+                cluster_level_template.format(aa=aa)),
+            usage_table.index.get_level_values(alt_tag_kw))).items():
         path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt"))
         with path_to_clusterfile.open("w") as clust_fh:
             clust_fh.write("\n".join(map(itemgetter(1), gene_list)))
-        relpaths_to_cluster_lists[cluster] = str(path_to_clusterfile.relative_to('.'))
+        relpaths_to_cluster_lists[cluster] = str(
+            path_to_clusterfile.relative_to('.'))
     md_report += "\n\n".join([
         f"    - [{cluster}]({relpath_to_list})"
         for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()])
@@ -760,10 +772,36 @@ def write_cluster_lists(
     plt.savefig(path_to_fig, metadata={'creationDate': None})
     plt.close()
     relpath_to_fig = str(path_to_fig.relative_to('.'))
-    md_report += f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n"
+    md_report += (
+        f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n")
     return md_report
 
 
+def find_valley(values):
+    """
+    Try to find a cutting threshold for the *values*,
+    where the corresponding density distribution is lowest.
+    """
+    if len(set(values)) == 1:
+        (single_val,) = set(values)
+        return single_val
+    min_val = min(values)
+    max_val = max(values)
+    val_range = max_val - min_val
+    # We use 100 bins
+    try:
+        val_density = gaussian_kde(values).evaluate(
+            np.linspace(min_val, max_val, 100))
+    except LinAlgError as err:
+        print(f"min: {min_val}, max: {max_val}, N: {len(values)}.")
+        print(str(err))
+        raise
+    # Where is the density lowest? (excluding first and last)
+    min_density_idx = val_density[1:-1].argmin()
+    # Finding the value corresponding to the index in the `val_density` array:
+    return min_val + (min_density_idx * val_range / 100)
+
+
 def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
     """
     Plot a boxplot from pandas DataFrame *usage_table*.
-- 
GitLab