Threshold finding function.

ae4349e4 · Blaise Li · 598c606d · ae4349e4 · ae4349e4
Commit ae4349e4 authored 3 years ago by Blaise Li
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = "0.15"
+__version__ = "0.16"
 from .libcodonusage import (
    aa2colour,
    aa_usage,
@@ -10,6 +10,7 @@ from .libcodonusage import (
    columns_by_aa,
    detect_fishy_genes,
    exclude_all_nan_cols,
+    find_valley,
    gene_wide_codon_usage,
    load_bias_table,
    load_counts_table,
@@ -20,6 +21,7 @@ from .libcodonusage import (
    render_md,
    save_counts_table,
    sort_counts_by_aa,
+    star2stop,
    violin_usage,
    violin_usage_vertical,
    violin_usage_by_clusters,

--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -46,7 +46,9 @@ import pandas as pd
 # python3 -m pip install plydata
 # from plydata import define, pull, query
 # python3 -m pip install scipy
+from scipy.linalg import LinAlgError
 from scipy.spatial.distance import sqeuclidean as sqdist
+from scipy.stats import gaussian_kde
 # Python library with useful data-processing features
 # python3 -m pip install scikit-learn
 # https://scikit-learn.org/stable/install.html
@@ -729,8 +731,16 @@ def star2stop(text):
 def write_cluster_lists(
        usage_table, aa, clusters_dir,
        cluster_level_template,
-        y_label_template):
+        y_label_template,
+        alt_tag_kw="old_locus_tag"):
    """
+    Extract lists of genes in *usage_table** for the clusters
+    associated with codons for amino-acid *aa* and write the
+    resulting lists in a sub-folder (named based on *aa*)
+    of the directory *clusters_dir*.
+    The clusters are found in the column whose name conforms to
+    *cluster_level_template*
+    Also generate violin plots for each cluster.
    """
    md_report = f"* Clusters for {aa}:\n\n"
    aa_dir = clusters_dir.joinpath(star2stop(aa))
@@ -740,12 +750,14 @@ def write_cluster_lists(
    #        for genes belonging to this cluster
    relpaths_to_cluster_lists = {}
    for (cluster, gene_list) in groupby(itemgetter(0), zip(
-        usage_table.index.get_level_values(cluster_level_template.format(aa=aa)),
-        usage_table.index.get_level_values("old_locus_tag"))).items():
+            usage_table.index.get_level_values(
+                cluster_level_template.format(aa=aa)),
+            usage_table.index.get_level_values(alt_tag_kw))).items():
        path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt"))
        with path_to_clusterfile.open("w") as clust_fh:
            clust_fh.write("\n".join(map(itemgetter(1), gene_list)))
-        relpaths_to_cluster_lists[cluster] = str(path_to_clusterfile.relative_to('.'))
+        relpaths_to_cluster_lists[cluster] = str(
+            path_to_clusterfile.relative_to('.'))
    md_report += "\n\n".join([
        f"    - [{cluster}]({relpath_to_list})"
        for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()])
@@ -760,10 +772,36 @@ def write_cluster_lists(
    plt.savefig(path_to_fig, metadata={'creationDate': None})
    plt.close()
    relpath_to_fig = str(path_to_fig.relative_to('.'))
-    md_report += f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n"
+    md_report += (
+        f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n")
    return md_report


+def find_valley(values):
+    """
+    Try to find a cutting threshold for the *values*,
+    where the corresponding density distribution is lowest.
+    """
+    if len(set(values)) == 1:
+        (single_val,) = set(values)
+        return single_val
+    min_val = min(values)
+    max_val = max(values)
+    val_range = max_val - min_val
+    # We use 100 bins
+    try:
+        val_density = gaussian_kde(values).evaluate(
+            np.linspace(min_val, max_val, 100))
+    except LinAlgError as err:
+        print(f"min: {min_val}, max: {max_val}, N: {len(values)}.")
+        print(str(err))
+        raise
+    # Where is the density lowest? (excluding first and last)
+    min_density_idx = val_density[1:-1].argmin()
+    # Finding the value corresponding to the index in the `val_density` array:
+    return min_val + (min_density_idx * val_range / 100)
+
+
 def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
    """
    Plot a boxplot from pandas DataFrame *usage_table*.