Skip to content
Snippets Groups Projects
Commit ae4349e4 authored by Blaise Li's avatar Blaise Li
Browse files

Threshold finding function.

parent 598c606d
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = "0.15"
__version__ = "0.16"
from .libcodonusage import (
aa2colour,
aa_usage,
......@@ -10,6 +10,7 @@ from .libcodonusage import (
columns_by_aa,
detect_fishy_genes,
exclude_all_nan_cols,
find_valley,
gene_wide_codon_usage,
load_bias_table,
load_counts_table,
......@@ -20,6 +21,7 @@ from .libcodonusage import (
render_md,
save_counts_table,
sort_counts_by_aa,
star2stop,
violin_usage,
violin_usage_vertical,
violin_usage_by_clusters,
......
......@@ -46,7 +46,9 @@ import pandas as pd
# python3 -m pip install plydata
# from plydata import define, pull, query
# python3 -m pip install scipy
from scipy.linalg import LinAlgError
from scipy.spatial.distance import sqeuclidean as sqdist
from scipy.stats import gaussian_kde
# Python library with useful data-processing features
# python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html
......@@ -729,8 +731,16 @@ def star2stop(text):
def write_cluster_lists(
usage_table, aa, clusters_dir,
cluster_level_template,
y_label_template):
y_label_template,
alt_tag_kw="old_locus_tag"):
"""
Extract lists of genes in *usage_table** for the clusters
associated with codons for amino-acid *aa* and write the
resulting lists in a sub-folder (named based on *aa*)
of the directory *clusters_dir*.
The clusters are found in the column whose name conforms to
*cluster_level_template*
Also generate violin plots for each cluster.
"""
md_report = f"* Clusters for {aa}:\n\n"
aa_dir = clusters_dir.joinpath(star2stop(aa))
......@@ -740,12 +750,14 @@ def write_cluster_lists(
# for genes belonging to this cluster
relpaths_to_cluster_lists = {}
for (cluster, gene_list) in groupby(itemgetter(0), zip(
usage_table.index.get_level_values(cluster_level_template.format(aa=aa)),
usage_table.index.get_level_values("old_locus_tag"))).items():
usage_table.index.get_level_values(
cluster_level_template.format(aa=aa)),
usage_table.index.get_level_values(alt_tag_kw))).items():
path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt"))
with path_to_clusterfile.open("w") as clust_fh:
clust_fh.write("\n".join(map(itemgetter(1), gene_list)))
relpaths_to_cluster_lists[cluster] = str(path_to_clusterfile.relative_to('.'))
relpaths_to_cluster_lists[cluster] = str(
path_to_clusterfile.relative_to('.'))
md_report += "\n\n".join([
f" - [{cluster}]({relpath_to_list})"
for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()])
......@@ -760,10 +772,36 @@ def write_cluster_lists(
plt.savefig(path_to_fig, metadata={'creationDate': None})
plt.close()
relpath_to_fig = str(path_to_fig.relative_to('.'))
md_report += f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n"
md_report += (
f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n")
return md_report
def find_valley(values):
"""
Try to find a cutting threshold for the *values*,
where the corresponding density distribution is lowest.
"""
if len(set(values)) == 1:
(single_val,) = set(values)
return single_val
min_val = min(values)
max_val = max(values)
val_range = max_val - min_val
# We use 100 bins
try:
val_density = gaussian_kde(values).evaluate(
np.linspace(min_val, max_val, 100))
except LinAlgError as err:
print(f"min: {min_val}, max: {max_val}, N: {len(values)}.")
print(str(err))
raise
# Where is the density lowest? (excluding first and last)
min_density_idx = val_density[1:-1].argmin()
# Finding the value corresponding to the index in the `val_density` array:
return min_val + (min_density_idx * val_range / 100)
def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
"""
Plot a boxplot from pandas DataFrame *usage_table*.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment