diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 45910b8f337c1890f6ef98a6c1e752747cac8cf0..1d922c3163cd8e404de4badc64d39223e85d6cec 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.15" +__version__ = "0.16" from .libcodonusage import ( aa2colour, aa_usage, @@ -10,6 +10,7 @@ from .libcodonusage import ( columns_by_aa, detect_fishy_genes, exclude_all_nan_cols, + find_valley, gene_wide_codon_usage, load_bias_table, load_counts_table, @@ -20,6 +21,7 @@ from .libcodonusage import ( render_md, save_counts_table, sort_counts_by_aa, + star2stop, violin_usage, violin_usage_vertical, violin_usage_by_clusters, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 1889d810a7ebfff493446c0750afd5c9cb1969a5..f27b205415a13b2c467e76f7ebc33a55cfc8630f 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -46,7 +46,9 @@ import pandas as pd # python3 -m pip install plydata # from plydata import define, pull, query # python3 -m pip install scipy +from scipy.linalg import LinAlgError from scipy.spatial.distance import sqeuclidean as sqdist +from scipy.stats import gaussian_kde # Python library with useful data-processing features # python3 -m pip install scikit-learn # https://scikit-learn.org/stable/install.html @@ -729,8 +731,16 @@ def star2stop(text): def write_cluster_lists( usage_table, aa, clusters_dir, cluster_level_template, - y_label_template): + y_label_template, + alt_tag_kw="old_locus_tag"): """ + Extract lists of genes in *usage_table** for the clusters + associated with codons for amino-acid *aa* and write the + resulting lists in a sub-folder (named based on *aa*) + of the directory *clusters_dir*. + The clusters are found in the column whose name conforms to + *cluster_level_template* + Also generate violin plots for each cluster. """ md_report = f"* Clusters for {aa}:\n\n" aa_dir = clusters_dir.joinpath(star2stop(aa)) @@ -740,12 +750,14 @@ def write_cluster_lists( # for genes belonging to this cluster relpaths_to_cluster_lists = {} for (cluster, gene_list) in groupby(itemgetter(0), zip( - usage_table.index.get_level_values(cluster_level_template.format(aa=aa)), - usage_table.index.get_level_values("old_locus_tag"))).items(): + usage_table.index.get_level_values( + cluster_level_template.format(aa=aa)), + usage_table.index.get_level_values(alt_tag_kw))).items(): path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt")) with path_to_clusterfile.open("w") as clust_fh: clust_fh.write("\n".join(map(itemgetter(1), gene_list))) - relpaths_to_cluster_lists[cluster] = str(path_to_clusterfile.relative_to('.')) + relpaths_to_cluster_lists[cluster] = str( + path_to_clusterfile.relative_to('.')) md_report += "\n\n".join([ f" - [{cluster}]({relpath_to_list})" for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()]) @@ -760,10 +772,36 @@ def write_cluster_lists( plt.savefig(path_to_fig, metadata={'creationDate': None}) plt.close() relpath_to_fig = str(path_to_fig.relative_to('.')) - md_report += f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n" + md_report += ( + f"\n\n - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n") return md_report +def find_valley(values): + """ + Try to find a cutting threshold for the *values*, + where the corresponding density distribution is lowest. + """ + if len(set(values)) == 1: + (single_val,) = set(values) + return single_val + min_val = min(values) + max_val = max(values) + val_range = max_val - min_val + # We use 100 bins + try: + val_density = gaussian_kde(values).evaluate( + np.linspace(min_val, max_val, 100)) + except LinAlgError as err: + print(f"min: {min_val}, max: {max_val}, N: {len(values)}.") + print(str(err)) + raise + # Where is the density lowest? (excluding first and last) + min_density_idx = val_density[1:-1].argmin() + # Finding the value corresponding to the index in the `val_density` array: + return min_val + (min_density_idx * val_range / 100) + + def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"): """ Plot a boxplot from pandas DataFrame *usage_table*.