From 4f5b9211c2a646c554a664a856c1a56c98a1ef3f Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 23 Mar 2022 14:13:25 +0100 Subject: [PATCH] Gene extraction and plotting functions. --- libcodonusage/__init__.py | 5 +- libcodonusage/libcodonusage.py | 205 ++++++++++++++++++++++++++++++++- 2 files changed, 206 insertions(+), 4 deletions(-) diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 1d922c3..3fa340f 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.16" +__version__ = "0.17" from .libcodonusage import ( aa2colour, aa_usage, @@ -10,6 +10,8 @@ from .libcodonusage import ( columns_by_aa, detect_fishy_genes, exclude_all_nan_cols, + extract_top_genes_from_cluster, + find_most_biased_genes, find_valley, gene_wide_codon_usage, load_bias_table, @@ -18,6 +20,7 @@ from .libcodonusage import ( make_cluster_table, make_centroids_cluster_finder, make_counts_only, + plot_codon_usage_for_gene_list, render_md, save_counts_table, sort_counts_by_aa, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index f27b205..7721658 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -639,7 +639,9 @@ in the data. return centroids_scub_by_aa -def make_centroids_cluster_finder(centroids_table, aa): +def make_centroids_cluster_finder( + centroids_table, + aa): # pylint: disable=C0103 """ Make a function that, when applied to a row in the standardized codon bias table, determines to what centroid among those @@ -680,7 +682,7 @@ def make_cluster_table(scub_table, centroids_scub_table): (f"cluster_{aa}_full_bias", ""): scub_table.apply( make_centroids_cluster_finder(centroids_scub_table, aa), axis=1).values - for aa + for aa # pylint: disable=C0103 in unique(centroids_scub_table.columns.get_level_values(0))}, index=scub_table.index) @@ -729,7 +731,9 @@ def star2stop(text): def write_cluster_lists( - usage_table, aa, clusters_dir, + usage_table, + aa, # pylint: disable=C0103 + clusters_dir, cluster_level_template, y_label_template, alt_tag_kw="old_locus_tag"): @@ -802,6 +806,201 @@ def find_valley(values): return min_val + (min_density_idx * val_range / 100) +def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"): + """ + Compute a usage bias threshold hopefully separating the most biased genes + from the remainder of the cluster where *data* contains codon usage biases + for the genes in the cluster and has an *alt_tag_kw* level in its index. + Return the threshold in terms of usage bias and the list of genes above it. + + /!\ This is likely very sensitive to the shape + of the usage bias distribution. + """ + thresh = find_valley(data.values) + # Try avoiding cases with threshold "misplaced" at the bottom + # of the distribution + retries = 0 + while do_retries and thresh < 0: + retries += 1 + thresh = find_valley(data.sort_values().iloc[retries:].values) + top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values + return (thresh, top_genes, retries) + + +def extract_top_genes_from_cluster( + usage_table, + aa, # pylint: disable=C0103 + codon, + clusters_dir, + axis=None, savefig=True, alt_tag_kw="old_locus_tag"): + """ + Return a few lines of markdown with links pointing to + where the saved results are expected to be found. + """ + md_report = f" - Most biased genes for {codon} ({aa}):\n\n" + aa_dir = clusters_dir.joinpath(star2stop(aa)) + # Files where to save the results + top_dir = aa_dir.joinpath("top_genes") + top_dir.mkdir(parents=True, exist_ok=True) + path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) + path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) + path_to_fig = top_dir.joinpath(star2stop( + f"usage_biases_violin_plot_{aa}_{codon}.png")) + # Load the list of genes belonging to the cluster + # associated with a high usage bias for *codon* + path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt")) + with open(path_to_clusterfile) as gene_list_fh: + genes = [line.strip() for line in gene_list_fh] + # Extract the usage bias data for these genes and for this codon + idx_level = usage_table.index.names.index(alt_tag_kw) + try: + # data = usage_table.loc[genes][(aa, codon)] + data = usage_table.loc[ + (*(slice(None) for _ in range(idx_level)), genes), + (aa, codon)] + except KeyError: + print(genes[0]) + print(usage_table.index.names) + raise + q3 = data.quantile(0.75) # pylint: disable=C0103 + top25 = data[(data > q3)].index.get_level_values( + alt_tag_kw).values + (thresh, top_genes, retries) = find_most_biased_genes( + data, do_retries=True, alt_tag_kw=alt_tag_kw) + if retries != 0: + print( + f"Bottom-most {retries} values had to be discarded in order " + f"to avoid a negative threshold for the cluster {aa}_{codon}.") + top_genes_check = set( + usage_table[ + (usage_table[(aa, codon)] > thresh)].index.get_level_values( + alt_tag_kw).values) + if set(top_genes) != top_genes_check: + assert set(top_genes).issubset(top_genes_check) + extra_genes_not_in_cluster = top_genes_check - set(top_genes) + print( + f"{len(extra_genes_not_in_cluster)} genes were found " + f"with a high bias for {codon} but not belonging " + f"to the cluster {aa}_{codon}.") + # For visual verification purpose, + # display where the threshold is situated on a violin plot + axis = violin_with_thresh( + data, [(thresh, "red"), (q3, "blue")], + f"standardized usage bias for {codon} ({aa})", + facecolor=aa2colour[aa], axis=axis) + # Save the figure and close it. + plt.tight_layout() + if savefig: + plt.savefig(path_to_fig, metadata={'creationDate': None}) + plt.close() + # Save the list of the top genes + with open(path_to_top25, "w") as gene_list_fh: + gene_list_fh.write("\n".join(top25)) + with open(path_to_top, "w") as gene_list_fh: + gene_list_fh.write("\n".join(top_genes)) + relpath_to_top25 = str(path_to_top25.relative_to('.')) + relpath_to_top = str(path_to_top.relative_to('.')) + relpath_to_fig = str(path_to_fig.relative_to('.')) + md_report += "\n\n".join([ + f" * [top 25% genes]({relpath_to_top25})", + f" * [top genes]({relpath_to_top})", + f" * [violin plot with thresholds ({q3:.3}, " + f"{thresh:.3})]({relpath_to_fig})"]) + return md_report + + +# TODO: +def plot_codon_usage_for_gene_list( + gene_list_path, + usage_table, + aa, # pylint: disable=C0103 + codon, + figs_dir=None, + axis=None, savefig=True, alt_tag_kw="old_locus_tag"): + """ + Make a violin plot of codon usage biases for genes in file *gene_list_path* + using data in *usage_table*. + The plot will represent the distribution of usage bias + for codon *codon* of amino-acid *aa*. + The plot will be saved inside *figs_dir* if *savefig* is set to True. + Gene identifiers are expected to be *alt_tag_kw* identifiers. + """ + md_report = f" - Genes from {gene_list_path}:\n\n" + with open(gene_list_path) as gene_list_fh: + genes = [ + line.strip() for line in gene_list_fh + if line[0] != "#"] + # Extract the usage bias data for these genes and for this codon + idx_level = usage_table.index.names.index(alt_tag_kw) + try: + # data = usage_table.loc[genes][(aa, codon)] + data = usage_table.loc[ + (*(slice(None) for _ in range(idx_level)), genes), + (aa, codon)] + except KeyError: + print( + f"Genes are supposed to be provided as {alt_tag_kw} identifiers.") + print(genes[0]) + print(usage_table.index.names) + raise + q3 = data.quantile(0.75) # pylint: disable=C0103 + # top25 = data[(data > q3)].index.get_level_values( + # alt_tag_kw).values + (thresh, top_genes, retries) = find_most_biased_genes( + data, do_retries=True, alt_tag_kw=alt_tag_kw) + if retries != 0: + print( + f"Bottom-most {retries} values (out of {len(data)}) " + "had to be discarded in order to avoid a negative threshold " + f"for the cluster {aa}_{codon}.") + top_genes_check = set( + usage_table[ + (usage_table[(aa, codon)] > thresh)].index.get_level_values( + alt_tag_kw).values) + if set(top_genes) != top_genes_check: + assert set(top_genes).issubset(top_genes_check) + extra_genes_not_in_cluster = top_genes_check - set(top_genes) + print( + f"{len(extra_genes_not_in_cluster)} genes were found " + f"with a high bias for {codon} but not belonging to " + f"the cluster {aa}_{codon}.") + # For visual verification purpose, + # display where the threshold is situated on a violin plot + axis = violin_with_thresh( + data, [(thresh, "red"), (q3, "blue")], + f"standardized usage bias for {codon} ({aa})", + facecolor=aa2colour[aa], axis=axis) + # Save the figure and close it. + plt.tight_layout() + if not savefig: + return md_report + if figs_dir is None: + raise ValueError("No *figs_dir* directory specified.\n") + aa_dir = figs_dir.joinpath(star2stop(aa)) + # Files where to save the results + aa_dir.mkdir(parents=True, exist_ok=True) + path_to_fig = aa_dir.joinpath(star2stop( + f"usage_biases_violin_plot_{aa}_{codon}.png")) + plt.savefig(path_to_fig, metadata={'creationDate': None}) + plt.close() + # Save the list of the top genes + # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt")) + # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt")) + # with open(path_to_top25, "w") as gene_list_fh: + # gene_list_fh.write("\n".join(top25)) + # with open(path_to_top, "w") as gene_list_fh: + # gene_list_fh.write("\n".join(top_genes)) + # relpath_to_top25 = str(path_to_top25.relative_to('.')) + # relpath_to_top = str(path_to_top.relative_to('.')) + relpath_to_fig = str(path_to_fig.relative_to('.')) + md_report += "\n\n".join([ + # f" * [top 25% genes]({relpath_to_top25})", + # f" * [top genes]({relpath_to_top})", + f" * [violin plot with thresholds ({q3:.3}, " + f"{thresh:.3})]({relpath_to_fig})"]) + return md_report + + def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"): """ Plot a boxplot from pandas DataFrame *usage_table*. -- GitLab