Skip to content
Snippets Groups Projects
Commit 4f5b9211 authored by Blaise Li's avatar Blaise Li
Browse files

Gene extraction and plotting functions.

parent ae4349e4
No related branches found
No related tags found
No related merge requests found
__copyright__ = "Copyright (C) 2022 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = "0.16"
__version__ = "0.17"
from .libcodonusage import (
aa2colour,
aa_usage,
......@@ -10,6 +10,8 @@ from .libcodonusage import (
columns_by_aa,
detect_fishy_genes,
exclude_all_nan_cols,
extract_top_genes_from_cluster,
find_most_biased_genes,
find_valley,
gene_wide_codon_usage,
load_bias_table,
......@@ -18,6 +20,7 @@ from .libcodonusage import (
make_cluster_table,
make_centroids_cluster_finder,
make_counts_only,
plot_codon_usage_for_gene_list,
render_md,
save_counts_table,
sort_counts_by_aa,
......
......@@ -639,7 +639,9 @@ in the data.
return centroids_scub_by_aa
def make_centroids_cluster_finder(centroids_table, aa):
def make_centroids_cluster_finder(
centroids_table,
aa): # pylint: disable=C0103
"""
Make a function that, when applied to a row in the standardized
codon bias table, determines to what centroid among those
......@@ -680,7 +682,7 @@ def make_cluster_table(scub_table, centroids_scub_table):
(f"cluster_{aa}_full_bias", ""): scub_table.apply(
make_centroids_cluster_finder(centroids_scub_table, aa),
axis=1).values
for aa
for aa # pylint: disable=C0103
in unique(centroids_scub_table.columns.get_level_values(0))},
index=scub_table.index)
......@@ -729,7 +731,9 @@ def star2stop(text):
def write_cluster_lists(
usage_table, aa, clusters_dir,
usage_table,
aa, # pylint: disable=C0103
clusters_dir,
cluster_level_template,
y_label_template,
alt_tag_kw="old_locus_tag"):
......@@ -802,6 +806,201 @@ def find_valley(values):
return min_val + (min_density_idx * val_range / 100)
def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"):
"""
Compute a usage bias threshold hopefully separating the most biased genes
from the remainder of the cluster where *data* contains codon usage biases
for the genes in the cluster and has an *alt_tag_kw* level in its index.
Return the threshold in terms of usage bias and the list of genes above it.
/!\ This is likely very sensitive to the shape
of the usage bias distribution.
"""
thresh = find_valley(data.values)
# Try avoiding cases with threshold "misplaced" at the bottom
# of the distribution
retries = 0
while do_retries and thresh < 0:
retries += 1
thresh = find_valley(data.sort_values().iloc[retries:].values)
top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values
return (thresh, top_genes, retries)
def extract_top_genes_from_cluster(
usage_table,
aa, # pylint: disable=C0103
codon,
clusters_dir,
axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
"""
Return a few lines of markdown with links pointing to
where the saved results are expected to be found.
"""
md_report = f" - Most biased genes for {codon} ({aa}):\n\n"
aa_dir = clusters_dir.joinpath(star2stop(aa))
# Files where to save the results
top_dir = aa_dir.joinpath("top_genes")
top_dir.mkdir(parents=True, exist_ok=True)
path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
path_to_fig = top_dir.joinpath(star2stop(
f"usage_biases_violin_plot_{aa}_{codon}.png"))
# Load the list of genes belonging to the cluster
# associated with a high usage bias for *codon*
path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt"))
with open(path_to_clusterfile) as gene_list_fh:
genes = [line.strip() for line in gene_list_fh]
# Extract the usage bias data for these genes and for this codon
idx_level = usage_table.index.names.index(alt_tag_kw)
try:
# data = usage_table.loc[genes][(aa, codon)]
data = usage_table.loc[
(*(slice(None) for _ in range(idx_level)), genes),
(aa, codon)]
except KeyError:
print(genes[0])
print(usage_table.index.names)
raise
q3 = data.quantile(0.75) # pylint: disable=C0103
top25 = data[(data > q3)].index.get_level_values(
alt_tag_kw).values
(thresh, top_genes, retries) = find_most_biased_genes(
data, do_retries=True, alt_tag_kw=alt_tag_kw)
if retries != 0:
print(
f"Bottom-most {retries} values had to be discarded in order "
f"to avoid a negative threshold for the cluster {aa}_{codon}.")
top_genes_check = set(
usage_table[
(usage_table[(aa, codon)] > thresh)].index.get_level_values(
alt_tag_kw).values)
if set(top_genes) != top_genes_check:
assert set(top_genes).issubset(top_genes_check)
extra_genes_not_in_cluster = top_genes_check - set(top_genes)
print(
f"{len(extra_genes_not_in_cluster)} genes were found "
f"with a high bias for {codon} but not belonging "
f"to the cluster {aa}_{codon}.")
# For visual verification purpose,
# display where the threshold is situated on a violin plot
axis = violin_with_thresh(
data, [(thresh, "red"), (q3, "blue")],
f"standardized usage bias for {codon} ({aa})",
facecolor=aa2colour[aa], axis=axis)
# Save the figure and close it.
plt.tight_layout()
if savefig:
plt.savefig(path_to_fig, metadata={'creationDate': None})
plt.close()
# Save the list of the top genes
with open(path_to_top25, "w") as gene_list_fh:
gene_list_fh.write("\n".join(top25))
with open(path_to_top, "w") as gene_list_fh:
gene_list_fh.write("\n".join(top_genes))
relpath_to_top25 = str(path_to_top25.relative_to('.'))
relpath_to_top = str(path_to_top.relative_to('.'))
relpath_to_fig = str(path_to_fig.relative_to('.'))
md_report += "\n\n".join([
f" * [top 25% genes]({relpath_to_top25})",
f" * [top genes]({relpath_to_top})",
f" * [violin plot with thresholds ({q3:.3}, "
f"{thresh:.3})]({relpath_to_fig})"])
return md_report
# TODO:
def plot_codon_usage_for_gene_list(
gene_list_path,
usage_table,
aa, # pylint: disable=C0103
codon,
figs_dir=None,
axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
"""
Make a violin plot of codon usage biases for genes in file *gene_list_path*
using data in *usage_table*.
The plot will represent the distribution of usage bias
for codon *codon* of amino-acid *aa*.
The plot will be saved inside *figs_dir* if *savefig* is set to True.
Gene identifiers are expected to be *alt_tag_kw* identifiers.
"""
md_report = f" - Genes from {gene_list_path}:\n\n"
with open(gene_list_path) as gene_list_fh:
genes = [
line.strip() for line in gene_list_fh
if line[0] != "#"]
# Extract the usage bias data for these genes and for this codon
idx_level = usage_table.index.names.index(alt_tag_kw)
try:
# data = usage_table.loc[genes][(aa, codon)]
data = usage_table.loc[
(*(slice(None) for _ in range(idx_level)), genes),
(aa, codon)]
except KeyError:
print(
f"Genes are supposed to be provided as {alt_tag_kw} identifiers.")
print(genes[0])
print(usage_table.index.names)
raise
q3 = data.quantile(0.75) # pylint: disable=C0103
# top25 = data[(data > q3)].index.get_level_values(
# alt_tag_kw).values
(thresh, top_genes, retries) = find_most_biased_genes(
data, do_retries=True, alt_tag_kw=alt_tag_kw)
if retries != 0:
print(
f"Bottom-most {retries} values (out of {len(data)}) "
"had to be discarded in order to avoid a negative threshold "
f"for the cluster {aa}_{codon}.")
top_genes_check = set(
usage_table[
(usage_table[(aa, codon)] > thresh)].index.get_level_values(
alt_tag_kw).values)
if set(top_genes) != top_genes_check:
assert set(top_genes).issubset(top_genes_check)
extra_genes_not_in_cluster = top_genes_check - set(top_genes)
print(
f"{len(extra_genes_not_in_cluster)} genes were found "
f"with a high bias for {codon} but not belonging to "
f"the cluster {aa}_{codon}.")
# For visual verification purpose,
# display where the threshold is situated on a violin plot
axis = violin_with_thresh(
data, [(thresh, "red"), (q3, "blue")],
f"standardized usage bias for {codon} ({aa})",
facecolor=aa2colour[aa], axis=axis)
# Save the figure and close it.
plt.tight_layout()
if not savefig:
return md_report
if figs_dir is None:
raise ValueError("No *figs_dir* directory specified.\n")
aa_dir = figs_dir.joinpath(star2stop(aa))
# Files where to save the results
aa_dir.mkdir(parents=True, exist_ok=True)
path_to_fig = aa_dir.joinpath(star2stop(
f"usage_biases_violin_plot_{aa}_{codon}.png"))
plt.savefig(path_to_fig, metadata={'creationDate': None})
plt.close()
# Save the list of the top genes
# path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
# path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
# with open(path_to_top25, "w") as gene_list_fh:
# gene_list_fh.write("\n".join(top25))
# with open(path_to_top, "w") as gene_list_fh:
# gene_list_fh.write("\n".join(top_genes))
# relpath_to_top25 = str(path_to_top25.relative_to('.'))
# relpath_to_top = str(path_to_top.relative_to('.'))
relpath_to_fig = str(path_to_fig.relative_to('.'))
md_report += "\n\n".join([
# f" * [top 25% genes]({relpath_to_top25})",
# f" * [top genes]({relpath_to_top})",
f" * [violin plot with thresholds ({q3:.3}, "
f"{thresh:.3})]({relpath_to_fig})"])
return md_report
def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
"""
Plot a boxplot from pandas DataFrame *usage_table*.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment