Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libcodonusage
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
libcodonusage
Commits
4f5b9211
Commit
4f5b9211
authored
3 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Gene extraction and plotting functions.
parent
ae4349e4
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libcodonusage/__init__.py
+4
-1
4 additions, 1 deletion
libcodonusage/__init__.py
libcodonusage/libcodonusage.py
+202
-3
202 additions, 3 deletions
libcodonusage/libcodonusage.py
with
206 additions
and
4 deletions
libcodonusage/__init__.py
+
4
−
1
View file @
4f5b9211
__copyright__
=
"
Copyright (C) 2022 Blaise Li
"
__licence__
=
"
GNU GPLv3
"
__version__
=
"
0.1
6
"
__version__
=
"
0.1
7
"
from
.libcodonusage
import
(
aa2colour
,
aa_usage
,
...
...
@@ -10,6 +10,8 @@ from .libcodonusage import (
columns_by_aa
,
detect_fishy_genes
,
exclude_all_nan_cols
,
extract_top_genes_from_cluster
,
find_most_biased_genes
,
find_valley
,
gene_wide_codon_usage
,
load_bias_table
,
...
...
@@ -18,6 +20,7 @@ from .libcodonusage import (
make_cluster_table
,
make_centroids_cluster_finder
,
make_counts_only
,
plot_codon_usage_for_gene_list
,
render_md
,
save_counts_table
,
sort_counts_by_aa
,
...
...
This diff is collapsed.
Click to expand it.
libcodonusage/libcodonusage.py
+
202
−
3
View file @
4f5b9211
...
...
@@ -639,7 +639,9 @@ in the data.
return
centroids_scub_by_aa
def
make_centroids_cluster_finder
(
centroids_table
,
aa
):
def
make_centroids_cluster_finder
(
centroids_table
,
aa
):
# pylint: disable=C0103
"""
Make a function that, when applied to a row in the standardized
codon bias table, determines to what centroid among those
...
...
@@ -680,7 +682,7 @@ def make_cluster_table(scub_table, centroids_scub_table):
(
f
"
cluster_
{
aa
}
_full_bias
"
,
""
):
scub_table
.
apply
(
make_centroids_cluster_finder
(
centroids_scub_table
,
aa
),
axis
=
1
).
values
for
aa
for
aa
# pylint: disable=C0103
in
unique
(
centroids_scub_table
.
columns
.
get_level_values
(
0
))},
index
=
scub_table
.
index
)
...
...
@@ -729,7 +731,9 @@ def star2stop(text):
def
write_cluster_lists
(
usage_table
,
aa
,
clusters_dir
,
usage_table
,
aa
,
# pylint: disable=C0103
clusters_dir
,
cluster_level_template
,
y_label_template
,
alt_tag_kw
=
"
old_locus_tag
"
):
...
...
@@ -802,6 +806,201 @@ def find_valley(values):
return
min_val
+
(
min_density_idx
*
val_range
/
100
)
def
find_most_biased_genes
(
data
,
do_retries
=
True
,
alt_tag_kw
=
"
old_locus_tag
"
):
"""
Compute a usage bias threshold hopefully separating the most biased genes
from the remainder of the cluster where *data* contains codon usage biases
for the genes in the cluster and has an *alt_tag_kw* level in its index.
Return the threshold in terms of usage bias and the list of genes above it.
/!\ This is likely very sensitive to the shape
of the usage bias distribution.
"""
thresh
=
find_valley
(
data
.
values
)
# Try avoiding cases with threshold "misplaced" at the bottom
# of the distribution
retries
=
0
while
do_retries
and
thresh
<
0
:
retries
+=
1
thresh
=
find_valley
(
data
.
sort_values
().
iloc
[
retries
:].
values
)
top_genes
=
data
[(
data
>
thresh
)].
index
.
get_level_values
(
alt_tag_kw
).
values
return
(
thresh
,
top_genes
,
retries
)
def
extract_top_genes_from_cluster
(
usage_table
,
aa
,
# pylint: disable=C0103
codon
,
clusters_dir
,
axis
=
None
,
savefig
=
True
,
alt_tag_kw
=
"
old_locus_tag
"
):
"""
Return a few lines of markdown with links pointing to
where the saved results are expected to be found.
"""
md_report
=
f
"
- Most biased genes for
{
codon
}
(
{
aa
}
):
\n\n
"
aa_dir
=
clusters_dir
.
joinpath
(
star2stop
(
aa
))
# Files where to save the results
top_dir
=
aa_dir
.
joinpath
(
"
top_genes
"
)
top_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
path_to_top
=
top_dir
.
joinpath
(
star2stop
(
f
"
{
aa
}
_
{
codon
}
_top.txt
"
))
path_to_top25
=
top_dir
.
joinpath
(
star2stop
(
f
"
{
aa
}
_
{
codon
}
_top25.txt
"
))
path_to_fig
=
top_dir
.
joinpath
(
star2stop
(
f
"
usage_biases_violin_plot_
{
aa
}
_
{
codon
}
.png
"
))
# Load the list of genes belonging to the cluster
# associated with a high usage bias for *codon*
path_to_clusterfile
=
aa_dir
.
joinpath
(
star2stop
(
f
"
{
aa
}
_
{
codon
}
.txt
"
))
with
open
(
path_to_clusterfile
)
as
gene_list_fh
:
genes
=
[
line
.
strip
()
for
line
in
gene_list_fh
]
# Extract the usage bias data for these genes and for this codon
idx_level
=
usage_table
.
index
.
names
.
index
(
alt_tag_kw
)
try
:
# data = usage_table.loc[genes][(aa, codon)]
data
=
usage_table
.
loc
[
(
*
(
slice
(
None
)
for
_
in
range
(
idx_level
)),
genes
),
(
aa
,
codon
)]
except
KeyError
:
print
(
genes
[
0
])
print
(
usage_table
.
index
.
names
)
raise
q3
=
data
.
quantile
(
0.75
)
# pylint: disable=C0103
top25
=
data
[(
data
>
q3
)].
index
.
get_level_values
(
alt_tag_kw
).
values
(
thresh
,
top_genes
,
retries
)
=
find_most_biased_genes
(
data
,
do_retries
=
True
,
alt_tag_kw
=
alt_tag_kw
)
if
retries
!=
0
:
print
(
f
"
Bottom-most
{
retries
}
values had to be discarded in order
"
f
"
to avoid a negative threshold for the cluster
{
aa
}
_
{
codon
}
.
"
)
top_genes_check
=
set
(
usage_table
[
(
usage_table
[(
aa
,
codon
)]
>
thresh
)].
index
.
get_level_values
(
alt_tag_kw
).
values
)
if
set
(
top_genes
)
!=
top_genes_check
:
assert
set
(
top_genes
).
issubset
(
top_genes_check
)
extra_genes_not_in_cluster
=
top_genes_check
-
set
(
top_genes
)
print
(
f
"
{
len
(
extra_genes_not_in_cluster
)
}
genes were found
"
f
"
with a high bias for
{
codon
}
but not belonging
"
f
"
to the cluster
{
aa
}
_
{
codon
}
.
"
)
# For visual verification purpose,
# display where the threshold is situated on a violin plot
axis
=
violin_with_thresh
(
data
,
[(
thresh
,
"
red
"
),
(
q3
,
"
blue
"
)],
f
"
standardized usage bias for
{
codon
}
(
{
aa
}
)
"
,
facecolor
=
aa2colour
[
aa
],
axis
=
axis
)
# Save the figure and close it.
plt
.
tight_layout
()
if
savefig
:
plt
.
savefig
(
path_to_fig
,
metadata
=
{
'
creationDate
'
:
None
})
plt
.
close
()
# Save the list of the top genes
with
open
(
path_to_top25
,
"
w
"
)
as
gene_list_fh
:
gene_list_fh
.
write
(
"
\n
"
.
join
(
top25
))
with
open
(
path_to_top
,
"
w
"
)
as
gene_list_fh
:
gene_list_fh
.
write
(
"
\n
"
.
join
(
top_genes
))
relpath_to_top25
=
str
(
path_to_top25
.
relative_to
(
'
.
'
))
relpath_to_top
=
str
(
path_to_top
.
relative_to
(
'
.
'
))
relpath_to_fig
=
str
(
path_to_fig
.
relative_to
(
'
.
'
))
md_report
+=
"
\n\n
"
.
join
([
f
"
* [top 25% genes](
{
relpath_to_top25
}
)
"
,
f
"
* [top genes](
{
relpath_to_top
}
)
"
,
f
"
* [violin plot with thresholds (
{
q3
:
.
3
}
,
"
f
"
{
thresh
:
.
3
}
)](
{
relpath_to_fig
}
)
"
])
return
md_report
# TODO:
def
plot_codon_usage_for_gene_list
(
gene_list_path
,
usage_table
,
aa
,
# pylint: disable=C0103
codon
,
figs_dir
=
None
,
axis
=
None
,
savefig
=
True
,
alt_tag_kw
=
"
old_locus_tag
"
):
"""
Make a violin plot of codon usage biases for genes in file *gene_list_path*
using data in *usage_table*.
The plot will represent the distribution of usage bias
for codon *codon* of amino-acid *aa*.
The plot will be saved inside *figs_dir* if *savefig* is set to True.
Gene identifiers are expected to be *alt_tag_kw* identifiers.
"""
md_report
=
f
"
- Genes from
{
gene_list_path
}
:
\n\n
"
with
open
(
gene_list_path
)
as
gene_list_fh
:
genes
=
[
line
.
strip
()
for
line
in
gene_list_fh
if
line
[
0
]
!=
"
#
"
]
# Extract the usage bias data for these genes and for this codon
idx_level
=
usage_table
.
index
.
names
.
index
(
alt_tag_kw
)
try
:
# data = usage_table.loc[genes][(aa, codon)]
data
=
usage_table
.
loc
[
(
*
(
slice
(
None
)
for
_
in
range
(
idx_level
)),
genes
),
(
aa
,
codon
)]
except
KeyError
:
print
(
f
"
Genes are supposed to be provided as
{
alt_tag_kw
}
identifiers.
"
)
print
(
genes
[
0
])
print
(
usage_table
.
index
.
names
)
raise
q3
=
data
.
quantile
(
0.75
)
# pylint: disable=C0103
# top25 = data[(data > q3)].index.get_level_values(
# alt_tag_kw).values
(
thresh
,
top_genes
,
retries
)
=
find_most_biased_genes
(
data
,
do_retries
=
True
,
alt_tag_kw
=
alt_tag_kw
)
if
retries
!=
0
:
print
(
f
"
Bottom-most
{
retries
}
values (out of
{
len
(
data
)
}
)
"
"
had to be discarded in order to avoid a negative threshold
"
f
"
for the cluster
{
aa
}
_
{
codon
}
.
"
)
top_genes_check
=
set
(
usage_table
[
(
usage_table
[(
aa
,
codon
)]
>
thresh
)].
index
.
get_level_values
(
alt_tag_kw
).
values
)
if
set
(
top_genes
)
!=
top_genes_check
:
assert
set
(
top_genes
).
issubset
(
top_genes_check
)
extra_genes_not_in_cluster
=
top_genes_check
-
set
(
top_genes
)
print
(
f
"
{
len
(
extra_genes_not_in_cluster
)
}
genes were found
"
f
"
with a high bias for
{
codon
}
but not belonging to
"
f
"
the cluster
{
aa
}
_
{
codon
}
.
"
)
# For visual verification purpose,
# display where the threshold is situated on a violin plot
axis
=
violin_with_thresh
(
data
,
[(
thresh
,
"
red
"
),
(
q3
,
"
blue
"
)],
f
"
standardized usage bias for
{
codon
}
(
{
aa
}
)
"
,
facecolor
=
aa2colour
[
aa
],
axis
=
axis
)
# Save the figure and close it.
plt
.
tight_layout
()
if
not
savefig
:
return
md_report
if
figs_dir
is
None
:
raise
ValueError
(
"
No *figs_dir* directory specified.
\n
"
)
aa_dir
=
figs_dir
.
joinpath
(
star2stop
(
aa
))
# Files where to save the results
aa_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
path_to_fig
=
aa_dir
.
joinpath
(
star2stop
(
f
"
usage_biases_violin_plot_
{
aa
}
_
{
codon
}
.png
"
))
plt
.
savefig
(
path_to_fig
,
metadata
=
{
'
creationDate
'
:
None
})
plt
.
close
()
# Save the list of the top genes
# path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
# path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
# with open(path_to_top25, "w") as gene_list_fh:
# gene_list_fh.write("\n".join(top25))
# with open(path_to_top, "w") as gene_list_fh:
# gene_list_fh.write("\n".join(top_genes))
# relpath_to_top25 = str(path_to_top25.relative_to('.'))
# relpath_to_top = str(path_to_top.relative_to('.'))
relpath_to_fig
=
str
(
path_to_fig
.
relative_to
(
'
.
'
))
md_report
+=
"
\n\n
"
.
join
([
# f" * [top 25% genes]({relpath_to_top25})",
# f" * [top genes]({relpath_to_top})",
f
"
* [violin plot with thresholds (
{
q3
:
.
3
}
,
"
f
"
{
thresh
:
.
3
}
)](
{
relpath_to_fig
}
)
"
])
return
md_report
def
boxplot_usage
(
usage_table
,
ylabel
,
whiskers
=
"
1.5 IQR
"
):
"""
Plot a boxplot from pandas DataFrame *usage_table*.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment