Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libcodonusage
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
libcodonusage
Commits
ae4349e4
Commit
ae4349e4
authored
3 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Threshold finding function.
parent
598c606d
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libcodonusage/__init__.py
+3
-1
3 additions, 1 deletion
libcodonusage/__init__.py
libcodonusage/libcodonusage.py
+43
-5
43 additions, 5 deletions
libcodonusage/libcodonusage.py
with
46 additions
and
6 deletions
libcodonusage/__init__.py
+
3
−
1
View file @
ae4349e4
__copyright__
=
"
Copyright (C) 2022 Blaise Li
"
__licence__
=
"
GNU GPLv3
"
__version__
=
"
0.1
5
"
__version__
=
"
0.1
6
"
from
.libcodonusage
import
(
aa2colour
,
aa_usage
,
...
...
@@ -10,6 +10,7 @@ from .libcodonusage import (
columns_by_aa
,
detect_fishy_genes
,
exclude_all_nan_cols
,
find_valley
,
gene_wide_codon_usage
,
load_bias_table
,
load_counts_table
,
...
...
@@ -20,6 +21,7 @@ from .libcodonusage import (
render_md
,
save_counts_table
,
sort_counts_by_aa
,
star2stop
,
violin_usage
,
violin_usage_vertical
,
violin_usage_by_clusters
,
...
...
This diff is collapsed.
Click to expand it.
libcodonusage/libcodonusage.py
+
43
−
5
View file @
ae4349e4
...
...
@@ -46,7 +46,9 @@ import pandas as pd
# python3 -m pip install plydata
# from plydata import define, pull, query
# python3 -m pip install scipy
from
scipy.linalg
import
LinAlgError
from
scipy.spatial.distance
import
sqeuclidean
as
sqdist
from
scipy.stats
import
gaussian_kde
# Python library with useful data-processing features
# python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html
...
...
@@ -729,8 +731,16 @@ def star2stop(text):
def
write_cluster_lists
(
usage_table
,
aa
,
clusters_dir
,
cluster_level_template
,
y_label_template
):
y_label_template
,
alt_tag_kw
=
"
old_locus_tag
"
):
"""
Extract lists of genes in *usage_table** for the clusters
associated with codons for amino-acid *aa* and write the
resulting lists in a sub-folder (named based on *aa*)
of the directory *clusters_dir*.
The clusters are found in the column whose name conforms to
*cluster_level_template*
Also generate violin plots for each cluster.
"""
md_report
=
f
"
* Clusters for
{
aa
}
:
\n\n
"
aa_dir
=
clusters_dir
.
joinpath
(
star2stop
(
aa
))
...
...
@@ -740,12 +750,14 @@ def write_cluster_lists(
# for genes belonging to this cluster
relpaths_to_cluster_lists
=
{}
for
(
cluster
,
gene_list
)
in
groupby
(
itemgetter
(
0
),
zip
(
usage_table
.
index
.
get_level_values
(
cluster_level_template
.
format
(
aa
=
aa
)),
usage_table
.
index
.
get_level_values
(
"
old_locus_tag
"
))).
items
():
usage_table
.
index
.
get_level_values
(
cluster_level_template
.
format
(
aa
=
aa
)),
usage_table
.
index
.
get_level_values
(
alt_tag_kw
))).
items
():
path_to_clusterfile
=
aa_dir
.
joinpath
(
star2stop
(
f
"
{
cluster
}
.txt
"
))
with
path_to_clusterfile
.
open
(
"
w
"
)
as
clust_fh
:
clust_fh
.
write
(
"
\n
"
.
join
(
map
(
itemgetter
(
1
),
gene_list
)))
relpaths_to_cluster_lists
[
cluster
]
=
str
(
path_to_clusterfile
.
relative_to
(
'
.
'
))
relpaths_to_cluster_lists
[
cluster
]
=
str
(
path_to_clusterfile
.
relative_to
(
'
.
'
))
md_report
+=
"
\n\n
"
.
join
([
f
"
- [
{
cluster
}
](
{
relpath_to_list
}
)
"
for
(
cluster
,
relpath_to_list
)
in
relpaths_to_cluster_lists
.
items
()])
...
...
@@ -760,10 +772,36 @@ def write_cluster_lists(
plt
.
savefig
(
path_to_fig
,
metadata
=
{
'
creationDate
'
:
None
})
plt
.
close
()
relpath_to_fig
=
str
(
path_to_fig
.
relative_to
(
'
.
'
))
md_report
+=
f
"
\n\n
- [Violin plots for
{
aa
}
clusters](
{
relpath_to_fig
}
)
\n\n
"
md_report
+=
(
f
"
\n\n
- [Violin plots for
{
aa
}
clusters](
{
relpath_to_fig
}
)
\n\n
"
)
return
md_report
def
find_valley
(
values
):
"""
Try to find a cutting threshold for the *values*,
where the corresponding density distribution is lowest.
"""
if
len
(
set
(
values
))
==
1
:
(
single_val
,)
=
set
(
values
)
return
single_val
min_val
=
min
(
values
)
max_val
=
max
(
values
)
val_range
=
max_val
-
min_val
# We use 100 bins
try
:
val_density
=
gaussian_kde
(
values
).
evaluate
(
np
.
linspace
(
min_val
,
max_val
,
100
))
except
LinAlgError
as
err
:
print
(
f
"
min:
{
min_val
}
, max:
{
max_val
}
, N:
{
len
(
values
)
}
.
"
)
print
(
str
(
err
))
raise
# Where is the density lowest? (excluding first and last)
min_density_idx
=
val_density
[
1
:
-
1
].
argmin
()
# Finding the value corresponding to the index in the `val_density` array:
return
min_val
+
(
min_density_idx
*
val_range
/
100
)
def
boxplot_usage
(
usage_table
,
ylabel
,
whiskers
=
"
1.5 IQR
"
):
"""
Plot a boxplot from pandas DataFrame *usage_table*.
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment