Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libcodonusage
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container Registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
libcodonusage
Commits
b37e7ce5
Commit
b37e7ce5
authored
3 years ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
PCA, filtering by chromosome.
parent
efd54b2d
No related branches found
No related tags found
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
libcodonusage/__init__.py
+4
-1
4 additions, 1 deletion
libcodonusage/__init__.py
libcodonusage/libcodonusage.py
+145
-15
145 additions, 15 deletions
libcodonusage/libcodonusage.py
with
149 additions
and
16 deletions
libcodonusage/__init__.py
+
4
−
1
View file @
b37e7ce5
__copyright__
=
"
Copyright (C) 2022 Blaise Li
"
__copyright__
=
"
Copyright (C) 2022 Blaise Li
"
__licence__
=
"
GNU GPLv3
"
__licence__
=
"
GNU GPLv3
"
__version__
=
"
0.2
2
"
__version__
=
"
0.2
3
"
from
.libcodonusage
import
(
from
.libcodonusage
import
(
aa2colour
,
aa2colour
,
aa_usage
,
aa_usage
,
by_aa_codon_usage
,
by_aa_codon_usage
,
centroid_usage
,
centroid_usage
,
codon2aa
,
codon2aa
,
codon_usage_pca
,
columns_by_aa
,
columns_by_aa
,
compare_clusterings
,
compare_clusterings
,
detect_fishy_genes
,
detect_fishy_genes
,
exclude_all_nan_cols
,
exclude_all_nan_cols
,
extract_top_genes_from_cluster
,
extract_top_genes_from_cluster
,
filter_on_idx_levels
,
find_most_biased_genes
,
find_most_biased_genes
,
find_valley
,
find_valley
,
format_codon_labels
,
gene_wide_codon_usage
,
gene_wide_codon_usage
,
load_bias_table
,
load_bias_table
,
load_counts_table
,
load_counts_table
,
...
...
This diff is collapsed.
Click to expand it.
libcodonusage/libcodonusage.py
+
145
−
15
View file @
b37e7ce5
...
@@ -36,7 +36,7 @@ import seaborn as sns
...
@@ -36,7 +36,7 @@ import seaborn as sns
# python3 -m pip install biotite
# python3 -m pip install biotite
# See https://www.biotite-python.org/install.html
# See https://www.biotite-python.org/install.html
import
biotite.sequence.graphics
as
bgraphs
import
biotite.sequence.graphics
as
bgraphs
from
biotite.sequence
import
CodonTable
from
biotite.sequence
import
CodonTable
,
NucleotideSequence
# python3 -m pip install numpy
# python3 -m pip install numpy
import
numpy
as
np
import
numpy
as
np
# Python module to handle tabular data
# Python module to handle tabular data
...
@@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde
...
@@ -53,6 +53,7 @@ from scipy.stats import gaussian_kde
# Python library with useful data-processing features
# Python library with useful data-processing features
# python3 -m pip install scikit-learn
# python3 -m pip install scikit-learn
# https://scikit-learn.org/stable/install.html
# https://scikit-learn.org/stable/install.html
from
sklearn.decomposition
import
PCA
from
sklearn.preprocessing
import
normalize
from
sklearn.preprocessing
import
normalize
# Python module to vizualize set intersections
# Python module to vizualize set intersections
# python3 -m pip install upsetplot
# python3 -m pip install upsetplot
...
@@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath(
...
@@ -93,6 +94,12 @@ with Path(bgraphs.colorschemes._scheme_dir).joinpath(
# key: amino-acid
# key: amino-acid
# value: hexadecimal html colour code
# value: hexadecimal html colour code
aa2colour
=
{
**
colscheme
[
"
colors
"
],
"
*
"
:
'
#000000
'
}
aa2colour
=
{
**
colscheme
[
"
colors
"
],
"
*
"
:
'
#000000
'
}
######################################
# Associating colours to nucleotides #
######################################
nuc_alphabet
=
NucleotideSequence
.
alphabet_unamb
nuc_colours
=
bgraphs
.
get_color_scheme
(
"
rainbow
"
,
nuc_alphabet
)
nuc2colour
=
dict
(
zip
(
nuc_alphabet
,
nuc_colours
))
def
load_counts_table
(
def
load_counts_table
(
...
@@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path):
...
@@ -345,6 +352,20 @@ def save_counts_table(counts_table, table_path):
render_md
(
f
"
The table was saved at [
{
table_path
}
](
{
table_path
}
).
"
)
render_md
(
f
"
The table was saved at [
{
table_path
}
](
{
table_path
}
).
"
)
def
filter_on_idx_levels
(
counts_table
,
filter_dict
):
"""
Filter a table *counts_table* based on values of certain index levels.
*filter_dict* should contain index level names as keys and values that
rows should have at this level to be included in the output
filtered table.
"""
row_filter
=
np
.
all
([
counts_table
.
index
.
get_level_values
(
idx_lvl
)
==
idx_val
for
(
idx_lvl
,
idx_val
)
in
filter_dict
.
items
()],
axis
=
0
)
return
counts_table
[
row_filter
]
# Codon usage calculations can be done in various ways.
# Codon usage calculations can be done in various ways.
# Some examples are given at page 6500 (2 of the pdf) of
# Some examples are given at page 6500 (2 of the pdf) of
# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
...
@@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
...
@@ -352,11 +373,19 @@ SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
SUZUKI_LINK
=
f
"
[Suzuki et al (2005)](https://doi.org/
{
SUZUKI_DOI
}
)
"
SUZUKI_LINK
=
f
"
[Suzuki et al (2005)](https://doi.org/
{
SUZUKI_DOI
}
)
"
def
gene_wide_codon_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
):
def
gene_wide_codon_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
):
"""
"""
Compute codon usage biases
"
gene-wide
"
as the standardized
Compute codon usage biases
"
gene-wide
"
as the standardized
difference between a gene
'
s codon proportions and global
difference between a gene
'
s codon proportions and global
codon proportions.
codon proportions (default).
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
"""
"""
render_md
(
f
"""
render_md
(
f
"""
We will compute codon usage
"
gene-wide
"
(i.e. not by amino-acid),
We will compute codon usage
"
gene-wide
"
(i.e. not by amino-acid),
...
@@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
...
@@ -381,12 +410,17 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# Due to imprecision in float arithmetics,
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
# we can only check that the sums are close to 1
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
if
ref_filter_dict
is
None
:
counts_for_global
=
codon_counts
else
:
counts_for_global
=
filter_on_idx_levels
(
codon_counts
,
ref_filter_dict
)
render_md
(
"""
render_md
(
"""
To compute the usage biases, we also need to compute codon proportions
To compute the usage biases, we also need to compute codon proportions
in the global usage.
in the global usage.
"""
)
"""
)
render_md
(
"
We compute the global usage, as the sum on columns.
"
)
render_md
(
"
We compute the global usage, as the sum on columns.
"
)
global_usage
=
co
don_counts
.
sum
(
axis
=
0
)
global_usage
=
co
unts_for_global
.
sum
(
axis
=
0
)
render_md
(
"
Then we normalize it the same way as for the individual genes.
"
)
render_md
(
"
Then we normalize it the same way as for the individual genes.
"
)
# The values.reshape(1, -1) turns the Series data into a 2D array
# The values.reshape(1, -1) turns the Series data into a 2D array
# with one line. flatten() turns the data back to 1D
# with one line. flatten() turns the data back to 1D
...
@@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons.
...
@@ -423,12 +457,20 @@ across genes) so that they are more comparable between codons.
# TODO: add option to output RSCU instead of / besides proportions
# TODO: add option to output RSCU instead of / besides proportions
def
by_aa_codon_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
):
def
by_aa_codon_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
):
"""
"""
Compute codon usage biases
"
by amino-acid
"
as the standardized
Compute codon usage biases
"
by amino-acid
"
as the standardized
difference between a gene
'
s codon proportions and global
difference between a gene
'
s codon proportions and global
codon proportions, where proportions are computed within
codon proportions, where proportions are computed within
groups of same-amino-acid-coding codons instead of gene-wide.
groups of same-amino-acid-coding codons instead of gene-wide.
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
"""
"""
render_md
(
f
"""
render_md
(
f
"""
We will compute codon usage
"
by amino-acid
"
, by looking at the
We will compute codon usage
"
by amino-acid
"
, by looking at the
...
@@ -464,6 +506,11 @@ the total for the corresponding amino-acid.
...
@@ -464,6 +506,11 @@ the total for the corresponding amino-acid.
np
.
logical_not
(
np
.
logical_not
(
np
.
isclose
(
np
.
isclose
(
sums
.
values
,
all_zeroes
)))
sums
.
values
,
all_zeroes
)))
if
ref_filter_dict
is
None
:
counts_for_global
=
codon_counts
else
:
counts_for_global
=
filter_on_idx_levels
(
codon_counts
,
ref_filter_dict
)
render_md
(
"""
render_md
(
"""
To compute the usage biases, we also need to compute codon proportions
To compute the usage biases, we also need to compute codon proportions
in the global usage.
in the global usage.
...
@@ -472,7 +519,7 @@ in the global usage.
...
@@ -472,7 +519,7 @@ in the global usage.
We compute the global usage, as the sum of the counts for a given codon,
We compute the global usage, as the sum of the counts for a given codon,
across genes.
across genes.
"""
)
"""
)
global_usage
=
co
don_counts
.
sum
(
axis
=
0
)
global_usage
=
co
unts_for_global
.
sum
(
axis
=
0
)
render_md
(
"
Then we sum over codons corresponding to the same amino-acid.
"
)
render_md
(
"
Then we sum over codons corresponding to the same amino-acid.
"
)
global_summed_by_aa
=
global_usage
.
groupby
(
level
=
0
).
sum
()
global_summed_by_aa
=
global_usage
.
groupby
(
level
=
0
).
sum
()
render_md
(
"""
render_md
(
"""
...
@@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons.
...
@@ -509,11 +556,19 @@ across genes) so that they are more comparable between codons.
return
standardized_codon_usage_biases
return
standardized_codon_usage_biases
def
aa_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
):
def
aa_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
):
"""
"""
Compute amino-acid usage biases as the standardized
Compute amino-acid usage biases as the standardized
difference between a gene
'
s amino-acid proportions
difference between a gene
'
s amino-acid proportions
and global amino-acid proportions.
and global amino-acid proportions.
If *ref_filter_dict* is not None, it should be a dict of
(index_level, index_value) pairs and the global codon proportions
will actually be proportions computed on the part of the data
restricted to the genes where the *index_level* has the *index_value*
for all those pairs.
"""
"""
render_md
(
"""
render_md
(
"""
We will compute amino-acid usage, by looking at the
We will compute amino-acid usage, by looking at the
...
@@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
...
@@ -541,17 +596,23 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
colsums
=
aa_proportions
.
sum
(
axis
=
1
)
colsums
=
aa_proportions
.
sum
(
axis
=
1
)
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
# Then, computing the global amino-acid proportions
# Then, computing the global amino-acid proportions
if
ref_filter_dict
is
None
:
counts_for_global
=
summed_by_aa
else
:
counts_for_global
=
filter_on_idx_levels
(
summed_by_aa
,
ref_filter_dict
)
render_md
(
"""
render_md
(
"""
To compute the usage biases, we also need to compute amino-acid proportions
To compute the usage biases, we also need to compute amino-acid proportions
in the global usage.
in the global usage.
"""
)
"""
)
render_md
(
"""
#
render_md("""
We compute the global usage, as the sum of the counts for a given codon,
#
We compute the global usage, as the sum of the counts for a given codon,
across genes.
#
across genes.
"""
)
#
""")
global_usage
=
codon_counts
.
sum
(
axis
=
0
)
#
global_usage = codon_counts.sum(axis=0)
render_md
(
"
Then we sum over codons corresponding to the same amino-acid.
"
)
render_md
(
"
Then we sum over codons corresponding to the same amino-acid.
"
)
global_summed_by_aa
=
global_usage
.
groupby
(
level
=
0
).
sum
()
# global_summed_by_aa = global_usage.groupby(level=0).sum()
global_summed_by_aa
=
counts_for_global
.
sum
()
render_md
(
"
Then we normalize it the same way as for the individual genes.
"
)
render_md
(
"
Then we normalize it the same way as for the individual genes.
"
)
# The values.reshape(1, -1) turns the Series data into a 2D array
# The values.reshape(1, -1) turns the Series data into a 2D array
# with one line. flatten() turns the data back to 1D
# with one line. flatten() turns the data back to 1D
...
@@ -606,6 +667,75 @@ methionine (M) and tryptophan (W).
...
@@ -606,6 +667,75 @@ methionine (M) and tryptophan (W).
all_nan_cols
)
all_nan_cols
)
def
codon_usage_pca
(
usage_data
,
figs_dir
=
None
,
hue
=
"
chrom
"
):
"""
Perform Principal Component Analysis on *usage_data*.
DataFrame *usage_data* is expected to contain observations as lines
and contain numerical values in columns, without NaNs, corresponding
to codons, where the columns headers are supposed to match the following
pattern: <aa>_<codon>, where <aa> is a single-letter code for
an amino-acid, and <codon> is one of the 3-letter codons for this
amino-acid, in capital letters among A, T, G and C
(i.e. in the DNA alphabet).
*hue* is expected to be one of the levels of the MultiIndex of
*usage_data*, and will be used to assign colours to the observations
in PCA plots. By default, colour is based on the content of a
"
chrom
"
level in the index.
If *figs_dir* is not None, this path to a directory will be used
to save graphics representing the projection of the observations
in the first four principal components (0 vs. 1 and 2 vs. 3)
as well as graphics representing the influence of each data column
on the first four principal components.
"""
if
figs_dir
is
not
None
:
figs_dir
=
Path
(
figs_dir
)
figs_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
pca
=
PCA
().
fit
(
usage_data
)
transformed_data
=
pd
.
DataFrame
(
pca
.
transform
(
usage_data
),
index
=
usage_data
.
index
).
reset_index
(
level
=
hue
)
render_md
(
"
Plotting genes on the first 4 components
\n
"
)
(
fig
,
axes
)
=
plt
.
subplots
(
1
,
2
,
figsize
=
(
16
,
8
))
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
0
,
y
=
1
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
0
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
2
,
y
=
3
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
1
])
if
figs_dir
is
not
None
:
plt
.
savefig
(
figs_dir
.
joinpath
(
"
PCA_projections.png
"
),
metadata
=
{
'
creationDate
'
:
None
})
display
(
fig
)
plt
.
close
(
fig
)
render_md
(
"
Vizualizing the influence of codons in the first 4 components
\n
"
)
(
fig
,
axes
)
=
plt
.
subplots
(
4
,
1
,
figsize
=
(
16
,
16
))
for
(
component
,
axis
)
in
enumerate
(
axes
):
pd
.
Series
(
pca
.
components_
[
component
],
index
=
usage_data
.
columns
).
plot
.
bar
(
ax
=
axes
[
component
],
# colname is supposed to end with the 3-letters codon
color
=
[
nuc2colour
[
colname
[
-
1
]]
for
colname
in
usage_data
.
columns
])
axis
.
set_ylabel
(
f
"
weight in component
{
component
}
"
)
# axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
fig
.
subplots_adjust
(
hspace
=
.
5
)
if
figs_dir
is
not
None
:
plt
.
savefig
(
figs_dir
.
joinpath
(
"
PCA_components.png
"
),
metadata
=
{
'
creationDate
'
:
None
})
display
(
fig
)
plt
.
close
(
fig
)
return
(
pca
,
transformed_data
)
def
centroid_usage
(
codon_counts
,
all_nan_cols
):
def
centroid_usage
(
codon_counts
,
all_nan_cols
):
"""
"""
Define
"
centroids
"
for gene clusters, one per codon in *codon_counts*.
Define
"
centroids
"
for gene clusters, one per codon in *codon_counts*.
...
@@ -1202,7 +1332,7 @@ def violin_usage(
...
@@ -1202,7 +1332,7 @@ def violin_usage(
"
hue
"
:
hue
,
"
palette
"
:
palette
,
"
dodge
"
:
dodge
,
"
hue
"
:
hue
,
"
palette
"
:
palette
,
"
dodge
"
:
dodge
,
"
data
"
:
long_form
,
"
ax
"
:
axis
,
"
orient
"
:
"
v
"
,
"
scale
"
:
"
count
"
}
"
data
"
:
long_form
,
"
ax
"
:
axis
,
"
orient
"
:
"
v
"
,
"
scale
"
:
"
count
"
}
kwargs
.
update
(
violin_kwargs
)
kwargs
.
update
(
violin_kwargs
)
sns
.
violinplot
(
**
kwargs
)
axis
=
sns
.
violinplot
(
**
kwargs
)
# sns.violinplot(x=variable, y=ylabel, order=variable2order(variable),
# sns.violinplot(x=variable, y=ylabel, order=variable2order(variable),
# hue=hue, palette=palette, dodge=dodge,
# hue=hue, palette=palette, dodge=dodge,
# data=long_form, ax=axis, orient="v", scale="count",
# data=long_form, ax=axis, orient="v", scale="count",
...
@@ -1246,7 +1376,7 @@ def violin_usage_vertical(
...
@@ -1246,7 +1376,7 @@ def violin_usage_vertical(
"
hue
"
:
hue
,
"
palette
"
:
palette
,
"
dodge
"
:
dodge
,
"
hue
"
:
hue
,
"
palette
"
:
palette
,
"
dodge
"
:
dodge
,
"
data
"
:
long_form
,
"
ax
"
:
axis
,
"
orient
"
:
"
h
"
,
"
scale
"
:
"
count
"
}
"
data
"
:
long_form
,
"
ax
"
:
axis
,
"
orient
"
:
"
h
"
,
"
scale
"
:
"
count
"
}
kwargs
.
update
(
violin_kwargs
)
kwargs
.
update
(
violin_kwargs
)
sns
.
violinplot
(
**
kwargs
)
axis
=
sns
.
violinplot
(
**
kwargs
)
# sns.violinplot(
# sns.violinplot(
# y=variable, x=ylabel, order=variable2order(variable),
# y=variable, x=ylabel, order=variable2order(variable),
# hue=hue, palette=palette, dodge=dodge,
# hue=hue, palette=palette, dodge=dodge,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment