Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
L
libcodonusage
Manage
Activity
Members
Labels
Plan
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Container registry
Model registry
Operate
Environments
Terraform modules
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Blaise LI
libcodonusage
Commits
6ef40430
Commit
6ef40430
authored
1 year ago
by
Blaise Li
Browse files
Options
Downloads
Patches
Plain Diff
Linting code.
parent
6ad1947d
No related branches found
No related tags found
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
libcodonusage/libcodonusage.py
+71
-47
71 additions, 47 deletions
libcodonusage/libcodonusage.py
with
71 additions
and
47 deletions
libcodonusage/libcodonusage.py
+
71
−
47
View file @
6ef40430
...
...
@@ -211,7 +211,6 @@ def detect_fishy_genes(codon_counts):
A table of boolean criteria is returned, with one line per gene.
"""
def
display_gene_set
(
gene_set
,
max_size
=
10
):
"""
Print out genes in a gene set, depending on their number.
...
...
@@ -445,7 +444,8 @@ SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
def
remove_codons
(
codon_counts
,
codon_list
):
"""
Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons).
Filter out codons in a table *codon_counts* based on codons
present in the list *codon_list* (like stop codons).
"""
codon_counts
.
drop
(
columns
=
codon_list
,
inplace
=
True
)
return
codon_counts
...
...
@@ -453,17 +453,23 @@ def remove_codons(codon_counts, codon_list):
def
sum_codon_counts
(
row
,
codons
):
"""
Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*.
Perform the row-wise sum of codon counts for the codons
present in *codons* list given the row *row*.
"""
sum
=
0
for
cod
in
codons
:
sum
+=
row
[
cod
]
return
sum
# TODO: try row[codons].sum()
# sum = 0
# for cod in codons:
# sum += row[cod]
# return sum
# Possibly more efficient, avoids the following pylint warning:
# W0622: Redefining built-in 'sum' (redefined-builtin)
return
sum
([
row
[
cod
]
for
cod
in
codons
])
def
max_codon_counts
(
row
,
codons
):
"""
Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
Return the row-wise maximum of codon counts for the codons
present in *codons* list given the row *row*.
"""
counts_codons
=
[]
for
cod
in
codons
:
...
...
@@ -471,30 +477,41 @@ def max_codon_counts(row, codons):
return
max
(
counts_codons
)
def
group_codons_by_class
(
codon_counts
,
group_name
,
dict_classes
,
mode
=
'
max
'
,
filter
=
False
):
def
group_codons_by_class
(
codon_counts
,
group_name
,
dict_classes
,
mode
=
"
max
"
,
keep_only_groups
=
False
):
"""
Group codons given specific classes in *codon_counts* table.
*group_name* contains the name of the grouping, and plays the role
of aa names in the original
codon counts table.
*dict_classes* contains the different classes under this grouping
as keys and the associated
list of codons as values.
*mode* defines the way grouping is computed.
If mode is
'
max
'
, the maximum value of counts of codons belonging
to the same class is used for the grouped class. Otherwise, the sum of counts values for all
codons belonging
*group_name* contains the name of the grouping, and plays the role
of aa names in the original
codon counts table.
*dict_classes* contains the different classes under this grouping
as keys and the associated
list of codons as values.
*mode* defines the way grouping is computed.
If mode is
"
max
"
, the maximum value of counts of
codons belonging
to the same class is used for the grouped class.
*filter* is a boolean set to True if you want to filter out other codons than the ones specified in
dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
the grouped_classes.
Otherwise, the sum of counts values for all codons belonging to
the same class is used for the grouped class.
*keep_only_groups* is a boolean set to True if you want to filter out
other codons than the ones specified in dict_classes.
If set to False (default), the original codon_counts table
is returned with additional columns for the grouped_classes.
"""
list_classes
=
list
(
dict_classes
.
items
())
list_classes_names
=
[]
# pylint issues the following warning:
# "W0640: Cell variable value defined in loop (cell-var-from-loop)"
# Since the lambda function is used immediately,
# this should not be an actual issue
# (see https://stackoverflow.com/q/25314547/1878788 and answers)
for
key
,
value
in
dict_classes
.
items
():
if
mode
==
'
max
'
:
codon_counts
[
group_name
,
key
]
=
codon_counts
.
apply
(
lambda
row
:
max_codon_counts
(
row
,
value
),
axis
=
1
)
if
mode
==
"
max
"
:
codon_counts
[
group_name
,
key
]
=
codon_counts
.
apply
(
lambda
row
:
max_codon_counts
(
row
,
value
),
axis
=
1
)
else
:
codon_counts
[
group_name
,
key
]
=
codon_counts
.
apply
(
lambda
row
:
sum_codon_counts
(
row
,
value
),
axis
=
1
)
codon_counts
[
group_name
,
key
]
=
codon_counts
.
apply
(
lambda
row
:
sum_codon_counts
(
row
,
value
),
axis
=
1
)
list_classes_names
.
append
(
key
)
if
filter
:
if
keep_only_groups
:
return
codon_counts
.
loc
[:,
([
group_name
],
list_classes_names
)]
else
:
return
codon_counts
...
...
@@ -502,7 +519,8 @@ def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', fi
def
gene_wide_codon_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
):
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
,
check_colsums
=
False
):
"""
Compute codon usage biases
"
gene-wide
"
as the standardized
difference between a gene
'
s codon proportions and global
...
...
@@ -532,16 +550,16 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# codon_proportions.style.hide(axis="index")
if
verbose
:
display
(
codon_proportions
.
head
(
3
))
#
C
he
ck that the sum of proportions (columns) for a gene is 1
colsums
=
codon_proportions
.
sum
(
axis
=
1
).
values
#
Due to imprecision in float arithmetics,
#
we can only check that the sums are close to 1
## I put this assert in comment because after grouping (either by max or by sum),
## the distribution is too skewed to have an optimal normalization
## I am not sure about the meaning of normalizing as skewed data as we have
#
assert np.allclose(colsums, np.full(len(colsums), 1))
#
T
he
assert has been made optional because after grouping
# (either by max or by sum), the distribution is too skewed
#
to have an optimal normalization
#
I am not sure about the meaning of normalizing as skewed data as we have
if
check_colsums
:
# Check that the sum of proportions (columns) for a gene is 1
colsums
=
codon_proportions
.
sum
(
axis
=
1
).
values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
if
ref_filter_dict
is
None
:
counts_for_global
=
codon_counts
else
:
...
...
@@ -777,7 +795,8 @@ across genes) so that they are more comparable between codons.
def
aa_usage
(
codon_counts
,
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
):
verbose
=
False
,
return_more
=
False
,
ref_filter_dict
=
None
,
check_colsums
=
False
):
"""
Compute amino-acid usage biases as the standardized
difference between a gene
'
s amino-acid proportions
...
...
@@ -812,10 +831,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# aa_proportions.style.hide(axis="index")
if
verbose
:
display
(
aa_proportions
.
head
(
3
))
# Checking that proportions sum to 1
colsums
=
aa_proportions
.
sum
(
axis
=
1
)
# Same here since the normalization is working as good on skewed distribution
#assert np.allclose(colsums, np.full(len(colsums), 1))
# The assert has been made optional since the normalization is working
# as good on skewed distribution
if
check_colsums
:
# Checking that proportions sum to 1
colsums
=
aa_proportions
.
sum
(
axis
=
1
)
assert
np
.
allclose
(
colsums
,
np
.
full
(
len
(
colsums
),
1
))
# Then, computing the global amino-acid proportions
if
ref_filter_dict
is
None
:
counts_for_global
=
summed_by_aa
...
...
@@ -946,7 +967,8 @@ def codon_influence_in_components(
def
codon_usage_pca
(
usage_data
,
figs_dir
=
None
,
hue
=
"
chrom
"
,
exclude_cols
=
None
,
plot_more_components
=
False
,
figs_dir
=
None
,
hue
=
"
chrom
"
,
exclude_cols
=
None
,
plot_more_components
=
False
,
formats
=
None
,
cols_are_codons
=
True
):
"""
Perform Principal Component Analysis on *usage_data*.
...
...
@@ -993,22 +1015,22 @@ def codon_usage_pca(
(
fig
,
axes
)
=
plt
.
subplots
(
3
,
2
,
figsize
=
(
16
,
25
))
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
0
,
y
=
1
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
0
,
0
])
x
=
0
,
y
=
1
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
0
,
0
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
2
,
y
=
3
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
0
,
1
])
x
=
2
,
y
=
3
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
0
,
1
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
4
,
y
=
5
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
1
,
0
])
x
=
4
,
y
=
5
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
1
,
0
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
6
,
y
=
7
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
1
,
1
])
x
=
6
,
y
=
7
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
1
,
1
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
8
,
y
=
9
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
2
,
0
])
x
=
8
,
y
=
9
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
2
,
0
])
sns
.
scatterplot
(
data
=
transformed_data
,
x
=
10
,
y
=
11
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
2
,
1
])
x
=
10
,
y
=
11
,
hue
=
hue
,
marker
=
"
.
"
,
ax
=
axes
[
2
,
1
])
else
:
(
fig
,
axes
)
=
plt
.
subplots
(
1
,
2
,
figsize
=
(
16
,
8
))
sns
.
scatterplot
(
...
...
@@ -1028,7 +1050,9 @@ def codon_usage_pca(
if
cols_are_codons
:
codon_influence_in_components
(
pca
.
components_
,
usage_data
.
columns
,
figs_dir
=
figs_dir
,
more_components
=
plot_more_components
,
formats
=
formats
)
figs_dir
=
figs_dir
,
more_components
=
plot_more_components
,
formats
=
formats
)
return
(
pca
,
transformed_data
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment