Skip to content
Snippets Groups Projects
Commit 6ef40430 authored by Blaise Li's avatar Blaise Li
Browse files

Linting code.

parent 6ad1947d
No related branches found
No related tags found
No related merge requests found
......@@ -211,7 +211,6 @@ def detect_fishy_genes(codon_counts):
A table of boolean criteria is returned, with one line per gene.
"""
def display_gene_set(gene_set, max_size=10):
"""
Print out genes in a gene set, depending on their number.
......@@ -445,7 +444,8 @@ SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
def remove_codons(codon_counts, codon_list):
"""
Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons).
Filter out codons in a table *codon_counts* based on codons
present in the list *codon_list* (like stop codons).
"""
codon_counts.drop(columns=codon_list, inplace=True)
return codon_counts
......@@ -453,17 +453,23 @@ def remove_codons(codon_counts, codon_list):
def sum_codon_counts(row, codons):
"""
Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*.
Perform the row-wise sum of codon counts for the codons
present in *codons* list given the row *row*.
"""
sum = 0
for cod in codons:
sum += row[cod]
return sum
# TODO: try row[codons].sum()
# sum = 0
# for cod in codons:
# sum += row[cod]
# return sum
# Possibly more efficient, avoids the following pylint warning:
# W0622: Redefining built-in 'sum' (redefined-builtin)
return sum([row[cod] for cod in codons])
def max_codon_counts(row, codons):
"""
Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
Return the row-wise maximum of codon counts for the codons
present in *codons* list given the row *row*.
"""
counts_codons = []
for cod in codons:
......@@ -471,30 +477,41 @@ def max_codon_counts(row, codons):
return max(counts_codons)
def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
def group_codons_by_class(
codon_counts, group_name, dict_classes,
mode="max", keep_only_groups=False):
"""
Group codons given specific classes in *codon_counts* table.
*group_name* contains the name of the grouping, and plays the role of aa names in the original
codon counts table.
*dict_classes* contains the different classes under this grouping as keys and the associated
list of codons as values.
*mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging
*group_name* contains the name of the grouping, and plays the role
of aa names in the original codon counts table.
*dict_classes* contains the different classes under this grouping
as keys and the associated list of codons as values.
*mode* defines the way grouping is computed.
If mode is "max", the maximum value of counts of codons belonging
to the same class is used for the grouped class.
*filter* is a boolean set to True if you want to filter out other codons than the ones specified in
dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
the grouped_classes.
Otherwise, the sum of counts values for all codons belonging to
the same class is used for the grouped class.
*keep_only_groups* is a boolean set to True if you want to filter out
other codons than the ones specified in dict_classes.
If set to False (default), the original codon_counts table
is returned with additional columns for the grouped_classes.
"""
list_classes = list(dict_classes.items())
list_classes_names = []
# pylint issues the following warning:
# "W0640: Cell variable value defined in loop (cell-var-from-loop)"
# Since the lambda function is used immediately,
# this should not be an actual issue
# (see https://stackoverflow.com/q/25314547/1878788 and answers)
for key, value in dict_classes.items():
if mode == 'max':
codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
if mode == "max":
codon_counts[group_name, key] = codon_counts.apply(
lambda row: max_codon_counts(row, value), axis=1)
else:
codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
codon_counts[group_name, key] = codon_counts.apply(
lambda row: sum_codon_counts(row, value), axis=1)
list_classes_names.append(key)
if filter:
if keep_only_groups:
return codon_counts.loc[:, ([group_name], list_classes_names)]
else:
return codon_counts
......@@ -502,7 +519,8 @@ def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', fi
def gene_wide_codon_usage(
codon_counts,
verbose=False, return_more=False, ref_filter_dict=None):
verbose=False, return_more=False, ref_filter_dict=None,
check_colsums=False):
"""
Compute codon usage biases "gene-wide" as the standardized
difference between a gene's codon proportions and global
......@@ -532,16 +550,16 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# codon_proportions.style.hide(axis="index")
if verbose:
display(codon_proportions.head(3))
# Check that the sum of proportions (columns) for a gene is 1
colsums = codon_proportions.sum(axis=1).values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
## I put this assert in comment because after grouping (either by max or by sum),
## the distribution is too skewed to have an optimal normalization
## I am not sure about the meaning of normalizing as skewed data as we have
#assert np.allclose(colsums, np.full(len(colsums), 1))
# The assert has been made optional because after grouping
# (either by max or by sum), the distribution is too skewed
# to have an optimal normalization
# I am not sure about the meaning of normalizing as skewed data as we have
if check_colsums:
# Check that the sum of proportions (columns) for a gene is 1
colsums = codon_proportions.sum(axis=1).values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
assert np.allclose(colsums, np.full(len(colsums), 1))
if ref_filter_dict is None:
counts_for_global = codon_counts
else:
......@@ -777,7 +795,8 @@ across genes) so that they are more comparable between codons.
def aa_usage(
codon_counts,
verbose=False, return_more=False, ref_filter_dict=None):
verbose=False, return_more=False, ref_filter_dict=None,
check_colsums=False):
"""
Compute amino-acid usage biases as the standardized
difference between a gene's amino-acid proportions
......@@ -812,10 +831,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
# aa_proportions.style.hide(axis="index")
if verbose:
display(aa_proportions.head(3))
# Checking that proportions sum to 1
colsums = aa_proportions.sum(axis=1)
# Same here since the normalization is working as good on skewed distribution
#assert np.allclose(colsums, np.full(len(colsums), 1))
# The assert has been made optional since the normalization is working
# as good on skewed distribution
if check_colsums:
# Checking that proportions sum to 1
colsums = aa_proportions.sum(axis=1)
assert np.allclose(colsums, np.full(len(colsums), 1))
# Then, computing the global amino-acid proportions
if ref_filter_dict is None:
counts_for_global = summed_by_aa
......@@ -946,7 +967,8 @@ def codon_influence_in_components(
def codon_usage_pca(
usage_data,
figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False,
figs_dir=None, hue="chrom", exclude_cols=None,
plot_more_components=False,
formats=None, cols_are_codons=True):
"""
Perform Principal Component Analysis on *usage_data*.
......@@ -993,22 +1015,22 @@ def codon_usage_pca(
(fig, axes) = plt.subplots(3, 2, figsize=(16, 25))
sns.scatterplot(
data=transformed_data,
x=0, y=1, hue=hue, marker=".", ax=axes[0,0])
x=0, y=1, hue=hue, marker=".", ax=axes[0, 0])
sns.scatterplot(
data=transformed_data,
x=2, y=3, hue=hue, marker=".", ax=axes[0,1])
x=2, y=3, hue=hue, marker=".", ax=axes[0, 1])
sns.scatterplot(
data=transformed_data,
x=4, y=5, hue=hue, marker=".", ax=axes[1,0])
x=4, y=5, hue=hue, marker=".", ax=axes[1, 0])
sns.scatterplot(
data=transformed_data,
x=6, y=7, hue=hue, marker=".", ax=axes[1,1])
x=6, y=7, hue=hue, marker=".", ax=axes[1, 1])
sns.scatterplot(
data=transformed_data,
x=8, y=9, hue=hue, marker=".", ax=axes[2,0])
x=8, y=9, hue=hue, marker=".", ax=axes[2, 0])
sns.scatterplot(
data=transformed_data,
x=10, y=11, hue=hue, marker=".", ax=axes[2,1])
x=10, y=11, hue=hue, marker=".", ax=axes[2, 1])
else:
(fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
sns.scatterplot(
......@@ -1028,7 +1050,9 @@ def codon_usage_pca(
if cols_are_codons:
codon_influence_in_components(
pca.components_, usage_data.columns,
figs_dir=figs_dir, more_components=plot_more_components, formats=formats)
figs_dir=figs_dir,
more_components=plot_more_components,
formats=formats)
return (pca, transformed_data)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment