Skip to content
Snippets Groups Projects
Commit b3fc4bf6 authored by manselme's avatar manselme
Browse files

Comment asserts since normalization is not really working on skewed distributions now

parent b9011ed2
No related branches found
No related tags found
No related merge requests found
......@@ -19,7 +19,7 @@ from .libcodonusage import (
find_valley,
format_codon_labels,
gene_wide_codon_usage,
group_codons_by_sum,
group_codons_by_class,
load_bias_table,
load_counts_table,
load_table_with_info_index,
......@@ -27,6 +27,7 @@ from .libcodonusage import (
make_cluster_table,
make_centroids_cluster_finder,
make_counts_only,
max_codon_counts,
plot_codon_usage_for_gene_list,
remove_codons,
render_md,
......
......@@ -463,21 +463,37 @@ def sum_codon_counts(row, codons):
return sum
def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
def max_codon_counts(row, codons):
"""
Group codons by sum given specific classes in *codon_counts* table.
Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
"""
counts_codons = []
for cod in codons:
counts_codons.append(row[cod])
return max(counts_codons)
def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
"""
Group codons given specific classes in *codon_counts* table.
*group_name* contains the name of the grouping, and plays the role of aa names in the original
codon counts table.
*dict_classes* contains the different classes under this grouping as keys and the associated
list of codons as values.
*mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging
to the same class is used for the grouped class.
*filter* is a boolean set to True if you want to filter out other codons than the ones specified in
dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
the grouped_classes.
"""
list_classes = list(dict_classes.items())
list_classes_names = []
for key, value in dict_classes.items():
if mode == 'max':
codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
else:
codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
list_classes_names.append(key)
if filter:
......@@ -522,9 +538,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
colsums = codon_proportions.sum(axis=1).values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
## I put this assert in comment because after grouping (either by max or by sum),
## the distribution is too skewed to have an optimal normalization
## I am not sure about the meaning of normalizing as skewed data as we have
#assert np.allclose(colsums, np.full(len(colsums), 1))
print("mean", np.mean(colsums))
assert np.isclose(np.mean(colsums), 1)
if ref_filter_dict is None:
counts_for_global = codon_counts
else:
......@@ -797,7 +816,8 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
display(aa_proportions.head(3))
# Checking that proportions sum to 1
colsums = aa_proportions.sum(axis=1)
assert np.allclose(colsums, np.full(len(colsums), 1))
# Same here since the normalization is working as good on skewed distribution
#assert np.allclose(colsums, np.full(len(colsums), 1))
# Then, computing the global amino-acid proportions
if ref_filter_dict is None:
counts_for_global = summed_by_aa
......
......@@ -19,7 +19,7 @@ from .libcodonusage import (
find_valley,
format_codon_labels,
gene_wide_codon_usage,
group_codons_by_sum,
group_codons_by_class,
load_bias_table,
load_counts_table,
load_table_with_info_index,
......@@ -27,6 +27,7 @@ from .libcodonusage import (
make_cluster_table,
make_centroids_cluster_finder,
make_counts_only,
max_codon_counts,
plot_codon_usage_for_gene_list,
remove_codons,
render_md,
......
......@@ -463,21 +463,37 @@ def sum_codon_counts(row, codons):
return sum
def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
def max_codon_counts(row, codons):
"""
Group codons by sum given specific classes in *codon_counts* table.
Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
"""
counts_codons = []
for cod in codons:
counts_codons.append(row[cod])
return max(counts_codons)
def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
"""
Group codons given specific classes in *codon_counts* table.
*group_name* contains the name of the grouping, and plays the role of aa names in the original
codon counts table.
*dict_classes* contains the different classes under this grouping as keys and the associated
list of codons as values.
*mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging
to the same class is used for the grouped class.
*filter* is a boolean set to True if you want to filter out other codons than the ones specified in
dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
the grouped_classes.
"""
list_classes = list(dict_classes.items())
list_classes_names = []
for key, value in dict_classes.items():
if mode == 'max':
codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
else:
codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
list_classes_names.append(key)
if filter:
......@@ -522,9 +538,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
colsums = codon_proportions.sum(axis=1).values
# Due to imprecision in float arithmetics,
# we can only check that the sums are close to 1
## I put this assert in comment because after grouping (either by max or by sum),
## the distribution is too skewed to have an optimal normalization
## I am not sure about the meaning of normalizing as skewed data as we have
#assert np.allclose(colsums, np.full(len(colsums), 1))
print("mean", np.mean(colsums))
assert np.isclose(np.mean(colsums), 1)
if ref_filter_dict is None:
counts_for_global = codon_counts
else:
......@@ -797,7 +816,8 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
display(aa_proportions.head(3))
# Checking that proportions sum to 1
colsums = aa_proportions.sum(axis=1)
assert np.allclose(colsums, np.full(len(colsums), 1))
# Same here since the normalization is working as good on skewed distribution
#assert np.allclose(colsums, np.full(len(colsums), 1))
# Then, computing the global amino-acid proportions
if ref_filter_dict is None:
counts_for_global = summed_by_aa
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment