From b3fc4bf6d8d87c15e02e0bb281ec9d6b36c8648f Mon Sep 17 00:00:00 2001
From: Marie Anselmet <marie.anselmet@pasteur.fr>
Date: Fri, 20 Oct 2023 14:20:10 +0200
Subject: [PATCH] Comment asserts since normalization is not really working on
 skewed distributions now

---
 build/lib/libcodonusage/__init__.py      |  3 ++-
 build/lib/libcodonusage/libcodonusage.py | 34 +++++++++++++++++++-----
 libcodonusage/__init__.py                |  3 ++-
 libcodonusage/libcodonusage.py           | 34 +++++++++++++++++++-----
 4 files changed, 58 insertions(+), 16 deletions(-)

diff --git a/build/lib/libcodonusage/__init__.py b/build/lib/libcodonusage/__init__.py
index 45862d8..2b0bec4 100644
--- a/build/lib/libcodonusage/__init__.py
+++ b/build/lib/libcodonusage/__init__.py
@@ -19,7 +19,7 @@ from .libcodonusage import (
     find_valley,
     format_codon_labels,
     gene_wide_codon_usage,
-    group_codons_by_sum,
+    group_codons_by_class,
     load_bias_table,
     load_counts_table,
     load_table_with_info_index,
@@ -27,6 +27,7 @@ from .libcodonusage import (
     make_cluster_table,
     make_centroids_cluster_finder,
     make_counts_only,
+    max_codon_counts,
     plot_codon_usage_for_gene_list,
     remove_codons,
     render_md,
diff --git a/build/lib/libcodonusage/libcodonusage.py b/build/lib/libcodonusage/libcodonusage.py
index 603a8bd..fc66f73 100644
--- a/build/lib/libcodonusage/libcodonusage.py
+++ b/build/lib/libcodonusage/libcodonusage.py
@@ -463,22 +463,38 @@ def sum_codon_counts(row, codons):
     return sum
 
 
-def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
+def max_codon_counts(row, codons):
     """
-    Group codons by sum given specific classes in *codon_counts* table.
+    Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
+    """
+    counts_codons = []
+    for cod in codons:
+        counts_codons.append(row[cod])
+    return max(counts_codons)
+
+
+def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
+    """
+    Group codons given specific classes in *codon_counts* table.
 
     *group_name* contains the name of the grouping, and plays the role of aa names in the original 
     codon counts table.
     *dict_classes* contains the different classes under this grouping as keys and the associated 
     list of codons as values.
+    *mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
+    to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging 
+    to the same class is used for the grouped class.
     *filter* is a boolean set to True if you want to filter out other codons than the ones specified in
-    dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
+    dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
     the grouped_classes.
     """
     list_classes = list(dict_classes.items())
     list_classes_names = []
     for key, value in dict_classes.items():
-        codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
+        if mode == 'max':
+            codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
+        else:
+            codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
         list_classes_names.append(key)
     if filter:
         return codon_counts.loc[:, ([group_name], list_classes_names)]
@@ -522,9 +538,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
     colsums = codon_proportions.sum(axis=1).values
     # Due to imprecision in float arithmetics,
     # we can only check that the sums are close to 1
+
+
+    ## I put this assert in comment because after grouping (either by max or by sum),
+    ## the distribution is too skewed to have an optimal normalization
+    ## I am not sure about the meaning of normalizing as skewed data as we have
     #assert np.allclose(colsums, np.full(len(colsums), 1))
-    print("mean", np.mean(colsums))
-    assert np.isclose(np.mean(colsums), 1)
     if ref_filter_dict is None:
         counts_for_global = codon_counts
     else:
@@ -797,7 +816,8 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
         display(aa_proportions.head(3))
     # Checking that proportions sum to 1
     colsums = aa_proportions.sum(axis=1)
-    assert np.allclose(colsums, np.full(len(colsums), 1))
+    # Same here since the normalization is working as good on skewed distribution
+    #assert np.allclose(colsums, np.full(len(colsums), 1))
     # Then, computing the global amino-acid proportions
     if ref_filter_dict is None:
         counts_for_global = summed_by_aa
diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 45862d8..2b0bec4 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -19,7 +19,7 @@ from .libcodonusage import (
     find_valley,
     format_codon_labels,
     gene_wide_codon_usage,
-    group_codons_by_sum,
+    group_codons_by_class,
     load_bias_table,
     load_counts_table,
     load_table_with_info_index,
@@ -27,6 +27,7 @@ from .libcodonusage import (
     make_cluster_table,
     make_centroids_cluster_finder,
     make_counts_only,
+    max_codon_counts,
     plot_codon_usage_for_gene_list,
     remove_codons,
     render_md,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 603a8bd..fc66f73 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -463,22 +463,38 @@ def sum_codon_counts(row, codons):
     return sum
 
 
-def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
+def max_codon_counts(row, codons):
     """
-    Group codons by sum given specific classes in *codon_counts* table.
+    Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
+    """
+    counts_codons = []
+    for cod in codons:
+        counts_codons.append(row[cod])
+    return max(counts_codons)
+
+
+def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
+    """
+    Group codons given specific classes in *codon_counts* table.
 
     *group_name* contains the name of the grouping, and plays the role of aa names in the original 
     codon counts table.
     *dict_classes* contains the different classes under this grouping as keys and the associated 
     list of codons as values.
+    *mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
+    to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging 
+    to the same class is used for the grouped class.
     *filter* is a boolean set to True if you want to filter out other codons than the ones specified in
-    dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
+    dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
     the grouped_classes.
     """
     list_classes = list(dict_classes.items())
     list_classes_names = []
     for key, value in dict_classes.items():
-        codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
+        if mode == 'max':
+            codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
+        else:
+            codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
         list_classes_names.append(key)
     if filter:
         return codon_counts.loc[:, ([group_name], list_classes_names)]
@@ -522,9 +538,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
     colsums = codon_proportions.sum(axis=1).values
     # Due to imprecision in float arithmetics,
     # we can only check that the sums are close to 1
+
+
+    ## I put this assert in comment because after grouping (either by max or by sum),
+    ## the distribution is too skewed to have an optimal normalization
+    ## I am not sure about the meaning of normalizing as skewed data as we have
     #assert np.allclose(colsums, np.full(len(colsums), 1))
-    print("mean", np.mean(colsums))
-    assert np.isclose(np.mean(colsums), 1)
     if ref_filter_dict is None:
         counts_for_global = codon_counts
     else:
@@ -797,7 +816,8 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
         display(aa_proportions.head(3))
     # Checking that proportions sum to 1
     colsums = aa_proportions.sum(axis=1)
-    assert np.allclose(colsums, np.full(len(colsums), 1))
+    # Same here since the normalization is working as good on skewed distribution
+    #assert np.allclose(colsums, np.full(len(colsums), 1))
     # Then, computing the global amino-acid proportions
     if ref_filter_dict is None:
         counts_for_global = summed_by_aa
-- 
GitLab