From 6ef404308b8111488fa183dea9538b2ec8484900 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 6 Nov 2023 12:04:01 +0100
Subject: [PATCH] Linting code.

---
 libcodonusage/libcodonusage.py | 118 ++++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 47 deletions(-)

diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index d16d43a..0bc6a6d 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -211,7 +211,6 @@ def detect_fishy_genes(codon_counts):
 
     A table of boolean criteria is returned, with one line per gene.
     """
-
     def display_gene_set(gene_set, max_size=10):
         """
         Print out genes in a gene set, depending on their number.
@@ -445,7 +444,8 @@ SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
 
 def remove_codons(codon_counts, codon_list):
     """
-    Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons).
+    Filter out codons in a table *codon_counts* based on codons
+    present in the list *codon_list* (like stop codons).
     """
     codon_counts.drop(columns=codon_list, inplace=True)
     return codon_counts
@@ -453,17 +453,23 @@ def remove_codons(codon_counts, codon_list):
 
 def sum_codon_counts(row, codons):
     """
-    Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*.
+    Perform the row-wise sum of codon counts for the codons
+    present in *codons* list given the row *row*.
     """
-    sum = 0
-    for cod in codons:
-        sum += row[cod]
-    return sum
+    # TODO: try row[codons].sum()
+    # sum = 0
+    # for cod in codons:
+    #     sum += row[cod]
+    # return sum
+    # Possibly more efficient, avoids the following pylint warning:
+    # W0622: Redefining built-in 'sum' (redefined-builtin)
+    return sum([row[cod] for cod in codons])
 
 
 def max_codon_counts(row, codons):
     """
-    Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*.
+    Return the row-wise maximum of codon counts for the codons
+    present in *codons* list given the row *row*.
     """
     counts_codons = []
     for cod in codons:
@@ -471,30 +477,41 @@ def max_codon_counts(row, codons):
     return max(counts_codons)
 
 
-def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False):
+def group_codons_by_class(
+        codon_counts, group_name, dict_classes,
+        mode="max", keep_only_groups=False):
     """
     Group codons given specific classes in *codon_counts* table.
 
-    *group_name* contains the name of the grouping, and plays the role of aa names in the original 
-    codon counts table.
-    *dict_classes* contains the different classes under this grouping as keys and the associated 
-    list of codons as values.
-    *mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging
-    to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging 
+    *group_name* contains the name of the grouping, and plays the role
+    of aa names in the original codon counts table.
+    *dict_classes* contains the different classes under this grouping
+    as keys and the associated list of codons as values.
+    *mode* defines the way grouping is computed.
+    If mode is "max", the maximum value of counts of codons belonging
     to the same class is used for the grouped class.
-    *filter* is a boolean set to True if you want to filter out other codons than the ones specified in
-    dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for
-    the grouped_classes.
+    Otherwise, the sum of counts values for all codons belonging to
+    the same class is used for the grouped class.
+    *keep_only_groups* is a boolean set to True if you want to filter out
+    other codons than the ones specified in dict_classes.
+    If set to False (default), the original codon_counts table
+    is returned with additional columns for the grouped_classes.
     """
-    list_classes = list(dict_classes.items())
     list_classes_names = []
+    # pylint issues the following warning:
+    # "W0640: Cell variable value defined in loop (cell-var-from-loop)"
+    # Since the lambda function is used immediately,
+    # this should not be an actual issue
+    # (see https://stackoverflow.com/q/25314547/1878788 and answers)
     for key, value in dict_classes.items():
-        if mode == 'max':
-            codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1)
+        if mode == "max":
+            codon_counts[group_name, key] = codon_counts.apply(
+                lambda row: max_codon_counts(row, value), axis=1)
         else:
-            codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
+            codon_counts[group_name, key] = codon_counts.apply(
+                lambda row: sum_codon_counts(row, value), axis=1)
         list_classes_names.append(key)
-    if filter:
+    if keep_only_groups:
         return codon_counts.loc[:, ([group_name], list_classes_names)]
     else:
         return codon_counts
@@ -502,7 +519,8 @@ def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', fi
 
 def gene_wide_codon_usage(
         codon_counts,
-        verbose=False, return_more=False, ref_filter_dict=None):
+        verbose=False, return_more=False, ref_filter_dict=None,
+        check_colsums=False):
     """
     Compute codon usage biases "gene-wide" as the standardized
     difference between a gene's codon proportions and global
@@ -532,16 +550,16 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
     # codon_proportions.style.hide(axis="index")
     if verbose:
         display(codon_proportions.head(3))
-    # Check that the sum of proportions (columns) for a gene is 1
-    colsums = codon_proportions.sum(axis=1).values
-    # Due to imprecision in float arithmetics,
-    # we can only check that the sums are close to 1
-
-
-    ## I put this assert in comment because after grouping (either by max or by sum),
-    ## the distribution is too skewed to have an optimal normalization
-    ## I am not sure about the meaning of normalizing as skewed data as we have
-    #assert np.allclose(colsums, np.full(len(colsums), 1))
+    # The assert has been made optional because after grouping
+    # (either by max or by sum), the distribution is too skewed
+    # to have an optimal normalization
+    # I am not sure about the meaning of normalizing as skewed data as we have
+    if check_colsums:
+        # Check that the sum of proportions (columns) for a gene is 1
+        colsums = codon_proportions.sum(axis=1).values
+        # Due to imprecision in float arithmetics,
+        # we can only check that the sums are close to 1
+        assert np.allclose(colsums, np.full(len(colsums), 1))
     if ref_filter_dict is None:
         counts_for_global = codon_counts
     else:
@@ -777,7 +795,8 @@ across genes) so that they are more comparable between codons.
 
 def aa_usage(
         codon_counts,
-        verbose=False, return_more=False, ref_filter_dict=None):
+        verbose=False, return_more=False, ref_filter_dict=None,
+        check_colsums=False):
     """
     Compute amino-acid usage biases as the standardized
     difference between a gene's amino-acid proportions
@@ -812,10 +831,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum).
     # aa_proportions.style.hide(axis="index")
     if verbose:
         display(aa_proportions.head(3))
-    # Checking that proportions sum to 1
-    colsums = aa_proportions.sum(axis=1)
-    # Same here since the normalization is working as good on skewed distribution
-    #assert np.allclose(colsums, np.full(len(colsums), 1))
+    # The assert has been made optional since the normalization is working
+    # as good on skewed distribution
+    if check_colsums:
+        # Checking that proportions sum to 1
+        colsums = aa_proportions.sum(axis=1)
+        assert np.allclose(colsums, np.full(len(colsums), 1))
     # Then, computing the global amino-acid proportions
     if ref_filter_dict is None:
         counts_for_global = summed_by_aa
@@ -946,7 +967,8 @@ def codon_influence_in_components(
 
 def codon_usage_pca(
         usage_data,
-        figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False,
+        figs_dir=None, hue="chrom", exclude_cols=None,
+        plot_more_components=False,
         formats=None, cols_are_codons=True):
     """
     Perform Principal Component Analysis on *usage_data*.
@@ -993,22 +1015,22 @@ def codon_usage_pca(
         (fig, axes) = plt.subplots(3, 2, figsize=(16, 25))
         sns.scatterplot(
             data=transformed_data,
-            x=0, y=1, hue=hue, marker=".", ax=axes[0,0])
+            x=0, y=1, hue=hue, marker=".", ax=axes[0, 0])
         sns.scatterplot(
             data=transformed_data,
-            x=2, y=3, hue=hue, marker=".", ax=axes[0,1])
+            x=2, y=3, hue=hue, marker=".", ax=axes[0, 1])
         sns.scatterplot(
             data=transformed_data,
-            x=4, y=5, hue=hue, marker=".", ax=axes[1,0])
+            x=4, y=5, hue=hue, marker=".", ax=axes[1, 0])
         sns.scatterplot(
             data=transformed_data,
-            x=6, y=7, hue=hue, marker=".", ax=axes[1,1])
+            x=6, y=7, hue=hue, marker=".", ax=axes[1, 1])
         sns.scatterplot(
             data=transformed_data,
-            x=8, y=9, hue=hue, marker=".", ax=axes[2,0])
+            x=8, y=9, hue=hue, marker=".", ax=axes[2, 0])
         sns.scatterplot(
             data=transformed_data,
-            x=10, y=11, hue=hue, marker=".", ax=axes[2,1])
+            x=10, y=11, hue=hue, marker=".", ax=axes[2, 1])
     else:
         (fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
         sns.scatterplot(
@@ -1028,7 +1050,9 @@ def codon_usage_pca(
     if cols_are_codons:
         codon_influence_in_components(
             pca.components_, usage_data.columns,
-            figs_dir=figs_dir, more_components=plot_more_components, formats=formats)
+            figs_dir=figs_dir,
+            more_components=plot_more_components,
+            formats=formats)
     return (pca, transformed_data)
 
 
-- 
GitLab