From 6ef404308b8111488fa183dea9538b2ec8484900 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Mon, 6 Nov 2023 12:04:01 +0100 Subject: [PATCH] Linting code. --- libcodonusage/libcodonusage.py | 118 ++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 47 deletions(-) diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index d16d43a..0bc6a6d 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -211,7 +211,6 @@ def detect_fishy_genes(codon_counts): A table of boolean criteria is returned, with one line per gene. """ - def display_gene_set(gene_set, max_size=10): """ Print out genes in a gene set, depending on their number. @@ -445,7 +444,8 @@ SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})" def remove_codons(codon_counts, codon_list): """ - Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons). + Filter out codons in a table *codon_counts* based on codons + present in the list *codon_list* (like stop codons). """ codon_counts.drop(columns=codon_list, inplace=True) return codon_counts @@ -453,17 +453,23 @@ def remove_codons(codon_counts, codon_list): def sum_codon_counts(row, codons): """ - Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*. + Perform the row-wise sum of codon counts for the codons + present in *codons* list given the row *row*. """ - sum = 0 - for cod in codons: - sum += row[cod] - return sum + # TODO: try row[codons].sum() + # sum = 0 + # for cod in codons: + # sum += row[cod] + # return sum + # Possibly more efficient, avoids the following pylint warning: + # W0622: Redefining built-in 'sum' (redefined-builtin) + return sum([row[cod] for cod in codons]) def max_codon_counts(row, codons): """ - Return the row-wise maximum of codon counts for the codons present in *codons* list given the row *row*. + Return the row-wise maximum of codon counts for the codons + present in *codons* list given the row *row*. """ counts_codons = [] for cod in codons: @@ -471,30 +477,41 @@ def max_codon_counts(row, codons): return max(counts_codons) -def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', filter=False): +def group_codons_by_class( + codon_counts, group_name, dict_classes, + mode="max", keep_only_groups=False): """ Group codons given specific classes in *codon_counts* table. - *group_name* contains the name of the grouping, and plays the role of aa names in the original - codon counts table. - *dict_classes* contains the different classes under this grouping as keys and the associated - list of codons as values. - *mode* defines the way grouping is computed. If mode is 'max', the maximum value of counts of codons belonging - to the same class is used for the grouped class. Otherwise, the sum of counts values for all codons belonging + *group_name* contains the name of the grouping, and plays the role + of aa names in the original codon counts table. + *dict_classes* contains the different classes under this grouping + as keys and the associated list of codons as values. + *mode* defines the way grouping is computed. + If mode is "max", the maximum value of counts of codons belonging to the same class is used for the grouped class. - *filter* is a boolean set to True if you want to filter out other codons than the ones specified in - dict_classes. If set to False (default), the original codon_counts table is returned with additionnal columns for - the grouped_classes. + Otherwise, the sum of counts values for all codons belonging to + the same class is used for the grouped class. + *keep_only_groups* is a boolean set to True if you want to filter out + other codons than the ones specified in dict_classes. + If set to False (default), the original codon_counts table + is returned with additional columns for the grouped_classes. """ - list_classes = list(dict_classes.items()) list_classes_names = [] + # pylint issues the following warning: + # "W0640: Cell variable value defined in loop (cell-var-from-loop)" + # Since the lambda function is used immediately, + # this should not be an actual issue + # (see https://stackoverflow.com/q/25314547/1878788 and answers) for key, value in dict_classes.items(): - if mode == 'max': - codon_counts[group_name, key] = codon_counts.apply(lambda row: max_codon_counts(row, value), axis=1) + if mode == "max": + codon_counts[group_name, key] = codon_counts.apply( + lambda row: max_codon_counts(row, value), axis=1) else: - codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1) + codon_counts[group_name, key] = codon_counts.apply( + lambda row: sum_codon_counts(row, value), axis=1) list_classes_names.append(key) - if filter: + if keep_only_groups: return codon_counts.loc[:, ([group_name], list_classes_names)] else: return codon_counts @@ -502,7 +519,8 @@ def group_codons_by_class(codon_counts, group_name, dict_classes, mode='max', fi def gene_wide_codon_usage( codon_counts, - verbose=False, return_more=False, ref_filter_dict=None): + verbose=False, return_more=False, ref_filter_dict=None, + check_colsums=False): """ Compute codon usage biases "gene-wide" as the standardized difference between a gene's codon proportions and global @@ -532,16 +550,16 @@ using the "l1" norm (which, for positive-only values amounts to the sum). # codon_proportions.style.hide(axis="index") if verbose: display(codon_proportions.head(3)) - # Check that the sum of proportions (columns) for a gene is 1 - colsums = codon_proportions.sum(axis=1).values - # Due to imprecision in float arithmetics, - # we can only check that the sums are close to 1 - - - ## I put this assert in comment because after grouping (either by max or by sum), - ## the distribution is too skewed to have an optimal normalization - ## I am not sure about the meaning of normalizing as skewed data as we have - #assert np.allclose(colsums, np.full(len(colsums), 1)) + # The assert has been made optional because after grouping + # (either by max or by sum), the distribution is too skewed + # to have an optimal normalization + # I am not sure about the meaning of normalizing as skewed data as we have + if check_colsums: + # Check that the sum of proportions (columns) for a gene is 1 + colsums = codon_proportions.sum(axis=1).values + # Due to imprecision in float arithmetics, + # we can only check that the sums are close to 1 + assert np.allclose(colsums, np.full(len(colsums), 1)) if ref_filter_dict is None: counts_for_global = codon_counts else: @@ -777,7 +795,8 @@ across genes) so that they are more comparable between codons. def aa_usage( codon_counts, - verbose=False, return_more=False, ref_filter_dict=None): + verbose=False, return_more=False, ref_filter_dict=None, + check_colsums=False): """ Compute amino-acid usage biases as the standardized difference between a gene's amino-acid proportions @@ -812,10 +831,12 @@ using the "l1" norm (which, for positive-only values amounts to the sum). # aa_proportions.style.hide(axis="index") if verbose: display(aa_proportions.head(3)) - # Checking that proportions sum to 1 - colsums = aa_proportions.sum(axis=1) - # Same here since the normalization is working as good on skewed distribution - #assert np.allclose(colsums, np.full(len(colsums), 1)) + # The assert has been made optional since the normalization is working + # as good on skewed distribution + if check_colsums: + # Checking that proportions sum to 1 + colsums = aa_proportions.sum(axis=1) + assert np.allclose(colsums, np.full(len(colsums), 1)) # Then, computing the global amino-acid proportions if ref_filter_dict is None: counts_for_global = summed_by_aa @@ -946,7 +967,8 @@ def codon_influence_in_components( def codon_usage_pca( usage_data, - figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False, + figs_dir=None, hue="chrom", exclude_cols=None, + plot_more_components=False, formats=None, cols_are_codons=True): """ Perform Principal Component Analysis on *usage_data*. @@ -993,22 +1015,22 @@ def codon_usage_pca( (fig, axes) = plt.subplots(3, 2, figsize=(16, 25)) sns.scatterplot( data=transformed_data, - x=0, y=1, hue=hue, marker=".", ax=axes[0,0]) + x=0, y=1, hue=hue, marker=".", ax=axes[0, 0]) sns.scatterplot( data=transformed_data, - x=2, y=3, hue=hue, marker=".", ax=axes[0,1]) + x=2, y=3, hue=hue, marker=".", ax=axes[0, 1]) sns.scatterplot( data=transformed_data, - x=4, y=5, hue=hue, marker=".", ax=axes[1,0]) + x=4, y=5, hue=hue, marker=".", ax=axes[1, 0]) sns.scatterplot( data=transformed_data, - x=6, y=7, hue=hue, marker=".", ax=axes[1,1]) + x=6, y=7, hue=hue, marker=".", ax=axes[1, 1]) sns.scatterplot( data=transformed_data, - x=8, y=9, hue=hue, marker=".", ax=axes[2,0]) + x=8, y=9, hue=hue, marker=".", ax=axes[2, 0]) sns.scatterplot( data=transformed_data, - x=10, y=11, hue=hue, marker=".", ax=axes[2,1]) + x=10, y=11, hue=hue, marker=".", ax=axes[2, 1]) else: (fig, axes) = plt.subplots(1, 2, figsize=(16, 8)) sns.scatterplot( @@ -1028,7 +1050,9 @@ def codon_usage_pca( if cols_are_codons: codon_influence_in_components( pca.components_, usage_data.columns, - figs_dir=figs_dir, more_components=plot_more_components, formats=formats) + figs_dir=figs_dir, + more_components=plot_more_components, + formats=formats) return (pca, transformed_data) -- GitLab