From 9b4e17e3894d3280ad6b202eef549aeb02a806a7 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 4 Oct 2023 11:41:33 +0200 Subject: [PATCH] Optional fillna in exclude_all_nan_cols. The default behaviour has been changed to be of more general use. The previous default was to fill the NaNs not belonging to all NaN columns with 0, which was suitable for standardized usage biases (which are expected to be centered on 0), but we might prefer to keep NaNs if we want to later replace those values with something else when dealing with other usage metrics than biases. If a single filling value is desired, it can be set using argument fill_other_nas. Implementation detail: The fillna still happens, but with a default value of np.nan, which hopefully should be OK. --- libcodonusage/__init__.py | 2 +- libcodonusage/libcodonusage.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py index 2065ff6..e82dd57 100644 --- a/libcodonusage/__init__.py +++ b/libcodonusage/__init__.py @@ -1,6 +1,6 @@ __copyright__ = "Copyright (C) 2022-2023 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = "0.27.5" +__version__ = "0.28.0" from .libcodonusage import ( aa2colour, aa_usage, diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py index 2be726c..f63e15b 100644 --- a/libcodonusage/libcodonusage.py +++ b/libcodonusage/libcodonusage.py @@ -794,27 +794,28 @@ across genes) so that they are more comparable between amino-acids. return standardized_aa_usage_biases -def exclude_all_nan_cols(standardized_usage_biases): +def exclude_all_nan_cols(usage_table, fill_other_nas=np.nan): """ - Detect columns in *standardized_usage_biases* that contain only NaNs - and remove them from the table. + Detect columns in *usage_table* that contain only NaNs + and remove them from the table. Other NaN values are replaced + with *fill_other_nas*. """ render_md(""" -Standardization may result in division by zero for usage biases +Standardization may result in division by zero for usage data that have a zero standard deviation. This is expected to be the case for "by amino-acid" usage biases for codons corresponding to amino-acids having only one codon: methionine (M) and tryptophan (W). """) - all_nan_cols = standardized_usage_biases.columns[ - standardized_usage_biases.isna().all()] + all_nan_cols = usage_table.columns[ + usage_table.isna().all()] if len(all_nan_cols): render_md("The following columns contain only NaNs:") display(all_nan_cols) render_md("This likely resulted from a division by zero.") render_md("These columns will be excluded.") return ( - standardized_usage_biases.drop(columns=all_nan_cols).fillna(0), + usage_table.drop(columns=all_nan_cols).fillna(fill_other_nas), all_nan_cols) -- GitLab