From faf04a848c03ff7c3a11d34ef8b634fc2d68370e Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Thu, 14 Sep 2023 13:18:58 +0200
Subject: [PATCH] Add checks on column formats.

---
 libcodonusage/libcodonusage.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 62c1259..4c2e6a6 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -502,6 +502,19 @@ across genes) so that they are more comparable between codons.
     return standardized_codon_usage_biases
 
 
+def check_aa_codon_columns(table):
+    """
+    Check that the columns of *table* correspond to (aa, codon) pairs.
+    """
+    msg = "Codon proportions table should have two levels: 'aa' and 'codon'"
+    if codon_proportions_by_aa.columns.nlevels !=2:
+        raise ValueError(msg)
+    if codon_proportions_by_aa.columns.names[0] != "aa":
+        raise ValueError(msg)
+    if codon_proportions_by_aa.columns.names[1] != "codon":
+        raise ValueError(msg)
+
+
 def compute_rscu(codon_proportions_by_aa):
     """
     Compute Relative Syninymous Codon Usage from proportions in genes.
@@ -513,6 +526,7 @@ def compute_rscu(codon_proportions_by_aa):
     where the first level is the amino-acid name, and the second level
     the codon.
     """
+    check_aa_codon_columns(codon_proportions_by_aa)
     degeneracy = pd.Series(
         # concat "flattens" the list of iterables given as arguments
         # (list of tuples of repeated degeneracy values)
@@ -558,6 +572,7 @@ def by_aa_codon_usage(
     restricted to the genes where the *index_level* has the *index_value*
     for all those pairs.
     """
+    check_aa_codon_columns(codon_counts)
     render_md(f"""
 We will compute codon usage "by amino-acid", by looking at the
 proportion of codons for each amino-acid within a gene's CDS.
@@ -667,6 +682,7 @@ def aa_usage(
     restricted to the genes where the *index_level* has the *index_value*
     for all those pairs.
     """
+    check_aa_codon_columns(codon_counts)
     render_md("""
 We will compute amino-acid usage, by looking at the
 proportions of amino-acids within a gene's CDS.
@@ -892,6 +908,7 @@ def centroid_usage(codon_counts, all_nan_cols):
     For each amino-acid, there is one centroid per codon, where the
     proportion for this codon is set to 1.0, and 0.0 for the other codons.
     """
+    check_aa_codon_columns(codon_counts)
     summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
     global_usage = codon_counts.sum(axis=0)
     global_summed_by_aa = global_usage.groupby(level=0).sum()
-- 
GitLab