From 0390d53dcd76f8e324ed0ff1e882d92c66340d15 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 4 Mar 2022 15:54:15 +0100
Subject: [PATCH] Fixed column order, nicer output.

---
 libcodonusage/__init__.py      |  3 +-
 libcodonusage/libcodonusage.py | 92 +++++++++++++++++++++-------------
 2 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/libcodonusage/__init__.py b/libcodonusage/__init__.py
index 6f7ce6d..cd647f6 100644
--- a/libcodonusage/__init__.py
+++ b/libcodonusage/__init__.py
@@ -1,6 +1,6 @@
 __copyright__ = "Copyright (C) 2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = 0.5
+__version__ = 0.6
 from .libcodonusage import (
     aa2colour,
     codon2aa,
@@ -11,6 +11,7 @@ from .libcodonusage import (
     make_aa_codon_columns,
     make_counts_only,
     render_md,
+    save_counts_table,
     sort_counts_by_aa,
     violin_usage,
     violin_usage_vertical,
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 9bbda1f..e8c8eea 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -95,15 +95,15 @@ def load_counts_table(table_path, index_col="old_locus_tag"):
     there are other columns containing various pieces of information
     regarding those genes.
     """
-    render_md(f"Loading data from {table_path}...\n")
+    render_md(f"Loading data from [{table_path}]({table_path})...\n")
     codon_counts = pd.read_table(table_path, index_col=index_col)
     nb_genes = len(codon_counts)
     render_md(
         f"""
-        There are {nb_genes} genes in the table.
+    There are {nb_genes} genes in the table.
 
-        The table looks as follows (first 3 lines):
-        """)
+    The table looks as follows (first 3 lines):
+""")
     display(codon_counts.head(3))
     return codon_counts
 
@@ -176,33 +176,41 @@ def detect_fishy_genes(codon_counts):
 
     A table of boolean criteria is returned, with one line per gene.
     """
+
+    def display_gene_set(gene_set, max_size=10):
+        """
+        Print out genes in a gene set, depending on their number.
+        """
+        if gene_set and len(gene_set) <= max_size:
+            print("   ", ", ".join(gene_set))
+
     render_md("### Searching for mis-annotated genes")
     (criteria, gene_sets) = compute_criteria(codon_counts)
     render_md(
         """
-        We should avoid genes that are not in-frame. We can likely exclude
-        those that do not start with a valid start codon.
-        Here, this will be seen by looking at the translation of the first
-        codon.
-        If it is "*" (`start_stop`) or "-" (`wrong_start`),
-        the codon is not a valid start codon.
-        """)
+We should avoid genes that are not in-frame. We can likely exclude
+those that do not start with a valid start codon.
+Here, this will be seen by looking at the translation of the first
+codon.
+If it is "*" (`start_stop`) or "-" (`wrong_start`),
+the codon is not a valid start codon.
+""")
     wrong_start = gene_sets["wrong_start"]
-    render_md(f"There are {len(wrong_start)} genes that start with a `-`:")
-    display(wrong_start)
+    render_md(f"There are {len(wrong_start)} genes that start with a `-`.")
+    display_gene_set(wrong_start)
 
     start_stop = gene_sets["start_stop"]
     render_md(
-        f"There are {len(start_stop)} genes that start with a stop codon:")
-    display(start_stop)
+        f"There are {len(start_stop)} genes that start with a stop codon.")
+    display_gene_set(start_stop)
 
     no_met_start = gene_sets["no_met_start"]
     if no_met_start == wrong_start | start_stop:
         render_md(
             """
-            Genes not belonging to the `start_stop` or `wrong_start`
-            category all start with a methionine.
-            """)
+Genes not belonging to the `start_stop` or `wrong_start`
+category all start with a methionine.
+""")
     else:
         raise NotImplementedError(
             "There are genes that start with something else than 'M', "
@@ -211,45 +219,45 @@ def detect_fishy_genes(codon_counts):
     start_upstream = gene_sets["start_upstream"]
     render_md(
         f"{len(start_upstream)} genes have a possibly ill-defined"
-        " start position:")
-    display(start_upstream)
+        " start position.")
+    display_gene_set(start_upstream)
 
     start_upstream_met_start = gene_sets["start_upstream_met_start"]
     if start_upstream_met_start:
         render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
-                  " start:")
-        display(start_upstream_met_start)
+                  " start.")
+        display_gene_set(start_upstream_met_start)
 
         start_upstream_met_start_nostops = gene_sets[
             "start_upstream_met_start_nostops"]
         if start_upstream_met_start_nostops:
             render_md(f"{len(start_upstream_met_start_nostops)} of them"
-                      " also do not contain any stop:")
-            display(start_upstream_met_start_nostops)
+                      " also do not contain any stop.")
+            display_gene_set(start_upstream_met_start_nostops)
             render_md("Those genes could probably be kept.")
 
     has_stops_good_met_start = gene_sets["has_stops_good_met_start"]
     render_md(
         f"{len(has_stops_good_met_start)} genes contain stops"
         " but have a well defined start position with 'M'.")
-    if len(has_stops_good_met_start) <= 10:
-        display(has_stops_good_met_start)
+    display_gene_set(has_stops_good_met_start)
 
     good_met_start = gene_sets["good_met_start"]
     render_md(
         f"{len(good_met_start)} genes have a well defined "
         "start position with 'M'.")
     render_md(
-        """If genes that have stop readthrough are a known phenomenon,
-        only those among them that do not have a valid start codon
-        might be excluded.""")
+        """
+If genes that have stop readthrough are a known phenomenon,
+only those among them that do not have a valid start codon
+might be excluded.
+""")
 
     no_stop_good_met_start = gene_sets["no_stop_good_met_start"]
     render_md(
         f"{len(no_stop_good_met_start)} genes have a well defined"
         " start position with 'M' and contain no stop codon.")
-    if len(no_stop_good_met_start) <= 10:
-        display(no_stop_good_met_start)
+    display_gene_set(no_stop_good_met_start)
     return criteria
 
 
@@ -257,14 +265,17 @@ def make_counts_only(counts_table):
     """
     Integrate "informative" columns of *counts_table* into the index.
     """
+    # To ensure a stable order:
+    ref_info_cols = [
+        "old_locus_tag", "locus_tag", "length",
+        "start_codon", "expected_start_aa",
+        "first_stop", "nb_stops", "start_upstream", "end_downstream"]
+    # To ensure no info columns are lost:
     info_cols = [
         counts_table.index.name,
         *counts_table.columns.difference(codon2aa)]
-    assert set(info_cols) == {
-        "old_locus_tag", "locus_tag", "length",
-        "start_codon", "expected_start_aa",
-        "first_stop", "nb_stops", "start_upstream", "end_downstream"}
-    return counts_table.reset_index().set_index(info_cols)
+    assert set(info_cols) == set(ref_info_cols)
+    return counts_table.reset_index().set_index(ref_info_cols)
 
 
 def make_aa_codon_columns(counts_table):
@@ -302,6 +313,15 @@ def sort_counts_by_aa(counts_table):
     return sorted_counts
 
 
+def save_counts_table(counts_table, table_path):
+    """
+    Save table *counts_table* at *table_path*.
+    """
+    table_path.parent.mkdir(parents=True, exist_ok=True)
+    counts_table.to_csv(table_path, sep="\t")
+    render_md(f"The table was saved at [{table_path}]({table_path}).")
+
+
 def load_bias_table(table_path, nb_cluster_series=2):
     """
     Load a table containing by-amino-acid codon usage biases.
-- 
GitLab