From b9011ed25e3de307bdbca326c1b72e629cb58317 Mon Sep 17 00:00:00 2001
From: Marie Anselmet <marie.anselmet@pasteur.fr>
Date: Fri, 20 Oct 2023 11:26:02 +0200
Subject: [PATCH] Add little docu to new fonctions

---
 build/lib/libcodonusage/__init__.py      |   45 +
 build/lib/libcodonusage/libcodonusage.py | 1797 ++++++++++++++++++++++
 libcodonusage/libcodonusage.py           |   18 +-
 3 files changed, 1856 insertions(+), 4 deletions(-)
 create mode 100644 build/lib/libcodonusage/__init__.py
 create mode 100644 build/lib/libcodonusage/libcodonusage.py

diff --git a/build/lib/libcodonusage/__init__.py b/build/lib/libcodonusage/__init__.py
new file mode 100644
index 0000000..45862d8
--- /dev/null
+++ b/build/lib/libcodonusage/__init__.py
@@ -0,0 +1,45 @@
+__copyright__ = "Copyright (C) 2022-2023 Blaise Li"
+__licence__ = "GNU GPLv3"
+__version__ = "0.28.1"
+from .libcodonusage import (
+    aa2colour,
+    aa_usage,
+    by_aa_codon_usage,
+    centroid_usage,
+    codon2aa,
+    codon_usage_pca,
+    columns_by_aa,
+    compare_clusterings,
+    compute_rscu,
+    detect_fishy_genes,
+    exclude_all_nan_cols,
+    extract_top_genes_from_cluster,
+    filter_on_idx_levels,
+    find_most_biased_genes,
+    find_valley,
+    format_codon_labels,
+    gene_wide_codon_usage,
+    group_codons_by_sum,
+    load_bias_table,
+    load_counts_table,
+    load_table_with_info_index,
+    make_aa_codon_columns,
+    make_cluster_table,
+    make_centroids_cluster_finder,
+    make_counts_only,
+    plot_codon_usage_for_gene_list,
+    remove_codons,
+    render_md,
+    save_counts_table,
+    sort_counts_by_aa,
+    split_info_index,
+    star2stop,
+    sum_codon_counts,
+    to_long_form,
+    violin_usage,
+    violin_usage_vertical,
+    violin_usage_by_clusters,
+    violin_usage_by_clusters_splitby,
+    violin_with_thresh,
+    write_cluster_lists,
+    )
diff --git a/build/lib/libcodonusage/libcodonusage.py b/build/lib/libcodonusage/libcodonusage.py
new file mode 100644
index 0000000..603a8bd
--- /dev/null
+++ b/build/lib/libcodonusage/libcodonusage.py
@@ -0,0 +1,1797 @@
+#!/usr/bin/env python3
+# Copyright (C) 2021-2023 Blaise Li
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Functions used in Jupyter notebooks."""
+from http.cookies import Morsel
+from itertools import combinations
+import json
+from operator import attrgetter, itemgetter
+from pathlib import Path
+from unittest import mock
+# python3 -m pip install cytoolz
+from cytoolz import concat, groupby, unique, valmap
+# To render mardown in a Jupyter notebook on gitlab
+from IPython.display import display
+from IPython.core.display import HTML
+# python3 -m pip install markdown
+from markdown import markdown
+# Basic plotting library for Python
+# python3 -m pip install matplotlib
+# See https://matplotlib.org/
+import matplotlib.pyplot as plt
+# Python library to make some nicer plots
+# python3 -m pip install seaborn
+# See https://seaborn.pydata.org/
+import seaborn as sns
+# Useful Python package for bioinformatics.
+# python3 -m pip install biotite
+# See https://www.biotite-python.org/install.html
+import biotite.sequence.graphics as bgraphs
+from biotite.sequence import CodonTable, NucleotideSequence
+# python3 -m pip install numpy
+import numpy as np
+# Python module to handle tabular data
+# python3 -m pip install pandas
+# See https://pandas.pydata.org/pandas-docs/stable/index.html
+import pandas as pd
+# Python module that facilitates the exploration of tbular data
+# python3 -m pip install plydata
+# from plydata import define, pull, query
+# python3 -m pip install scipy
+from scipy.linalg import LinAlgError
+from scipy.spatial.distance import sqeuclidean as sqdist
+from scipy.stats import gaussian_kde
+# Python library with useful data-processing features
+# python3 -m pip install scikit-learn
+# https://scikit-learn.org/stable/install.html
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import normalize
+# Python module to vizualize set intersections
+# python3 -m pip install upsetplot
+from upsetplot import from_indicators, UpSet
+
+
+fmt_metadata = {
+    "png": {"creationDate": None},
+    "svg": {
+        "Creator": None, "Date": None,
+        "Title": ("Distribution of standardized codon usages biases"
+                  " (by aa), by chromosome")}
+}
+
+
+def render_md(md_str):
+    """
+    Render a markdown string *md_str* in a Jupyter notebook.
+
+    More direct rendering is not possible on gitlab for security reasons.
+    """
+    display(HTML(markdown(md_str)))
+
+
+codon2aa = CodonTable.load(11).codon_dict()
+# Column headers made from (aa, codon) pairs
+codon_headers = pd.MultiIndex.from_tuples(
+    ((aa, codon) for (codon, aa) in codon2aa.items()),
+    names=("aa", "codon"))
+# Groups of column headers for each amino-acid
+# key: aa, value: list of (aa, codon) pairs
+columns_by_aa = groupby(itemgetter(0), codon_headers)
+
+
+######################################
+# Associating colours to amino-acids #
+######################################
+# We load amino-acid colouring information to use in graphical representations.
+# We'll use colour schemes provided by the biotite package:
+# https://www.biotite-python.org/examples/gallery/sequence/color_schemes_protein.html
+# We look directly at the json file
+# instead of using the "native" biotite mechanisms
+with Path(bgraphs.colorschemes._scheme_dir).joinpath(
+        "jalview_zappo.json").open() as fh:
+    colscheme = json.load(fh)
+# We add the black colour for the stop "amino-acid" and associated codons
+# key: amino-acid
+# value: hexadecimal html colour code
+aa2colour = {**colscheme["colors"], "*": '#000000'}
+######################################
+# Associating colours to nucleotides #
+######################################
+nuc_alphabet = NucleotideSequence.alphabet_unamb
+nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet)
+nuc2colour = dict(zip(nuc_alphabet, nuc_colours))
+
+
+def load_counts_table(
+        table_path, index_col="old_locus_tag", index_unique=True):
+    """
+    Load a table of pre-computed codon counts at *table_path*.
+
+    The lines correspond to genes.
+    Besides the columns containing the counts for each codon,
+    there are other columns containing various pieces of information
+    regarding those genes.
+
+    If *index_unique* is True, a ValueError error will be raised if
+    some elements in the the column *index_col* are not unique.
+    """
+    render_md(f"Loading data from [{table_path}]({table_path})...\n")
+    codon_counts = pd.read_table(table_path, index_col=index_col)
+    if index_unique and not codon_counts.index.is_unique:
+        raise ValueError(f"Index {index_col} contains repeated values.\n")
+    nb_genes = len(codon_counts)
+    render_md(
+        f"""
+    There are {nb_genes} genes in the table.
+
+    The table looks as follows (first 3 lines):
+""")
+    display(codon_counts.head(3))
+    return codon_counts
+
+
+def compute_criteria(codon_counts):
+    """
+    Prepare a Boolean table and corresponging gene sets.
+
+    There are one column and one gene set for each criterion.
+    """
+    wrong_start_col = codon_counts["expected_start_aa"] == "-"
+    start_stop_col = codon_counts["expected_start_aa"] == "*"
+    no_met_start_col = codon_counts["expected_start_aa"] != "M"
+    start_upstream_col = codon_counts["start_upstream"]
+    has_stops_col = codon_counts["nb_stops"] > 0
+    start_upstream_met_start_col = start_upstream_col & ~no_met_start_col
+    start_upstream_met_start_nostops_col = (
+        start_upstream_met_start_col & ~has_stops_col)
+    good_met_start_col = ~start_upstream_col & ~no_met_start_col
+    has_stops_good_met_start_col = (
+        has_stops_col & ~no_met_start_col
+        & ~start_upstream_met_start_col)
+    no_stop_good_met_start_col = (
+        ~has_stops_col & ~no_met_start_col
+        & ~start_upstream_met_start_col)
+    criteria = pd.DataFrame({
+        "wrong_start": wrong_start_col,
+        "start_stop": start_stop_col,
+        "no_met_start": no_met_start_col,
+        "start_upstream": start_upstream_col,
+        "has_stops": has_stops_col,
+        "start_upstream_met_start": start_upstream_met_start_col,
+        "start_upstream_met_start_nostops": (
+            start_upstream_met_start_nostops_col),
+        "good_met_start": good_met_start_col,
+        "has_stops_good_met_start": has_stops_good_met_start_col})
+    render_md("Number of genes fitting various selection criteria:\n\n")
+    display(criteria.agg(sum))
+    render_md("Upset plot of the non-empty categories:\n\n")
+    fig = plt.figure()
+    try:
+        UpSet(
+            from_indicators(*[criteria.columns], data=criteria),
+            show_counts=True).plot(fig=fig)
+    except AttributeError:
+        pass
+    display(fig)
+    plt.close(fig)
+    gene_sets = {
+        "wrong_start": set(criteria[wrong_start_col].index),
+        "start_stop": set(criteria[start_stop_col].index),
+        "no_met_start": set(criteria[no_met_start_col].index),
+        "start_upstream": set(criteria[start_upstream_col].index),
+        # Not used
+        # "has_stops": set(criteria[has_stops_col].index),
+        "start_upstream_met_start": set(
+            criteria[start_upstream_met_start_col].index),
+        "start_upstream_met_start_nostops": set(
+            criteria[start_upstream_met_start_nostops_col].index),
+        "good_met_start": set(criteria[good_met_start_col].index),
+        "has_stops_good_met_start": set(
+            criteria[has_stops_good_met_start_col].index),
+        "no_stop_good_met_start": set(
+            criteria[no_stop_good_met_start_col].index)}
+    return (criteria, gene_sets)
+
+
+def detect_fishy_genes(codon_counts):
+    """
+    Run diagnostics about genes included in table *codon_counts*.
+
+    These should help decide whether to exclude some of the genes.
+
+    A table of boolean criteria is returned, with one line per gene.
+    """
+
+    def display_gene_set(gene_set, max_size=10):
+        """
+        Print out genes in a gene set, depending on their number.
+        """
+        if gene_set and len(gene_set) <= max_size:
+            print("   ", ", ".join(gene_set))
+
+    render_md("### Searching for mis-annotated genes")
+    (criteria, gene_sets) = compute_criteria(codon_counts)
+    render_md(
+        """
+We should avoid genes that are not in-frame. We can likely exclude
+those that do not start with a valid start codon.
+Here, this will be seen by looking at the translation of the first
+codon.
+If it is "*" (`start_stop`) or "-" (`wrong_start`),
+the codon is not a valid start codon.
+""")
+    wrong_start = gene_sets["wrong_start"]
+    render_md(f"There are {len(wrong_start)} genes that start with a `-`.")
+    display_gene_set(wrong_start)
+
+    start_stop = gene_sets["start_stop"]
+    render_md(
+        f"There are {len(start_stop)} genes that start with a stop codon.")
+    display_gene_set(start_stop)
+
+    no_met_start = gene_sets["no_met_start"]
+    if no_met_start == wrong_start | start_stop:
+        render_md(
+            """
+Genes not belonging to the `start_stop` or `wrong_start`
+category all start with a methionine.
+""")
+    else:
+        raise NotImplementedError(
+            "There are genes that start with something else than 'M', "
+            "'-' or '*'.")
+
+    start_upstream = gene_sets["start_upstream"]
+    render_md(
+        f"{len(start_upstream)} genes have a possibly ill-defined"
+        " start position.")
+    display_gene_set(start_upstream)
+
+    start_upstream_met_start = gene_sets["start_upstream_met_start"]
+    if start_upstream_met_start:
+        render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
+                  " start.")
+        display_gene_set(start_upstream_met_start)
+
+        start_upstream_met_start_nostops = gene_sets[
+            "start_upstream_met_start_nostops"]
+        if start_upstream_met_start_nostops:
+            render_md(f"{len(start_upstream_met_start_nostops)} of them"
+                      " also do not contain any stop.")
+            display_gene_set(start_upstream_met_start_nostops)
+            render_md("Those genes could probably be kept.")
+
+    has_stops_good_met_start = gene_sets["has_stops_good_met_start"]
+    render_md(
+        f"{len(has_stops_good_met_start)} genes contain stops"
+        " but have a well defined start position with 'M'.")
+    display_gene_set(has_stops_good_met_start)
+
+    good_met_start = gene_sets["good_met_start"]
+    render_md(
+        f"{len(good_met_start)} genes have a well defined "
+        "start position with 'M'.")
+    render_md(
+        """
+If genes that have stop readthrough are a known phenomenon,
+only those among them that do not have a valid start codon
+might be excluded.
+""")
+
+    no_stop_good_met_start = gene_sets["no_stop_good_met_start"]
+    render_md(
+        f"{len(no_stop_good_met_start)} genes have a well defined"
+        " start position with 'M' and contain no stop codon.")
+    display_gene_set(no_stop_good_met_start)
+    return criteria
+
+
+def make_counts_only(
+        counts_table,
+        seq_id_kw="locus_tag", alt_tag_kw="old_locus_tag",
+        ref_info_cols=None):
+    """
+    Integrate "informative" columns of *counts_table* into the index.
+    """
+    # To ensure a stable order:
+    if ref_info_cols is None:
+        ref_info_cols = [
+            alt_tag_kw, seq_id_kw,
+            "start", "end", "length",
+            "start_codon", "expected_start_aa",
+            "first_stop", "nb_stops", "start_upstream", "end_downstream"]
+    # To ensure no info columns are lost:
+    info_cols = [
+        counts_table.index.name,
+        *counts_table.columns.difference(codon2aa)]
+    assert set(info_cols) == set(ref_info_cols)
+    return counts_table.reset_index().set_index(ref_info_cols)
+
+
+def make_aa_codon_columns(counts_table):
+    """Transform column headers into a (aa, codon) MultiIndex."""
+    counts_table.columns = pd.MultiIndex.from_tuples(
+        ((codon2aa[codon], codon) for codon in counts_table.columns),
+        names=("aa", "codon"))
+    render_md(
+        "We associated amino-acid information to codons, "
+        "as an extra level in the table columns.")
+    return counts_table
+
+
+def sort_counts_by_aa(counts_table):
+    """
+    Sort columns so that codons are grouped by corresponding amino-acid.
+    """
+    aacodons_order = list(aa2colour.keys())
+    # We need to include both aas and codons in the list
+    # because it will be used for both the aa and the codon levels
+    # when sorting the (aa, codon) MultiIndex
+    aacodons_order.extend(codon2aa.keys())
+
+    def aa_sortkey(col):
+        """
+        Key function to use when sorting a MultiIndex consisting in
+        (aa, codon) pairs.
+        """
+        return col.map(aacodons_order.index)
+    sorted_counts = counts_table.sort_index(
+        axis=1, level=(0, 1), ascending=(True, True), key=aa_sortkey)
+    render_md(
+        "The columns were sorted in order to group codons"
+        " by associated amino-acid.")
+    return sorted_counts
+
+
+def save_counts_table(counts_table, table_path):
+    """
+    Save table *counts_table* at *table_path*.
+    """
+    table_path.parent.mkdir(parents=True, exist_ok=True)
+    counts_table.to_csv(table_path, sep="\t")
+    render_md(f"The table was saved at [{table_path}]({table_path}).")
+
+
+def load_table_with_info_index(table_path, nb_info_cols, nb_cluster_series=0):
+    """
+    Load a table containing by-amino-acid codon counts or usage biases.
+
+    The table, located at *table_path*, should have tab-separated columns.
+    It is expected to contain a series of informative columns
+    preceding the columns containing the codon usage bias values.
+
+    The first *nb_info_cols* columns contain information about gene
+    identifiers and features extracted from CDS annotation data.
+
+    Then, there can be series of gene-clustering information, where each
+    column of the series corresponds to clusters defined based on
+    codon-usage biases among the codons coding an amino-acid.
+    There are no columns for amino-acids coded by only one codon (M and W).
+    *nb_cluster_series* specifies the number of such series.
+
+    The result is a pandas DataFrame object, where all the above information
+    is encoded as a MultiIndex, the codon counts or usage biases being in the
+    remaining columns.
+    """
+    return pd.read_csv(
+        table_path,
+        sep="\t",
+        # *nb_info_cols* starting levels from the initial table,
+        # plus levels corresponging to clustering information.
+        # By default, from 2 methods (nb_cluster_series=2):
+        # * `cluster_{aa}_kmeans` for each amino-acid
+        #   having more than one codon
+        # * `cluster_{aa}_full_bias` for each amino-acid
+        #   having more than one codon
+        index_col=list(
+            range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))),
+        header=[0, 1])
+
+
+def split_info_index(table, keep_index_cols):
+    """
+    Split table *table* into info and data.
+
+    Info is supposed to be contained in the index of DataFrame *table*,
+    data in its columns.
+
+    Return a pair of tables, one for the data, one for the info,
+    where the index contains only the levels listed in *keep_index_cols*
+    """
+    drop_info_cols = [
+        colname for colname in table.index.names
+        if colname not in keep_index_cols]
+    info_table = table.reset_index()[
+        [*keep_index_cols, *drop_info_cols]].set_index(keep_index_cols)
+    # To avoid loss of index levels in input by side effect:
+    out_table = table.copy()
+    out_table.index = out_table.index.droplevel(drop_info_cols)
+    # To ensure indices have their levels in the same order
+    # in out_table and in infoinfo_table:
+    out_table = out_table.reset_index().set_index(keep_index_cols)
+    return (out_table, info_table)
+
+
+def filter_on_idx_levels(counts_table, filter_dict):
+    """
+    Filter a table *counts_table* based on values of certain index levels.
+
+    *filter_dict* should contain index level names as keys and values that
+    rows should have at this level to be included in the output
+    filtered table.
+    """
+    row_filter = np.all([
+        counts_table.index.get_level_values(idx_lvl) == idx_val
+        for (idx_lvl, idx_val) in filter_dict.items()], axis=0)
+    return counts_table[row_filter]
+
+
+# Codon usage calculations can be done in various ways.
+# Some examples are given at page 6500 (2 of the pdf) of
+# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
+SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
+SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
+
+
+def remove_codons(codon_counts, codon_list):
+    """
+    Filter out codons in a table *codon_counts* based on codons present in the list *codon_list* (like stop codons).
+    """
+    codon_counts.drop(columns=codon_list, inplace=True)
+    return codon_counts
+
+
+def sum_codon_counts(row, codons):
+    """
+    Perform the row-wise sum of codon counts for the codons present in *codons* list given the row *row*.
+    """
+    sum = 0
+    for cod in codons:
+        sum += row[cod]
+    return sum
+
+
+def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
+    """
+    Group codons by sum given specific classes in *codon_counts* table.
+
+    *group_name* contains the name of the grouping, and plays the role of aa names in the original 
+    codon counts table.
+    *dict_classes* contains the different classes under this grouping as keys and the associated 
+    list of codons as values.
+    *filter* is a boolean set to True if you want to filter out other codons than the ones specified in
+    dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
+    the grouped_classes.
+    """
+    list_classes = list(dict_classes.items())
+    list_classes_names = []
+    for key, value in dict_classes.items():
+        codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
+        list_classes_names.append(key)
+    if filter:
+        return codon_counts.loc[:, ([group_name], list_classes_names)]
+    else:
+        return codon_counts
+
+
+def gene_wide_codon_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
+    """
+    Compute codon usage biases "gene-wide" as the standardized
+    difference between a gene's codon proportions and global
+    codon proportions (default).
+
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
+    """
+    render_md(f"""
+We will compute codon usage "gene-wide" (i.e. not by amino-acid),
+by looking at the proportion of codons within a gene's CDS.
+(This corresponds to R1 in {SUZUKI_LINK})
+""")
+    render_md("""
+To compute codon proportions, we can divide each line by its sum,
+or, equivalently (and hopefully more efficiently), normalize the data
+using the "l1" norm (which, for positive-only values amounts to the sum).
+""")
+    codon_proportions = pd.DataFrame(
+        normalize(codon_counts, norm="l1"),
+        index=codon_counts.index,
+        columns=codon_counts.columns)
+    # Doesn't seem to be working:
+    # codon_proportions.style.hide(axis="index")
+    if verbose:
+        display(codon_proportions.head(3))
+    # Check that the sum of proportions (columns) for a gene is 1
+    colsums = codon_proportions.sum(axis=1).values
+    # Due to imprecision in float arithmetics,
+    # we can only check that the sums are close to 1
+    #assert np.allclose(colsums, np.full(len(colsums), 1))
+    print("mean", np.mean(colsums))
+    assert np.isclose(np.mean(colsums), 1)
+    if ref_filter_dict is None:
+        counts_for_global = codon_counts
+    else:
+        counts_for_global = filter_on_idx_levels(
+            codon_counts, ref_filter_dict)
+    render_md("""
+To compute the usage biases, we also need to compute codon proportions
+in the global usage.
+""")
+    render_md("We compute the global usage, as the sum on columns.")
+    global_usage = counts_for_global.sum(axis=0)
+    render_md("Then we normalize it the same way as for the individual genes.")
+    # The values.reshape(1, -1) turns the Series data into a 2D array
+    # with one line. flatten() turns the data back to 1D
+    global_proportions = pd.Series(
+        normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(),
+        index=global_usage.index)
+    # Can be 0.999999999999...
+    # if global_proportions.sum() != 1.:
+    #     display(global_proportions)
+    #     display(global_proportions.sum())
+    assert np.isclose(global_proportions.sum(), 1.)
+    # assert global_proportions.sum() == 1.
+    render_md("""
+Codon usage biases in genes can then be computed by subtracting
+the corresponding global proportion to a codon's proportion.
+""")
+    codon_usage_biases = codon_proportions.sub(global_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between codons.
+""")
+    standardized_codon_usage_biases = codon_usage_biases.div(
+        codon_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_codon_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_codon_usage_biases.head(3))
+    if return_more:
+        return {
+            "biases": standardized_codon_usage_biases,
+            "proportions": codon_proportions,
+            "global_proportions": global_proportions}
+    return standardized_codon_usage_biases
+
+
+def check_aa_codon_columns(table):
+    """
+    Check that the columns of *table* correspond to (aa, codon) pairs.
+    """
+    msg = "Codon proportions table should have two levels: 'aa' and 'codon'"
+    if table.columns.nlevels != 2:
+        raise ValueError(msg)
+    if table.columns.names[0] != "aa":
+        raise ValueError(msg)
+    if table.columns.names[1] != "codon":
+        raise ValueError(msg)
+
+
+def compute_rscu(codon_proportions_by_aa):
+    """
+    Compute Relative Synonymous Codon Usage from proportions in genes.
+
+    *codon_proportions_by_aa* should be a pandas DataFrame where
+    rows correspond to genes, and columns to codons. It contains
+    the proportions in the gene of the codon among those coding the
+    same amino-acid. The column index should be a pandas MultiIndex
+    where the first level is the amino-acid name, and the second level
+    the codon.
+
+    Return a pair where the first element is the rscu table, and
+    the second is the degeneracy table.
+    """
+    check_aa_codon_columns(codon_proportions_by_aa)
+    degeneracy = pd.Series(
+        # concat "flattens" the list of iterables given as arguments
+        # (list of tuples of repeated degeneracy values)
+        list(concat([
+            # we want stretches of the same degeneracy values
+            # of as many elements as there are codons in this degeneracy group,
+            # hence the tuple multiplication
+            ((degen),) * degen
+            # valmap transforms values in a dict given as second argument
+            # using the function given as first argument
+            # We extract only the value from the (key, value) pairs
+            # generated by the .items() methods of the resulting dict
+            # (Actually, we could probably have just used the .values() method)
+            for (_, degen) in valmap(
+                # len gives the length of each group,
+                # where a group contains the (aa, codon) pairs for a given aa,
+                # that is, the degeneracy for this group of codons
+                len,
+                # groupby creates a dict where keys are obtained from
+                # the function given as first argument and values are
+                # groups made from the iterable given as second argument
+                # (here, column names, which are (aa, codon) pairs)
+                # itemgetter(0) extracts the amino-acid from a pair (aa, codon)
+                groupby(
+                    itemgetter(0),
+                    codon_proportions_by_aa.columns)).items()])),
+        index=codon_proportions_by_aa.columns)
+    return (
+        codon_proportions_by_aa.mul(degeneracy),
+        degeneracy)
+
+
+def by_aa_codon_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
+    """
+    Compute codon usage biases "by amino-acid" as the standardized
+    difference between a gene's codon proportions and global
+    codon proportions, where proportions are computed within
+    groups of same-amino-acid-coding codons instead of gene-wide.
+
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
+    """
+    check_aa_codon_columns(codon_counts)
+    render_md(f"""
+We will compute codon usage "by amino-acid", by looking at the
+proportion of codons for each amino-acid within a gene's CDS.
+(This corresponds to R2 in {SUZUKI_LINK})
+""")
+    render_md("""
+We first need to compute, for a given gene, the total number of codons
+corresponding to each amino-acid.
+""")
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    render_md("""
+To compute codon proportions, we can then divide each codon counts by
+the total for the corresponding amino-acid.
+""")
+    codon_proportions = codon_counts.div(
+        summed_by_aa)
+    # Doesn't seem to be working:
+    # codon_proportions.style.hide(axis="index")
+    if verbose:
+        display(codon_proportions.head(3))
+    # There are some (genes, aa) pairs where the sum is zero.
+    # This is normal if that gene does not have that amino-acid.
+    sums = codon_proportions.groupby(level=0, axis=1).sum()
+    # Values in sums should be either zero or one.
+    all_ones = np.full(sums.shape, 1.)
+    all_zeroes = np.full(sums.shape, 0.)
+    # Due to imprecision in float arithmetics,
+    # we can only check that the sums are close to 1 or to 0
+    assert np.array_equal(
+        np.isclose(
+            sums.values, all_ones),
+        np.logical_not(
+            np.isclose(
+                sums.values, all_zeroes)))
+    if return_more:
+        render_md(f"""
+We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying
+codon proportions by the degeneracy of the corresponding amino-acid, that is,
+by the number of different codons existing for that amino-acid.
+This amounts to computing codon counts relative to the average codon counts
+for that amino-acid.
+(This corresponds to R3 in {SUZUKI_LINK})
+""")
+        (rscu, degeneracy) = compute_rscu(codon_proportions)
+        render_md(f"""
+We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative
+to the maxium across synonymous codons of the counts for that amino-acid.
+""")
+        max_counts = codon_counts.T.groupby("aa").max().T
+        r4_table = codon_counts.div(
+            max_counts)
+    if ref_filter_dict is None:
+        counts_for_global = codon_counts
+    else:
+        counts_for_global = filter_on_idx_levels(
+            codon_counts, ref_filter_dict)
+    render_md("""
+To compute the usage biases, we also need to compute codon proportions
+in the global usage.
+""")
+    render_md("""
+We compute the global usage, as the sum of the counts for a given codon,
+across genes.
+""")
+    global_usage = counts_for_global.sum(axis=0)
+    if return_more:
+        # Same computation as for individual genes,
+        # but based on the total counts.
+        global_max_counts = global_usage.T.groupby("aa").max().T
+        global_r4 = global_usage.div(
+            global_max_counts)
+    render_md("Then we sum over codons corresponding to the same amino-acid.")
+    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    render_md("""
+Dividing the global usage for each codon by the total for the corresponding
+amino-acid, we can now compute the global proportions the same way as for
+the individual genes.
+""")
+    global_proportions = global_usage.div(
+        global_summed_by_aa)
+    # The proportions of codons for a given amino-acid should sum to 1
+    assert all(
+        val == 1.
+        for val in global_proportions.groupby(level=0).sum())
+    render_md("""
+Codon usage biases in genes can then be computed by subtracting
+the corresponding global proportion to a codon's proportion.
+""")
+    codon_usage_biases = codon_proportions.sub(global_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between codons.
+""")
+    standardized_codon_usage_biases = codon_usage_biases.div(
+        codon_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_codon_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_codon_usage_biases.head(3))
+    if return_more:
+        return {
+            "biases": standardized_codon_usage_biases,
+            "proportions": codon_proportions,
+            "rscu": rscu,
+            "r4_table": r4_table,
+            "global_proportions": global_proportions,
+            "global_rscu": global_proportions.mul(degeneracy),
+            "global_r4": global_r4}
+    return standardized_codon_usage_biases
+
+
+def aa_usage(
+        codon_counts,
+        verbose=False, return_more=False, ref_filter_dict=None):
+    """
+    Compute amino-acid usage biases as the standardized
+    difference between a gene's amino-acid proportions
+    and global amino-acid proportions.
+
+    If *ref_filter_dict* is not None, it should be a dict of
+    (index_level, index_value) pairs and the global codon proportions
+    will actually be proportions computed on the part of the data
+    restricted to the genes where the *index_level* has the *index_value*
+    for all those pairs.
+    """
+    check_aa_codon_columns(codon_counts)
+    render_md("""
+We will compute amino-acid usage, by looking at the
+proportions of amino-acids within a gene's CDS.
+""")
+    render_md("""
+We first need to compute, for a given gene, the total number of codons
+corresponding to each amino-acid.
+""")
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    render_md("""
+To compute amino-acid proportions, we can divide each line by its sum,
+or, equivalently (and hopefully more efficiently), normalize the data
+using the "l1" norm (which, for positive-only values amounts to the sum).
+""")
+    aa_proportions = pd.DataFrame(
+        normalize(summed_by_aa, norm="l1"),
+        index=summed_by_aa.index,
+        columns=summed_by_aa.columns)
+    # Doesn't seem to be working:
+    # aa_proportions.style.hide(axis="index")
+    if verbose:
+        display(aa_proportions.head(3))
+    # Checking that proportions sum to 1
+    colsums = aa_proportions.sum(axis=1)
+    assert np.allclose(colsums, np.full(len(colsums), 1))
+    # Then, computing the global amino-acid proportions
+    if ref_filter_dict is None:
+        counts_for_global = summed_by_aa
+    else:
+        counts_for_global = filter_on_idx_levels(
+            summed_by_aa, ref_filter_dict)
+    render_md("""
+To compute the usage biases, we also need to compute amino-acid proportions
+in the global usage.
+""")
+#     render_md("""
+# We compute the global usage, as the sum of the counts for a given codon,
+# across genes.
+# """)
+#     global_usage = codon_counts.sum(axis=0)
+    render_md("Then we sum over codons corresponding to the same amino-acid.")
+    # global_summed_by_aa = global_usage.groupby(level=0).sum()
+    global_summed_by_aa = counts_for_global.sum()
+    render_md("Then we normalize it the same way as for the individual genes.")
+    # The values.reshape(1, -1) turns the Series data into a 2D array
+    # with one line. flatten() turns the data back to 1D
+    global_aa_proportions = pd.Series(
+        normalize(
+            global_summed_by_aa.values.reshape(1, -1),
+            norm="l1").flatten(),
+        index=global_summed_by_aa.index)
+    render_md("""
+Amino-acid usage biases can be computed by subtracting the corresponding
+global proportion to an amino-acid's proportion.
+""")
+    aa_usage_biases = aa_proportions.sub(global_aa_proportions)
+    render_md("""
+We can standardize these values (by dividing by their standard deviation
+across genes) so that they are more comparable between amino-acids.
+""")
+    standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std())
+    # Doesn't seem to be working:
+    # standardized_aa_usage_biases.style.hide(axis="index")
+    if verbose:
+        display(standardized_aa_usage_biases.head(3))
+    if return_more:
+        return {
+            "biases": standardized_aa_usage_biases,
+            "proportions": aa_proportions,
+            "global_proportions": global_aa_proportions}
+    return standardized_aa_usage_biases
+
+
+def exclude_all_nan_cols(usage_table, fill_other_nas=np.nan):
+    """
+    Detect columns in *usage_table* that contain only NaNs
+    and remove them from the table. Other NaN values are replaced
+    with *fill_other_nas*.
+    """
+    render_md("""
+Standardization may result in division by zero for usage data
+that have a zero standard deviation.
+This is expected to be the case for "by amino-acid" usage biases
+for codons corresponding to amino-acids having only one codon:
+methionine (M) and tryptophan (W).
+""")
+    all_nan_cols = usage_table.columns[
+        usage_table.isna().all()]
+    if len(all_nan_cols):
+        render_md("The following columns contain only NaNs:")
+        display(all_nan_cols)
+        render_md("This likely resulted from a division by zero.")
+        render_md("These columns will be excluded.")
+    return (
+        usage_table.drop(columns=all_nan_cols).fillna(fill_other_nas),
+        all_nan_cols)
+
+
+def codon_influence_in_components(
+        components, colnames,
+        figs_dir=None, more_components=False, formats=None):
+    """
+    Plot the influence of the columns in the first 4 principal axes of a PCA.
+
+    Each column should correspond to a codon, and will be represented as a
+    colour bar whose coulour is based on the last letter of the codon.
+
+    *components* should be a numpy array representing the principal
+    axes of a PCA, such as the `components_` attribute of a fitted
+    `sklearn.decomposition.PCA` object.
+
+    *colnames* should be the names of the columns in the *components*
+    array and  are expected to match the following pattern:
+    <aa>_<codon>, where <aa> is a single-letter code for an amino-acid,
+    and <codon> is one of the 3-letter codons for this amino-acid,
+    in capital letters among A, T, G and C (i.e. in the DNA alphabet).
+    The last letter will be used to set the colour of a bar in the plots
+
+    *figs_dir* should be a path to a directory that will be used
+    to save graphics representing the influence of each data column
+    on the first four principal components.
+
+    *formats* should be a list of formats in which the figures should
+    be saved, such as "svg" or "png".
+    """
+    render_md(
+        "Vizualizing the influence of codons in the first components\n")
+    # TODO: *figsize* could be adapted depending on the number of columns
+    if more_components:
+        (fig, axes) = plt.subplots(12, 1, figsize=(16, 60))
+    else:
+        (fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
+    for (component, axis) in enumerate(axes):
+        pd.Series(
+            components[component],
+            index=colnames).plot.bar(
+                ax=axis,
+                # colname is supposed to end with the 3-letters codon
+                color=[
+                    nuc2colour[colname[-1]]
+                    for colname in colnames])
+        axis.set_ylabel(f"weight in component {component}")
+        # axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
+    fig.subplots_adjust(hspace=.5)
+    if figs_dir is not None and formats is not None:
+        for ext in formats:
+            plt.savefig(
+                figs_dir.joinpath(f"PCA_components.{ext}"),
+                metadata=fmt_metadata[ext])
+    display(fig)
+    plt.close(fig)
+
+
+def codon_usage_pca(
+        usage_data,
+        figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False,
+        formats=None, cols_are_codons=True):
+    """
+    Perform Principal Component Analysis on *usage_data*.
+
+    DataFrame *usage_data* is expected to contain observations as lines
+    and contain numerical values in columns, without NaNs, corresponding
+    to codons, where the columns headers are supposed to match the following
+    pattern: <aa>_<codon>, where <aa> is a single-letter code for
+    an amino-acid, and <codon> is one of the 3-letter codons for this
+    amino-acid, in capital letters among A, T, G and C
+    (i.e. in the DNA alphabet).
+
+    *hue* is expected to be one of the levels of the MultiIndex of
+    *usage_data*, and will be used to assign colours to the observations
+    in PCA plots. By default, colour is based on the content of a "chrom"
+    level in the index.
+
+    If *figs_dir* is not None, this path to a directory will be used
+    to save graphics representing the projection of the observations
+    in the first four principal components (0 vs. 1 and 2 vs. 3).
+
+    Unless *cols_are_codons* is set to False, there will also be
+    graphics representing the influence of each data column
+    on the first four principal components.
+
+    *formats* should be a list of formats in which the figures should
+    be saved, such as "svg" or "png".
+
+    If *exclude_cols* is not None, the columns with the names contained
+    in the iterable *exclude_cols* will not be included in the PCA analysis.
+    """
+    if figs_dir is not None:
+        figs_dir = Path(figs_dir)
+        figs_dir.mkdir(parents=True, exist_ok=True)
+    if exclude_cols is not None:
+        usage_data = usage_data.drop(columns=exclude_cols)
+    pca = PCA().fit(usage_data)
+    transformed_data = pd.DataFrame(
+        pca.transform(usage_data),
+        index=usage_data.index).reset_index(level=hue)
+    render_md(
+        "Plotting genes on the first components\n")
+    if plot_more_components:
+        (fig, axes) = plt.subplots(3, 2, figsize=(16, 25))
+        sns.scatterplot(
+            data=transformed_data,
+            x=0, y=1, hue=hue, marker=".", ax=axes[0,0])
+        sns.scatterplot(
+            data=transformed_data,
+            x=2, y=3, hue=hue, marker=".", ax=axes[0,1])
+        sns.scatterplot(
+            data=transformed_data,
+            x=4, y=5, hue=hue, marker=".", ax=axes[1,0])
+        sns.scatterplot(
+            data=transformed_data,
+            x=6, y=7, hue=hue, marker=".", ax=axes[1,1])
+        sns.scatterplot(
+            data=transformed_data,
+            x=8, y=9, hue=hue, marker=".", ax=axes[2,0])
+        sns.scatterplot(
+            data=transformed_data,
+            x=10, y=11, hue=hue, marker=".", ax=axes[2,1])
+    else:
+        (fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
+        sns.scatterplot(
+            data=transformed_data,
+            x=0, y=1, hue=hue, marker=".", ax=axes[0])
+        sns.scatterplot(
+            data=transformed_data,
+            x=2, y=3, hue=hue, marker=".", ax=axes[1])
+
+    if figs_dir is not None and formats is not None:
+        for ext in formats:
+            plt.savefig(
+                figs_dir.joinpath(f"PCA_projections.{ext}"),
+                metadata=fmt_metadata[ext])
+    display(fig)
+    plt.close(fig)
+    if cols_are_codons:
+        codon_influence_in_components(
+            pca.components_, usage_data.columns,
+            figs_dir=figs_dir, more_components=plot_more_components, formats=formats)
+    return (pca, transformed_data)
+
+
+def centroid_usage(codon_counts, all_nan_cols):
+    """
+    Define "centroids" for gene clusters, one per codon in *codon_counts*.
+
+    The standardized usage biases of these centroids are computed so that
+    the codon proportions for a given amino-acid are the same as the global
+    proportions, except for the codons corresponding to a certain amino-acid.
+    For each amino-acid, there is one centroid per codon, where the
+    proportion for this codon is set to 1.0, and 0.0 for the other codons.
+    """
+    check_aa_codon_columns(codon_counts)
+    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
+    global_usage = codon_counts.sum(axis=0)
+    global_summed_by_aa = global_usage.groupby(level=0).sum()
+    global_proportions_by_aa = global_usage.div(
+        global_summed_by_aa)
+    codon_proportions_by_aa = codon_counts.div(summed_by_aa)
+    codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
+        global_proportions_by_aa)
+    # _columns_by_aa may be different from columns_by_aa
+    # Groups of column headers for each amino-acid
+    # key: aa, value: list of (aa, codon) pairs
+    _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns)
+
+    def generate_centroid_proportions():
+        for (aa1, columns) in _columns_by_aa.items():
+            for (_, codon1) in columns:
+                proportions = global_proportions_by_aa.copy()
+                for (aa2, codon2) in columns:
+                    proportions[(aa2, codon2)] = 0.0
+                proportions[(aa1, codon1)] = 1.0
+                proportions.name = (
+                    f"{aa1}_{codon1}", f"{aa1}_{codon1}",
+                    *np.full(
+                        codon_proportions_by_aa.index.nlevels - 2,
+                        np.nan))
+                yield proportions
+
+    render_md("""
+For each codon, a "centroid" (future center of a cluster of genes)
+is defined where the proportion of this codon is set to 1.0.
+For the other codons corresponding to the same amino-acid, the proportion
+is therefore set to 0.0.
+For codons corresponding to other amino-acids, the proportions are set to
+the proportions observed in the global data.
+""")
+    centroids_proportions_by_aa = pd.DataFrame(
+        generate_centroid_proportions())
+    centroids_proportions_by_aa.index.rename(
+        codon_proportions_by_aa.index.names,
+        inplace=True)
+
+    render_md("""
+The biases for the centroids are then computed by subtracting to those
+proportions the proportions in the global data, and the biases are
+standardized by dividing by the standard deviation of codon usage biases
+in the data.
+""")
+    # Compute biases using the same global proportions as the real data
+    # and standardize by the same standard deviations as the real data
+    centroids_scub_by_aa = centroids_proportions_by_aa.sub(
+        global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop(
+            columns=all_nan_cols)
+    # centroids_scub_by_aa.head(3)
+    return centroids_scub_by_aa
+
+
+def make_centroids_cluster_finder(
+        centroids_table,
+        aa):  # pylint: disable=C0103
+    """
+    Make a function that, when applied to a row in the standardized
+    codon bias table, determines to what centroid among those
+    corresponding to *aa* it is the closest.
+    The *centroids_table* should contain standardized codon usage
+    biases for the centroids.
+    """
+    # The columns that contain data pertaining to codons coding aa:
+    cols_for_aa = centroids_table.columns.get_level_values(0) == aa
+    cluster_names = [
+        f"{aa}_{codon}"
+        for (aa, codon) in centroids_table.columns[cols_for_aa]]
+
+    def closest_centroid(gene):
+        dists_to_centroids = {}
+        for cluster_name in cluster_names:
+            dists_to_centroids[cluster_name] = sqdist(
+                gene[cols_for_aa].values,
+                centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values)
+        # We return the cluster_name (`min(...))[1]`)
+        # associated with the minimum distance
+        return min(
+            (dist, cluster_name)
+            for (cluster_name, dist)
+            in dists_to_centroids.items())[1]
+    return closest_centroid
+
+
+def make_cluster_table(scub_table, centroids_scub_table):
+    """
+    Make a table for the genes in standardized codon usage bias
+    table *scub_table* where each column tells, for a given
+    amino-acid to which centroid in *centroids_scub_table*
+    it is the closest.
+    """
+    return pd.DataFrame(
+        {
+            (f"cluster_{aa}_full_bias", ""): scub_table.apply(
+                make_centroids_cluster_finder(centroids_scub_table, aa),
+                axis=1).values
+            for aa  # pylint: disable=C0103
+            in unique(centroids_scub_table.columns.get_level_values(0))},
+        index=scub_table.index)
+
+
+def compare_clusterings(
+        usage_table,
+        clust1_template="cluster_{aa}_full_bias",
+        clust2_template="cluster_{aa}_highest_bias"):
+    """
+    Compare lists of genes in *usage_table* for the clusters
+    found in the columns whose name conform to
+    *clust1_template* and *clust2_template*.
+    """
+    crosstabs = {}
+    for aa in aa2colour.keys():  # pylint: disable=C0103
+        if aa in {"M", "W"}:
+            continue
+        clust1_level = clust1_template.format(aa=aa)
+        clust2_level = clust2_template.format(aa=aa)
+        crosstabs[aa] = pd.crosstab(
+            usage_table.reset_index()[clust1_level],
+            usage_table.reset_index()[clust2_level])
+        cols_for_aa = columns_by_aa[aa]
+        cluster_names = [
+            f"{aa}_{codon}"
+            for (aa, codon) in usage_table[cols_for_aa].columns]
+        # Checking that the clusters are the same
+        same_clusters = True
+        for (clust_1, clust_2) in combinations(cluster_names, 2):
+            # assert crosstabs[aa][clust_1][clust_2] == 0
+            try:
+                # if crosstabs[aa][clust_1][clust_2] != 0:
+                if (crosstabs[aa][clust_1][clust_2] != 0) \
+                        or (crosstabs[aa][clust_2][clust_1] != 0):
+                    same_clusters = False
+            except KeyError:
+                # At least one cluster is absent.
+                # Is the absence symmetric?
+                miss_1_in_1 = clust_1 in usage_table.reset_index()[
+                    clust1_level]
+                miss_2_in_2 = clust_2 not in usage_table.reset_index()[
+                    clust2_level]
+                miss_1_in_2 = clust_1 in usage_table.reset_index()[
+                    clust2_level]
+                miss_2_in_1 = clust_2 not in usage_table.reset_index()[
+                    clust1_level]
+                if (
+                        (miss_1_in_1 != miss_1_in_2)
+                        or (miss_2_in_1 != miss_2_in_2)):
+                    same_clusters = False
+        if not same_clusters:
+            render_md(f"\nClusters for {aa} differ:\n")
+        else:
+            render_md(f"\nClusters for {aa} are identical:\n")
+        display(crosstabs[aa])
+
+
+def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
+    """
+    Load a table containing by-amino-acid codon usage biases.
+
+    The table, located at *table_path*, should have tab-separated columns.
+    It is expected to contain a series of informative columns
+    preceding the columns containing the codon usage bias values.
+
+    The first *nb_info_cols* columns contain information about gene
+    identifiers and features extracted from CDS annotation data.
+
+    Then, there can be series of gene-clustering information, where each
+    column of the series corresponds to clusters defined based on
+    codon-usage biases among the codons coding an amino-acid.
+    There are no columns for amino-acids coded by only one codon (M and W).
+    *nb_cluster_series* specifies the number of such series.
+
+    The result is a pandas DataFrame object, where all the above information
+    is encoded as a MultiIndex, the codon usage biases being in the
+    remaining columns.
+    """
+    return load_table_with_info_index(
+        table_path, nb_info_cols, nb_cluster_series)
+
+
+def star2stop(text):
+    """
+    Replace stars with "stop", for use in file paths.
+    """
+    return text.replace("*", "stop")
+
+
+def write_cluster_lists(
+        usage_table,
+        aa,  # pylint: disable=C0103
+        clusters_dir,
+        cluster_level_template,
+        y_label_template,
+        alt_tag_kw="old_locus_tag"):
+    """
+    Extract lists of genes in *usage_table** for the clusters
+    associated with codons for amino-acid *aa* and write the
+    resulting lists in a sub-folder (named based on *aa*)
+    of the directory *clusters_dir*.
+    The clusters are found in the column whose name conforms to
+    *cluster_level_template*
+    Also generate violin plots for each cluster.
+    """
+    md_report = f"* Clusters for {aa}:\n\n"
+    aa_dir = clusters_dir.joinpath(star2stop(aa))
+    aa_dir.mkdir(parents=True, exist_ok=True)
+    # key: cluster
+    # value: relative path to a file containing old locus tags
+    #        for genes belonging to this cluster
+    relpaths_to_cluster_lists = {}
+    for (cluster, gene_list) in groupby(itemgetter(0), zip(
+            usage_table.index.get_level_values(
+                cluster_level_template.format(aa=aa)),
+            usage_table.index.get_level_values(alt_tag_kw))).items():
+        path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt"))
+        with path_to_clusterfile.open("w") as clust_fh:
+            clust_fh.write("\n".join(map(itemgetter(1), gene_list)))
+        relpaths_to_cluster_lists[cluster] = str(
+            path_to_clusterfile.relative_to('.'))
+    md_report += "\n\n".join([
+        f"    - [{cluster}]({relpath_to_list})"
+        for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()])
+    violin_usage_by_clusters(
+        usage_table,
+        aa,
+        y_label_template,
+        cluster_level_template=cluster_level_template,
+        vertical=True)
+    path_to_fig = aa_dir.joinpath(star2stop(
+        f"usage_biases_violin_plots_by_cluster_for_{aa}.png"))
+    plt.savefig(path_to_fig, metadata=fmt_metadata["png"])
+    plt.close()
+    relpath_to_fig = str(path_to_fig.relative_to('.'))
+    md_report += (
+        f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n")
+    return md_report
+
+
+def find_valley(values):
+    """
+    Try to find a cutting threshold for the *values*,
+    where the corresponding density distribution is lowest.
+    """
+    if len(set(values)) == 1:
+        (single_val,) = set(values)
+        return single_val
+    min_val = min(values)
+    max_val = max(values)
+    val_range = max_val - min_val
+    # We use 100 bins
+    try:
+        val_density = gaussian_kde(values).evaluate(
+            np.linspace(min_val, max_val, 100))
+    except LinAlgError as err:
+        print(f"min: {min_val}, max: {max_val}, N: {len(values)}.")
+        print(str(err))
+        raise
+    # Where is the density lowest? (excluding first and last)
+    min_density_idx = val_density[1:-1].argmin()
+    # Finding the value corresponding to the index in the `val_density` array:
+    return min_val + (min_density_idx * val_range / 100)
+
+
+def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"):
+    """
+    Compute a usage bias threshold hopefully separating the most biased genes
+    from the remainder of the cluster where *data* contains codon usage biases
+    for the genes in the cluster and has an *alt_tag_kw* level in its index.
+    Return the threshold in terms of usage bias and the list of genes above it.
+
+    /!\ This is likely very sensitive to the shape
+    of the usage bias distribution.
+    """
+    thresh = find_valley(data.values)
+    # Try avoiding cases with threshold "misplaced" at the bottom
+    # of the distribution
+    retries = 0
+    while do_retries and thresh < 0:
+        retries += 1
+        thresh = find_valley(data.sort_values().iloc[retries:].values)
+    top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values
+    return (thresh, top_genes, retries)
+
+
+def extract_top_genes_from_cluster(
+        usage_table,
+        aa,  # pylint: disable=C0103
+        codon,
+        clusters_dir,
+        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
+    """
+    Return a few lines of markdown with links pointing to
+    where the saved results are expected to be found.
+    """
+    md_report = f"    - Most biased genes for {codon} ({aa}):\n\n"
+    aa_dir = clusters_dir.joinpath(star2stop(aa))
+    # Files where to save the results
+    top_dir = aa_dir.joinpath("top_genes")
+    path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
+    path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
+    path_to_fig = top_dir.joinpath(star2stop(
+        f"usage_biases_violin_plot_{aa}_{codon}.png"))
+    # Load the list of genes belonging to the cluster
+    # associated with a high usage bias for *codon*
+    path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt"))
+    with open(path_to_clusterfile) as gene_list_fh:
+        genes = [line.strip() for line in gene_list_fh]
+    # Extract the usage bias data for these genes and for this codon
+    idx_level = usage_table.index.names.index(alt_tag_kw)
+    try:
+        # data = usage_table.loc[genes][(aa, codon)]
+        data = usage_table.loc[
+            (*(slice(None) for _ in range(idx_level)), genes),
+            (aa, codon)]
+    except KeyError:
+        print(genes[0])
+        print(usage_table.index.names)
+        raise
+    q3 = data.quantile(0.75)  # pylint: disable=C0103
+    top25 = data[(data > q3)].index.get_level_values(
+        alt_tag_kw).values
+    (thresh, top_genes, retries) = find_most_biased_genes(
+        data, do_retries=True, alt_tag_kw=alt_tag_kw)
+    if retries != 0:
+        print(
+            f"Bottom-most {retries} values had to be discarded in order "
+            f"to avoid a negative threshold for the cluster {aa}_{codon}.")
+    top_genes_check = set(
+        usage_table[
+            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
+                alt_tag_kw).values)
+    if set(top_genes) != top_genes_check:
+        assert set(top_genes).issubset(top_genes_check)
+        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
+        print(
+            f"{len(extra_genes_not_in_cluster)} genes were found "
+            f"with a high bias for {codon} but not belonging "
+            f"to the cluster {aa}_{codon}.")
+    # For visual verification purpose,
+    # display where the threshold is situated on a violin plot
+    axis = violin_with_thresh(
+        data, [(thresh, "red"), (q3, "blue")],
+        f"standardized usage bias for {codon} ({aa})",
+        facecolor=aa2colour[aa], axis=axis)
+    # Save the figure and close it.
+    plt.tight_layout()
+    if savefig:
+        top_dir.mkdir(parents=True, exist_ok=True)
+        plt.savefig(path_to_fig, metadata={'creationDate': None})
+        plt.close()
+        # Save the list of the top genes
+        with open(path_to_top25, "w") as gene_list_fh:
+            gene_list_fh.write("\n".join(top25))
+        with open(path_to_top, "w") as gene_list_fh:
+            gene_list_fh.write("\n".join(top_genes))
+    relpath_to_top25 = str(path_to_top25.relative_to('.'))
+    relpath_to_top = str(path_to_top.relative_to('.'))
+    relpath_to_fig = str(path_to_fig.relative_to('.'))
+    md_report += "\n\n".join([
+        f"        * [top 25% genes]({relpath_to_top25})",
+        f"        * [top genes]({relpath_to_top})",
+        f"        * [violin plot with thresholds ({q3:.3}, "
+        f"{thresh:.3})]({relpath_to_fig})"])
+    return md_report
+
+
+# TODO:
+def plot_codon_usage_for_gene_list(
+        gene_list_path,
+        usage_table,
+        aa,  # pylint: disable=C0103
+        codon,
+        figs_dir=None,
+        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
+    """
+    Make a violin plot of codon usage biases for genes in file *gene_list_path*
+    using data in *usage_table*.
+    The plot will represent the distribution of usage bias
+    for codon *codon* of amino-acid *aa*.
+    The plot will be saved inside *figs_dir* if *savefig* is set to True.
+    Gene identifiers are expected to be *alt_tag_kw* identifiers.
+    """
+    md_report = f"    - Genes from {gene_list_path}:\n\n"
+    with open(gene_list_path) as gene_list_fh:
+        genes = [
+            line.strip() for line in gene_list_fh
+            if line[0] != "#"]
+    # Extract the usage bias data for these genes and for this codon
+    idx_level = usage_table.index.names.index(alt_tag_kw)
+    try:
+        # data = usage_table.loc[genes][(aa, codon)]
+        data = usage_table.loc[
+            (*(slice(None) for _ in range(idx_level)), genes),
+            (aa, codon)]
+    except KeyError:
+        print(
+            f"Genes are supposed to be provided as {alt_tag_kw} identifiers.")
+        print(genes[0])
+        print(usage_table.index.names)
+        raise
+    q3 = data.quantile(0.75)  # pylint: disable=C0103
+    # top25 = data[(data > q3)].index.get_level_values(
+    #     alt_tag_kw).values
+    (thresh, top_genes, retries) = find_most_biased_genes(
+        data, do_retries=True, alt_tag_kw=alt_tag_kw)
+    if retries != 0:
+        print(
+            f"Bottom-most {retries} values (out of {len(data)}) "
+            "had to be discarded in order to avoid a negative threshold "
+            f"for the cluster {aa}_{codon}.")
+    top_genes_check = set(
+        usage_table[
+            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
+                alt_tag_kw).values)
+    if set(top_genes) != top_genes_check:
+        assert set(top_genes).issubset(top_genes_check)
+        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
+        print(
+            f"{len(extra_genes_not_in_cluster)} genes were found "
+            f"with a high bias for {codon} but not belonging to "
+            f"the cluster {aa}_{codon}.")
+    # For visual verification purpose,
+    # display where the threshold is situated on a violin plot
+    axis = violin_with_thresh(
+        data, [(thresh, "red"), (q3, "blue")],
+        f"standardized usage bias for {codon} ({aa})",
+        facecolor=aa2colour[aa], axis=axis)
+    # Save the figure and close it.
+    plt.tight_layout()
+    if not savefig:
+        return md_report
+    if figs_dir is None:
+        raise ValueError("No *figs_dir* directory specified.\n")
+    aa_dir = figs_dir.joinpath(star2stop(aa))
+    # Files where to save the results
+    aa_dir.mkdir(parents=True, exist_ok=True)
+    path_to_fig = aa_dir.joinpath(star2stop(
+        f"usage_biases_violin_plot_{aa}_{codon}.png"))
+    plt.savefig(path_to_fig, metadata=fmt_metadata["png"])
+    plt.close()
+    # Save the list of the top genes
+    # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
+    # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
+    # with open(path_to_top25, "w") as gene_list_fh:
+    #     gene_list_fh.write("\n".join(top25))
+    # with open(path_to_top, "w") as gene_list_fh:
+    #     gene_list_fh.write("\n".join(top_genes))
+    # relpath_to_top25 = str(path_to_top25.relative_to('.'))
+    # relpath_to_top = str(path_to_top.relative_to('.'))
+    relpath_to_fig = str(path_to_fig.relative_to('.'))
+    md_report += "\n\n".join([
+        # f"        * [top 25% genes]({relpath_to_top25})",
+        # f"        * [top genes]({relpath_to_top})",
+        f"        * [violin plot with thresholds ({q3:.3}, "
+        f"{thresh:.3})]({relpath_to_fig})"])
+    return md_report
+
+
+def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
+    """
+    Plot a boxplot from pandas DataFrame *usage_table*.
+
+    Use *y_label* as y-axis label.
+    """
+    nb_genes = usage_table.shape[0]
+    if whiskers == "1.5 IQR":
+        axis = usage_table.plot.box(figsize=(18, 6), rot=90, sym=".")
+        axis.set_title(
+            f"Distributions of codon usage biases among {nb_genes} "
+            "genes (whiskers at 1.5 IQR).")
+    elif whiskers == "5 percent tails":
+        axis = usage_table.plot.box(
+            figsize=(18, 6), rot=90, sym=".", whis=(5, 95))
+        axis.set_title(
+            f"Distributions of codon usage biases among {nb_genes} "
+            "genes (whiskers at the 5th and 95th percentiles).")
+    axis.set_ylabel(ylabel)
+
+
+def to_long_form(usage_table, ylabel, others=None):
+    """
+    Transform data in *usage_table* into "long form".
+
+    The goal is to be able to use aa as hue in violin plots.
+
+    See https://pandas.pydata.org/docs/user_guide/10min.html#stack
+    and https://seaborn.pydata.org/generated/seaborn.violinplot.html
+
+    *y_label* is the column header you want to set for the usage values.
+
+    *others* should be a list of extra information you want to extract
+    from the index (which is expected to be a pandas MultiIndex).
+    """
+    col_nb_levels = usage_table.columns.nlevels
+    row_nb_levels = usage_table.index.nlevels
+    long_form = usage_table.stack(
+        level=list(range(col_nb_levels))).to_frame().reset_index(
+            level=list(range(
+                row_nb_levels, row_nb_levels + col_nb_levels)))
+    if others is None:
+        others = []
+    if col_nb_levels == 1:
+        long_form.columns = ["aa", ylabel]
+        others = [other for other in others if other != "aa"]
+    elif col_nb_levels == 2:
+        long_form.columns = ["aa", "codon", ylabel]
+        others = [other for other in others if other not in {"aa", "codon"}]
+    else:
+        raise NotImplementedError(
+            "Don't know how to deal with column headers "
+            "with more than 2 levels ('aa' and 'codon')\n")
+    long_form = long_form.reset_index(level=[
+        lvl_name for lvl_name in usage_table.index.names
+        if lvl_name.startswith("cluster")])
+    if not set(others).issubset(set(long_form.index.names)):
+        print(set(others))
+        print(set(long_form.index.names))
+        raise ValueError(
+            "All elements in *others* should be in the index "
+            "of *usage_table*")
+    return long_form.assign(**{
+        other: long_form.index.get_level_values(other) for other in others})
+
+
+def variable2order(variable):
+    """
+    Define the order of amino-acids or codons.
+    """
+    if variable == "aa":
+        return list(columns_by_aa.keys())
+    if variable == "codon":
+        return [codon for (_, codon) in concat(columns_by_aa.values())]
+    raise ValueError("variable can only be 'aa' or 'codon'.\n")
+
+
+def format_codon_labels(codons):
+    """
+    Add amino-acid information to codons meant to be used as tick labels.
+    """
+    return [
+        plt.Text(x, y, f"{codon} ({codon2aa[codon]})")
+        for (x, y, codon) in map(attrgetter("_x", "_y", "_text"), codons)]
+
+
+def violin_usage(
+        usage_table, variable, ylabel,
+        hue="aa", axis=None, **violin_kwargs):
+    """
+    Plot violin plots of codon usage biases.
+
+    *variable* should be either "codon" or "aa".
+    """
+    if hue in {"aa", "codon"}:
+        dodge = False
+        nb_violins = 1
+    else:
+        dodge = True
+        # dodge = True implies there will be one violin
+        # per possible value of *hue*, side by side
+        nb_violins = len(usage_table.index.get_level_values(hue).unique())
+    long_form = to_long_form(usage_table, ylabel, others=[hue])
+    if axis is None:
+        _, axis = plt.subplots(figsize=(18 * nb_violins, 6))
+        do_legend = True
+    else:
+        do_legend = False
+    if hue == "aa":
+        palette = aa2colour
+    else:
+        palette = None
+    kwargs = {
+        "x": variable, "y": ylabel, "order": variable2order(variable),
+        "hue": hue, "palette": palette, "dodge": dodge,
+        "data": long_form, "ax": axis, "orient": "v", "scale": "count"}
+    kwargs.update(violin_kwargs)
+    axis = sns.violinplot(**kwargs)
+    if do_legend:
+        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0)
+    if variable == "codon":
+        ticklabels = format_codon_labels(axis.get_xticklabels())
+    else:
+        ticklabels = axis.get_xticklabels()
+    # TODO: Expect issues when variable == "aa"
+    axis.set_xticklabels(ticklabels, rotation=90)
+    return axis
+
+
+def violin_usage_vertical(
+        usage_table, variable, ylabel,
+        hue="aa", axis=None, **violin_kwargs):
+    """
+    Plot vertically stacked violin plots of codon usage biases.
+
+    *variable* should be either "codon" or "aa".
+    """
+    if hue in {"aa", "codon"}:
+        dodge = False
+        nb_violins = 1
+    else:
+        dodge = True
+        # dodge = True implies there will be one violin
+        # per possible value of *hue*, side by side
+        nb_violins = len(usage_table.index.get_level_values(hue).unique())
+    long_form = to_long_form(usage_table, ylabel, others=[hue])
+    if axis is None:
+        if variable == "codon":
+            base_height = 44
+        elif variable == "aa":
+            base_height = 18
+        else:
+            raise NotImplementedError(f"Unsupported variable: {variable}.")
+        _, axis = plt.subplots(figsize=(6, base_height * nb_violins))
+    if hue == "aa":
+        palette = aa2colour
+    else:
+        palette = None
+    kwargs = {
+        "y": variable, "x": ylabel,
+        "order": variable2order(variable),
+        "hue": hue, "palette": palette, "dodge": dodge,
+        "data": long_form, "ax": axis, "orient": "h", "scale": "count"}
+    kwargs.update(violin_kwargs)
+    axis = sns.violinplot(**kwargs)
+    if variable == "codon":
+        ticklabels = format_codon_labels(axis.get_yticklabels())
+        axis.set_yticklabels(ticklabels)
+    return axis
+
+
+def violin_usage_by_clusters(usage_with_clusters, aa,  # pylint: disable=C0103
+                             ylabel_template,
+                             cluster_level_template="cluster_{aa}",
+                             vertical=False, **violin_kwargs):
+    """
+    Plot a series of violin plots for each cluster of genes.
+
+    The clusters are defined in usage table *usage_with_clusters*
+    based on usage biases for the codons coding for amino-acid *aa*.
+
+    *usage_with_clusters* should have clustering indications as
+    an index level named according to *cluster_level_template*
+    (default "cluster_{aa}"), where {aa} is to be replaced by *aa*.
+    """
+    clusters = usage_with_clusters.groupby(
+        level=cluster_level_template.format(aa=aa))
+    # To adjust figure size
+    if "hue" in violin_kwargs and violin_kwargs["hue"] not in {"aa", "codon"}:
+        # dodge will be set to True in later plotting call.
+        # This implies that there will be one violin
+        # per possible value of *hue*, side by side
+        nb_violins = len(usage_with_clusters.index.get_level_values(
+            violin_kwargs["hue"]).unique())
+    else:
+        nb_violins = 1
+    if vertical:
+        fig, axes = plt.subplots(
+            ncols=clusters.ngroups,
+            constrained_layout=True,
+            figsize=(6 * clusters.ngroups, 44 * nb_violins))
+    else:
+        fig, axes = plt.subplots(
+            nrows=clusters.ngroups,
+            constrained_layout=True,
+            figsize=(18 * nb_violins, 6 * clusters.ngroups))
+    for ((cluster, usage_table), axis) in zip(clusters, axes):
+        kwargs = {"axis": axis}
+        kwargs.update(violin_kwargs)
+        if vertical:
+            violin_usage_vertical(
+                usage_table, "codon",
+                ylabel_template.format(aa=aa, cluster=cluster),
+                **kwargs)
+        else:
+            violin_usage(
+                usage_table, "codon",
+                ylabel_template.format(aa=aa, cluster=cluster),
+                **kwargs)
+    return fig
+
+
+def violin_usage_by_clusters_splitby(
+        usage_with_clusters, aa,  # pylint: disable=C0103
+        ylabel_template,
+        cluster_level_template="cluster_{aa}",
+        vertical=False, by_lvl="chrom",
+        **violin_kwargs):
+    """
+    Plot a series of violin plots for each cluster of genes,
+    splitting the figures according to groups defined by the
+    content of the index level *by_lvl* of *usage_with_clusters*.
+
+    The clusters are defined in usage table *usage_with_clusters*
+    based on usage biases for the codons coding for amino-acid *aa*.
+
+    *usage_with_clusters* should have clustering indications as
+    an index level named according to *cluster_level_template*
+    (default "cluster_{aa}"), where {aa} is to be replaced by *aa*.
+    """
+    classes = sorted(set(
+        usage_with_clusters.index.get_level_values(by_lvl)))
+    idx_level = usage_with_clusters.index.names.index(by_lvl)
+    for clss in classes:
+        fig = violin_usage_by_clusters(
+            usage_with_clusters.loc[
+                (*(slice(None) for _ in range(idx_level)), clss), ],
+            aa, ylabel_template + f" for {clss}",
+            cluster_level_template=cluster_level_template,
+            vertical=vertical, **violin_kwargs)
+        fig.suptitle(f"Violin plots for {clss}")
+
+
+# Based on
+# https://matplotlib.org/stable/gallery/statistics/customized_violin.html
+def adjacent_values(q0, q1, q3, q4):  # pylint: disable=C0103
+    """
+    Define whiskers positions using the 1.5 * IQR criterion.
+
+    Use the values of quartiles *q0* (min), *q1*, *q3* and *q4* (max).
+    """
+    upper_adjacent_value = q3 + (q3 - q1) * 1.5
+    upper_adjacent_value = np.clip(upper_adjacent_value, q3, q4)
+    lower_adjacent_value = q1 - (q3 - q1) * 1.5
+    lower_adjacent_value = np.clip(lower_adjacent_value, q0, q1)
+    return (lower_adjacent_value, upper_adjacent_value)
+
+
+def violin_with_thresh(data, thresh, ylabel,
+                       facecolor="grey", axis=None, **violin_kwargs):
+    """
+    Draw a violin plot from *data* with a horizontal line at value *thresh*.
+
+    If *thresh* is a single value, the line will be red.
+    If *thresh* contains pairs, it will be assumed that these
+    are (threshold, colour) pairs, to be used to plot various lines.
+    Y-axis will be labeled using *ylabel*.
+    """
+    if axis is None:
+        _, axis = plt.subplots(figsize=(2, 3.5))
+    kwargs = {"showextrema": False}
+    kwargs.update(violin_kwargs)
+    violin_parts = axis.violinplot(data, **kwargs)
+    # https://stackoverflow.com/a/26291582/1878788
+    [body] = violin_parts["bodies"]
+    body.set_facecolor(facecolor)
+    body.set_edgecolor("black")
+    try:
+        (val, col) = thresh[0]
+    except TypeError:
+        thresh = [(thresh, "red")]
+    for (val, col) in thresh:
+        axis.axhline(val, color=col)
+        axis.text(
+            1.01, val, f"{val:.3}",
+            horizontalalignment="left", verticalalignment="bottom")
+    (q0, q1, median, q3, q4) = np.percentile(  # pylint: disable=C0103
+        data, [0, 25, 50, 75, 100])
+    (w_min, w_max) = adjacent_values(q0, q1, q3, q4)
+    axis.scatter(1, median, marker="o", s=30, color="white", zorder=3)
+    axis.vlines(1, q1, q3, linestyle="-", linewidth=5, color="black")
+    axis.vlines(1, w_min, w_max, linestyle="-", linewidth=1, color="black")
+    axis.set_xticks([])
+    axis.set_ylabel(ylabel)
+    return axis
+
+
+if __name__ == "__main__":
+    import doctest
+    doctest.testmod()
diff --git a/libcodonusage/libcodonusage.py b/libcodonusage/libcodonusage.py
index 3cbb666..603a8bd 100644
--- a/libcodonusage/libcodonusage.py
+++ b/libcodonusage/libcodonusage.py
@@ -463,15 +463,25 @@ def sum_codon_counts(row, codons):
     return sum
 
 
-def group_codons_by_sum(codon_counts, class_name, dict_classes, filter):
-    
+def group_codons_by_sum(codon_counts, group_name, dict_classes, filter):
+    """
+    Group codons by sum given specific classes in *codon_counts* table.
+
+    *group_name* contains the name of the grouping, and plays the role of aa names in the original 
+    codon counts table.
+    *dict_classes* contains the different classes under this grouping as keys and the associated 
+    list of codons as values.
+    *filter* is a boolean set to True if you want to filter out other codons than the ones specified in
+    dict_classes. If set to False, the original codon_counts table is returned with additionnal columns for
+    the grouped_classes.
+    """
     list_classes = list(dict_classes.items())
     list_classes_names = []
     for key, value in dict_classes.items():
-        codon_counts[class_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
+        codon_counts[group_name, key] = codon_counts.apply(lambda row: sum_codon_counts(row, value), axis=1)
         list_classes_names.append(key)
     if filter:
-        return codon_counts.loc[:, ([class_name], list_classes_names)]
+        return codon_counts.loc[:, ([group_name], list_classes_names)]
     else:
         return codon_counts
 
-- 
GitLab