From 5660ae18776fe93b75f81d821b9c61e8d941012d Mon Sep 17 00:00:00 2001
From: manselme <marie.anselmet@pasteur.fr>
Date: Thu, 19 Oct 2023 17:21:23 +0200
Subject: [PATCH] Delete libcodonusage.py

---
 build/lib/libcodonusage/libcodonusage.py | 1759 ----------------------
 1 file changed, 1759 deletions(-)
 delete mode 100644 build/lib/libcodonusage/libcodonusage.py

diff --git a/build/lib/libcodonusage/libcodonusage.py b/build/lib/libcodonusage/libcodonusage.py
deleted file mode 100644
index 706b30e..0000000
--- a/build/lib/libcodonusage/libcodonusage.py
+++ /dev/null
@@ -1,1759 +0,0 @@
-#!/usr/bin/env python3
-# Copyright (C) 2021-2023 Blaise Li
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <https://www.gnu.org/licenses/>.
-"""Functions used in Jupyter notebooks."""
-from http.cookies import Morsel
-from itertools import combinations
-import json
-from operator import attrgetter, itemgetter
-from pathlib import Path
-from unittest import mock
-# python3 -m pip install cytoolz
-from cytoolz import concat, groupby, unique, valmap
-# To render mardown in a Jupyter notebook on gitlab
-from IPython.display import display
-from IPython.core.display import HTML
-# python3 -m pip install markdown
-from markdown import markdown
-# Basic plotting library for Python
-# python3 -m pip install matplotlib
-# See https://matplotlib.org/
-import matplotlib.pyplot as plt
-# Python library to make some nicer plots
-# python3 -m pip install seaborn
-# See https://seaborn.pydata.org/
-import seaborn as sns
-# Useful Python package for bioinformatics.
-# python3 -m pip install biotite
-# See https://www.biotite-python.org/install.html
-import biotite.sequence.graphics as bgraphs
-from biotite.sequence import CodonTable, NucleotideSequence
-# python3 -m pip install numpy
-import numpy as np
-# Python module to handle tabular data
-# python3 -m pip install pandas
-# See https://pandas.pydata.org/pandas-docs/stable/index.html
-import pandas as pd
-# Python module that facilitates the exploration of tbular data
-# python3 -m pip install plydata
-# from plydata import define, pull, query
-# python3 -m pip install scipy
-from scipy.linalg import LinAlgError
-from scipy.spatial.distance import sqeuclidean as sqdist
-from scipy.stats import gaussian_kde
-# Python library with useful data-processing features
-# python3 -m pip install scikit-learn
-# https://scikit-learn.org/stable/install.html
-from sklearn.decomposition import PCA
-from sklearn.preprocessing import normalize
-# Python module to vizualize set intersections
-# python3 -m pip install upsetplot
-from upsetplot import from_indicators, UpSet
-
-
-fmt_metadata = {
-    "png": {"creationDate": None},
-    "svg": {
-        "Creator": None, "Date": None,
-        "Title": ("Distribution of standardized codon usages biases"
-                  " (by aa), by chromosome")}
-}
-
-
-def render_md(md_str):
-    """
-    Render a markdown string *md_str* in a Jupyter notebook.
-
-    More direct rendering is not possible on gitlab for security reasons.
-    """
-    display(HTML(markdown(md_str)))
-
-
-codon2aa = CodonTable.load(11).codon_dict()
-# Column headers made from (aa, codon) pairs
-codon_headers = pd.MultiIndex.from_tuples(
-    ((aa, codon) for (codon, aa) in codon2aa.items()),
-    names=("aa", "codon"))
-# Groups of column headers for each amino-acid
-# key: aa, value: list of (aa, codon) pairs
-columns_by_aa = groupby(itemgetter(0), codon_headers)
-
-
-######################################
-# Associating colours to amino-acids #
-######################################
-# We load amino-acid colouring information to use in graphical representations.
-# We'll use colour schemes provided by the biotite package:
-# https://www.biotite-python.org/examples/gallery/sequence/color_schemes_protein.html
-# We look directly at the json file
-# instead of using the "native" biotite mechanisms
-with Path(bgraphs.colorschemes._scheme_dir).joinpath(
-        "jalview_zappo.json").open() as fh:
-    colscheme = json.load(fh)
-# We add the black colour for the stop "amino-acid" and associated codons
-# key: amino-acid
-# value: hexadecimal html colour code
-aa2colour = {**colscheme["colors"], "*": '#000000'}
-######################################
-# Associating colours to nucleotides #
-######################################
-nuc_alphabet = NucleotideSequence.alphabet_unamb
-nuc_colours = bgraphs.get_color_scheme("rainbow", nuc_alphabet)
-nuc2colour = dict(zip(nuc_alphabet, nuc_colours))
-
-
-def load_counts_table(
-        table_path, index_col="old_locus_tag", index_unique=True):
-    """
-    Load a table of pre-computed codon counts at *table_path*.
-
-    The lines correspond to genes.
-    Besides the columns containing the counts for each codon,
-    there are other columns containing various pieces of information
-    regarding those genes.
-
-    If *index_unique* is True, a ValueError error will be raised if
-    some elements in the the column *index_col* are not unique.
-    """
-    render_md(f"Loading data from [{table_path}]({table_path})...\n")
-    codon_counts = pd.read_table(table_path, index_col=index_col)
-    if index_unique and not codon_counts.index.is_unique:
-        raise ValueError(f"Index {index_col} contains repeated values.\n")
-    nb_genes = len(codon_counts)
-    render_md(
-        f"""
-    There are {nb_genes} genes in the table.
-
-    The table looks as follows (first 3 lines):
-""")
-    display(codon_counts.head(3))
-    return codon_counts
-
-#def filter_codon_counts_table(codon_counts, )
-
-def compute_criteria(codon_counts):
-    """
-    Prepare a Boolean table and corresponging gene sets.
-
-    There are one column and one gene set for each criterion.
-    """
-    wrong_start_col = codon_counts["expected_start_aa"] == "-"
-    start_stop_col = codon_counts["expected_start_aa"] == "*"
-    no_met_start_col = codon_counts["expected_start_aa"] != "M"
-    start_upstream_col = codon_counts["start_upstream"]
-    has_stops_col = codon_counts["nb_stops"] > 0
-    start_upstream_met_start_col = start_upstream_col & ~no_met_start_col
-    start_upstream_met_start_nostops_col = (
-        start_upstream_met_start_col & ~has_stops_col)
-    good_met_start_col = ~start_upstream_col & ~no_met_start_col
-    has_stops_good_met_start_col = (
-        has_stops_col & ~no_met_start_col
-        & ~start_upstream_met_start_col)
-    no_stop_good_met_start_col = (
-        ~has_stops_col & ~no_met_start_col
-        & ~start_upstream_met_start_col)
-    criteria = pd.DataFrame({
-        "wrong_start": wrong_start_col,
-        "start_stop": start_stop_col,
-        "no_met_start": no_met_start_col,
-        "start_upstream": start_upstream_col,
-        "has_stops": has_stops_col,
-        "start_upstream_met_start": start_upstream_met_start_col,
-        "start_upstream_met_start_nostops": (
-            start_upstream_met_start_nostops_col),
-        "good_met_start": good_met_start_col,
-        "has_stops_good_met_start": has_stops_good_met_start_col})
-    render_md("Number of genes fitting various selection criteria:\n\n")
-    display(criteria.agg(sum))
-    render_md("Upset plot of the non-empty categories:\n\n")
-    fig = plt.figure()
-    try:
-        UpSet(
-            from_indicators(*[criteria.columns], data=criteria),
-            show_counts=True).plot(fig=fig)
-    except AttributeError:
-        pass
-    display(fig)
-    plt.close(fig)
-    gene_sets = {
-        "wrong_start": set(criteria[wrong_start_col].index),
-        "start_stop": set(criteria[start_stop_col].index),
-        "no_met_start": set(criteria[no_met_start_col].index),
-        "start_upstream": set(criteria[start_upstream_col].index),
-        # Not used
-        # "has_stops": set(criteria[has_stops_col].index),
-        "start_upstream_met_start": set(
-            criteria[start_upstream_met_start_col].index),
-        "start_upstream_met_start_nostops": set(
-            criteria[start_upstream_met_start_nostops_col].index),
-        "good_met_start": set(criteria[good_met_start_col].index),
-        "has_stops_good_met_start": set(
-            criteria[has_stops_good_met_start_col].index),
-        "no_stop_good_met_start": set(
-            criteria[no_stop_good_met_start_col].index)}
-    return (criteria, gene_sets)
-
-
-def detect_fishy_genes(codon_counts):
-    """
-    Run diagnostics about genes included in table *codon_counts*.
-
-    These should help decide whether to exclude some of the genes.
-
-    A table of boolean criteria is returned, with one line per gene.
-    """
-
-    def display_gene_set(gene_set, max_size=10):
-        """
-        Print out genes in a gene set, depending on their number.
-        """
-        if gene_set and len(gene_set) <= max_size:
-            print("   ", ", ".join(gene_set))
-
-    render_md("### Searching for mis-annotated genes")
-    (criteria, gene_sets) = compute_criteria(codon_counts)
-    render_md(
-        """
-We should avoid genes that are not in-frame. We can likely exclude
-those that do not start with a valid start codon.
-Here, this will be seen by looking at the translation of the first
-codon.
-If it is "*" (`start_stop`) or "-" (`wrong_start`),
-the codon is not a valid start codon.
-""")
-    wrong_start = gene_sets["wrong_start"]
-    render_md(f"There are {len(wrong_start)} genes that start with a `-`.")
-    display_gene_set(wrong_start)
-
-    start_stop = gene_sets["start_stop"]
-    render_md(
-        f"There are {len(start_stop)} genes that start with a stop codon.")
-    display_gene_set(start_stop)
-
-    no_met_start = gene_sets["no_met_start"]
-    if no_met_start == wrong_start | start_stop:
-        render_md(
-            """
-Genes not belonging to the `start_stop` or `wrong_start`
-category all start with a methionine.
-""")
-    else:
-        raise NotImplementedError(
-            "There are genes that start with something else than 'M', "
-            "'-' or '*'.")
-
-    start_upstream = gene_sets["start_upstream"]
-    render_md(
-        f"{len(start_upstream)} genes have a possibly ill-defined"
-        " start position.")
-    display_gene_set(start_upstream)
-
-    start_upstream_met_start = gene_sets["start_upstream_met_start"]
-    if start_upstream_met_start:
-        render_md(f"{len(start_upstream_met_start)} of them have a valid 'M'"
-                  " start.")
-        display_gene_set(start_upstream_met_start)
-
-        start_upstream_met_start_nostops = gene_sets[
-            "start_upstream_met_start_nostops"]
-        if start_upstream_met_start_nostops:
-            render_md(f"{len(start_upstream_met_start_nostops)} of them"
-                      " also do not contain any stop.")
-            display_gene_set(start_upstream_met_start_nostops)
-            render_md("Those genes could probably be kept.")
-
-    has_stops_good_met_start = gene_sets["has_stops_good_met_start"]
-    render_md(
-        f"{len(has_stops_good_met_start)} genes contain stops"
-        " but have a well defined start position with 'M'.")
-    display_gene_set(has_stops_good_met_start)
-
-    good_met_start = gene_sets["good_met_start"]
-    render_md(
-        f"{len(good_met_start)} genes have a well defined "
-        "start position with 'M'.")
-    render_md(
-        """
-If genes that have stop readthrough are a known phenomenon,
-only those among them that do not have a valid start codon
-might be excluded.
-""")
-
-    no_stop_good_met_start = gene_sets["no_stop_good_met_start"]
-    render_md(
-        f"{len(no_stop_good_met_start)} genes have a well defined"
-        " start position with 'M' and contain no stop codon.")
-    display_gene_set(no_stop_good_met_start)
-    return criteria
-
-
-def make_counts_only(
-        counts_table,
-        seq_id_kw="locus_tag", alt_tag_kw="old_locus_tag",
-        ref_info_cols=None):
-    """
-    Integrate "informative" columns of *counts_table* into the index.
-    """
-    # To ensure a stable order:
-    if ref_info_cols is None:
-        ref_info_cols = [
-            alt_tag_kw, seq_id_kw,
-            "start", "end", "length",
-            "start_codon", "expected_start_aa",
-            "first_stop", "nb_stops", "start_upstream", "end_downstream"]
-    # To ensure no info columns are lost:
-    info_cols = [
-        counts_table.index.name,
-        *counts_table.columns.difference(codon2aa)]
-    assert set(info_cols) == set(ref_info_cols)
-    return counts_table.reset_index().set_index(ref_info_cols)
-
-
-def make_aa_codon_columns(counts_table):
-    """Transform column headers into a (aa, codon) MultiIndex."""
-    counts_table.columns = pd.MultiIndex.from_tuples(
-        ((codon2aa[codon], codon) for codon in counts_table.columns),
-        names=("aa", "codon"))
-    render_md(
-        "We associated amino-acid information to codons, "
-        "as an extra level in the table columns.")
-    return counts_table
-
-
-def sort_counts_by_aa(counts_table):
-    """
-    Sort columns so that codons are grouped by corresponding amino-acid.
-    """
-    aacodons_order = list(aa2colour.keys())
-    # We need to include both aas and codons in the list
-    # because it will be used for both the aa and the codon levels
-    # when sorting the (aa, codon) MultiIndex
-    aacodons_order.extend(codon2aa.keys())
-
-    def aa_sortkey(col):
-        """
-        Key function to use when sorting a MultiIndex consisting in
-        (aa, codon) pairs.
-        """
-        return col.map(aacodons_order.index)
-    sorted_counts = counts_table.sort_index(
-        axis=1, level=(0, 1), ascending=(True, True), key=aa_sortkey)
-    render_md(
-        "The columns were sorted in order to group codons"
-        " by associated amino-acid.")
-    return sorted_counts
-
-
-def save_counts_table(counts_table, table_path):
-    """
-    Save table *counts_table* at *table_path*.
-    """
-    table_path.parent.mkdir(parents=True, exist_ok=True)
-    counts_table.to_csv(table_path, sep="\t")
-    render_md(f"The table was saved at [{table_path}]({table_path}).")
-
-
-def load_table_with_info_index(table_path, nb_info_cols, nb_cluster_series=0):
-    """
-    Load a table containing by-amino-acid codon counts or usage biases.
-
-    The table, located at *table_path*, should have tab-separated columns.
-    It is expected to contain a series of informative columns
-    preceding the columns containing the codon usage bias values.
-
-    The first *nb_info_cols* columns contain information about gene
-    identifiers and features extracted from CDS annotation data.
-
-    Then, there can be series of gene-clustering information, where each
-    column of the series corresponds to clusters defined based on
-    codon-usage biases among the codons coding an amino-acid.
-    There are no columns for amino-acids coded by only one codon (M and W).
-    *nb_cluster_series* specifies the number of such series.
-
-    The result is a pandas DataFrame object, where all the above information
-    is encoded as a MultiIndex, the codon counts or usage biases being in the
-    remaining columns.
-    """
-    return pd.read_csv(
-        table_path,
-        sep="\t",
-        # *nb_info_cols* starting levels from the initial table,
-        # plus levels corresponging to clustering information.
-        # By default, from 2 methods (nb_cluster_series=2):
-        # * `cluster_{aa}_kmeans` for each amino-acid
-        #   having more than one codon
-        # * `cluster_{aa}_full_bias` for each amino-acid
-        #   having more than one codon
-        index_col=list(
-            range(nb_info_cols + nb_cluster_series * (len(aa2colour) - 2))),
-        header=[0, 1])
-
-
-def split_info_index(table, keep_index_cols):
-    """
-    Split table *table* into info and data.
-
-    Info is supposed to be contained in the index of DataFrame *table*,
-    data in its columns.
-
-    Return a pair of tables, one for the data, one for the info,
-    where the index contains only the levels listed in *keep_index_cols*
-    """
-    drop_info_cols = [
-        colname for colname in table.index.names
-        if colname not in keep_index_cols]
-    info_table = table.reset_index()[
-        [*keep_index_cols, *drop_info_cols]].set_index(keep_index_cols)
-    # To avoid loss of index levels in input by side effect:
-    out_table = table.copy()
-    out_table.index = out_table.index.droplevel(drop_info_cols)
-    # To ensure indices have their levels in the same order
-    # in out_table and in infoinfo_table:
-    out_table = out_table.reset_index().set_index(keep_index_cols)
-    return (out_table, info_table)
-
-
-def filter_on_idx_levels(counts_table, filter_dict):
-    """
-    Filter a table *counts_table* based on values of certain index levels.
-
-    *filter_dict* should contain index level names as keys and values that
-    rows should have at this level to be included in the output
-    filtered table.
-    """
-    row_filter = np.all([
-        counts_table.index.get_level_values(idx_lvl) == idx_val
-        for (idx_lvl, idx_val) in filter_dict.items()], axis=0)
-    return counts_table[row_filter]
-
-
-# Codon usage calculations can be done in various ways.
-# Some examples are given at page 6500 (2 of the pdf) of
-# [Suzuki et al (2005)](https://doi.org/10.1016/j.febslet.2005.10.032)
-SUZUKI_DOI = "10.1016/j.febslet.2005.10.032"
-SUZUKI_LINK = f"[Suzuki et al (2005)](https://doi.org/{SUZUKI_DOI})"
-
-
-def gene_wide_codon_usage(
-        codon_counts,
-        verbose=False, return_more=False, ref_filter_dict=None):
-    """
-    Compute codon usage biases "gene-wide" as the standardized
-    difference between a gene's codon proportions and global
-    codon proportions (default).
-
-    If *ref_filter_dict* is not None, it should be a dict of
-    (index_level, index_value) pairs and the global codon proportions
-    will actually be proportions computed on the part of the data
-    restricted to the genes where the *index_level* has the *index_value*
-    for all those pairs.
-    """
-    render_md(f"""
-We will compute codon usage "gene-wide" (i.e. not by amino-acid),
-by looking at the proportion of codons within a gene's CDS.
-(This corresponds to R1 in {SUZUKI_LINK})
-""")
-    render_md("""
-To compute codon proportions, we can divide each line by its sum,
-or, equivalently (and hopefully more efficiently), normalize the data
-using the "l1" norm (which, for positive-only values amounts to the sum).
-""")
-    print("codon_proportions1")
-    print(codon_counts)
-    codon_proportions = pd.DataFrame(
-        normalize(codon_counts, norm="l1"),
-        index=codon_counts.index,
-        columns=codon_counts.columns)
-    print("codon_proportions2")
-    print(codon_proportions)
-    # Doesn't seem to be working:
-    # codon_proportions.style.hide(axis="index")
-    if verbose:
-        display(codon_proportions.head(3))
-    # Check that the sum of proportions (columns) for a gene is 1
-    colsums = codon_proportions.sum(axis=1).values
-    # Due to imprecision in float arithmetics,
-    # we can only check that the sums are close to 1
-    assert np.allclose(colsums, np.full(len(colsums), 1))
-    if ref_filter_dict is None:
-        counts_for_global = codon_counts
-    else:
-        counts_for_global = filter_on_idx_levels(
-            codon_counts, ref_filter_dict)
-    render_md("""
-To compute the usage biases, we also need to compute codon proportions
-in the global usage.
-""")
-    render_md("We compute the global usage, as the sum on columns.")
-    global_usage = counts_for_global.sum(axis=0)
-    render_md("Then we normalize it the same way as for the individual genes.")
-    # The values.reshape(1, -1) turns the Series data into a 2D array
-    # with one line. flatten() turns the data back to 1D
-    global_proportions = pd.Series(
-        normalize(global_usage.values.reshape(1, -1), norm="l1").flatten(),
-        index=global_usage.index)
-    # Can be 0.999999999999...
-    # if global_proportions.sum() != 1.:
-    #     display(global_proportions)
-    #     display(global_proportions.sum())
-    assert np.isclose(global_proportions.sum(), 1.)
-    # assert global_proportions.sum() == 1.
-    render_md("""
-Codon usage biases in genes can then be computed by subtracting
-the corresponding global proportion to a codon's proportion.
-""")
-    codon_usage_biases = codon_proportions.sub(global_proportions)
-    render_md("""
-We can standardize these values (by dividing by their standard deviation
-across genes) so that they are more comparable between codons.
-""")
-    standardized_codon_usage_biases = codon_usage_biases.div(
-        codon_usage_biases.std())
-    # Doesn't seem to be working:
-    # standardized_codon_usage_biases.style.hide(axis="index")
-    if verbose:
-        display(standardized_codon_usage_biases.head(3))
-    if return_more:
-        return {
-            "biases": standardized_codon_usage_biases,
-            "proportions": codon_proportions,
-            "global_proportions": global_proportions}
-    return standardized_codon_usage_biases
-
-
-def check_aa_codon_columns(table):
-    """
-    Check that the columns of *table* correspond to (aa, codon) pairs.
-    """
-    msg = "Codon proportions table should have two levels: 'aa' and 'codon'"
-    if table.columns.nlevels != 2:
-        raise ValueError(msg)
-    if table.columns.names[0] != "aa":
-        raise ValueError(msg)
-    if table.columns.names[1] != "codon":
-        raise ValueError(msg)
-
-
-def compute_rscu(codon_proportions_by_aa):
-    """
-    Compute Relative Synonymous Codon Usage from proportions in genes.
-
-    *codon_proportions_by_aa* should be a pandas DataFrame where
-    rows correspond to genes, and columns to codons. It contains
-    the proportions in the gene of the codon among those coding the
-    same amino-acid. The column index should be a pandas MultiIndex
-    where the first level is the amino-acid name, and the second level
-    the codon.
-
-    Return a pair where the first element is the rscu table, and
-    the second is the degeneracy table.
-    """
-    check_aa_codon_columns(codon_proportions_by_aa)
-    degeneracy = pd.Series(
-        # concat "flattens" the list of iterables given as arguments
-        # (list of tuples of repeated degeneracy values)
-        list(concat([
-            # we want stretches of the same degeneracy values
-            # of as many elements as there are codons in this degeneracy group,
-            # hence the tuple multiplication
-            ((degen),) * degen
-            # valmap transforms values in a dict given as second argument
-            # using the function given as first argument
-            # We extract only the value from the (key, value) pairs
-            # generated by the .items() methods of the resulting dict
-            # (Actually, we could probably have just used the .values() method)
-            for (_, degen) in valmap(
-                # len gives the length of each group,
-                # where a group contains the (aa, codon) pairs for a given aa,
-                # that is, the degeneracy for this group of codons
-                len,
-                # groupby creates a dict where keys are obtained from
-                # the function given as first argument and values are
-                # groups made from the iterable given as second argument
-                # (here, column names, which are (aa, codon) pairs)
-                # itemgetter(0) extracts the amino-acid from a pair (aa, codon)
-                groupby(
-                    itemgetter(0),
-                    codon_proportions_by_aa.columns)).items()])),
-        index=codon_proportions_by_aa.columns)
-    return (
-        codon_proportions_by_aa.mul(degeneracy),
-        degeneracy)
-
-
-def by_aa_codon_usage(
-        codon_counts,
-        verbose=False, return_more=False, ref_filter_dict=None):
-    """
-    Compute codon usage biases "by amino-acid" as the standardized
-    difference between a gene's codon proportions and global
-    codon proportions, where proportions are computed within
-    groups of same-amino-acid-coding codons instead of gene-wide.
-
-    If *ref_filter_dict* is not None, it should be a dict of
-    (index_level, index_value) pairs and the global codon proportions
-    will actually be proportions computed on the part of the data
-    restricted to the genes where the *index_level* has the *index_value*
-    for all those pairs.
-    """
-    check_aa_codon_columns(codon_counts)
-    render_md(f"""
-We will compute codon usage "by amino-acid", by looking at the
-proportion of codons for each amino-acid within a gene's CDS.
-(This corresponds to R2 in {SUZUKI_LINK})
-""")
-    render_md("""
-We first need to compute, for a given gene, the total number of codons
-corresponding to each amino-acid.
-""")
-    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
-    render_md("""
-To compute codon proportions, we can then divide each codon counts by
-the total for the corresponding amino-acid.
-""")
-    codon_proportions = codon_counts.div(
-        summed_by_aa)
-    # Doesn't seem to be working:
-    # codon_proportions.style.hide(axis="index")
-    if verbose:
-        display(codon_proportions.head(3))
-    # There are some (genes, aa) pairs where the sum is zero.
-    # This is normal if that gene does not have that amino-acid.
-    sums = codon_proportions.groupby(level=0, axis=1).sum()
-    # Values in sums should be either zero or one.
-    all_ones = np.full(sums.shape, 1.)
-    all_zeroes = np.full(sums.shape, 0.)
-    # Due to imprecision in float arithmetics,
-    # we can only check that the sums are close to 1 or to 0
-    assert np.array_equal(
-        np.isclose(
-            sums.values, all_ones),
-        np.logical_not(
-            np.isclose(
-                sums.values, all_zeroes)))
-    if return_more:
-        render_md(f"""
-We will also compute RSCU (Relative Synonymous Codon Usage) by multiplying
-codon proportions by the degeneracy of the corresponding amino-acid, that is,
-by the number of different codons existing for that amino-acid.
-This amounts to computing codon counts relative to the average codon counts
-for that amino-acid.
-(This corresponds to R3 in {SUZUKI_LINK})
-""")
-        (rscu, degeneracy) = compute_rscu(codon_proportions)
-        render_md(f"""
-We will also compute R4 ({SUZUKI_LINK}) by computing codon counts relative
-to the maxium across synonymous codons of the counts for that amino-acid.
-""")
-        max_counts = codon_counts.T.groupby("aa").max().T
-        r4_table = codon_counts.div(
-            max_counts)
-    if ref_filter_dict is None:
-        counts_for_global = codon_counts
-    else:
-        counts_for_global = filter_on_idx_levels(
-            codon_counts, ref_filter_dict)
-    render_md("""
-To compute the usage biases, we also need to compute codon proportions
-in the global usage.
-""")
-    render_md("""
-We compute the global usage, as the sum of the counts for a given codon,
-across genes.
-""")
-    global_usage = counts_for_global.sum(axis=0)
-    if return_more:
-        # Same computation as for individual genes,
-        # but based on the total counts.
-        global_max_counts = global_usage.T.groupby("aa").max().T
-        global_r4 = global_usage.div(
-            global_max_counts)
-    render_md("Then we sum over codons corresponding to the same amino-acid.")
-    global_summed_by_aa = global_usage.groupby(level=0).sum()
-    render_md("""
-Dividing the global usage for each codon by the total for the corresponding
-amino-acid, we can now compute the global proportions the same way as for
-the individual genes.
-""")
-    global_proportions = global_usage.div(
-        global_summed_by_aa)
-    # The proportions of codons for a given amino-acid should sum to 1
-    assert all(
-        val == 1.
-        for val in global_proportions.groupby(level=0).sum())
-    render_md("""
-Codon usage biases in genes can then be computed by subtracting
-the corresponding global proportion to a codon's proportion.
-""")
-    codon_usage_biases = codon_proportions.sub(global_proportions)
-    render_md("""
-We can standardize these values (by dividing by their standard deviation
-across genes) so that they are more comparable between codons.
-""")
-    standardized_codon_usage_biases = codon_usage_biases.div(
-        codon_usage_biases.std())
-    # Doesn't seem to be working:
-    # standardized_codon_usage_biases.style.hide(axis="index")
-    if verbose:
-        display(standardized_codon_usage_biases.head(3))
-    if return_more:
-        return {
-            "biases": standardized_codon_usage_biases,
-            "proportions": codon_proportions,
-            "rscu": rscu,
-            "r4_table": r4_table,
-            "global_proportions": global_proportions,
-            "global_rscu": global_proportions.mul(degeneracy),
-            "global_r4": global_r4}
-    return standardized_codon_usage_biases
-
-
-def aa_usage(
-        codon_counts,
-        verbose=False, return_more=False, ref_filter_dict=None):
-    """
-    Compute amino-acid usage biases as the standardized
-    difference between a gene's amino-acid proportions
-    and global amino-acid proportions.
-
-    If *ref_filter_dict* is not None, it should be a dict of
-    (index_level, index_value) pairs and the global codon proportions
-    will actually be proportions computed on the part of the data
-    restricted to the genes where the *index_level* has the *index_value*
-    for all those pairs.
-    """
-    check_aa_codon_columns(codon_counts)
-    render_md("""
-We will compute amino-acid usage, by looking at the
-proportions of amino-acids within a gene's CDS.
-""")
-    render_md("""
-We first need to compute, for a given gene, the total number of codons
-corresponding to each amino-acid.
-""")
-    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
-    render_md("""
-To compute amino-acid proportions, we can divide each line by its sum,
-or, equivalently (and hopefully more efficiently), normalize the data
-using the "l1" norm (which, for positive-only values amounts to the sum).
-""")
-    aa_proportions = pd.DataFrame(
-        normalize(summed_by_aa, norm="l1"),
-        index=summed_by_aa.index,
-        columns=summed_by_aa.columns)
-    # Doesn't seem to be working:
-    # aa_proportions.style.hide(axis="index")
-    if verbose:
-        display(aa_proportions.head(3))
-    # Checking that proportions sum to 1
-    colsums = aa_proportions.sum(axis=1)
-    assert np.allclose(colsums, np.full(len(colsums), 1))
-    # Then, computing the global amino-acid proportions
-    if ref_filter_dict is None:
-        counts_for_global = summed_by_aa
-    else:
-        counts_for_global = filter_on_idx_levels(
-            summed_by_aa, ref_filter_dict)
-    render_md("""
-To compute the usage biases, we also need to compute amino-acid proportions
-in the global usage.
-""")
-#     render_md("""
-# We compute the global usage, as the sum of the counts for a given codon,
-# across genes.
-# """)
-#     global_usage = codon_counts.sum(axis=0)
-    render_md("Then we sum over codons corresponding to the same amino-acid.")
-    # global_summed_by_aa = global_usage.groupby(level=0).sum()
-    global_summed_by_aa = counts_for_global.sum()
-    render_md("Then we normalize it the same way as for the individual genes.")
-    # The values.reshape(1, -1) turns the Series data into a 2D array
-    # with one line. flatten() turns the data back to 1D
-    global_aa_proportions = pd.Series(
-        normalize(
-            global_summed_by_aa.values.reshape(1, -1),
-            norm="l1").flatten(),
-        index=global_summed_by_aa.index)
-    render_md("""
-Amino-acid usage biases can be computed by subtracting the corresponding
-global proportion to an amino-acid's proportion.
-""")
-    aa_usage_biases = aa_proportions.sub(global_aa_proportions)
-    render_md("""
-We can standardize these values (by dividing by their standard deviation
-across genes) so that they are more comparable between amino-acids.
-""")
-    standardized_aa_usage_biases = aa_usage_biases.div(aa_usage_biases.std())
-    # Doesn't seem to be working:
-    # standardized_aa_usage_biases.style.hide(axis="index")
-    if verbose:
-        display(standardized_aa_usage_biases.head(3))
-    if return_more:
-        return {
-            "biases": standardized_aa_usage_biases,
-            "proportions": aa_proportions,
-            "global_proportions": global_aa_proportions}
-    return standardized_aa_usage_biases
-
-
-def exclude_all_nan_cols(usage_table, fill_other_nas=np.nan):
-    """
-    Detect columns in *usage_table* that contain only NaNs
-    and remove them from the table. Other NaN values are replaced
-    with *fill_other_nas*.
-    """
-    render_md("""
-Standardization may result in division by zero for usage data
-that have a zero standard deviation.
-This is expected to be the case for "by amino-acid" usage biases
-for codons corresponding to amino-acids having only one codon:
-methionine (M) and tryptophan (W).
-""")
-    all_nan_cols = usage_table.columns[
-        usage_table.isna().all()]
-    if len(all_nan_cols):
-        render_md("The following columns contain only NaNs:")
-        display(all_nan_cols)
-        render_md("This likely resulted from a division by zero.")
-        render_md("These columns will be excluded.")
-    return (
-        usage_table.drop(columns=all_nan_cols).fillna(fill_other_nas),
-        all_nan_cols)
-
-
-def codon_influence_in_components(
-        components, colnames,
-        figs_dir=None, more_components=False, formats=None):
-    """
-    Plot the influence of the columns in the first 4 principal axes of a PCA.
-
-    Each column should correspond to a codon, and will be represented as a
-    colour bar whose coulour is based on the last letter of the codon.
-
-    *components* should be a numpy array representing the principal
-    axes of a PCA, such as the `components_` attribute of a fitted
-    `sklearn.decomposition.PCA` object.
-
-    *colnames* should be the names of the columns in the *components*
-    array and  are expected to match the following pattern:
-    <aa>_<codon>, where <aa> is a single-letter code for an amino-acid,
-    and <codon> is one of the 3-letter codons for this amino-acid,
-    in capital letters among A, T, G and C (i.e. in the DNA alphabet).
-    The last letter will be used to set the colour of a bar in the plots
-
-    *figs_dir* should be a path to a directory that will be used
-    to save graphics representing the influence of each data column
-    on the first four principal components.
-
-    *formats* should be a list of formats in which the figures should
-    be saved, such as "svg" or "png".
-    """
-    render_md(
-        "Vizualizing the influence of codons in the first components\n")
-    # TODO: *figsize* could be adapted depending on the number of columns
-    if more_components:
-        (fig, axes) = plt.subplots(12, 1, figsize=(16, 60))
-    else:
-        (fig, axes) = plt.subplots(4, 1, figsize=(16, 16))
-    for (component, axis) in enumerate(axes):
-        pd.Series(
-            components[component],
-            index=colnames).plot.bar(
-                ax=axis,
-                # colname is supposed to end with the 3-letters codon
-                color=[
-                    nuc2colour[colname[-1]]
-                    for colname in colnames])
-        axis.set_ylabel(f"weight in component {component}")
-        # axis.set_xticklabels(axis.get_xticklabels(), rotation=90)
-    fig.subplots_adjust(hspace=.5)
-    if figs_dir is not None and formats is not None:
-        for ext in formats:
-            plt.savefig(
-                figs_dir.joinpath(f"PCA_components.{ext}"),
-                metadata=fmt_metadata[ext])
-    display(fig)
-    plt.close(fig)
-
-
-def codon_usage_pca(
-        usage_data,
-        figs_dir=None, hue="chrom", exclude_cols=None, plot_more_components=False,
-        formats=None, cols_are_codons=True):
-    """
-    Perform Principal Component Analysis on *usage_data*.
-
-    DataFrame *usage_data* is expected to contain observations as lines
-    and contain numerical values in columns, without NaNs, corresponding
-    to codons, where the columns headers are supposed to match the following
-    pattern: <aa>_<codon>, where <aa> is a single-letter code for
-    an amino-acid, and <codon> is one of the 3-letter codons for this
-    amino-acid, in capital letters among A, T, G and C
-    (i.e. in the DNA alphabet).
-
-    *hue* is expected to be one of the levels of the MultiIndex of
-    *usage_data*, and will be used to assign colours to the observations
-    in PCA plots. By default, colour is based on the content of a "chrom"
-    level in the index.
-
-    If *figs_dir* is not None, this path to a directory will be used
-    to save graphics representing the projection of the observations
-    in the first four principal components (0 vs. 1 and 2 vs. 3).
-
-    Unless *cols_are_codons* is set to False, there will also be
-    graphics representing the influence of each data column
-    on the first four principal components.
-
-    *formats* should be a list of formats in which the figures should
-    be saved, such as "svg" or "png".
-
-    If *exclude_cols* is not None, the columns with the names contained
-    in the iterable *exclude_cols* will not be included in the PCA analysis.
-    """
-    if figs_dir is not None:
-        figs_dir = Path(figs_dir)
-        figs_dir.mkdir(parents=True, exist_ok=True)
-    if exclude_cols is not None:
-        usage_data = usage_data.drop(columns=exclude_cols)
-    pca = PCA().fit(usage_data)
-    transformed_data = pd.DataFrame(
-        pca.transform(usage_data),
-        index=usage_data.index).reset_index(level=hue)
-    render_md(
-        "Plotting genes on the first components\n")
-    if plot_more_components:
-        (fig, axes) = plt.subplots(3, 2, figsize=(16, 25))
-        sns.scatterplot(
-            data=transformed_data,
-            x=0, y=1, hue=hue, marker=".", ax=axes[0,0])
-        sns.scatterplot(
-            data=transformed_data,
-            x=2, y=3, hue=hue, marker=".", ax=axes[0,1])
-        sns.scatterplot(
-            data=transformed_data,
-            x=4, y=5, hue=hue, marker=".", ax=axes[1,0])
-        sns.scatterplot(
-            data=transformed_data,
-            x=6, y=7, hue=hue, marker=".", ax=axes[1,1])
-        sns.scatterplot(
-            data=transformed_data,
-            x=8, y=9, hue=hue, marker=".", ax=axes[2,0])
-        sns.scatterplot(
-            data=transformed_data,
-            x=10, y=11, hue=hue, marker=".", ax=axes[2,1])
-    else:
-        (fig, axes) = plt.subplots(1, 2, figsize=(16, 8))
-        sns.scatterplot(
-            data=transformed_data,
-            x=0, y=1, hue=hue, marker=".", ax=axes[0])
-        sns.scatterplot(
-            data=transformed_data,
-            x=2, y=3, hue=hue, marker=".", ax=axes[1])
-
-    if figs_dir is not None and formats is not None:
-        for ext in formats:
-            plt.savefig(
-                figs_dir.joinpath(f"PCA_projections.{ext}"),
-                metadata=fmt_metadata[ext])
-    display(fig)
-    plt.close(fig)
-    if cols_are_codons:
-        codon_influence_in_components(
-            pca.components_, usage_data.columns,
-            figs_dir=figs_dir, more_components=plot_more_components, formats=formats)
-    return (pca, transformed_data)
-
-
-def centroid_usage(codon_counts, all_nan_cols):
-    """
-    Define "centroids" for gene clusters, one per codon in *codon_counts*.
-
-    The standardized usage biases of these centroids are computed so that
-    the codon proportions for a given amino-acid are the same as the global
-    proportions, except for the codons corresponding to a certain amino-acid.
-    For each amino-acid, there is one centroid per codon, where the
-    proportion for this codon is set to 1.0, and 0.0 for the other codons.
-    """
-    check_aa_codon_columns(codon_counts)
-    summed_by_aa = codon_counts.groupby(level=0, axis=1).sum()
-    global_usage = codon_counts.sum(axis=0)
-    global_summed_by_aa = global_usage.groupby(level=0).sum()
-    global_proportions_by_aa = global_usage.div(
-        global_summed_by_aa)
-    codon_proportions_by_aa = codon_counts.div(summed_by_aa)
-    codon_usage_biases_by_aa = codon_proportions_by_aa.sub(
-        global_proportions_by_aa)
-    # _columns_by_aa may be different from columns_by_aa
-    # Groups of column headers for each amino-acid
-    # key: aa, value: list of (aa, codon) pairs
-    _columns_by_aa = groupby(itemgetter(0), codon_proportions_by_aa.columns)
-
-    def generate_centroid_proportions():
-        for (aa1, columns) in _columns_by_aa.items():
-            for (_, codon1) in columns:
-                proportions = global_proportions_by_aa.copy()
-                for (aa2, codon2) in columns:
-                    proportions[(aa2, codon2)] = 0.0
-                proportions[(aa1, codon1)] = 1.0
-                proportions.name = (
-                    f"{aa1}_{codon1}", f"{aa1}_{codon1}",
-                    *np.full(
-                        codon_proportions_by_aa.index.nlevels - 2,
-                        np.nan))
-                yield proportions
-
-    render_md("""
-For each codon, a "centroid" (future center of a cluster of genes)
-is defined where the proportion of this codon is set to 1.0.
-For the other codons corresponding to the same amino-acid, the proportion
-is therefore set to 0.0.
-For codons corresponding to other amino-acids, the proportions are set to
-the proportions observed in the global data.
-""")
-    centroids_proportions_by_aa = pd.DataFrame(
-        generate_centroid_proportions())
-    centroids_proportions_by_aa.index.rename(
-        codon_proportions_by_aa.index.names,
-        inplace=True)
-
-    render_md("""
-The biases for the centroids are then computed by subtracting to those
-proportions the proportions in the global data, and the biases are
-standardized by dividing by the standard deviation of codon usage biases
-in the data.
-""")
-    # Compute biases using the same global proportions as the real data
-    # and standardize by the same standard deviations as the real data
-    centroids_scub_by_aa = centroids_proportions_by_aa.sub(
-        global_proportions_by_aa).div(codon_usage_biases_by_aa.std()).drop(
-            columns=all_nan_cols)
-    # centroids_scub_by_aa.head(3)
-    return centroids_scub_by_aa
-
-
-def make_centroids_cluster_finder(
-        centroids_table,
-        aa):  # pylint: disable=C0103
-    """
-    Make a function that, when applied to a row in the standardized
-    codon bias table, determines to what centroid among those
-    corresponding to *aa* it is the closest.
-    The *centroids_table* should contain standardized codon usage
-    biases for the centroids.
-    """
-    # The columns that contain data pertaining to codons coding aa:
-    cols_for_aa = centroids_table.columns.get_level_values(0) == aa
-    cluster_names = [
-        f"{aa}_{codon}"
-        for (aa, codon) in centroids_table.columns[cols_for_aa]]
-
-    def closest_centroid(gene):
-        dists_to_centroids = {}
-        for cluster_name in cluster_names:
-            dists_to_centroids[cluster_name] = sqdist(
-                gene[cols_for_aa].values,
-                centroids_table.loc[cluster_name].iloc[:, cols_for_aa].values)
-        # We return the cluster_name (`min(...))[1]`)
-        # associated with the minimum distance
-        return min(
-            (dist, cluster_name)
-            for (cluster_name, dist)
-            in dists_to_centroids.items())[1]
-    return closest_centroid
-
-
-def make_cluster_table(scub_table, centroids_scub_table):
-    """
-    Make a table for the genes in standardized codon usage bias
-    table *scub_table* where each column tells, for a given
-    amino-acid to which centroid in *centroids_scub_table*
-    it is the closest.
-    """
-    return pd.DataFrame(
-        {
-            (f"cluster_{aa}_full_bias", ""): scub_table.apply(
-                make_centroids_cluster_finder(centroids_scub_table, aa),
-                axis=1).values
-            for aa  # pylint: disable=C0103
-            in unique(centroids_scub_table.columns.get_level_values(0))},
-        index=scub_table.index)
-
-
-def compare_clusterings(
-        usage_table,
-        clust1_template="cluster_{aa}_full_bias",
-        clust2_template="cluster_{aa}_highest_bias"):
-    """
-    Compare lists of genes in *usage_table* for the clusters
-    found in the columns whose name conform to
-    *clust1_template* and *clust2_template*.
-    """
-    crosstabs = {}
-    for aa in aa2colour.keys():  # pylint: disable=C0103
-        if aa in {"M", "W"}:
-            continue
-        clust1_level = clust1_template.format(aa=aa)
-        clust2_level = clust2_template.format(aa=aa)
-        crosstabs[aa] = pd.crosstab(
-            usage_table.reset_index()[clust1_level],
-            usage_table.reset_index()[clust2_level])
-        cols_for_aa = columns_by_aa[aa]
-        cluster_names = [
-            f"{aa}_{codon}"
-            for (aa, codon) in usage_table[cols_for_aa].columns]
-        # Checking that the clusters are the same
-        same_clusters = True
-        for (clust_1, clust_2) in combinations(cluster_names, 2):
-            # assert crosstabs[aa][clust_1][clust_2] == 0
-            try:
-                # if crosstabs[aa][clust_1][clust_2] != 0:
-                if (crosstabs[aa][clust_1][clust_2] != 0) \
-                        or (crosstabs[aa][clust_2][clust_1] != 0):
-                    same_clusters = False
-            except KeyError:
-                # At least one cluster is absent.
-                # Is the absence symmetric?
-                miss_1_in_1 = clust_1 in usage_table.reset_index()[
-                    clust1_level]
-                miss_2_in_2 = clust_2 not in usage_table.reset_index()[
-                    clust2_level]
-                miss_1_in_2 = clust_1 in usage_table.reset_index()[
-                    clust2_level]
-                miss_2_in_1 = clust_2 not in usage_table.reset_index()[
-                    clust1_level]
-                if (
-                        (miss_1_in_1 != miss_1_in_2)
-                        or (miss_2_in_1 != miss_2_in_2)):
-                    same_clusters = False
-        if not same_clusters:
-            render_md(f"\nClusters for {aa} differ:\n")
-        else:
-            render_md(f"\nClusters for {aa} are identical:\n")
-        display(crosstabs[aa])
-
-
-def load_bias_table(table_path, nb_info_cols=9, nb_cluster_series=2):
-    """
-    Load a table containing by-amino-acid codon usage biases.
-
-    The table, located at *table_path*, should have tab-separated columns.
-    It is expected to contain a series of informative columns
-    preceding the columns containing the codon usage bias values.
-
-    The first *nb_info_cols* columns contain information about gene
-    identifiers and features extracted from CDS annotation data.
-
-    Then, there can be series of gene-clustering information, where each
-    column of the series corresponds to clusters defined based on
-    codon-usage biases among the codons coding an amino-acid.
-    There are no columns for amino-acids coded by only one codon (M and W).
-    *nb_cluster_series* specifies the number of such series.
-
-    The result is a pandas DataFrame object, where all the above information
-    is encoded as a MultiIndex, the codon usage biases being in the
-    remaining columns.
-    """
-    return load_table_with_info_index(
-        table_path, nb_info_cols, nb_cluster_series)
-
-
-def star2stop(text):
-    """
-    Replace stars with "stop", for use in file paths.
-    """
-    return text.replace("*", "stop")
-
-
-def write_cluster_lists(
-        usage_table,
-        aa,  # pylint: disable=C0103
-        clusters_dir,
-        cluster_level_template,
-        y_label_template,
-        alt_tag_kw="old_locus_tag"):
-    """
-    Extract lists of genes in *usage_table** for the clusters
-    associated with codons for amino-acid *aa* and write the
-    resulting lists in a sub-folder (named based on *aa*)
-    of the directory *clusters_dir*.
-    The clusters are found in the column whose name conforms to
-    *cluster_level_template*
-    Also generate violin plots for each cluster.
-    """
-    md_report = f"* Clusters for {aa}:\n\n"
-    aa_dir = clusters_dir.joinpath(star2stop(aa))
-    aa_dir.mkdir(parents=True, exist_ok=True)
-    # key: cluster
-    # value: relative path to a file containing old locus tags
-    #        for genes belonging to this cluster
-    relpaths_to_cluster_lists = {}
-    for (cluster, gene_list) in groupby(itemgetter(0), zip(
-            usage_table.index.get_level_values(
-                cluster_level_template.format(aa=aa)),
-            usage_table.index.get_level_values(alt_tag_kw))).items():
-        path_to_clusterfile = aa_dir.joinpath(star2stop(f"{cluster}.txt"))
-        with path_to_clusterfile.open("w") as clust_fh:
-            clust_fh.write("\n".join(map(itemgetter(1), gene_list)))
-        relpaths_to_cluster_lists[cluster] = str(
-            path_to_clusterfile.relative_to('.'))
-    md_report += "\n\n".join([
-        f"    - [{cluster}]({relpath_to_list})"
-        for (cluster, relpath_to_list) in relpaths_to_cluster_lists.items()])
-    violin_usage_by_clusters(
-        usage_table,
-        aa,
-        y_label_template,
-        cluster_level_template=cluster_level_template,
-        vertical=True)
-    path_to_fig = aa_dir.joinpath(star2stop(
-        f"usage_biases_violin_plots_by_cluster_for_{aa}.png"))
-    plt.savefig(path_to_fig, metadata=fmt_metadata["png"])
-    plt.close()
-    relpath_to_fig = str(path_to_fig.relative_to('.'))
-    md_report += (
-        f"\n\n    - [Violin plots for {aa} clusters]({relpath_to_fig})\n\n")
-    return md_report
-
-
-def find_valley(values):
-    """
-    Try to find a cutting threshold for the *values*,
-    where the corresponding density distribution is lowest.
-    """
-    if len(set(values)) == 1:
-        (single_val,) = set(values)
-        return single_val
-    min_val = min(values)
-    max_val = max(values)
-    val_range = max_val - min_val
-    # We use 100 bins
-    try:
-        val_density = gaussian_kde(values).evaluate(
-            np.linspace(min_val, max_val, 100))
-    except LinAlgError as err:
-        print(f"min: {min_val}, max: {max_val}, N: {len(values)}.")
-        print(str(err))
-        raise
-    # Where is the density lowest? (excluding first and last)
-    min_density_idx = val_density[1:-1].argmin()
-    # Finding the value corresponding to the index in the `val_density` array:
-    return min_val + (min_density_idx * val_range / 100)
-
-
-def find_most_biased_genes(data, do_retries=True, alt_tag_kw="old_locus_tag"):
-    """
-    Compute a usage bias threshold hopefully separating the most biased genes
-    from the remainder of the cluster where *data* contains codon usage biases
-    for the genes in the cluster and has an *alt_tag_kw* level in its index.
-    Return the threshold in terms of usage bias and the list of genes above it.
-
-    /!\ This is likely very sensitive to the shape
-    of the usage bias distribution.
-    """
-    thresh = find_valley(data.values)
-    # Try avoiding cases with threshold "misplaced" at the bottom
-    # of the distribution
-    retries = 0
-    while do_retries and thresh < 0:
-        retries += 1
-        thresh = find_valley(data.sort_values().iloc[retries:].values)
-    top_genes = data[(data > thresh)].index.get_level_values(alt_tag_kw).values
-    return (thresh, top_genes, retries)
-
-
-def extract_top_genes_from_cluster(
-        usage_table,
-        aa,  # pylint: disable=C0103
-        codon,
-        clusters_dir,
-        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
-    """
-    Return a few lines of markdown with links pointing to
-    where the saved results are expected to be found.
-    """
-    md_report = f"    - Most biased genes for {codon} ({aa}):\n\n"
-    aa_dir = clusters_dir.joinpath(star2stop(aa))
-    # Files where to save the results
-    top_dir = aa_dir.joinpath("top_genes")
-    path_to_top = top_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
-    path_to_top25 = top_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
-    path_to_fig = top_dir.joinpath(star2stop(
-        f"usage_biases_violin_plot_{aa}_{codon}.png"))
-    # Load the list of genes belonging to the cluster
-    # associated with a high usage bias for *codon*
-    path_to_clusterfile = aa_dir.joinpath(star2stop(f"{aa}_{codon}.txt"))
-    with open(path_to_clusterfile) as gene_list_fh:
-        genes = [line.strip() for line in gene_list_fh]
-    # Extract the usage bias data for these genes and for this codon
-    idx_level = usage_table.index.names.index(alt_tag_kw)
-    try:
-        # data = usage_table.loc[genes][(aa, codon)]
-        data = usage_table.loc[
-            (*(slice(None) for _ in range(idx_level)), genes),
-            (aa, codon)]
-    except KeyError:
-        print(genes[0])
-        print(usage_table.index.names)
-        raise
-    q3 = data.quantile(0.75)  # pylint: disable=C0103
-    top25 = data[(data > q3)].index.get_level_values(
-        alt_tag_kw).values
-    (thresh, top_genes, retries) = find_most_biased_genes(
-        data, do_retries=True, alt_tag_kw=alt_tag_kw)
-    if retries != 0:
-        print(
-            f"Bottom-most {retries} values had to be discarded in order "
-            f"to avoid a negative threshold for the cluster {aa}_{codon}.")
-    top_genes_check = set(
-        usage_table[
-            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
-                alt_tag_kw).values)
-    if set(top_genes) != top_genes_check:
-        assert set(top_genes).issubset(top_genes_check)
-        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
-        print(
-            f"{len(extra_genes_not_in_cluster)} genes were found "
-            f"with a high bias for {codon} but not belonging "
-            f"to the cluster {aa}_{codon}.")
-    # For visual verification purpose,
-    # display where the threshold is situated on a violin plot
-    axis = violin_with_thresh(
-        data, [(thresh, "red"), (q3, "blue")],
-        f"standardized usage bias for {codon} ({aa})",
-        facecolor=aa2colour[aa], axis=axis)
-    # Save the figure and close it.
-    plt.tight_layout()
-    if savefig:
-        top_dir.mkdir(parents=True, exist_ok=True)
-        plt.savefig(path_to_fig, metadata={'creationDate': None})
-        plt.close()
-        # Save the list of the top genes
-        with open(path_to_top25, "w") as gene_list_fh:
-            gene_list_fh.write("\n".join(top25))
-        with open(path_to_top, "w") as gene_list_fh:
-            gene_list_fh.write("\n".join(top_genes))
-    relpath_to_top25 = str(path_to_top25.relative_to('.'))
-    relpath_to_top = str(path_to_top.relative_to('.'))
-    relpath_to_fig = str(path_to_fig.relative_to('.'))
-    md_report += "\n\n".join([
-        f"        * [top 25% genes]({relpath_to_top25})",
-        f"        * [top genes]({relpath_to_top})",
-        f"        * [violin plot with thresholds ({q3:.3}, "
-        f"{thresh:.3})]({relpath_to_fig})"])
-    return md_report
-
-
-# TODO:
-def plot_codon_usage_for_gene_list(
-        gene_list_path,
-        usage_table,
-        aa,  # pylint: disable=C0103
-        codon,
-        figs_dir=None,
-        axis=None, savefig=True, alt_tag_kw="old_locus_tag"):
-    """
-    Make a violin plot of codon usage biases for genes in file *gene_list_path*
-    using data in *usage_table*.
-    The plot will represent the distribution of usage bias
-    for codon *codon* of amino-acid *aa*.
-    The plot will be saved inside *figs_dir* if *savefig* is set to True.
-    Gene identifiers are expected to be *alt_tag_kw* identifiers.
-    """
-    md_report = f"    - Genes from {gene_list_path}:\n\n"
-    with open(gene_list_path) as gene_list_fh:
-        genes = [
-            line.strip() for line in gene_list_fh
-            if line[0] != "#"]
-    # Extract the usage bias data for these genes and for this codon
-    idx_level = usage_table.index.names.index(alt_tag_kw)
-    try:
-        # data = usage_table.loc[genes][(aa, codon)]
-        data = usage_table.loc[
-            (*(slice(None) for _ in range(idx_level)), genes),
-            (aa, codon)]
-    except KeyError:
-        print(
-            f"Genes are supposed to be provided as {alt_tag_kw} identifiers.")
-        print(genes[0])
-        print(usage_table.index.names)
-        raise
-    q3 = data.quantile(0.75)  # pylint: disable=C0103
-    # top25 = data[(data > q3)].index.get_level_values(
-    #     alt_tag_kw).values
-    (thresh, top_genes, retries) = find_most_biased_genes(
-        data, do_retries=True, alt_tag_kw=alt_tag_kw)
-    if retries != 0:
-        print(
-            f"Bottom-most {retries} values (out of {len(data)}) "
-            "had to be discarded in order to avoid a negative threshold "
-            f"for the cluster {aa}_{codon}.")
-    top_genes_check = set(
-        usage_table[
-            (usage_table[(aa, codon)] > thresh)].index.get_level_values(
-                alt_tag_kw).values)
-    if set(top_genes) != top_genes_check:
-        assert set(top_genes).issubset(top_genes_check)
-        extra_genes_not_in_cluster = top_genes_check - set(top_genes)
-        print(
-            f"{len(extra_genes_not_in_cluster)} genes were found "
-            f"with a high bias for {codon} but not belonging to "
-            f"the cluster {aa}_{codon}.")
-    # For visual verification purpose,
-    # display where the threshold is situated on a violin plot
-    axis = violin_with_thresh(
-        data, [(thresh, "red"), (q3, "blue")],
-        f"standardized usage bias for {codon} ({aa})",
-        facecolor=aa2colour[aa], axis=axis)
-    # Save the figure and close it.
-    plt.tight_layout()
-    if not savefig:
-        return md_report
-    if figs_dir is None:
-        raise ValueError("No *figs_dir* directory specified.\n")
-    aa_dir = figs_dir.joinpath(star2stop(aa))
-    # Files where to save the results
-    aa_dir.mkdir(parents=True, exist_ok=True)
-    path_to_fig = aa_dir.joinpath(star2stop(
-        f"usage_biases_violin_plot_{aa}_{codon}.png"))
-    plt.savefig(path_to_fig, metadata=fmt_metadata["png"])
-    plt.close()
-    # Save the list of the top genes
-    # path_to_top = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top.txt"))
-    # path_to_top25 = aa_dir.joinpath(star2stop(f"{aa}_{codon}_top25.txt"))
-    # with open(path_to_top25, "w") as gene_list_fh:
-    #     gene_list_fh.write("\n".join(top25))
-    # with open(path_to_top, "w") as gene_list_fh:
-    #     gene_list_fh.write("\n".join(top_genes))
-    # relpath_to_top25 = str(path_to_top25.relative_to('.'))
-    # relpath_to_top = str(path_to_top.relative_to('.'))
-    relpath_to_fig = str(path_to_fig.relative_to('.'))
-    md_report += "\n\n".join([
-        # f"        * [top 25% genes]({relpath_to_top25})",
-        # f"        * [top genes]({relpath_to_top})",
-        f"        * [violin plot with thresholds ({q3:.3}, "
-        f"{thresh:.3})]({relpath_to_fig})"])
-    return md_report
-
-
-def boxplot_usage(usage_table, ylabel, whiskers="1.5 IQR"):
-    """
-    Plot a boxplot from pandas DataFrame *usage_table*.
-
-    Use *y_label* as y-axis label.
-    """
-    nb_genes = usage_table.shape[0]
-    if whiskers == "1.5 IQR":
-        axis = usage_table.plot.box(figsize=(18, 6), rot=90, sym=".")
-        axis.set_title(
-            f"Distributions of codon usage biases among {nb_genes} "
-            "genes (whiskers at 1.5 IQR).")
-    elif whiskers == "5 percent tails":
-        axis = usage_table.plot.box(
-            figsize=(18, 6), rot=90, sym=".", whis=(5, 95))
-        axis.set_title(
-            f"Distributions of codon usage biases among {nb_genes} "
-            "genes (whiskers at the 5th and 95th percentiles).")
-    axis.set_ylabel(ylabel)
-
-
-def to_long_form(usage_table, ylabel, others=None):
-    """
-    Transform data in *usage_table* into "long form".
-
-    The goal is to be able to use aa as hue in violin plots.
-
-    See https://pandas.pydata.org/docs/user_guide/10min.html#stack
-    and https://seaborn.pydata.org/generated/seaborn.violinplot.html
-
-    *y_label* is the column header you want to set for the usage values.
-
-    *others* should be a list of extra information you want to extract
-    from the index (which is expected to be a pandas MultiIndex).
-    """
-    col_nb_levels = usage_table.columns.nlevels
-    row_nb_levels = usage_table.index.nlevels
-    long_form = usage_table.stack(
-        level=list(range(col_nb_levels))).to_frame().reset_index(
-            level=list(range(
-                row_nb_levels, row_nb_levels + col_nb_levels)))
-    if others is None:
-        others = []
-    if col_nb_levels == 1:
-        long_form.columns = ["aa", ylabel]
-        others = [other for other in others if other != "aa"]
-    elif col_nb_levels == 2:
-        long_form.columns = ["aa", "codon", ylabel]
-        others = [other for other in others if other not in {"aa", "codon"}]
-    else:
-        raise NotImplementedError(
-            "Don't know how to deal with column headers "
-            "with more than 2 levels ('aa' and 'codon')\n")
-    long_form = long_form.reset_index(level=[
-        lvl_name for lvl_name in usage_table.index.names
-        if lvl_name.startswith("cluster")])
-    if not set(others).issubset(set(long_form.index.names)):
-        print(set(others))
-        print(set(long_form.index.names))
-        raise ValueError(
-            "All elements in *others* should be in the index "
-            "of *usage_table*")
-    return long_form.assign(**{
-        other: long_form.index.get_level_values(other) for other in others})
-
-
-def variable2order(variable):
-    """
-    Define the order of amino-acids or codons.
-    """
-    if variable == "aa":
-        return list(columns_by_aa.keys())
-    if variable == "codon":
-        return [codon for (_, codon) in concat(columns_by_aa.values())]
-    raise ValueError("variable can only be 'aa' or 'codon'.\n")
-
-
-def format_codon_labels(codons):
-    """
-    Add amino-acid information to codons meant to be used as tick labels.
-    """
-    return [
-        plt.Text(x, y, f"{codon} ({codon2aa[codon]})")
-        for (x, y, codon) in map(attrgetter("_x", "_y", "_text"), codons)]
-
-
-def violin_usage(
-        usage_table, variable, ylabel,
-        hue="aa", axis=None, **violin_kwargs):
-    """
-    Plot violin plots of codon usage biases.
-
-    *variable* should be either "codon" or "aa".
-    """
-    if hue in {"aa", "codon"}:
-        dodge = False
-        nb_violins = 1
-    else:
-        dodge = True
-        # dodge = True implies there will be one violin
-        # per possible value of *hue*, side by side
-        nb_violins = len(usage_table.index.get_level_values(hue).unique())
-    long_form = to_long_form(usage_table, ylabel, others=[hue])
-    if axis is None:
-        _, axis = plt.subplots(figsize=(18 * nb_violins, 6))
-        do_legend = True
-    else:
-        do_legend = False
-    if hue == "aa":
-        palette = aa2colour
-    else:
-        palette = None
-    kwargs = {
-        "x": variable, "y": ylabel, "order": variable2order(variable),
-        "hue": hue, "palette": palette, "dodge": dodge,
-        "data": long_form, "ax": axis, "orient": "v", "scale": "count"}
-    kwargs.update(violin_kwargs)
-    axis = sns.violinplot(**kwargs)
-    if do_legend:
-        plt.legend(bbox_to_anchor=(1.01, 1), borderaxespad=0)
-    if variable == "codon":
-        ticklabels = format_codon_labels(axis.get_xticklabels())
-    else:
-        ticklabels = axis.get_xticklabels()
-    # TODO: Expect issues when variable == "aa"
-    axis.set_xticklabels(ticklabels, rotation=90)
-    return axis
-
-
-def violin_usage_vertical(
-        usage_table, variable, ylabel,
-        hue="aa", axis=None, **violin_kwargs):
-    """
-    Plot vertically stacked violin plots of codon usage biases.
-
-    *variable* should be either "codon" or "aa".
-    """
-    if hue in {"aa", "codon"}:
-        dodge = False
-        nb_violins = 1
-    else:
-        dodge = True
-        # dodge = True implies there will be one violin
-        # per possible value of *hue*, side by side
-        nb_violins = len(usage_table.index.get_level_values(hue).unique())
-    long_form = to_long_form(usage_table, ylabel, others=[hue])
-    if axis is None:
-        if variable == "codon":
-            base_height = 44
-        elif variable == "aa":
-            base_height = 18
-        else:
-            raise NotImplementedError(f"Unsupported variable: {variable}.")
-        _, axis = plt.subplots(figsize=(6, base_height * nb_violins))
-    if hue == "aa":
-        palette = aa2colour
-    else:
-        palette = None
-    kwargs = {
-        "y": variable, "x": ylabel,
-        "order": variable2order(variable),
-        "hue": hue, "palette": palette, "dodge": dodge,
-        "data": long_form, "ax": axis, "orient": "h", "scale": "count"}
-    kwargs.update(violin_kwargs)
-    axis = sns.violinplot(**kwargs)
-    if variable == "codon":
-        ticklabels = format_codon_labels(axis.get_yticklabels())
-        axis.set_yticklabels(ticklabels)
-    return axis
-
-
-def violin_usage_by_clusters(usage_with_clusters, aa,  # pylint: disable=C0103
-                             ylabel_template,
-                             cluster_level_template="cluster_{aa}",
-                             vertical=False, **violin_kwargs):
-    """
-    Plot a series of violin plots for each cluster of genes.
-
-    The clusters are defined in usage table *usage_with_clusters*
-    based on usage biases for the codons coding for amino-acid *aa*.
-
-    *usage_with_clusters* should have clustering indications as
-    an index level named according to *cluster_level_template*
-    (default "cluster_{aa}"), where {aa} is to be replaced by *aa*.
-    """
-    clusters = usage_with_clusters.groupby(
-        level=cluster_level_template.format(aa=aa))
-    # To adjust figure size
-    if "hue" in violin_kwargs and violin_kwargs["hue"] not in {"aa", "codon"}:
-        # dodge will be set to True in later plotting call.
-        # This implies that there will be one violin
-        # per possible value of *hue*, side by side
-        nb_violins = len(usage_with_clusters.index.get_level_values(
-            violin_kwargs["hue"]).unique())
-    else:
-        nb_violins = 1
-    if vertical:
-        fig, axes = plt.subplots(
-            ncols=clusters.ngroups,
-            constrained_layout=True,
-            figsize=(6 * clusters.ngroups, 44 * nb_violins))
-    else:
-        fig, axes = plt.subplots(
-            nrows=clusters.ngroups,
-            constrained_layout=True,
-            figsize=(18 * nb_violins, 6 * clusters.ngroups))
-    for ((cluster, usage_table), axis) in zip(clusters, axes):
-        kwargs = {"axis": axis}
-        kwargs.update(violin_kwargs)
-        if vertical:
-            violin_usage_vertical(
-                usage_table, "codon",
-                ylabel_template.format(aa=aa, cluster=cluster),
-                **kwargs)
-        else:
-            violin_usage(
-                usage_table, "codon",
-                ylabel_template.format(aa=aa, cluster=cluster),
-                **kwargs)
-    return fig
-
-
-def violin_usage_by_clusters_splitby(
-        usage_with_clusters, aa,  # pylint: disable=C0103
-        ylabel_template,
-        cluster_level_template="cluster_{aa}",
-        vertical=False, by_lvl="chrom",
-        **violin_kwargs):
-    """
-    Plot a series of violin plots for each cluster of genes,
-    splitting the figures according to groups defined by the
-    content of the index level *by_lvl* of *usage_with_clusters*.
-
-    The clusters are defined in usage table *usage_with_clusters*
-    based on usage biases for the codons coding for amino-acid *aa*.
-
-    *usage_with_clusters* should have clustering indications as
-    an index level named according to *cluster_level_template*
-    (default "cluster_{aa}"), where {aa} is to be replaced by *aa*.
-    """
-    classes = sorted(set(
-        usage_with_clusters.index.get_level_values(by_lvl)))
-    idx_level = usage_with_clusters.index.names.index(by_lvl)
-    for clss in classes:
-        fig = violin_usage_by_clusters(
-            usage_with_clusters.loc[
-                (*(slice(None) for _ in range(idx_level)), clss), ],
-            aa, ylabel_template + f" for {clss}",
-            cluster_level_template=cluster_level_template,
-            vertical=vertical, **violin_kwargs)
-        fig.suptitle(f"Violin plots for {clss}")
-
-
-# Based on
-# https://matplotlib.org/stable/gallery/statistics/customized_violin.html
-def adjacent_values(q0, q1, q3, q4):  # pylint: disable=C0103
-    """
-    Define whiskers positions using the 1.5 * IQR criterion.
-
-    Use the values of quartiles *q0* (min), *q1*, *q3* and *q4* (max).
-    """
-    upper_adjacent_value = q3 + (q3 - q1) * 1.5
-    upper_adjacent_value = np.clip(upper_adjacent_value, q3, q4)
-    lower_adjacent_value = q1 - (q3 - q1) * 1.5
-    lower_adjacent_value = np.clip(lower_adjacent_value, q0, q1)
-    return (lower_adjacent_value, upper_adjacent_value)
-
-
-def violin_with_thresh(data, thresh, ylabel,
-                       facecolor="grey", axis=None, **violin_kwargs):
-    """
-    Draw a violin plot from *data* with a horizontal line at value *thresh*.
-
-    If *thresh* is a single value, the line will be red.
-    If *thresh* contains pairs, it will be assumed that these
-    are (threshold, colour) pairs, to be used to plot various lines.
-    Y-axis will be labeled using *ylabel*.
-    """
-    if axis is None:
-        _, axis = plt.subplots(figsize=(2, 3.5))
-    kwargs = {"showextrema": False}
-    kwargs.update(violin_kwargs)
-    violin_parts = axis.violinplot(data, **kwargs)
-    # https://stackoverflow.com/a/26291582/1878788
-    [body] = violin_parts["bodies"]
-    body.set_facecolor(facecolor)
-    body.set_edgecolor("black")
-    try:
-        (val, col) = thresh[0]
-    except TypeError:
-        thresh = [(thresh, "red")]
-    for (val, col) in thresh:
-        axis.axhline(val, color=col)
-        axis.text(
-            1.01, val, f"{val:.3}",
-            horizontalalignment="left", verticalalignment="bottom")
-    (q0, q1, median, q3, q4) = np.percentile(  # pylint: disable=C0103
-        data, [0, 25, 50, 75, 100])
-    (w_min, w_max) = adjacent_values(q0, q1, q3, q4)
-    axis.scatter(1, median, marker="o", s=30, color="white", zorder=3)
-    axis.vlines(1, q1, q3, linestyle="-", linewidth=5, color="black")
-    axis.vlines(1, w_min, w_max, linestyle="-", linewidth=1, color="black")
-    axis.set_xticks([])
-    axis.set_ylabel(ylabel)
-    return axis
-
-
-if __name__ == "__main__":
-    import doctest
-    doctest.testmod()
-- 
GitLab