From 08452c07145fff9957b7d7e930b300d7c0599f55 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 13 May 2019 11:57:19 +0200
Subject: [PATCH] Made libhts a submodule.

---
 .gitmodules               |   3 +
 libhts                    |   1 +
 libhts/.gitignore         |  11 -
 libhts/install.sh         |   5 -
 libhts/libhts/__init__.py |  12 -
 libhts/libhts/libhts.py   | 786 --------------------------------------
 libhts/setup.py           |  33 --
 7 files changed, 4 insertions(+), 847 deletions(-)
 create mode 160000 libhts
 delete mode 100644 libhts/.gitignore
 delete mode 100755 libhts/install.sh
 delete mode 100644 libhts/libhts/__init__.py
 delete mode 100644 libhts/libhts/libhts.py
 delete mode 100644 libhts/setup.py

diff --git a/.gitmodules b/.gitmodules
index f723ad7..6430ed0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,3 +7,6 @@
 [submodule "libworkflows"]
 	path = libworkflows
 	url = git@gitlab.pasteur.fr:bli/libworkflows.git
+[submodule "libhts"]
+	path = libhts
+	url = git@gitlab.pasteur.fr:bli/libhts.git
diff --git a/libhts b/libhts
new file mode 160000
index 0000000..7e25a23
--- /dev/null
+++ b/libhts
@@ -0,0 +1 @@
+Subproject commit 7e25a230845896610a1a11a8ba91a51c824e9d0f
diff --git a/libhts/.gitignore b/libhts/.gitignore
deleted file mode 100644
index b4e1667..0000000
--- a/libhts/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-# Compiled python modules.
-*.pyc
-
-# Setuptools distribution folder.
-/dist/
-
-# Python egg metadata, regenerated from source files by setuptools.
-/*.egg-info
-
-# Backups
-*~
diff --git a/libhts/install.sh b/libhts/install.sh
deleted file mode 100755
index 5ddc41a..0000000
--- a/libhts/install.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/sh
-python3.6 setup.py build_ext
-# .egg-link does not work with PYTHONPATH ?
-python3.6 -m pip install -e .
-python3.6 -m pip install  --no-deps --ignore-installed .
diff --git a/libhts/libhts/__init__.py b/libhts/libhts/__init__.py
deleted file mode 100644
index a980938..0000000
--- a/libhts/libhts/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from .libhts import (
-    aligner2min_mapq,
-    gtf_2_genes_exon_lengths,
-    id_list_gtf2bed,
-    make_empty_bigwig,
-    make_seeding_function,
-    median_ratio_to_pseudo_ref_size_factors,
-    plot_boxplots, plot_counts_distribution, plot_histo,
-    plot_lfc_distribution, plot_MA,
-    plot_norm_correlations, plot_paired_scatters, plot_scatter,
-    repeat_bed_2_lengths, size_factor_correlations,
-    spikein_gtf_2_lengths, status_setter)
diff --git a/libhts/libhts/libhts.py b/libhts/libhts/libhts.py
deleted file mode 100644
index 5932deb..0000000
--- a/libhts/libhts/libhts.py
+++ /dev/null
@@ -1,786 +0,0 @@
-from math import floor, ceil, sqrt, log
-from functools import reduce
-from re import sub
-import warnings
-from libworkflows import texscape
-
-
-def formatwarning(message, category, filename, lineno, line):
-    """Used to format warning messages."""
-    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
-
-
-warnings.formatwarning = formatwarning
-
-import numpy as np
-import pandas as pd
-# To compute correlation coefficient, and compute linear regression
-from scipy.stats.stats import pearsonr, linregress
-# To compute geometric mean
-from scipy.stats.mstats import gmean
-import matplotlib as mpl
-import matplotlib.pyplot as plt
-# https://stackoverflow.com/a/42768093/1878788
-from matplotlib.backends.backend_pgf import FigureCanvasPgf
-mpl.backend_bases.register_backend('pdf', FigureCanvasPgf)
-TEX_PARAMS = {
-    "text.usetex": True,            # use LaTeX to write all text
-    "pgf.rcfonts": False,           # Ignore Matplotlibrc
-    "pgf.texsystem": "lualatex",  # hoping to avoid memory issues
-    "pgf.preamble": [
-        r'\usepackage{color}'     # xcolor for colours
-    ]
-}
-mpl.rcParams.update(TEX_PARAMS)
-import seaborn as sns
-# from rpy2.robjects import r, pandas2ri, Formula, StrVector
-# as_df = r("as.data.frame")
-# from rpy2.rinterface import RRuntimeError
-# from rpy2.robjects.packages import importr
-# deseq2 = importr("DESeq2")
-from pybedtools import BedTool
-import pyBigWig
-import networkx as nx
-
-
-class Exon(object):
-    __slots__ = ("chrom", "start", "end")
-    def __init__(self, chrom, start, end):
-        self.chrom = chrom
-        self.start = start
-        self.end = end
-
-    def overlap(self, other):
-        if self.chrom != other.chrom:
-            return False
-        return (self.start <= other.start < self.end) or (other.start <= self.start < other.end)
-
-    def merge(self, other):
-        # Not necessary: can be indirectly linked
-        #assert overlap(self, other)
-        return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end))
-
-    def __len__(self):
-        return self.end - self.start
-
-overlap = Exon.overlap
-merge = Exon.merge
-
-class Gene(object):
-    """This object contains information obtained from a gtf file."""
-    __slots__ = ("gene_id", "exons", "union_exon_length")
-    def __init__(self, gene_id):
-        self.gene_id = gene_id
-        #self.transcripts = {}
-        self.exons = nx.Graph()
-        self.union_exon_length = None
-
-    #def add_transcript(self, feature):
-    #    the_id = feature.attrs["transcript_id"]
-    #    assert the_id not in self.transcripts
-    #    self.transcripts[the_id] = feature
-
-    def add_exon(self, feature):
-        #the_id = feature.attrs["exon_id"]
-        #assert the_id not in self.exons
-        #self.exons[the_id] = feature
-        exon = Exon(feature.chrom, feature.start, feature.end)
-        if exon not in self.exons:
-            self.exons.add_node(exon)
-
-    # The merging cannot be done on the full BedTool because we dont want
-    # to merge together exons not belonging to the same gene.
-    def set_union_exon_length(self):
-        """The exons are used to make a BedTool, which enables convenient merging of
-        overlapping features. The sum of the lengths of the merged exons is returned."""
-        if len(self.exons) == 1:
-            # No need to merge when there is only one exon
-            self.union_exon_length = len(next(iter(self.exons.nodes())))
-        else:
-            # Too slow
-            #self.union_exon_length = sum(map(
-            #    len, BedTool(self.exons.values()).merge().features()))
-            #self.union_exon_length = 0
-            # We group nodes that overlap, and merge them
-            #overlapping_exons = nx.quotient_graph(self.exons, overlap)
-            #for node in overlapping_exons.nodes():
-            #    mex = reduce(merge, node)
-            #    self.union_exon_length += len(mex)
-            self.union_exon_length = sum((len(reduce(
-                merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes()))
-
-
-def gtf_2_genes_exon_lengths(gtf_filename):
-    """Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
-    gtf_file = open(gtf_filename, "r")
-    gtf = BedTool(gtf_file)
-    genes = {}
-    for feature in gtf.features():
-        feat_type = feature[2]
-        if feat_type != "exon":
-            continue
-        attrs = feature.attrs
-        gene_id = attrs["gene_id"]
-        if gene_id not in genes:
-            genes[gene_id] = Gene(gene_id)
-        gene = genes[gene_id]
-        try:
-            gene.add_exon(feature)
-        except AssertionError:
-            # A given exon may be registered for several transcripts, hence several gtf entries
-            already = gene.exons[feature.attrs["exon_id"]]
-            assert already.attrs["transcript_id"] != feature.attrs["transcript_id"]
-            assert (already.start, already.end) == (feature.start, feature.end)
-    for gene in genes.values():
-        gene.set_union_exon_length()
-    return pd.DataFrame(pd.Series(
-        {gene.gene_id : gene.union_exon_length for gene in genes.values()},
-        name=("union_exon_len")).rename_axis("gene"))
-
-
-def repeat_bed_2_lengths(repeat_bed):
-    """Computes the lengths of repetitive elements in a bed file, grouped by families.
-    This assumes that the elements have their names composed of the family name,
-    then a colon, then a number. For instance:
-    Simple_repeat|Simple_repeat|(TTTTTTG)n:1
-    Simple_repeat|Simple_repeat|(TTTTTTG)n:2
-    Simple_repeat|Simple_repeat|(TTTTTTG)n:3
-    Simple_repeat|Simple_repeat|(TTTTTTG)n:4
-    -> Simple_repeat|Simple_repeat|(TTTTTTG)n
-    Returns a DataFrame associating the summed lengths to the family names.
-    """
-    # usecols=[1, 2, 3]: start, end, id
-    # index_col=2: id (relative to the selected columns)
-    start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2)
-    # bed lengths
-    lens = start_ends[2] - start_ends[1]
-    lens.name = "union_exon_len"
-    repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index]
-    # The reads assigned to a repeated element can come
-    # from the summed length of all the members of the family
-    # We call this "gene" for convenience and compatibility
-    return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
-
-
-def spikein_gtf_2_lengths(spikein_gtf):
-    """Computes the lengths of spike-ins, grouped by families.
-    Returns a DataFrame associating the summed lengths to the spike-in names.
-    """
-    spikein_ends = {}
-    with open(spikein_gtf) as gtf_file:
-        for line in gtf_file:
-            fields = line.strip().split("\t")
-            spikein_ends[fields[0]] = int(fields[4])
-    return pd.DataFrame(pd.Series(
-        spikein_ends,
-        name=("union_exon_len")).rename_axis("gene"))
-
-
-def id_list_gtf2bed(identifiers, gtf_filename, feature_type="transcript", id_kwd="gene_id"):
-    """
-    Extract bed coordinates of an iterable of identifiers from a gtf file.
-
-    *identifiers* is the iterable of identifiers.
-    *gtf_filename* is the name of the gtf file.
-    *feature_type* is the type of feature to be considered
-    in the gtf file (third columns).
-    *id_kwd* is the keyword under which the feature ID is expected to be found
-    in the feature annotations in the gtf_file. These feature IDs will be
-    matched against the elements in *identifiers*.
-    """
-    with open(gtf_filename, "r") as gtf_file:
-        gtf = BedTool(gtf_file)
-        if identifiers:
-            ids = set(identifiers)
-            def feature_filter(feature):
-                return feature[2] == feature_type and feature[id_kwd] in ids
-            return gtf.filter(feature_filter)
-        else:
-            # https://stackoverflow.com/a/13243870/1878788
-            def empty_bed_generator():
-                return
-                yield
-            return empty_bed_generator()
-
-
-def make_empty_bigwig(filename, chrom_sizes):
-    """Writes *filename* so that it is an empty bigwig file.
-    *chrom_sizes is a dictionary giving chromosome sizes* given
-    chomosome names.
-    """
-    bw_out = pyBigWig.open(filename, "w")
-    bw_out.addHeader(list(chrom_sizes.items()))
-    for (chrom, chrom_len) in bw_out.chroms().items():
-        bw_out.addEntries(
-            chrom, 0,
-            values=np.nan_to_num(np.zeros(chrom_len)[0::10]),
-            span=10, step=10)
-    bw_out.close()
-
-
-#################
-# Bowtie2 stuff #
-#################
-def zero(value):
-    return 0
-
-
-def identity(value):
-    return value
-
-
-bowtie2_function_selector = {
-    "C": zero,
-    "L": identity,
-    "S": sqrt,
-    "G": log}
-
-
-def make_seeding_function(seeding_string):
-    """Generates a function that computes the seeding pattern given a
-    string representing bowtie2 seeding settings (-L and -i options).
-    
-    >>> make_seeding_function("-L 6 -i S,1,0.8")(18)
-    [[0, 6], [4, 10], [8, 14], [12, 18]]
-    """
-    [opt1, val1, opt2, val2] = seeding_string.split()
-    if opt1 == "-L":
-        assert opt2 == "-i"
-        seed_len = int(val1)
-        interval_string = val2
-    else:
-        assert opt2 == "-L"
-        seed_len = int(val2)
-        interval_string = val1
-    [func_type, constant, coeff] = interval_string.split(",")
-    constant = float(constant)
-    coeff = float(coeff)
-    func_type = bowtie2_function_selector[func_type]
-    def seeding_function(read_len):
-        interval = floor(constant + (coeff * func_type(read_len)))
-        seeds = []
-        seed = [0, seed_len]
-        while seed[1] <= read_len:
-            seeds.append(seed)
-            next_seed_start = seed[0] + interval
-            seed = [next_seed_start, next_seed_start + seed_len]
-        return seeds
-    return seeding_function
-
-
-def aligner2min_mapq(aligner, wildcards):
-    """
-    Option to filter on MAPQ value in featureCounts.
-
-    What minimal MAPQ value should a read have to be considered uniquely mapped?
-    See <https://sequencing.qcfail.com/articles/mapq-values-are-really-useful-but-their-implementation-is-a-mess/>.
-    """
-    try:
-        mapping_type = wildcards.mapping_type
-    except AttributeError:
-        mapping_type = wildcards.mapped_type
-    if mapping_type.startswith("unique_"):
-        if aligner == "hisat2":
-            return "-Q 60"
-        elif aligner == "bowtie2":
-            return "-Q 23"
-        else:
-            raise NotImplementedError(f"{aligner} not handled (yet?)")
-    else:
-        return ""
-
-
-## Not sure this is a good idea...
-# def masked_gmean(a, axis=0, dtype=None):
-#     """Modified from stats.py."""
-#     # Converts the data into a masked array
-#     ma = np.ma.masked_invalid(a)
-#     # Apply gmean
-#     if not isinstance(ma, np.ndarray):
-#         # if not an ndarray object attempt to convert it
-#         log_a = np.log(np.array(ma, dtype=dtype))
-#     elif dtype:
-#         # Must change the default dtype allowing array type
-#         if isinstance(ma, np.ma.MaskedArray):
-#             log_a = np.log(np.ma.asarray(ma, dtype=dtype))
-#         else:
-#             log_a = np.log(np.asarray(ma, dtype=dtype))
-#     else:
-#         log_a = np.log(ma)
-#     return np.exp(log_a.mean(axis=axis))
-
-
-
-def median_ratio_to_pseudo_ref_size_factors(counts_data):
-    """Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106)
-    All libraries are used to define a pseudo-reference, which has
-    the geometric mean across libraries for a given gene in *counts_data*.
-    For a given library, the median across genes of the ratios to the
-    pseudo-reference is used as size factor."""
-    # Add pseudo-count to compute the geometric mean, then remove it
-    #pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
-    # Ignore lines with zeroes instead (may be bad for IP: many zeroes expected):
-    pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1)
-    # Ignore lines with only zeroes
-    # pseudo_ref = (counts_data[counts_data.sum(axis=1) > 0]).apply(masked_gmean, axis=1)
-    def median_ratio_to_pseudo_ref(col):
-        return (col / pseudo_ref).median()
-    #size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
-    median_ratios = counts_data[counts_data.prod(axis=1) > 0].apply(
-        median_ratio_to_pseudo_ref, axis=0)
-    # Not sure fillna(0) is appropriate
-    if any(median_ratios.isna()):
-        msg = "Could not compute median ratios to pseudo reference.\n"
-        warnings.warn(msg)
-        return median_ratios.fillna(1)
-    else:
-        return median_ratios
-
-
-def size_factor_correlations(counts_data, summaries, normalizer):
-    """Is there a correlation, across libraries, between normalized values and size factors?
-    The size factor type *normalizer* is either computed or taken from *summaries*.
-    The normalized data are computed by dividing *counts_data* by this size factor."""
-    if normalizer == "median_ratio_to_pseudo_ref":
-        size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
-    else:
-        size_factors = summaries.loc[normalizer]
-    #by_norm = counts_data / size_factors
-    def compute_pearsonr_with_size_factor(row):
-        return pearsonr(row, size_factors)[0]
-    #return by_norm.apply(compute_pearsonr_with_size_factor, axis=1)
-    return (counts_data / size_factors).apply(compute_pearsonr_with_size_factor, axis=1)
-
-
-def plot_norm_correlations(correlations):
-    #fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, sharex=True)
-    #correlations.plot.kde(ax=ax1)
-    #sns.violinplot(data=correlations, orient="h", ax=ax2)
-    #ax2.set_xlabel("Pearson correlation coefficient")
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        correlations.columns = [texscape(colname) for colname in correlations.columns]
-    axis = sns.violinplot(data=correlations, cut=0)
-    axis.set_ylabel("Pearson correlation coefficient")
-
-
-def plot_counts_distribution(data, xlabel):
-    # TODO: try to plot with semilog x axis
-    #axis = data.plot.kde(legend=None)
-    #axis.set_xlabel(xlabel)
-    #axis.legend(ncol=len(REPS))
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        xlabel = texscape(xlabel)
-        data.columns = [texscape(colname) for colname in data.columns]
-    try:
-        axis = data.plot.kde()
-    except ValueError as e:
-        msg = "".join([
-            "There seems to be a problem with the data.\n",
-            "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
-        warnings.warn(msg)
-        raise
-    axis.set_xlabel(xlabel)
-
-
-def plot_histo(outfile, data, title=None):
-    fig = plt.figure(figsize=(15,7))
-    ax = fig.add_subplot(111)
-    ax.set_xlim([data.index[0] - 0.5, data.index[-1] + 0.5])
-    #ax.set_ylim([0, 100])
-    bar_width = 0.8
-    #letter2legend = dict(zip("ACGT", "ACGT"))
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        data.columns = [texscape(colname) for colname in data.columns]
-        #title = sub("_", r"\_", title)
-        title = texscape(title)
-    for (read_len, count) in data.iterrows():
-        plt.bar(
-            read_len,
-            count,
-            align="center",
-            width=bar_width)
-            #color=letter2colour[letter],
-            #label=letter2legend[letter])
-    ax.legend()
-    ax.set_xticks(data.index)
-    ax.set_xticklabels(data.index)
-    ax.set_xlabel("read length")
-    ax.set_ylabel("number of reads")
-    plt.setp(ax.get_xticklabels(), rotation=90)
-    if title is not None:
-        plt.title(title)
-    ## debug
-    try:
-        plt.savefig(outfile)
-    except RuntimeError as e:
-        print(data.index)
-        print(title)
-        raise
-    ##
-
-
-def plot_boxplots(data, ylabel):
-    fig = plt.figure(figsize=(6, 12))
-    axis = fig.add_subplot(111)
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        ylabel = texscape(ylabel)
-        data.columns = [texscape(colname) for colname in data.columns]
-    data.plot.box(ax=axis)
-    axis.set_ylabel(ylabel)
-    for label in axis.get_xticklabels():
-        label.set_rotation(90)
-    plt.tight_layout()
-
-
-############
-# DE stuff #
-############
-# def do_deseq2(cond_names, conditions, counts_data,
-#               formula=None, contrast=None, deseq2_args=None):
-#     """Runs a DESeq2 differential expression analysis."""
-#     if formula is None:
-#         formula = Formula("~ lib")
-#     if contrast is None:
-#         # FIXME: MUT and REF are not defined
-#         # Maybe just make (formula and) contrast mandatory
-#         contrast = StrVector(["lib", MUT, REF])
-#     if deseq2_args is None:
-#         deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
-#     col_data = pd.DataFrame(conditions).assign(
-#         cond_name=pd.Series(cond_names).values).set_index("cond_name")
-#     # In case we want contrasts between factor combinations
-#     if ("lib" in col_data.columns) and ("treat" in col_data.columns):
-#         col_data = col_data.assign(
-#             lib_treat = ["%s_%s" % (lib, treat) for (lib, treat) in zip(
-#                 col_data["lib"], col_data["treat"])])
-#     # http://stackoverflow.com/a/31206596/1878788
-#     pandas2ri.activate()  # makes some conversions automatic
-#     # r_counts_data = pandas2ri.py2ri(counts_data)
-#     # r_col_data = pandas2ri.py2ri(col_data)
-#     # r.DESeqDataSetFromMatrix(countData=r_counts_data, colData=r_col_data, design=Formula("~lib"))
-#     dds = deseq2.DESeqDataSetFromMatrix(
-#         countData=counts_data,
-#         colData=col_data,
-#         design=formula)
-#     # dds = deseq2.DESeq(dds, betaPrior=deseq2_args["betaPrior"])
-#     # Decompose into the 3 steps to have more control on the options
-#     try:
-#         dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="ratio")
-#     except RRuntimeError as e:
-#         if sum(counts_data.prod(axis=1)) == 0:
-#             msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
-#                            "This is probably because every gene has at least one zero.\n",
-#                            "We will try to use the \"poscounts\" method instead."])
-#             warnings.warn(msg)
-#             try:
-#                 dds = deseq2.estimateSizeFactors_DESeqDataSet(dds, type="poscounts")
-#             except RRuntimeError as e:
-#                 msg = "".join(["Error occurred in estimateSizeFactors:\n%s\n" % e,
-#                                "We give up."])
-#                 warnings.warn(msg)
-#                 raise
-#                 #print(counts_data.dtypes)
-#                 #print(counts_data.columns)
-#                 #print(len(counts_data))
-#                 #raise
-#         else:
-#             raise
-#     size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
-#     #for cond in cond_names:
-#     #    #s = size_factors.loc[cond][0]
-#     #    #(*_, s) = size_factors.loc[cond]
-#     #pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
-#     try:
-#         dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
-#     except RRuntimeError as e:
-#         msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-#                     "We will try with fitType=\"local\"."])
-#         warnings.warn(msg)
-#         try:
-#             dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="local")
-#         except RRuntimeError as e:
-#             msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-#                         "We will try with fitType=\"mean\"."])
-#             warnings.warn(msg)
-#             try:
-#                 dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="mean")
-#             except RRuntimeError as e:
-#                 msg = "".join(["Error occurred in estimateDispersions:\n%s\n" % e,
-#                             "We give up."])
-#                 warnings.warn(msg)
-#                 raise
-#     dds = deseq2.nbinomWaldTest(dds, betaPrior=deseq2_args["betaPrior"])
-#     res = pandas2ri.ri2py(as_df(deseq2.results(
-#         dds,
-#         contrast=contrast,
-#         addMLE=deseq2_args["addMLE"],
-#         independentFiltering=deseq2_args["independentFiltering"])))
-#     res.index = counts_data.index
-#     return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
-
-
-# Cutoffs in log fold change
-LFC_CUTOFFS = [0.5, 1, 2]
-UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
-DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
-
-
-def status_setter(lfc_cutoffs=None, fold_type="log2FoldChange"):
-    """*fold_type* can also be "lfcMLE", which is based on uncorrected values.
-    This may not be good for genes with low expression levels."""
-    if lfc_cutoffs is None:
-        lfc_cutoffs = LFC_CUTOFFS
-    def set_status(row):
-        """Determines the up- or down-regulation status corresponding to a given
-        row of a deseq2 results table."""
-        if row["padj"] < 0.05:
-            #if row["log2FoldChange"] > 0:
-            lfc = row[fold_type]
-            if lfc > 0:
-                # Start from the highest cutoff,
-                # and decrease until below lfc
-                for cutoff in sorted(lfc_cutoffs, reverse=True):
-                    if lfc > cutoff:
-                        return f"up{cutoff}"
-                return "up"
-            else:
-                for cutoff in sorted(lfc_cutoffs, reverse=True):
-                    if lfc < -cutoff:
-                        return f"down{cutoff}"
-                return "down"
-        else:
-            return "NS"
-    return set_status
-
-
-# res = res.assign(is_DE=res.apply(set_de_status, axis=1))
-def set_de_status(row):
-    """Determines whether a gene is differentially expressed (DE) of not (NS)
-    based on the adjusted p-value in row of a deseq2 results table."""
-    if row["padj"] < 0.05:
-        return "DE"
-    else:
-        return "NS"
-DE2COLOUR = {
-    # black
-    "DE" : "k",
-    # pale grey
-    "NS" : "0.85"}
-
-
-def plot_lfc_distribution(res, contrast, fold_type=None):
-    """*fold_type* is "log2FoldChange" by default.
-    It can also be "lfcMLE", which is based on uncorrected values.
-    This may not be good for genes with low expression levels."""
-    #lfc = res.lfcMLE.dropna()
-    if fold_type is None:
-        fold_type = "log2FoldChange"
-    lfc = getattr(res, fold_type).dropna()
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        lfc.name = texscape(contrast)
-    else:
-        lfc.name = contrast
-        # lfc.columns = [texscape(colname) for colname in lfc.columns]
-    axis = sns.kdeplot(lfc)
-    axis.set_xlabel(fold_type)
-    axis.set_ylabel("frequency")
-
-
-def make_status2colour(down_statuses, up_statuses):
-    statuses = list(reversed(down_statuses)) + ["down", "NS", "up"] + up_statuses
-    return dict(zip(statuses, sns.color_palette("coolwarm", len(statuses))))
-
-
-STATUS2COLOUR = make_status2colour(DOWN_STATUSES, UP_STATUSES)
-# TODO: use other labelling than logfold or gene lists, i.e. biotype
-def plot_MA(res,
-            grouping=None,
-            group2colour=None,
-            mean_range=None,
-            lfc_range=None,
-            fold_type=None):
-    """*fold_type* is "log2FoldChange" by default.
-    It can also be "lfcMLE", which is based on uncorrected values.
-    This may not be good for genes with low expression levels."""
-    if not len(res):
-        raise ValueError("No data to plot.")
-    fig, ax = plt.subplots()
-    # Make a column indicating whether the gene is DE or NS
-    data = res.assign(is_DE=res.apply(set_de_status, axis=1))
-    x_column = "baseMean"
-    data = data.assign(logx=np.log10(data[x_column]))
-    if fold_type is None:
-        y_column = "log2FoldChange"
-    else:
-        y_column = fold_type
-    usetex = mpl.rcParams.get("text.usetex", False)
-    def scatter_group(group, label, colour, size=1):
-        """Plots the data in *group* on the scatterplot."""
-        if usetex:
-            label = texscape(label)
-        group.plot.scatter(
-            #x=x_column,
-            x="logx",
-            y=y_column,
-            s=size,
-            #logx=True,
-            c=colour,
-            label=label, ax=ax)
-    if usetex:
-        data.columns = [texscape(colname) for colname in data.columns]
-        y_column = texscape(y_column)
-        de_status_column = "is\_DE"
-    else:
-        de_status_column = "is_DE"
-    # First plot the data in grey and black
-    for (de_status, group) in data.groupby(de_status_column):
-        label = f"{de_status} ({len(group)})"
-        colour = DE2COLOUR[de_status]
-        scatter_group(group, label, colour, size=2)
-    if grouping is not None:
-        if isinstance(grouping, str):
-            # Overlay colours based on the "grouping" column
-            if group2colour is None:
-                group2colour = STATUS2COLOUR
-            for status, group in data.groupby(grouping):
-                label = f"{status} ({len(group)})"
-                colour = group2colour[status]
-                scatter_group(group, label, colour)
-        else:
-            (status, colour) = group2colour
-            row_indices = data.index.intersection(grouping)
-            try:
-                label=f"{status} ({len(row_indices)})"
-                scatter_group(data.ix[row_indices], label, colour)
-            except ValueError as e:
-                if str(e) != "scatter requires x column to be numeric":
-                    print(data.ix[row_indices])
-                    raise
-                else:
-                    warnings.warn(f"Nothing to plot for {status}\n")
-    ax.axhline(y=1, linewidth=0.5, color="0.5", linestyle="dashed")
-    ax.axhline(y=-1, linewidth=0.5, color="0.5", linestyle="dashed")
-    # TODO: check data basemean range
-    if mean_range is not None:
-        # ax.set_xlim(mean_range)
-        ax.set_xlim(np.log10(mean_range))
-    if lfc_range is not None:
-        (lfc_min, lfc_max) = lfc_range
-        lfc_here_min = getattr(data, y_column).min()
-        lfc_here_max = getattr(data, y_column).max()
-        if (lfc_here_min < lfc_min) or (lfc_here_max > lfc_max):
-            warnings.warn(f"Cannot plot {y_column} data ([{lfc_here_min}, {lfc_here_max}]) in requested range ([{lfc_min}, {lfc_max}])\n")
-        else:
-            ax.set_ylim(lfc_range)
-    # https://stackoverflow.com/a/24867320/1878788
-    x_ticks = np.arange(
-        floor(np.ma.masked_invalid(data["logx"]).min()),
-        ceil(np.ma.masked_invalid(data["logx"]).max()),
-        1)
-    x_ticklabels = [r"$10^{{{}}}$".format(tick) for tick in x_ticks]
-    plt.xticks(x_ticks, x_ticklabels)
-    ax.set_xlabel(x_column)
-
-
-def plot_scatter(data,
-                 x_column,
-                 y_column,
-                 regression=False,
-                 grouping=None,
-                 group2colour=None,
-                 x_range=None,
-                 y_range=None,
-                 axes_style=None):
-    if not len(data):
-        raise ValueError("No data to plot.")
-    fig, ax = plt.subplots()
-    # ax.set_adjustable('box')
-    # First plot the data in grey
-    data.plot.scatter(
-        x=x_column, y=y_column,
-        s=2, c="black", alpha=0.15, edgecolors='none',
-        ax=ax)
-    if regression:
-        linreg = linregress(data[[x_column, y_column]].dropna())
-        a = linreg.slope
-        b = linreg.intercept
-        def fit(x):
-            return (a * x) + b
-        min_x = data[[x_column]].min()[0]
-        max_x = data[[x_column]].max()[0]
-        min_y = fit(min_x)
-        max_y = fit(max_x)
-        xfit, yfit = (min_x, max_x), (min_y, max_y)
-        ax.plot(xfit, yfit, linewidth=0.5, color="0.5", linestyle="dashed")
-    # Overlay colour points
-    if grouping is not None:
-        if isinstance(grouping, str):
-            # Determine colours based on the "grouping" column
-            if group2colour is None:
-                statuses = data[grouping].unique()
-                group2colour = dict(zip(statuses, sns.color_palette("colorblind", len(statuses))))
-            for (status, group) in data.groupby(grouping):
-                group.plot.scatter(
-                    x=x_column, y=y_column, s=1, c=group2colour[status],
-                    label=f"{status} ({len(group)})", ax=ax)
-        else:
-            # Apply a colour to a list of genes
-            (status, colour) = group2colour
-            row_indices = data.index.intersection(grouping)
-            try:
-                data.ix[row_indices].plot.scatter(
-                    x=x_column, y=y_column, s=1, c=colour,
-                    label=f"{status} ({len(row_indices)})", ax=ax)
-            except ValueError as e:
-                if str(e) != "scatter requires x column to be numeric":
-                    print(data.ix[row_indices])
-                    raise
-                else:
-                    warnings.warn(f"Nothing to plot for {status}\n")
-    if axes_style is None:
-        axes_style = {"linewidth": 0.5, "color": "0.5", "linestyle": "dashed"}
-    ax.axhline(y=0, **axes_style)
-    ax.axvline(x=0, **axes_style)
-    # ax.axhline(y=0, linewidth=0.5, color="0.5", linestyle="dashed")
-    # ax.axvline(x=0, linewidth=0.5, color="0.5", linestyle="dashed")
-    # Set axis limits
-    if x_range is not None:
-        (x_min, x_max) = x_range
-        x_here_min = getattr(data, x_column).min()
-        x_here_max = getattr(data, x_column).max()
-        if (x_here_min < x_min) or (x_here_max > x_max):
-            warnings.warn(f"Cannot plot {x_column} data ([{x_here_min}, {x_here_max}]) in requested range ([{x_min}, {x_max}])\n")
-        else:
-            ax.set_xlim(x_range)
-    if y_range is not None:
-        (y_min, y_max) = y_range
-        y_here_min = getattr(data, y_column).min()
-        y_here_max = getattr(data, y_column).max()
-        if (y_here_min < y_min) or (y_here_max > y_max):
-            warnings.warn(f"Cannot plot {y_column} data ([{y_here_min}, {y_here_max}]) in requested range ([{y_min}, {y_max}])\n")
-        else:
-            ax.set_ylim(y_range)
-    return ax
-
-
-def plot_paired_scatters(data, columns=None, hue=None, log_log=False):
-    """Alternative to pairplot, in order to avoid histograms on the diagonal."""
-    if columns is None:
-        columns = data.columns
-    usetex = mpl.rcParams.get("text.usetex", False)
-    if usetex:
-        data.columns = [texscape(colname) for colname in data.columns]
-        columns = [texscape(colname) for colname in columns]
-    g = sns.PairGrid(data, vars=columns, hue=hue, size=8)
-    #g.map_offdiag(plt.scatter, marker=".")
-    g.map_lower(plt.scatter, marker=".")
-    if log_log:
-        for ax in g.axes.ravel():
-            ax.set_xscale('log')
-            ax.set_yscale('log')
-    g.add_legend()
diff --git a/libhts/setup.py b/libhts/setup.py
deleted file mode 100644
index cbe5dac..0000000
--- a/libhts/setup.py
+++ /dev/null
@@ -1,33 +0,0 @@
-from setuptools import setup, find_packages
-#from Cython.Build import cythonize
-
-name = "libhts"
-
-# Adapted from Biopython
-__version__ = "Undefined"
-for line in open("%s/__init__.py" % name):
-    if (line.startswith('__version__')):
-        exec(line.strip())
-
-
-setup(
-    name=name,
-    version=__version__,
-    description="Miscellaneous things to process high throughput sequencing data.",
-    author="Blaise Li",
-    author_email="blaise.li@normalesup.org",
-    license="MIT",
-    packages=find_packages(),
-    install_requires=[
-        # "libworkflows",
-        "matplotlib",
-        "networkx",
-        "numpy",
-        "pandas",
-        "pyBigWig",
-        "pybedtools",
-        "scipy",
-        "seaborn"])
-    #ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
-    #install_requires=["cytoolz"],
-    #zip_safe=False
-- 
GitLab