From 424f87f93a738ad257afb42c02d7bcdd5ffc849c Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 13 May 2019 12:36:53 +0200
Subject: [PATCH] Made libsmallrna a submodule.

---
 .gitmodules                             |   3 +
 libsmallrna                             |   1 +
 libsmallrna/.gitignore                  |  11 -
 libsmallrna/install.sh                  |   5 -
 libsmallrna/libsmallrna/__init__.py     |  16 -
 libsmallrna/libsmallrna/libsmallrna.pyx | 903 ------------------------
 libsmallrna/setup.py                    |  15 -
 7 files changed, 4 insertions(+), 950 deletions(-)
 create mode 160000 libsmallrna
 delete mode 100644 libsmallrna/.gitignore
 delete mode 100755 libsmallrna/install.sh
 delete mode 100644 libsmallrna/libsmallrna/__init__.py
 delete mode 100755 libsmallrna/libsmallrna/libsmallrna.pyx
 delete mode 100644 libsmallrna/setup.py

diff --git a/.gitmodules b/.gitmodules
index 6b2a212..d4b3d75 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "libdeseq"]
 	path = libdeseq
 	url = git@gitlab.pasteur.fr:bli/libdeseq.git
+[submodule "libsmallrna"]
+	path = libsmallrna
+	url = git@gitlab.pasteur.fr:bli/libsmallrna.git
diff --git a/libsmallrna b/libsmallrna
new file mode 160000
index 0000000..af0b527
--- /dev/null
+++ b/libsmallrna
@@ -0,0 +1 @@
+Subproject commit af0b5271e3f1ec2f4bcd4c43e108d3d442e2897b
diff --git a/libsmallrna/.gitignore b/libsmallrna/.gitignore
deleted file mode 100644
index b4e1667..0000000
--- a/libsmallrna/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-# Compiled python modules.
-*.pyc
-
-# Setuptools distribution folder.
-/dist/
-
-# Python egg metadata, regenerated from source files by setuptools.
-/*.egg-info
-
-# Backups
-*~
diff --git a/libsmallrna/install.sh b/libsmallrna/install.sh
deleted file mode 100755
index 5ddc41a..0000000
--- a/libsmallrna/install.sh
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/bin/sh
-python3.6 setup.py build_ext
-# .egg-link does not work with PYTHONPATH ?
-python3.6 -m pip install -e .
-python3.6 -m pip install  --no-deps --ignore-installed .
diff --git a/libsmallrna/libsmallrna/__init__.py b/libsmallrna/libsmallrna/__init__.py
deleted file mode 100644
index e98d0e1..0000000
--- a/libsmallrna/libsmallrna/__init__.py
+++ /dev/null
@@ -1,16 +0,0 @@
-from .libsmallrna import (
-    has_pi_signature, has_endosi_signature, has_26G_signature, has_csr1_endosi_signature,
-    count_small,
-    count_annots, count_pimis, count_sis, count_all_sis, count_nucl, count_first_bases,
-    add_compositions, add_results, Composition, get_read_info,
-    PI_MIN, PI_MAX, SI_MIN, SI_MAX,
-    SI_SUFFIXES, SI_PREFIXES,
-    SMALL_TYPES_TREE,
-    type2RNA,
-    types_under,
-    SMALL_TYPES,
-    SI_TYPES,
-    SIU_TYPES,
-    SISIU_TYPES,
-    RMSK_SISIU_TYPES,
-    JOINED_SMALL_TYPES)
diff --git a/libsmallrna/libsmallrna/libsmallrna.pyx b/libsmallrna/libsmallrna/libsmallrna.pyx
deleted file mode 100755
index 1019087..0000000
--- a/libsmallrna/libsmallrna/libsmallrna.pyx
+++ /dev/null
@@ -1,903 +0,0 @@
-#!/usr/bin/env python3
-# vim: set fileencoding=<utf-8> :
-# cython: language_level=3
-"""
-This module provides things used in small RNA annotation.
-
-TODO: New criteria for small RNA identification:
-- piRNA
-- miRNA
-- all_si (no signature criteria):
-    - 22G antisense (actually 21-23, including on such things as rRNA):
-        - te_22G
-        - prot_22G
-        - pseu_22G
-        - satel_22G
-        - simprep_22G
-    - 26G antisense (including on such things as rRNA):
-        - te_26G
-        - prot_26G
-        - pseu_26G
-        - satel_26G
-        - simprep_26G
-
-Note: We don't consider 22G(T+) or 26G(T+).
-The reliable ones should come from unmapped poly-T-trimmed remapped (all_siu, etc.)
-
-TODO: Update small_RNAs.pdf
-"""
-
-# http://stackoverflow.com/a/23575771/1878788
-# Also works by setting language_level as comment
-# at the top of the file.
-#from __future__ import print_function
-import warnings
-from sys import stderr
-from collections import defaultdict
-from cytoolz import merge_with, frequencies
-from itertools import chain
-from operator import add
-from functools import reduce
-#from pysam import AlignedSegment
-
-#############################################
-# Declaring small types and their hierarchy #
-#############################################
-cdef unsigned int CPI_MIN = 21
-cdef unsigned int CPI_MAX = 26
-cdef unsigned int CSI_MIN = 22
-cdef unsigned int CSI_MAX = 26
-PI_MIN = CPI_MIN
-PI_MAX = CPI_MAX
-SI_MIN = CSI_MIN
-SI_MAX = CSI_MAX
-
-
-PIMI = {"piRNA", "miRNA"}
-CDS_UTR = {"protein_coding_CDS", "protein_coding_UTR"}
-PROTEIN_CODING = CDS_UTR | {"protein_coding_pure_intron"}
-TE = {"DNA_transposons_rmsk", "RNA_transposons_rmsk"}
-ANNOTABLE_SI_TARGETS = PROTEIN_CODING | TE | {"pseudogene"} | {"satellites_rmsk", "simple_repeats_rmsk"}
-ALLOWED_SI_TARGETS = ANNOTABLE_SI_TARGETS
-
-# annotation names that seem to be frequent and that are not protein-coding or TE
-# "22G" alone occurs when the read was not associated with relevant annotations
-COMMON_NO_TARGET = {"piRNA", "miRNA", "no_signature", "22G"}
-COMMON_NO_ENDOSI = {"piRNA", "miRNA", "no_signature"}
-# Possible read signatures for endo-siRNAs
-# ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G", f"csr1_{SI_MIN}G"}
-# Stricter version, does not allow poly-U tail (should have been trimmed)
-ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G"}
-
-RMSK_PREFIXES = ["te", "satel", "simrep"]
-PROT_PREFIXES = ["prot"]
-PSEU_PREFIXES = ["pseu"]
-SI_PREFIXES = [*PROT_PREFIXES, *PSEU_PREFIXES, *RMSK_PREFIXES]
-SI_SUFFIXES = ENDOSI_SIG
-# Should we have only non-overlapping sets ?
-SMALL_TYPES_TREE = {
-    "si": {suffix: {
-        #"joined" : ["_".join(["si", suffix])],
-        "rmsk": ["_".join([prefix, "si", suffix]) for prefix in RMSK_PREFIXES],
-        "prot": ["_".join([prefix, "si", suffix]) for prefix in PROT_PREFIXES],
-        "pseu": ["_".join([prefix, "si", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES},
-    "siu": {suffix: {
-        #"joined" : ["_".join(["siu", suffix])],
-        "rmsk": ["_".join([prefix, "siu", suffix]) for prefix in RMSK_PREFIXES],
-        "prot": ["_".join([prefix, "siu", suffix]) for prefix in PROT_PREFIXES],
-        "pseu": ["_".join([prefix, "siu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES},
-    "pi": ["pi"],
-    "mi": ["mi"]
-}
-
-SISIU_TREE_EXTENSION = {
-    "sisiu": {suffix: {
-        #"joined" : ["_".join(["sisiu", suffix])],
-        "rmsk": ["_".join([prefix, "sisiu", suffix]) for prefix in RMSK_PREFIXES],
-        "prot": ["_".join([prefix, "sisiu", suffix]) for prefix in PROT_PREFIXES],
-        "pseu": ["_".join([prefix, "sisiu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES}}
-
-
-def types_under(typekeys, tree=SMALL_TYPES_TREE):
-    """
-    Generate type names corresponding to the branching according to the
-    succession of keys *typekeys* in the *tree* dictionary of (ultimately)
-    lists.
-    """
-    if typekeys:
-        yield from types_under(typekeys[1:], tree[typekeys[0]])
-    else:
-        if isinstance(tree, dict):
-            for subtree in tree.values():
-                yield from types_under([], subtree)
-        else:
-            yield from tree
-
-
-SMALL_TYPES = list(types_under([]))
-SI_TYPES = list(types_under(["si"]))
-SIU_TYPES = list(types_under(["siu"]))
-SISIU_TYPES = list(types_under([], tree=SISIU_TREE_EXTENSION))
-RMSK_SISIU_TYPES = set(chain(
-    types_under(["si", f"{SI_MIN}G", "rmsk"]),
-    types_under(["si", f"{SI_MAX}G", "rmsk"]),
-    types_under(["siu", f"{SI_MIN}G", "rmsk"]),
-    types_under(["siu", f"{SI_MAX}G", "rmsk"])))
-JOINED_SMALL_TYPES = [
-    *[f"si_{suffix}" for suffix in SI_SUFFIXES],
-    *[f"siu_{suffix}" for suffix in SI_SUFFIXES],
-    f"pimi{SI_MIN}G"]
-
-
-def type2RNA(small_types):
-    """Append "RNA" to the string or list of strings *small_types*."""
-    if isinstance(small_types, (str, bytes)):
-        return f"{small_types}RNA"
-    else:
-        return list(map(type2RNA, small_types))
-
-
-FQ_TEMPLATE = b"@%s\n%s\n+\n%s\n"
-
-DNA_COMPL_TABLE = str.maketrans("ACGTRYSWKMBDHVN", "TGCAYRWSMKVHDBN")
-
-cdef str creverse_complement(str seq):
-    return seq.translate(DNA_COMPL_TABLE)[::-1]
-
-
-cdef int BAM_FREVERSE = 16
-
-
-cdef object cget_read_info(object ali):
-    """Extracts information from AlignedSegment *ali*."""
-    if (ali.flag & BAM_FREVERSE) != 0:
-        return (
-            ali.query_name,
-            creverse_complement(ali.query_sequence),
-            # ali.query_qualities[::-1],
-            ali.qual[::-1],
-            ali.query_length,
-            ali.reference_name,
-            # five prime pos:
-            ali.reference_end - 1,
-            ali.reference_start,
-            ali.reference_end,
-            "-")
-    else:
-        return (
-            ali.query_name,
-            ali.query_sequence,
-            # ali.query_qualities,
-            ali.qual,
-            ali.query_length,
-            ali.reference_name,
-            # five prime pos:
-            ali.reference_start,
-            ali.reference_start,
-            ali.reference_end,
-            "+")
-
-
-def get_read_info(ali):
-    """Extracts information from AlignedSegment *ali*."""
-    return cget_read_info(ali)
-
-# def get_read_info(ali):
-#     """Extracts information from AlignedSegment *ali*."""
-#     if ali.is_reverse:
-#         return (
-#             ali.query_name,
-#             reverse_complement(ali.query_sequence),
-#             # ali.query_qualities[::-1],
-#             ali.qual[::-1],
-#             ali.query_length,
-#             ali.reference_start,
-#             ali.reference_end,
-#             "-")
-#     else:
-#         return (
-#             ali.query_name,
-#             ali.query_sequence,
-#             # ali.query_qualities,
-#             ali.qual,
-#             ali.query_length,
-#             ali.reference_start,
-#             ali.reference_end,
-#             "+")
-
-cdef bint chas_pi_signature(str seq, size_t read_len):
-    return read_len >= PI_MIN and seq[0] == "T" and read_len <= PI_MAX
-
-
-def has_pi_signature(str seq, int read_len):
-    f"""Return True if this is a {PI_MIN}U
-    (can actually have length up to {PI_MAX})."""
-    return chas_pi_signature(seq, read_len)
-
-
-cdef bint chas_endosi_signature(str seq, size_t read_len):
-    return (SI_MIN - 1) <= read_len <= (SI_MIN + 1) and seq[0] == "G"
-
-
-def has_endosi_signature(str seq, int read_len):
-    f"""Return True if this is a {SI_MIN}G."""
-    return chas_endosi_signature(seq, read_len)
-
-
-cdef bint chas_26G_signature(str seq, size_t read_len):
-    return read_len == SI_MAX and seq[0] == "G"
-
-
-def has_26G_signature(str seq, int read_len):
-    f"""Return True if this is a {SI_MAX}G."""
-    return chas_26G_signature(seq, read_len)
-
-
-# These should have problems mapping because the T-tail is not
-# in the genome sequence
-cdef bint chas_csr1_endosi_signature(str seq, int read_len):
-    # if read_len <= 22, t_tail will be the empty string
-    cdef str t_tail
-    t_tail = (read_len - SI_MIN) * "T"
-    if t_tail:
-        return (seq[0] == "G") and (seq[SI_MIN:] == t_tail)
-    else:
-        return False
-
-
-def has_csr1_endosi_signature(str seq, int read_len):
-    return chas_csr1_endosi_signature(seq, read_len)
-
-
-cdef bint chas_allsi_signature(str seq):
-    cdef unsigned int read_len = len(seq)
-    return (seq[0] == "G" and ((SI_MIN - 1) <= read_len <= SI_MAX)) or chas_csr1_endosi_signature(seq, read_len)
-
-
-def count_annots(annot_infos, queue):
-    """Returns a dictionary counting the annotation names in the
-    iterable *annot_infos*.
-    *annot_infos* is generated by process_annotations.
-    For the moment, nothing is done with the queue."""
-    return frequencies((annot_info[0][0] for annot_info in annot_infos))
-
-
-cdef class Composition:
-    cdef readonly int a, c, g, t
-    def __init__(self, int a=0, int c=0, int g=0, int t=0):
-        self.a = a
-        self.c = c
-        self.g = g
-        self.t = t
-    def __add__(self, other):
-        return Composition(
-            self.a + other.a,
-            self.c + other.c,
-            self.g + other.g,
-            self.t + other.t)
-    def __reduce__(self):
-        d = {}
-        d["a"] = self.a
-        d["c"] = self.c
-        d["g"] = self.g
-        d["t"] = self.t
-        return (Composition, (), d)
-    def __setstate__(self, d):
-        self.a = d["a"]
-        self.c = d["c"]
-        self.g = d["g"]
-        self.t = d["t"]
-
-
-def count_nucl(int read_len, str nucl):
-    if nucl == "A":
-        return {read_len: Composition(a=1)}
-    elif nucl == "C":
-        return {read_len: Composition(c=1)}
-    elif nucl == "G":
-        return {read_len: Composition(g=1)}
-    elif nucl == "T":
-        return {read_len: Composition(t=1)}
-    else:
-        return {read_len: Composition()}
-
-
-# sum doesn't work. Why?
-def my_sum(l):
-    return reduce(add, l)
-
-
-def add_compositions(comp1, comp2):
-    return merge_with(my_sum, comp1, comp2)
-    #return {read_len: comp1.get(read_len, Composition()) + comp2.get(read_len, Composition()) for read_len in chain(comp1.keys(), comp2.keys())}
-
-
-def count_first_bases(annot_infos, queue):
-    """Returns a dictionay {read_len: Composition} based on
-    the reads retrieved from the iterable *annot_infos*.
-    *annot_infos* is generated by process_annotations.
-    For the moment, nothing is done with the queue."""
-    compositions = defaultdict(Composition)
-    for annot_info in annot_infos:
-        (_, seq, _, read_len, _) = annot_info[1][1]
-        nucl = seq[0]
-        if nucl == "A":
-            compositions[read_len] += Composition(a=1)
-        elif nucl == "C":
-            compositions[read_len] += Composition(c=1)
-        elif nucl == "G":
-            compositions[read_len] += Composition(g=1)
-        elif nucl == "T":
-            compositions[read_len] += Composition(t=1)
-        else:
-            compositions[read_len] += Composition()
-    return compositions
-
-
-def find_matching_annot(annotations, seq):
-    """Finds among annotations one to consider as the real one."""
-    warnings.warn("randomly deciding between several piRNA or miRNA annotations")
-    return next(iter(annotations[0]))
-
-
-def count_small(annot_infos, write_queue):
-    """Return dictionaries counting for the gene names contained in the
-    iterable *annot_infos* and also, mixed in the same dictionaries, counts for
-    annotation types.
-
-    *annot_infos* comes from *process_annotations* defined in
-    *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info).
-    *annot_name* is a string obtained by joining a mix of biotype information
-    (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature
-    information (nothing, 21U, 22G, 26G, csr1_22G, no_signature).
-    *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA.
-    *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)).
-    *annotations* is a set of tuples (biotype, strand, start, end, gene_id).
-    """
-    cdef str annot_name, signature, name, seq, qual, strand
-    cdef bytes bname, bseq, bqual
-    counters = {small_type: defaultdict(int) for small_type in [
-        "pi", "mi", "all_si", f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]}
-    for ((annot_name, signature), annot_info) in annot_infos:
-        annotations, (name, seq, qual, _, strand) = annot_info
-        bname = name.encode("UTF-8")
-        bseq = seq.encode("UTF-8")
-        bqual = qual.encode("UTF-8")
-        fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-        if annot_name == "miRNA":
-            try:
-                (annot,) = annotations
-            except ValueError:
-                print("Unexpected multiple miRNA annotation: %s" % ", ".join(
-                    map(str, annotations)), file=stderr)
-                annot = find_matching_annot(annotations, seq)
-                # exit(1)
-            write_queue.put(("miRNA", fastq))
-            counters["mi"][annot[4]] += 1
-            counters["mi"]["miRNA"] += 1
-        elif annot_name == "piRNA":
-            try:
-                (annot,) = annotations
-            except ValueError:
-                # print("Unexpected multiple piRNA annotation: %s" % ", ".join(
-                #     map(str, annotations)), stderr)
-                # TODO: implement this correctly
-                annot = find_matching_annot(annotations, seq)
-            write_queue.put(("piRNA", FQ_TEMPLATE % (bname, bseq, bqual)))
-            counters["pi"][annot[4]] += 1
-            counters["pi"]["piRNA"] += 1
-        else:
-            # Consider a "general" all_si
-            write_queue.put(("all_siRNA", fastq))
-            # Determine gene_id
-            ####################
-            gene_ids = {annot[4] for annot in annotations}
-            if len(gene_ids) == 0:
-                counters["all_si"]["unknown_id"] += 1
-            elif len(gene_ids) == 1:
-                (gene_id,) = gene_ids
-                counters["all_si"][gene_id] += 1
-            else:
-                counters["all_si"]["_and_".join(sorted(gene_ids))] += 1
-            counters["all_si"]["all_siRNA"] += 1
-            if signature in ENDOSI_SIG:
-                # TODO: make an intermediate category: all_si_{SI_MIN}G and all_si_{SI_MAX}G
-                # considering signature, but not annotation
-                write_queue.put((f"all_si_{signature}RNA", fastq))
-                if len(gene_ids) == 0:
-                    counters[f"all_si_{signature}"]["unknown_id"] += 1
-                elif len(gene_ids) == 1:
-                    (gene_id,) = gene_ids
-                    counters[f"all_si_{signature}"][gene_id] += 1
-                else:
-                    counters[f"all_si_{signature}"]["_and_".join(sorted(gene_ids))] += 1
-                counters[f"all_si_{signature}"][f"all_si_{signature}RNA"] += 1
-                # From (annotations, (name, seq, qual, read_len, strand) pairs,
-                # filter out from annotations those that are same_strand with
-                # respect to the read
-                #
-                # Reason 1: for protein_coding or TE annotations, we are only
-                # interested in RDRP-generated small RNAs, which target an RNA
-                #
-                # Reason 2: for the other annotations, we consider that
-                # same_strand is likely some degradation product, and that the
-                # library preparation is good enough to make such cases
-                # negligible
-
-                antisense_annots = [
-                    annot for annot in annotations if annot[1] != strand]
-
-                # Eliminate reads that are antisense to some annotation other
-                # than protein_coding, TE, pseudogene, satellites or simple
-                # repeats
-                # The reason is that the real target of the small RNA can be
-                # the other, and not the protein_coding
-                antisense_biotypes = {annot[0] for annot in antisense_annots}
-                # Note: before 16/06/2017, satellites and simple_repeats were
-                # in ALLOWED but not in ANNOTABLE
-                # We added them to ALLOWED following a seminar by Susan Gasser
-                if antisense_biotypes - ALLOWED_SI_TARGETS:
-                    # Report as ambiguous if any annot[0] corresponds to
-                    # categories that we want to annotate
-                    if antisense_biotypes & ANNOTABLE_SI_TARGETS:
-                        # We have a mix of "not ALLOWED" and "ANNOTABLE"
-                        write_queue.put((
-                            "ambiguous_type",
-                            "%s\n" % "_and_".join(sorted(antisense_biotypes))))
-                    continue
-                # At this point, we know that antisense_biotype only contains
-                # allowed si target types.
-                # We are not interested in non annotable targets.
-                target_biotypes = antisense_biotypes & ANNOTABLE_SI_TARGETS
-                if not target_biotypes:
-                    continue
-                # Annotation priority:
-                # TE > protein_coding > pseudogene > satellites > simple_repeats
-                te_target_biotypes = TE & target_biotypes
-                if te_target_biotypes:
-                    # Write fastq
-                    ##############
-                    write_queue.put((f"te_si_{signature}RNA", fastq))
-                    # Determine te_id
-                    ##################
-                    te_ids = {
-                        annot[4] for annot in antisense_annots if annot[0] in te_target_biotypes}
-                    if len(te_ids) == 1:
-                        (te_id,) = te_ids
-                        counters[f"te_si_{signature}"][te_id] += 1
-                    else:
-                        write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(te_ids))))
-                    # Determine exact type
-                    #######################
-                    if TE <= te_target_biotypes:
-                        # Both are present, we can't decide between the two
-                        # Report as ambiguous
-                        write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
-                    else:
-                        # Either DNA_transposons_rmsk or RNA_transposons_rmsk
-                        (biotype,) = te_target_biotypes
-                        counters[f"te_si_{signature}"][biotype] += 1
-                    continue
-                # At this stage, we know there is no TE annotations
-                # Next in priority are protein_coding annotations
-                prot_target_biotypes = PROTEIN_CODING & target_biotypes
-                if prot_target_biotypes:
-                    # Write fastq
-                    ##############
-                    write_queue.put((f"prot_si_{signature}RNA", fastq))
-                    # Determine gene_id
-                    ####################
-                    gene_ids = {
-                        annot[4] for annot in antisense_annots if annot[0] in prot_target_biotypes}
-                    if len(gene_ids) == 1:
-                        (gene_id,) = gene_ids
-                        counters[f"prot_si_{signature}"][gene_id] += 1
-                    else:
-                        write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(gene_ids))))
-                    # Determine exact type
-                    #######################
-                    if CDS_UTR <= prot_target_biotypes:
-                        # Both are present, we can't decide between the two
-                        # Report as ambiguous
-                        write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
-                    elif ("protein_coding_pure_intron" in target_biotypes
-                        and (CDS_UTR & prot_target_biotypes)):
-                        counters[f"prot_si_{signature}"]["protein_coding_exon_junction"] += 1
-                    else:
-                        # At this point we should have only one element in target_biotype
-                        (biotype,) = prot_target_biotypes
-                        counters[f"prot_si_{signature}"][biotype] += 1
-                    continue
-                # At this stage, we know there is neither TE annotations
-                # nor protein_coding annotations
-                # Next in priority are pseudogene annotations
-                pseu_target_biotypes = {"pseudogene"} & target_biotypes
-                if pseu_target_biotypes:
-                    # Write fastq
-                    ##############
-                    write_queue.put((f"pseu_si_{signature}RNA", fastq))
-                    # Determine te_id
-                    ##################
-                    pseu_ids = {
-                        annot[4] for annot in antisense_annots if annot[0] == "pseudogene"}
-                    if len(pseu_ids) == 1:
-                        (pseu_id,) = pseu_ids
-                        counters[f"pseu_si_{signature}"][pseu_id] += 1
-                    else:
-                        write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(pseu_ids))))
-                    counters[f"pseu_si_{signature}"]["pseudogene"] += 1
-                    continue
-                # At this stage, we know there is neither TE annotations
-                # nor protein_coding annotations, nor pseudogene annotations
-                # Next in priority are satellite annotations
-                satel_target_biotypes = {"satellites_rmsk"} & target_biotypes
-                if satel_target_biotypes:
-                    # Write fastq
-                    ##############
-                    write_queue.put((f"satel_si_{signature}RNA", fastq))
-                    # Determine te_id
-                    ##################
-                    satel_ids = {
-                        annot[4] for annot in antisense_annots if annot[0] == "satellites_rmsk"}
-                    if len(satel_ids) == 1:
-                        (satel_id,) = satel_ids
-                        counters[f"satel_si_{signature}"][satel_id] += 1
-                    else:
-                        write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(satel_ids))))
-                    counters[f"satel_si_{signature}"]["satellites_rmsk"] += 1
-                    continue
-                # At this stage, we know there is neither TE annotations
-                # nor protein_coding annotations, nor pseudogene annotations
-                # nor satellites annotations
-                # Next in priority are simple_repeats annotations
-                simrep_target_biotypes = {"simple_repeats_rmsk"} & target_biotypes
-                if simrep_target_biotypes:
-                    # Write fastq
-                    ##############
-                    write_queue.put((f"simrep_si_{signature}RNA", fastq))
-                    # Determine te_id
-                    ##################
-                    simrep_ids = {
-                        annot[4] for annot in antisense_annots if annot[0] == "simple_repeats_rmsk"}
-                    if len(simrep_ids) == 1:
-                        (simrep_id,) = simrep_ids
-                        counters[f"simrep_si_{signature}"][simrep_id] += 1
-                    else:
-                        write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(simrep_ids))))
-                    counters[f"simrep_si_{signature}"]["simple_repeats_rmsk"] += 1
-                    continue
-    return tuple(counters[small_type] for small_type in [
-        "pi", "mi", "all_si", f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES])
-
-
-def count_pimis(annot_infos, write_queue):
-    """Return a dictionary counting for the gene names contained in the
-    iterable *annot_infos* and also, mixed in the same dictionary, counts for
-    annotation types.
-    *annot_infos* is generated by process_annotations."""
-    cdef str name, seq, qual
-    cdef bytes bname, bseq, bqual
-    pi_counter = defaultdict(int)
-    mi_counter = defaultdict(int)
-    for annot_info in annot_infos:
-        if annot_info[0][0] not in PIMI:
-            # return {}
-            continue
-        # (annotations, (name, seq, qual, read_len, strand)) = annot_info[1]
-        (annotations, (name, seq, qual, _, _)) = annot_info[1]
-        bname = name.encode("UTF-8")
-        bseq = seq.encode("UTF-8")
-        bqual = qual.encode("UTF-8")
-        # (annotations, _) = annot_info[1]
-        # The miRNA or piRNA annotations have previously been filtered so as to
-        # keep only those sense to the read.
-        if annot_info[0][0] == "miRNA":
-            try:
-                (annot,) = annotations
-            except ValueError:
-                print("Unexpected multiple miRNA annotation: %s" % ", ".join(
-                    map(str, annotations)), file=stderr)
-                annot = find_matching_annot(annotations, seq)
-                # exit(1)
-            write_queue.put(("miRNA", FQ_TEMPLATE % (bname, bseq, bqual)))
-            mi_counter[annot[4]] += 1
-            mi_counter["miRNA"] += 1
-            # return {annot[4]: 1, "miRNA": 1}
-        else:
-            # TODO: remove this after some time
-            assert annot_info[0][0] == "piRNA"
-            # Should be piRNA
-            try:
-                (annot,) = annotations
-            except ValueError:
-                # print("Unexpected multiple piRNA annotation: %s" % ", ".join(
-                #     map(str, annotations)), stderr)
-                # TODO: implement this correctly
-                annot = find_matching_annot(annotations, seq)
-            write_queue.put(("piRNA", FQ_TEMPLATE % (bname, bseq, bqual)))
-            pi_counter[annot[4]] += 1
-            pi_counter["piRNA"] += 1
-            # return {annot[4]: 1, "piRNA": 1}
-    return (pi_counter, mi_counter)
-
-
-def count_sis(annot_infos, write_queue):
-    """
-    Returns mixed dictionnaries to count gene identifiers and annotation types.
-    This only works if the two sets of possibilities are distinct...
-
-    *annot_infos* comes from *process_annotations* defined in
-    *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info).
-    *annot_name* is a string obtained by joining a mix of biotype information
-    (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature
-    information (nothing, 21U, 22G, 26G, csr1_22G, no_signature).
-    *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA.
-    *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)).
-    *annotations* is a set of tuples (biotype, strand, start, end, gene_id).
-    """
-    cdef str annot_name, signature, name, seq, qual, strand
-    cdef bytes bname, bseq, bqual
-    prot_si_counter = defaultdict(int)
-    te_si_counter = defaultdict(int)
-    pseu_si_counter = defaultdict(int)
-    satel_si_counter = defaultdict(int)
-    simrep_si_counter = defaultdict(int)
-    for ((annot_name, signature), annot_info) in annot_infos:
-        # Eliminate most common non-siRNA target cases based on annotation name.
-        # Also eliminate the reads with no signature
-        # (the annot_name should be "no_signature")
-        # if (annot_name in COMMON_NO_TARGET
-        #     or annot_name[:3] == "no_"):
-        #     continue
-        # Actually, we do accept reads that do not start with G
-        # Provided they are not plain "no_signature"
-        # (that is: without other annotations) 
-        if annot_name in COMMON_NO_TARGET:
-            continue
-        # If we only want reads starting with G, instead:
-        # if signature not in ENDOSI_SIG:
-        #     continue
-        # annotations: set of tuples
-        # (attrs["gene_biotype"], gtf.strand, gtf.start, gtf.end, attrs["gene_id"])
-        annotations, (name, seq, qual, _, strand) = annot_info
-        bname = name.encode("UTF-8")
-        bseq = seq.encode("UTF-8")
-        bqual = qual.encode("UTF-8")
-
-        # From (annotations, (name, seq, qual, read_len, strand) pairs, filter out
-        # from annotations those that are same_strand with respect to the read
-        #
-        # Reason 1: for protein_coding or TE annotations, we are only interested in
-        # RDRP-generated small RNAs, which target an RNA
-        #
-        # Reason 2: for the other annotations, we consider that same_strand is
-        # likely some degradation product, and that the library preparation is good
-        # enough to make such cases negligible
-
-        antisense_annots = [annot for annot in annotations if annot[1] != strand]
-
-        # Eliminate reads that are antisense to some annotation other than
-        # protein_coding, TE, pseudogene, satellites or simple repeats
-        # The reason is that the real target of the small RNA can be the other,
-        # and not the protein_coding
-        antisense_biotypes = {annot[0] for annot in antisense_annots}
-        # Note: before 16/06/2017, satellites and simple_repeats were
-        # in ALLOWED but not in ANNOTABLE
-        # We added them to ALLOWED following a seminar by Susan Gasser
-        if antisense_biotypes - ALLOWED_SI_TARGETS:
-            # Report as ambiguous if any annot[0] corresponds to categories that we want to annotate
-            if antisense_biotypes & ANNOTABLE_SI_TARGETS:
-                # We have a mix of "not ALLOWED" and "ANNOTABLE"
-                write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(antisense_biotypes))))
-            continue
-        # At this point, we know that antisense_biotype only contains
-        # allowed si target types.
-        # We are not interested in non annotable targets.
-        target_biotypes = antisense_biotypes & ANNOTABLE_SI_TARGETS
-        if not target_biotypes:
-            continue
-        # Annotation priority:
-        # TE > protein_coding > pseudogene > satellites > simple_repeats
-        te_target_biotypes = TE & target_biotypes
-        if te_target_biotypes:
-            # Write fastq
-            ##############
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("te_siRNA", fastq))
-            # Determine te_id
-            ##################
-            te_ids = {
-                annot[4] for annot in antisense_annots if annot[0] in te_target_biotypes}
-            if len(te_ids) == 1:
-                (te_id,) = te_ids
-                te_si_counter[te_id] += 1
-            else:
-                write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(te_ids))))
-            # Determine exact type
-            #######################
-            if TE <= te_target_biotypes:
-                # Both are present, we can't decide between the two
-                # Report as ambiguous
-                write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
-            else:
-                # Either DNA_transposons_rmsk or RNA_transposons_rmsk
-                (biotype,) = te_target_biotypes
-                te_si_counter[biotype] += 1
-            continue
-        # At this stage, we know there is no TE annotations
-        # Next in priority are protein_coding annotations
-        prot_target_biotypes = PROTEIN_CODING & target_biotypes
-        if prot_target_biotypes:
-            # Write fastq
-            ##############
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("prot_siRNA", fastq))
-            # Determine gene_id
-            ####################
-            gene_ids = {
-                annot[4] for annot in antisense_annots if annot[0] in prot_target_biotypes}
-            if len(gene_ids) == 1:
-                (gene_id,) = gene_ids
-                prot_si_counter[gene_id] += 1
-            else:
-                write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(gene_ids))))
-            # Determine exact type
-            #######################
-            if CDS_UTR <= prot_target_biotypes:
-                # Both are present, we can't decide between the two
-                # Report as ambiguous
-                write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
-            elif ("protein_coding_pure_intron" in target_biotypes
-                  and (CDS_UTR & prot_target_biotypes)):
-                prot_si_counter["protein_coding_exon_junction"] += 1
-            else:
-                # At this point we should have only one element in target_biotype
-                (biotype,) = prot_target_biotypes
-                prot_si_counter[biotype] += 1
-            continue
-        # At this stage, we know there is neither TE annotations
-        # nor protein_coding annotations
-        # Next in priority are pseudogene annotations
-        pseu_target_biotypes = {"pseudogene"} & target_biotypes
-        if pseu_target_biotypes:
-            # Write fastq
-            ##############
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("pseu_siRNA", fastq))
-            # Determine te_id
-            ##################
-            pseu_ids = {
-                annot[4] for annot in antisense_annots if annot[0] == "pseudogene"}
-            if len(pseu_ids) == 1:
-                (pseu_id,) = pseu_ids
-                pseu_si_counter[pseu_id] += 1
-            else:
-                write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(pseu_ids))))
-            pseu_si_counter["pseudogene"] += 1
-            continue
-        # At this stage, we know there is neither TE annotations
-        # nor protein_coding annotations, nor pseudogene annotations
-        # Next in priority are satellite annotations
-        satel_target_biotypes = {"satellites_rmsk"} & target_biotypes
-        if satel_target_biotypes:
-            # Write fastq
-            ##############
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("satel_siRNA", fastq))
-            # Determine te_id
-            ##################
-            satel_ids = {
-                annot[4] for annot in antisense_annots if annot[0] == "satellites_rmsk"}
-            if len(satel_ids) == 1:
-                (satel_id,) = satel_ids
-                satel_si_counter[satel_id] += 1
-            else:
-                write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(satel_ids))))
-            satel_si_counter["satellites_rmsk"] += 1
-            continue
-        # At this stage, we know there is neither TE annotations
-        # nor protein_coding annotations, nor pseudogene annotations
-        # nor satellites annotations
-        # Next in priority are simple_repeats annotations
-        simrep_target_biotypes = {"simple_repeats_rmsk"} & target_biotypes
-        if simrep_target_biotypes:
-            # Write fastq
-            ##############
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("simrep_siRNA", fastq))
-            # Determine te_id
-            ##################
-            simrep_ids = {
-                annot[4] for annot in antisense_annots if annot[0] == "simple_repeats_rmsk"}
-            if len(simrep_ids) == 1:
-                (simrep_id,) = simrep_ids
-                simrep_si_counter[simrep_id] += 1
-            else:
-                write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(simrep_ids))))
-            simrep_si_counter["simple_repeats_rmsk"] += 1
-            continue
-    return (prot_si_counter, te_si_counter, pseu_si_counter, satel_si_counter, simrep_si_counter)
-
-
-def count_all_sis(annot_infos, write_queue):
-    """
-    Returns mixed dictionnaries to count gene identifiers and annotation types.
-    This only works if the two sets of possibilities are distinct...
-
-    *annot_infos* comes from *process_annotations* defined in
-    *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info).
-    *annot_name* is a string obtained by joining a mix of biotype information
-    (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature
-    information (nothing, 21U, 22G, 26G, csr1_22G, no_signature).
-    *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA.
-    *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)).
-    *annotations* is a set of tuples (biotype, strand, start, end, gene_id).
-    """
-    cdef str annot_name, signature, name, seq, qual, strand
-    cdef bytes bname, bseq, bqual
-    all_si_counter = defaultdict(int)
-    for ((annot_name, signature), annot_info) in annot_infos:
-        # Eliminate most common non-siRNA target cases based on annotation name.
-        if annot_name in COMMON_NO_ENDOSI:
-            continue
-        # annotations: set of tuples
-        # (attrs["gene_biotype"], gtf.strand, gtf.start, gtf.end, attrs["gene_id"])
-        annotations, (name, seq, qual, _, strand) = annot_info
-        if chas_allsi_signature(seq):
-            # 22G(T*)-26G(T*)
-            bname = name.encode("UTF-8")
-            bseq = seq.encode("UTF-8")
-            bqual = qual.encode("UTF-8")
-            fastq = FQ_TEMPLATE % (bname, bseq, bqual)
-            write_queue.put(("all_siRNA", fastq))
-            # Determine gene_id
-            ####################
-            gene_ids = {annot[4] for annot in annotations}
-            if len(gene_ids) == 0:
-                all_si_counter["unknown_id"] += 1
-            elif len(gene_ids) == 1:
-                (gene_id,) = gene_ids
-                all_si_counter[gene_id] += 1
-            else:
-                all_si_counter["_and_".join(sorted(gene_ids))] += 1
-            all_si_counter["all_siRNA"] += 1
-    return all_si_counter
-
-
-# TODO: adapt to counters of generic small_types
-def add_results(results1, results2):
-    """Adds dictionaries inside tuples."""
-    (
-        annot_stats1,
-        small_type_counters1,
-        nucl_counts1) = results1
-    (
-        annot_stats2,
-        small_type_counters2,
-        nucl_counts2) = results2
-    return (merge_with(sum, (annot_stats1, annot_stats1)),
-            tuple(
-                merge_with(
-                    sum,
-                    (counts1, counts2)) for (counts1, counts2) in zip(
-                        small_type_counters1, small_type_counters2)),
-                # merge_with(sum, (pi_counts1, pi_counts2)),
-                # merge_with(sum, (mi_counts1, mi_counts2)),
-                # merge_with(sum, (prot_si_counts1, prot_si_counts2)),
-                # merge_with(sum, (te_si_counts1, te_si_counts2)),
-                # merge_with(sum, (pseu_si_counts1, pseu_si_counts2)),
-                # merge_with(sum, (satel_si_counts1, satel_si_counts2)),
-                # merge_with(sum, (simrep_si_counts1, simrep_si_counts2)),
-                # merge_with(sum, (all_si_counts1, all_si_counts2))),
-            add_compositions(nucl_counts1, nucl_counts2))
-
-
-def main():
-    """Example use case."""
-    print("No examples to show here.")
-    return 0
-
-
-if __name__ == "__main__":
-    import sys
-    sys.exit(main())
diff --git a/libsmallrna/setup.py b/libsmallrna/setup.py
deleted file mode 100644
index 5ef4fb5..0000000
--- a/libsmallrna/setup.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from setuptools import setup, find_packages
-from Cython.Build import cythonize
-
-setup(
-    name="libsmallrna",
-    version="0.2",
-    description="Miscellaneous things to deal with small RNA",
-    author="Blaise Li",
-    author_email="blaise.li@normalesup.org",
-    license="MIT",
-    # packages=["libsmallrna"],
-    ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
-    install_requires=["cytoolz"],
-    zip_safe=False,
-    packages=find_packages())
-- 
GitLab