From 424f87f93a738ad257afb42c02d7bcdd5ffc849c Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Mon, 13 May 2019 12:36:53 +0200 Subject: [PATCH] Made libsmallrna a submodule. --- .gitmodules | 3 + libsmallrna | 1 + libsmallrna/.gitignore | 11 - libsmallrna/install.sh | 5 - libsmallrna/libsmallrna/__init__.py | 16 - libsmallrna/libsmallrna/libsmallrna.pyx | 903 ------------------------ libsmallrna/setup.py | 15 - 7 files changed, 4 insertions(+), 950 deletions(-) create mode 160000 libsmallrna delete mode 100644 libsmallrna/.gitignore delete mode 100755 libsmallrna/install.sh delete mode 100644 libsmallrna/libsmallrna/__init__.py delete mode 100755 libsmallrna/libsmallrna/libsmallrna.pyx delete mode 100644 libsmallrna/setup.py diff --git a/.gitmodules b/.gitmodules index 6b2a212..d4b3d75 100644 --- a/.gitmodules +++ b/.gitmodules @@ -13,3 +13,6 @@ [submodule "libdeseq"] path = libdeseq url = git@gitlab.pasteur.fr:bli/libdeseq.git +[submodule "libsmallrna"] + path = libsmallrna + url = git@gitlab.pasteur.fr:bli/libsmallrna.git diff --git a/libsmallrna b/libsmallrna new file mode 160000 index 0000000..af0b527 --- /dev/null +++ b/libsmallrna @@ -0,0 +1 @@ +Subproject commit af0b5271e3f1ec2f4bcd4c43e108d3d442e2897b diff --git a/libsmallrna/.gitignore b/libsmallrna/.gitignore deleted file mode 100644 index b4e1667..0000000 --- a/libsmallrna/.gitignore +++ /dev/null @@ -1,11 +0,0 @@ -# Compiled python modules. -*.pyc - -# Setuptools distribution folder. -/dist/ - -# Python egg metadata, regenerated from source files by setuptools. -/*.egg-info - -# Backups -*~ diff --git a/libsmallrna/install.sh b/libsmallrna/install.sh deleted file mode 100755 index 5ddc41a..0000000 --- a/libsmallrna/install.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/sh -python3.6 setup.py build_ext -# .egg-link does not work with PYTHONPATH ? -python3.6 -m pip install -e . -python3.6 -m pip install --no-deps --ignore-installed . diff --git a/libsmallrna/libsmallrna/__init__.py b/libsmallrna/libsmallrna/__init__.py deleted file mode 100644 index e98d0e1..0000000 --- a/libsmallrna/libsmallrna/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -from .libsmallrna import ( - has_pi_signature, has_endosi_signature, has_26G_signature, has_csr1_endosi_signature, - count_small, - count_annots, count_pimis, count_sis, count_all_sis, count_nucl, count_first_bases, - add_compositions, add_results, Composition, get_read_info, - PI_MIN, PI_MAX, SI_MIN, SI_MAX, - SI_SUFFIXES, SI_PREFIXES, - SMALL_TYPES_TREE, - type2RNA, - types_under, - SMALL_TYPES, - SI_TYPES, - SIU_TYPES, - SISIU_TYPES, - RMSK_SISIU_TYPES, - JOINED_SMALL_TYPES) diff --git a/libsmallrna/libsmallrna/libsmallrna.pyx b/libsmallrna/libsmallrna/libsmallrna.pyx deleted file mode 100755 index 1019087..0000000 --- a/libsmallrna/libsmallrna/libsmallrna.pyx +++ /dev/null @@ -1,903 +0,0 @@ -#!/usr/bin/env python3 -# vim: set fileencoding=<utf-8> : -# cython: language_level=3 -""" -This module provides things used in small RNA annotation. - -TODO: New criteria for small RNA identification: -- piRNA -- miRNA -- all_si (no signature criteria): - - 22G antisense (actually 21-23, including on such things as rRNA): - - te_22G - - prot_22G - - pseu_22G - - satel_22G - - simprep_22G - - 26G antisense (including on such things as rRNA): - - te_26G - - prot_26G - - pseu_26G - - satel_26G - - simprep_26G - -Note: We don't consider 22G(T+) or 26G(T+). -The reliable ones should come from unmapped poly-T-trimmed remapped (all_siu, etc.) - -TODO: Update small_RNAs.pdf -""" - -# http://stackoverflow.com/a/23575771/1878788 -# Also works by setting language_level as comment -# at the top of the file. -#from __future__ import print_function -import warnings -from sys import stderr -from collections import defaultdict -from cytoolz import merge_with, frequencies -from itertools import chain -from operator import add -from functools import reduce -#from pysam import AlignedSegment - -############################################# -# Declaring small types and their hierarchy # -############################################# -cdef unsigned int CPI_MIN = 21 -cdef unsigned int CPI_MAX = 26 -cdef unsigned int CSI_MIN = 22 -cdef unsigned int CSI_MAX = 26 -PI_MIN = CPI_MIN -PI_MAX = CPI_MAX -SI_MIN = CSI_MIN -SI_MAX = CSI_MAX - - -PIMI = {"piRNA", "miRNA"} -CDS_UTR = {"protein_coding_CDS", "protein_coding_UTR"} -PROTEIN_CODING = CDS_UTR | {"protein_coding_pure_intron"} -TE = {"DNA_transposons_rmsk", "RNA_transposons_rmsk"} -ANNOTABLE_SI_TARGETS = PROTEIN_CODING | TE | {"pseudogene"} | {"satellites_rmsk", "simple_repeats_rmsk"} -ALLOWED_SI_TARGETS = ANNOTABLE_SI_TARGETS - -# annotation names that seem to be frequent and that are not protein-coding or TE -# "22G" alone occurs when the read was not associated with relevant annotations -COMMON_NO_TARGET = {"piRNA", "miRNA", "no_signature", "22G"} -COMMON_NO_ENDOSI = {"piRNA", "miRNA", "no_signature"} -# Possible read signatures for endo-siRNAs -# ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G", f"csr1_{SI_MIN}G"} -# Stricter version, does not allow poly-U tail (should have been trimmed) -ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G"} - -RMSK_PREFIXES = ["te", "satel", "simrep"] -PROT_PREFIXES = ["prot"] -PSEU_PREFIXES = ["pseu"] -SI_PREFIXES = [*PROT_PREFIXES, *PSEU_PREFIXES, *RMSK_PREFIXES] -SI_SUFFIXES = ENDOSI_SIG -# Should we have only non-overlapping sets ? -SMALL_TYPES_TREE = { - "si": {suffix: { - #"joined" : ["_".join(["si", suffix])], - "rmsk": ["_".join([prefix, "si", suffix]) for prefix in RMSK_PREFIXES], - "prot": ["_".join([prefix, "si", suffix]) for prefix in PROT_PREFIXES], - "pseu": ["_".join([prefix, "si", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES}, - "siu": {suffix: { - #"joined" : ["_".join(["siu", suffix])], - "rmsk": ["_".join([prefix, "siu", suffix]) for prefix in RMSK_PREFIXES], - "prot": ["_".join([prefix, "siu", suffix]) for prefix in PROT_PREFIXES], - "pseu": ["_".join([prefix, "siu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES}, - "pi": ["pi"], - "mi": ["mi"] -} - -SISIU_TREE_EXTENSION = { - "sisiu": {suffix: { - #"joined" : ["_".join(["sisiu", suffix])], - "rmsk": ["_".join([prefix, "sisiu", suffix]) for prefix in RMSK_PREFIXES], - "prot": ["_".join([prefix, "sisiu", suffix]) for prefix in PROT_PREFIXES], - "pseu": ["_".join([prefix, "sisiu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES}} - - -def types_under(typekeys, tree=SMALL_TYPES_TREE): - """ - Generate type names corresponding to the branching according to the - succession of keys *typekeys* in the *tree* dictionary of (ultimately) - lists. - """ - if typekeys: - yield from types_under(typekeys[1:], tree[typekeys[0]]) - else: - if isinstance(tree, dict): - for subtree in tree.values(): - yield from types_under([], subtree) - else: - yield from tree - - -SMALL_TYPES = list(types_under([])) -SI_TYPES = list(types_under(["si"])) -SIU_TYPES = list(types_under(["siu"])) -SISIU_TYPES = list(types_under([], tree=SISIU_TREE_EXTENSION)) -RMSK_SISIU_TYPES = set(chain( - types_under(["si", f"{SI_MIN}G", "rmsk"]), - types_under(["si", f"{SI_MAX}G", "rmsk"]), - types_under(["siu", f"{SI_MIN}G", "rmsk"]), - types_under(["siu", f"{SI_MAX}G", "rmsk"]))) -JOINED_SMALL_TYPES = [ - *[f"si_{suffix}" for suffix in SI_SUFFIXES], - *[f"siu_{suffix}" for suffix in SI_SUFFIXES], - f"pimi{SI_MIN}G"] - - -def type2RNA(small_types): - """Append "RNA" to the string or list of strings *small_types*.""" - if isinstance(small_types, (str, bytes)): - return f"{small_types}RNA" - else: - return list(map(type2RNA, small_types)) - - -FQ_TEMPLATE = b"@%s\n%s\n+\n%s\n" - -DNA_COMPL_TABLE = str.maketrans("ACGTRYSWKMBDHVN", "TGCAYRWSMKVHDBN") - -cdef str creverse_complement(str seq): - return seq.translate(DNA_COMPL_TABLE)[::-1] - - -cdef int BAM_FREVERSE = 16 - - -cdef object cget_read_info(object ali): - """Extracts information from AlignedSegment *ali*.""" - if (ali.flag & BAM_FREVERSE) != 0: - return ( - ali.query_name, - creverse_complement(ali.query_sequence), - # ali.query_qualities[::-1], - ali.qual[::-1], - ali.query_length, - ali.reference_name, - # five prime pos: - ali.reference_end - 1, - ali.reference_start, - ali.reference_end, - "-") - else: - return ( - ali.query_name, - ali.query_sequence, - # ali.query_qualities, - ali.qual, - ali.query_length, - ali.reference_name, - # five prime pos: - ali.reference_start, - ali.reference_start, - ali.reference_end, - "+") - - -def get_read_info(ali): - """Extracts information from AlignedSegment *ali*.""" - return cget_read_info(ali) - -# def get_read_info(ali): -# """Extracts information from AlignedSegment *ali*.""" -# if ali.is_reverse: -# return ( -# ali.query_name, -# reverse_complement(ali.query_sequence), -# # ali.query_qualities[::-1], -# ali.qual[::-1], -# ali.query_length, -# ali.reference_start, -# ali.reference_end, -# "-") -# else: -# return ( -# ali.query_name, -# ali.query_sequence, -# # ali.query_qualities, -# ali.qual, -# ali.query_length, -# ali.reference_start, -# ali.reference_end, -# "+") - -cdef bint chas_pi_signature(str seq, size_t read_len): - return read_len >= PI_MIN and seq[0] == "T" and read_len <= PI_MAX - - -def has_pi_signature(str seq, int read_len): - f"""Return True if this is a {PI_MIN}U - (can actually have length up to {PI_MAX}).""" - return chas_pi_signature(seq, read_len) - - -cdef bint chas_endosi_signature(str seq, size_t read_len): - return (SI_MIN - 1) <= read_len <= (SI_MIN + 1) and seq[0] == "G" - - -def has_endosi_signature(str seq, int read_len): - f"""Return True if this is a {SI_MIN}G.""" - return chas_endosi_signature(seq, read_len) - - -cdef bint chas_26G_signature(str seq, size_t read_len): - return read_len == SI_MAX and seq[0] == "G" - - -def has_26G_signature(str seq, int read_len): - f"""Return True if this is a {SI_MAX}G.""" - return chas_26G_signature(seq, read_len) - - -# These should have problems mapping because the T-tail is not -# in the genome sequence -cdef bint chas_csr1_endosi_signature(str seq, int read_len): - # if read_len <= 22, t_tail will be the empty string - cdef str t_tail - t_tail = (read_len - SI_MIN) * "T" - if t_tail: - return (seq[0] == "G") and (seq[SI_MIN:] == t_tail) - else: - return False - - -def has_csr1_endosi_signature(str seq, int read_len): - return chas_csr1_endosi_signature(seq, read_len) - - -cdef bint chas_allsi_signature(str seq): - cdef unsigned int read_len = len(seq) - return (seq[0] == "G" and ((SI_MIN - 1) <= read_len <= SI_MAX)) or chas_csr1_endosi_signature(seq, read_len) - - -def count_annots(annot_infos, queue): - """Returns a dictionary counting the annotation names in the - iterable *annot_infos*. - *annot_infos* is generated by process_annotations. - For the moment, nothing is done with the queue.""" - return frequencies((annot_info[0][0] for annot_info in annot_infos)) - - -cdef class Composition: - cdef readonly int a, c, g, t - def __init__(self, int a=0, int c=0, int g=0, int t=0): - self.a = a - self.c = c - self.g = g - self.t = t - def __add__(self, other): - return Composition( - self.a + other.a, - self.c + other.c, - self.g + other.g, - self.t + other.t) - def __reduce__(self): - d = {} - d["a"] = self.a - d["c"] = self.c - d["g"] = self.g - d["t"] = self.t - return (Composition, (), d) - def __setstate__(self, d): - self.a = d["a"] - self.c = d["c"] - self.g = d["g"] - self.t = d["t"] - - -def count_nucl(int read_len, str nucl): - if nucl == "A": - return {read_len: Composition(a=1)} - elif nucl == "C": - return {read_len: Composition(c=1)} - elif nucl == "G": - return {read_len: Composition(g=1)} - elif nucl == "T": - return {read_len: Composition(t=1)} - else: - return {read_len: Composition()} - - -# sum doesn't work. Why? -def my_sum(l): - return reduce(add, l) - - -def add_compositions(comp1, comp2): - return merge_with(my_sum, comp1, comp2) - #return {read_len: comp1.get(read_len, Composition()) + comp2.get(read_len, Composition()) for read_len in chain(comp1.keys(), comp2.keys())} - - -def count_first_bases(annot_infos, queue): - """Returns a dictionay {read_len: Composition} based on - the reads retrieved from the iterable *annot_infos*. - *annot_infos* is generated by process_annotations. - For the moment, nothing is done with the queue.""" - compositions = defaultdict(Composition) - for annot_info in annot_infos: - (_, seq, _, read_len, _) = annot_info[1][1] - nucl = seq[0] - if nucl == "A": - compositions[read_len] += Composition(a=1) - elif nucl == "C": - compositions[read_len] += Composition(c=1) - elif nucl == "G": - compositions[read_len] += Composition(g=1) - elif nucl == "T": - compositions[read_len] += Composition(t=1) - else: - compositions[read_len] += Composition() - return compositions - - -def find_matching_annot(annotations, seq): - """Finds among annotations one to consider as the real one.""" - warnings.warn("randomly deciding between several piRNA or miRNA annotations") - return next(iter(annotations[0])) - - -def count_small(annot_infos, write_queue): - """Return dictionaries counting for the gene names contained in the - iterable *annot_infos* and also, mixed in the same dictionaries, counts for - annotation types. - - *annot_infos* comes from *process_annotations* defined in - *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info). - *annot_name* is a string obtained by joining a mix of biotype information - (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature - information (nothing, 21U, 22G, 26G, csr1_22G, no_signature). - *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA. - *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)). - *annotations* is a set of tuples (biotype, strand, start, end, gene_id). - """ - cdef str annot_name, signature, name, seq, qual, strand - cdef bytes bname, bseq, bqual - counters = {small_type: defaultdict(int) for small_type in [ - "pi", "mi", "all_si", f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]} - for ((annot_name, signature), annot_info) in annot_infos: - annotations, (name, seq, qual, _, strand) = annot_info - bname = name.encode("UTF-8") - bseq = seq.encode("UTF-8") - bqual = qual.encode("UTF-8") - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - if annot_name == "miRNA": - try: - (annot,) = annotations - except ValueError: - print("Unexpected multiple miRNA annotation: %s" % ", ".join( - map(str, annotations)), file=stderr) - annot = find_matching_annot(annotations, seq) - # exit(1) - write_queue.put(("miRNA", fastq)) - counters["mi"][annot[4]] += 1 - counters["mi"]["miRNA"] += 1 - elif annot_name == "piRNA": - try: - (annot,) = annotations - except ValueError: - # print("Unexpected multiple piRNA annotation: %s" % ", ".join( - # map(str, annotations)), stderr) - # TODO: implement this correctly - annot = find_matching_annot(annotations, seq) - write_queue.put(("piRNA", FQ_TEMPLATE % (bname, bseq, bqual))) - counters["pi"][annot[4]] += 1 - counters["pi"]["piRNA"] += 1 - else: - # Consider a "general" all_si - write_queue.put(("all_siRNA", fastq)) - # Determine gene_id - #################### - gene_ids = {annot[4] for annot in annotations} - if len(gene_ids) == 0: - counters["all_si"]["unknown_id"] += 1 - elif len(gene_ids) == 1: - (gene_id,) = gene_ids - counters["all_si"][gene_id] += 1 - else: - counters["all_si"]["_and_".join(sorted(gene_ids))] += 1 - counters["all_si"]["all_siRNA"] += 1 - if signature in ENDOSI_SIG: - # TODO: make an intermediate category: all_si_{SI_MIN}G and all_si_{SI_MAX}G - # considering signature, but not annotation - write_queue.put((f"all_si_{signature}RNA", fastq)) - if len(gene_ids) == 0: - counters[f"all_si_{signature}"]["unknown_id"] += 1 - elif len(gene_ids) == 1: - (gene_id,) = gene_ids - counters[f"all_si_{signature}"][gene_id] += 1 - else: - counters[f"all_si_{signature}"]["_and_".join(sorted(gene_ids))] += 1 - counters[f"all_si_{signature}"][f"all_si_{signature}RNA"] += 1 - # From (annotations, (name, seq, qual, read_len, strand) pairs, - # filter out from annotations those that are same_strand with - # respect to the read - # - # Reason 1: for protein_coding or TE annotations, we are only - # interested in RDRP-generated small RNAs, which target an RNA - # - # Reason 2: for the other annotations, we consider that - # same_strand is likely some degradation product, and that the - # library preparation is good enough to make such cases - # negligible - - antisense_annots = [ - annot for annot in annotations if annot[1] != strand] - - # Eliminate reads that are antisense to some annotation other - # than protein_coding, TE, pseudogene, satellites or simple - # repeats - # The reason is that the real target of the small RNA can be - # the other, and not the protein_coding - antisense_biotypes = {annot[0] for annot in antisense_annots} - # Note: before 16/06/2017, satellites and simple_repeats were - # in ALLOWED but not in ANNOTABLE - # We added them to ALLOWED following a seminar by Susan Gasser - if antisense_biotypes - ALLOWED_SI_TARGETS: - # Report as ambiguous if any annot[0] corresponds to - # categories that we want to annotate - if antisense_biotypes & ANNOTABLE_SI_TARGETS: - # We have a mix of "not ALLOWED" and "ANNOTABLE" - write_queue.put(( - "ambiguous_type", - "%s\n" % "_and_".join(sorted(antisense_biotypes)))) - continue - # At this point, we know that antisense_biotype only contains - # allowed si target types. - # We are not interested in non annotable targets. - target_biotypes = antisense_biotypes & ANNOTABLE_SI_TARGETS - if not target_biotypes: - continue - # Annotation priority: - # TE > protein_coding > pseudogene > satellites > simple_repeats - te_target_biotypes = TE & target_biotypes - if te_target_biotypes: - # Write fastq - ############## - write_queue.put((f"te_si_{signature}RNA", fastq)) - # Determine te_id - ################## - te_ids = { - annot[4] for annot in antisense_annots if annot[0] in te_target_biotypes} - if len(te_ids) == 1: - (te_id,) = te_ids - counters[f"te_si_{signature}"][te_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(te_ids)))) - # Determine exact type - ####################### - if TE <= te_target_biotypes: - # Both are present, we can't decide between the two - # Report as ambiguous - write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes)))) - else: - # Either DNA_transposons_rmsk or RNA_transposons_rmsk - (biotype,) = te_target_biotypes - counters[f"te_si_{signature}"][biotype] += 1 - continue - # At this stage, we know there is no TE annotations - # Next in priority are protein_coding annotations - prot_target_biotypes = PROTEIN_CODING & target_biotypes - if prot_target_biotypes: - # Write fastq - ############## - write_queue.put((f"prot_si_{signature}RNA", fastq)) - # Determine gene_id - #################### - gene_ids = { - annot[4] for annot in antisense_annots if annot[0] in prot_target_biotypes} - if len(gene_ids) == 1: - (gene_id,) = gene_ids - counters[f"prot_si_{signature}"][gene_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(gene_ids)))) - # Determine exact type - ####################### - if CDS_UTR <= prot_target_biotypes: - # Both are present, we can't decide between the two - # Report as ambiguous - write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes)))) - elif ("protein_coding_pure_intron" in target_biotypes - and (CDS_UTR & prot_target_biotypes)): - counters[f"prot_si_{signature}"]["protein_coding_exon_junction"] += 1 - else: - # At this point we should have only one element in target_biotype - (biotype,) = prot_target_biotypes - counters[f"prot_si_{signature}"][biotype] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations - # Next in priority are pseudogene annotations - pseu_target_biotypes = {"pseudogene"} & target_biotypes - if pseu_target_biotypes: - # Write fastq - ############## - write_queue.put((f"pseu_si_{signature}RNA", fastq)) - # Determine te_id - ################## - pseu_ids = { - annot[4] for annot in antisense_annots if annot[0] == "pseudogene"} - if len(pseu_ids) == 1: - (pseu_id,) = pseu_ids - counters[f"pseu_si_{signature}"][pseu_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(pseu_ids)))) - counters[f"pseu_si_{signature}"]["pseudogene"] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations, nor pseudogene annotations - # Next in priority are satellite annotations - satel_target_biotypes = {"satellites_rmsk"} & target_biotypes - if satel_target_biotypes: - # Write fastq - ############## - write_queue.put((f"satel_si_{signature}RNA", fastq)) - # Determine te_id - ################## - satel_ids = { - annot[4] for annot in antisense_annots if annot[0] == "satellites_rmsk"} - if len(satel_ids) == 1: - (satel_id,) = satel_ids - counters[f"satel_si_{signature}"][satel_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(satel_ids)))) - counters[f"satel_si_{signature}"]["satellites_rmsk"] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations, nor pseudogene annotations - # nor satellites annotations - # Next in priority are simple_repeats annotations - simrep_target_biotypes = {"simple_repeats_rmsk"} & target_biotypes - if simrep_target_biotypes: - # Write fastq - ############## - write_queue.put((f"simrep_si_{signature}RNA", fastq)) - # Determine te_id - ################## - simrep_ids = { - annot[4] for annot in antisense_annots if annot[0] == "simple_repeats_rmsk"} - if len(simrep_ids) == 1: - (simrep_id,) = simrep_ids - counters[f"simrep_si_{signature}"][simrep_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(simrep_ids)))) - counters[f"simrep_si_{signature}"]["simple_repeats_rmsk"] += 1 - continue - return tuple(counters[small_type] for small_type in [ - "pi", "mi", "all_si", f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]) - - -def count_pimis(annot_infos, write_queue): - """Return a dictionary counting for the gene names contained in the - iterable *annot_infos* and also, mixed in the same dictionary, counts for - annotation types. - *annot_infos* is generated by process_annotations.""" - cdef str name, seq, qual - cdef bytes bname, bseq, bqual - pi_counter = defaultdict(int) - mi_counter = defaultdict(int) - for annot_info in annot_infos: - if annot_info[0][0] not in PIMI: - # return {} - continue - # (annotations, (name, seq, qual, read_len, strand)) = annot_info[1] - (annotations, (name, seq, qual, _, _)) = annot_info[1] - bname = name.encode("UTF-8") - bseq = seq.encode("UTF-8") - bqual = qual.encode("UTF-8") - # (annotations, _) = annot_info[1] - # The miRNA or piRNA annotations have previously been filtered so as to - # keep only those sense to the read. - if annot_info[0][0] == "miRNA": - try: - (annot,) = annotations - except ValueError: - print("Unexpected multiple miRNA annotation: %s" % ", ".join( - map(str, annotations)), file=stderr) - annot = find_matching_annot(annotations, seq) - # exit(1) - write_queue.put(("miRNA", FQ_TEMPLATE % (bname, bseq, bqual))) - mi_counter[annot[4]] += 1 - mi_counter["miRNA"] += 1 - # return {annot[4]: 1, "miRNA": 1} - else: - # TODO: remove this after some time - assert annot_info[0][0] == "piRNA" - # Should be piRNA - try: - (annot,) = annotations - except ValueError: - # print("Unexpected multiple piRNA annotation: %s" % ", ".join( - # map(str, annotations)), stderr) - # TODO: implement this correctly - annot = find_matching_annot(annotations, seq) - write_queue.put(("piRNA", FQ_TEMPLATE % (bname, bseq, bqual))) - pi_counter[annot[4]] += 1 - pi_counter["piRNA"] += 1 - # return {annot[4]: 1, "piRNA": 1} - return (pi_counter, mi_counter) - - -def count_sis(annot_infos, write_queue): - """ - Returns mixed dictionnaries to count gene identifiers and annotation types. - This only works if the two sets of possibilities are distinct... - - *annot_infos* comes from *process_annotations* defined in - *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info). - *annot_name* is a string obtained by joining a mix of biotype information - (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature - information (nothing, 21U, 22G, 26G, csr1_22G, no_signature). - *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA. - *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)). - *annotations* is a set of tuples (biotype, strand, start, end, gene_id). - """ - cdef str annot_name, signature, name, seq, qual, strand - cdef bytes bname, bseq, bqual - prot_si_counter = defaultdict(int) - te_si_counter = defaultdict(int) - pseu_si_counter = defaultdict(int) - satel_si_counter = defaultdict(int) - simrep_si_counter = defaultdict(int) - for ((annot_name, signature), annot_info) in annot_infos: - # Eliminate most common non-siRNA target cases based on annotation name. - # Also eliminate the reads with no signature - # (the annot_name should be "no_signature") - # if (annot_name in COMMON_NO_TARGET - # or annot_name[:3] == "no_"): - # continue - # Actually, we do accept reads that do not start with G - # Provided they are not plain "no_signature" - # (that is: without other annotations) - if annot_name in COMMON_NO_TARGET: - continue - # If we only want reads starting with G, instead: - # if signature not in ENDOSI_SIG: - # continue - # annotations: set of tuples - # (attrs["gene_biotype"], gtf.strand, gtf.start, gtf.end, attrs["gene_id"]) - annotations, (name, seq, qual, _, strand) = annot_info - bname = name.encode("UTF-8") - bseq = seq.encode("UTF-8") - bqual = qual.encode("UTF-8") - - # From (annotations, (name, seq, qual, read_len, strand) pairs, filter out - # from annotations those that are same_strand with respect to the read - # - # Reason 1: for protein_coding or TE annotations, we are only interested in - # RDRP-generated small RNAs, which target an RNA - # - # Reason 2: for the other annotations, we consider that same_strand is - # likely some degradation product, and that the library preparation is good - # enough to make such cases negligible - - antisense_annots = [annot for annot in annotations if annot[1] != strand] - - # Eliminate reads that are antisense to some annotation other than - # protein_coding, TE, pseudogene, satellites or simple repeats - # The reason is that the real target of the small RNA can be the other, - # and not the protein_coding - antisense_biotypes = {annot[0] for annot in antisense_annots} - # Note: before 16/06/2017, satellites and simple_repeats were - # in ALLOWED but not in ANNOTABLE - # We added them to ALLOWED following a seminar by Susan Gasser - if antisense_biotypes - ALLOWED_SI_TARGETS: - # Report as ambiguous if any annot[0] corresponds to categories that we want to annotate - if antisense_biotypes & ANNOTABLE_SI_TARGETS: - # We have a mix of "not ALLOWED" and "ANNOTABLE" - write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(antisense_biotypes)))) - continue - # At this point, we know that antisense_biotype only contains - # allowed si target types. - # We are not interested in non annotable targets. - target_biotypes = antisense_biotypes & ANNOTABLE_SI_TARGETS - if not target_biotypes: - continue - # Annotation priority: - # TE > protein_coding > pseudogene > satellites > simple_repeats - te_target_biotypes = TE & target_biotypes - if te_target_biotypes: - # Write fastq - ############## - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("te_siRNA", fastq)) - # Determine te_id - ################## - te_ids = { - annot[4] for annot in antisense_annots if annot[0] in te_target_biotypes} - if len(te_ids) == 1: - (te_id,) = te_ids - te_si_counter[te_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(te_ids)))) - # Determine exact type - ####################### - if TE <= te_target_biotypes: - # Both are present, we can't decide between the two - # Report as ambiguous - write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes)))) - else: - # Either DNA_transposons_rmsk or RNA_transposons_rmsk - (biotype,) = te_target_biotypes - te_si_counter[biotype] += 1 - continue - # At this stage, we know there is no TE annotations - # Next in priority are protein_coding annotations - prot_target_biotypes = PROTEIN_CODING & target_biotypes - if prot_target_biotypes: - # Write fastq - ############## - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("prot_siRNA", fastq)) - # Determine gene_id - #################### - gene_ids = { - annot[4] for annot in antisense_annots if annot[0] in prot_target_biotypes} - if len(gene_ids) == 1: - (gene_id,) = gene_ids - prot_si_counter[gene_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(gene_ids)))) - # Determine exact type - ####################### - if CDS_UTR <= prot_target_biotypes: - # Both are present, we can't decide between the two - # Report as ambiguous - write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes)))) - elif ("protein_coding_pure_intron" in target_biotypes - and (CDS_UTR & prot_target_biotypes)): - prot_si_counter["protein_coding_exon_junction"] += 1 - else: - # At this point we should have only one element in target_biotype - (biotype,) = prot_target_biotypes - prot_si_counter[biotype] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations - # Next in priority are pseudogene annotations - pseu_target_biotypes = {"pseudogene"} & target_biotypes - if pseu_target_biotypes: - # Write fastq - ############## - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("pseu_siRNA", fastq)) - # Determine te_id - ################## - pseu_ids = { - annot[4] for annot in antisense_annots if annot[0] == "pseudogene"} - if len(pseu_ids) == 1: - (pseu_id,) = pseu_ids - pseu_si_counter[pseu_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(pseu_ids)))) - pseu_si_counter["pseudogene"] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations, nor pseudogene annotations - # Next in priority are satellite annotations - satel_target_biotypes = {"satellites_rmsk"} & target_biotypes - if satel_target_biotypes: - # Write fastq - ############## - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("satel_siRNA", fastq)) - # Determine te_id - ################## - satel_ids = { - annot[4] for annot in antisense_annots if annot[0] == "satellites_rmsk"} - if len(satel_ids) == 1: - (satel_id,) = satel_ids - satel_si_counter[satel_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(satel_ids)))) - satel_si_counter["satellites_rmsk"] += 1 - continue - # At this stage, we know there is neither TE annotations - # nor protein_coding annotations, nor pseudogene annotations - # nor satellites annotations - # Next in priority are simple_repeats annotations - simrep_target_biotypes = {"simple_repeats_rmsk"} & target_biotypes - if simrep_target_biotypes: - # Write fastq - ############## - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("simrep_siRNA", fastq)) - # Determine te_id - ################## - simrep_ids = { - annot[4] for annot in antisense_annots if annot[0] == "simple_repeats_rmsk"} - if len(simrep_ids) == 1: - (simrep_id,) = simrep_ids - simrep_si_counter[simrep_id] += 1 - else: - write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(simrep_ids)))) - simrep_si_counter["simple_repeats_rmsk"] += 1 - continue - return (prot_si_counter, te_si_counter, pseu_si_counter, satel_si_counter, simrep_si_counter) - - -def count_all_sis(annot_infos, write_queue): - """ - Returns mixed dictionnaries to count gene identifiers and annotation types. - This only works if the two sets of possibilities are distinct... - - *annot_infos* comes from *process_annotations* defined in - *make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info). - *annot_name* is a string obtained by joining a mix of biotype information - (miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature - information (nothing, 21U, 22G, 26G, csr1_22G, no_signature). - *signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA. - *annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)). - *annotations* is a set of tuples (biotype, strand, start, end, gene_id). - """ - cdef str annot_name, signature, name, seq, qual, strand - cdef bytes bname, bseq, bqual - all_si_counter = defaultdict(int) - for ((annot_name, signature), annot_info) in annot_infos: - # Eliminate most common non-siRNA target cases based on annotation name. - if annot_name in COMMON_NO_ENDOSI: - continue - # annotations: set of tuples - # (attrs["gene_biotype"], gtf.strand, gtf.start, gtf.end, attrs["gene_id"]) - annotations, (name, seq, qual, _, strand) = annot_info - if chas_allsi_signature(seq): - # 22G(T*)-26G(T*) - bname = name.encode("UTF-8") - bseq = seq.encode("UTF-8") - bqual = qual.encode("UTF-8") - fastq = FQ_TEMPLATE % (bname, bseq, bqual) - write_queue.put(("all_siRNA", fastq)) - # Determine gene_id - #################### - gene_ids = {annot[4] for annot in annotations} - if len(gene_ids) == 0: - all_si_counter["unknown_id"] += 1 - elif len(gene_ids) == 1: - (gene_id,) = gene_ids - all_si_counter[gene_id] += 1 - else: - all_si_counter["_and_".join(sorted(gene_ids))] += 1 - all_si_counter["all_siRNA"] += 1 - return all_si_counter - - -# TODO: adapt to counters of generic small_types -def add_results(results1, results2): - """Adds dictionaries inside tuples.""" - ( - annot_stats1, - small_type_counters1, - nucl_counts1) = results1 - ( - annot_stats2, - small_type_counters2, - nucl_counts2) = results2 - return (merge_with(sum, (annot_stats1, annot_stats1)), - tuple( - merge_with( - sum, - (counts1, counts2)) for (counts1, counts2) in zip( - small_type_counters1, small_type_counters2)), - # merge_with(sum, (pi_counts1, pi_counts2)), - # merge_with(sum, (mi_counts1, mi_counts2)), - # merge_with(sum, (prot_si_counts1, prot_si_counts2)), - # merge_with(sum, (te_si_counts1, te_si_counts2)), - # merge_with(sum, (pseu_si_counts1, pseu_si_counts2)), - # merge_with(sum, (satel_si_counts1, satel_si_counts2)), - # merge_with(sum, (simrep_si_counts1, simrep_si_counts2)), - # merge_with(sum, (all_si_counts1, all_si_counts2))), - add_compositions(nucl_counts1, nucl_counts2)) - - -def main(): - """Example use case.""" - print("No examples to show here.") - return 0 - - -if __name__ == "__main__": - import sys - sys.exit(main()) diff --git a/libsmallrna/setup.py b/libsmallrna/setup.py deleted file mode 100644 index 5ef4fb5..0000000 --- a/libsmallrna/setup.py +++ /dev/null @@ -1,15 +0,0 @@ -from setuptools import setup, find_packages -from Cython.Build import cythonize - -setup( - name="libsmallrna", - version="0.2", - description="Miscellaneous things to deal with small RNA", - author="Blaise Li", - author_email="blaise.li@normalesup.org", - license="MIT", - # packages=["libsmallrna"], - ext_modules = cythonize("libsmallrna/libsmallrna.pyx"), - install_requires=["cytoolz"], - zip_safe=False, - packages=find_packages()) -- GitLab