Commit 424f87f9 authored by Blaise Li's avatar Blaise Li
Browse files

Made libsmallrna a submodule.

parent fdce587a
......@@ -13,3 +13,6 @@
[submodule "libdeseq"]
path = libdeseq
url = git@gitlab.pasteur.fr:bli/libdeseq.git
[submodule "libsmallrna"]
path = libsmallrna
url = git@gitlab.pasteur.fr:bli/libsmallrna.git
Subproject commit af0b5271e3f1ec2f4bcd4c43e108d3d442e2897b
# Compiled python modules.
*.pyc
# Setuptools distribution folder.
/dist/
# Python egg metadata, regenerated from source files by setuptools.
/*.egg-info
# Backups
*~
#!/bin/sh
python3.6 setup.py build_ext
# .egg-link does not work with PYTHONPATH ?
python3.6 -m pip install -e .
python3.6 -m pip install --no-deps --ignore-installed .
from .libsmallrna import (
has_pi_signature, has_endosi_signature, has_26G_signature, has_csr1_endosi_signature,
count_small,
count_annots, count_pimis, count_sis, count_all_sis, count_nucl, count_first_bases,
add_compositions, add_results, Composition, get_read_info,
PI_MIN, PI_MAX, SI_MIN, SI_MAX,
SI_SUFFIXES, SI_PREFIXES,
SMALL_TYPES_TREE,
type2RNA,
types_under,
SMALL_TYPES,
SI_TYPES,
SIU_TYPES,
SISIU_TYPES,
RMSK_SISIU_TYPES,
JOINED_SMALL_TYPES)
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
# cython: language_level=3
"""
This module provides things used in small RNA annotation.
TODO: New criteria for small RNA identification:
- piRNA
- miRNA
- all_si (no signature criteria):
- 22G antisense (actually 21-23, including on such things as rRNA):
- te_22G
- prot_22G
- pseu_22G
- satel_22G
- simprep_22G
- 26G antisense (including on such things as rRNA):
- te_26G
- prot_26G
- pseu_26G
- satel_26G
- simprep_26G
Note: We don't consider 22G(T+) or 26G(T+).
The reliable ones should come from unmapped poly-T-trimmed remapped (all_siu, etc.)
TODO: Update small_RNAs.pdf
"""
# http://stackoverflow.com/a/23575771/1878788
# Also works by setting language_level as comment
# at the top of the file.
#from __future__ import print_function
import warnings
from sys import stderr
from collections import defaultdict
from cytoolz import merge_with, frequencies
from itertools import chain
from operator import add
from functools import reduce
#from pysam import AlignedSegment
#############################################
# Declaring small types and their hierarchy #
#############################################
cdef unsigned int CPI_MIN = 21
cdef unsigned int CPI_MAX = 26
cdef unsigned int CSI_MIN = 22
cdef unsigned int CSI_MAX = 26
PI_MIN = CPI_MIN
PI_MAX = CPI_MAX
SI_MIN = CSI_MIN
SI_MAX = CSI_MAX
PIMI = {"piRNA", "miRNA"}
CDS_UTR = {"protein_coding_CDS", "protein_coding_UTR"}
PROTEIN_CODING = CDS_UTR | {"protein_coding_pure_intron"}
TE = {"DNA_transposons_rmsk", "RNA_transposons_rmsk"}
ANNOTABLE_SI_TARGETS = PROTEIN_CODING | TE | {"pseudogene"} | {"satellites_rmsk", "simple_repeats_rmsk"}
ALLOWED_SI_TARGETS = ANNOTABLE_SI_TARGETS
# annotation names that seem to be frequent and that are not protein-coding or TE
# "22G" alone occurs when the read was not associated with relevant annotations
COMMON_NO_TARGET = {"piRNA", "miRNA", "no_signature", "22G"}
COMMON_NO_ENDOSI = {"piRNA", "miRNA", "no_signature"}
# Possible read signatures for endo-siRNAs
# ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G", f"csr1_{SI_MIN}G"}
# Stricter version, does not allow poly-U tail (should have been trimmed)
ENDOSI_SIG = {f"{SI_MIN}G", f"{SI_MAX}G"}
RMSK_PREFIXES = ["te", "satel", "simrep"]
PROT_PREFIXES = ["prot"]
PSEU_PREFIXES = ["pseu"]
SI_PREFIXES = [*PROT_PREFIXES, *PSEU_PREFIXES, *RMSK_PREFIXES]
SI_SUFFIXES = ENDOSI_SIG
# Should we have only non-overlapping sets ?
SMALL_TYPES_TREE = {
"si": {suffix: {
#"joined" : ["_".join(["si", suffix])],
"rmsk": ["_".join([prefix, "si", suffix]) for prefix in RMSK_PREFIXES],
"prot": ["_".join([prefix, "si", suffix]) for prefix in PROT_PREFIXES],
"pseu": ["_".join([prefix, "si", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES},
"siu": {suffix: {
#"joined" : ["_".join(["siu", suffix])],
"rmsk": ["_".join([prefix, "siu", suffix]) for prefix in RMSK_PREFIXES],
"prot": ["_".join([prefix, "siu", suffix]) for prefix in PROT_PREFIXES],
"pseu": ["_".join([prefix, "siu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES},
"pi": ["pi"],
"mi": ["mi"]
}
SISIU_TREE_EXTENSION = {
"sisiu": {suffix: {
#"joined" : ["_".join(["sisiu", suffix])],
"rmsk": ["_".join([prefix, "sisiu", suffix]) for prefix in RMSK_PREFIXES],
"prot": ["_".join([prefix, "sisiu", suffix]) for prefix in PROT_PREFIXES],
"pseu": ["_".join([prefix, "sisiu", suffix]) for prefix in PSEU_PREFIXES]} for suffix in SI_SUFFIXES}}
def types_under(typekeys, tree=SMALL_TYPES_TREE):
"""
Generate type names corresponding to the branching according to the
succession of keys *typekeys* in the *tree* dictionary of (ultimately)
lists.
"""
if typekeys:
yield from types_under(typekeys[1:], tree[typekeys[0]])
else:
if isinstance(tree, dict):
for subtree in tree.values():
yield from types_under([], subtree)
else:
yield from tree
SMALL_TYPES = list(types_under([]))
SI_TYPES = list(types_under(["si"]))
SIU_TYPES = list(types_under(["siu"]))
SISIU_TYPES = list(types_under([], tree=SISIU_TREE_EXTENSION))
RMSK_SISIU_TYPES = set(chain(
types_under(["si", f"{SI_MIN}G", "rmsk"]),
types_under(["si", f"{SI_MAX}G", "rmsk"]),
types_under(["siu", f"{SI_MIN}G", "rmsk"]),
types_under(["siu", f"{SI_MAX}G", "rmsk"])))
JOINED_SMALL_TYPES = [
*[f"si_{suffix}" for suffix in SI_SUFFIXES],
*[f"siu_{suffix}" for suffix in SI_SUFFIXES],
f"pimi{SI_MIN}G"]
def type2RNA(small_types):
"""Append "RNA" to the string or list of strings *small_types*."""
if isinstance(small_types, (str, bytes)):
return f"{small_types}RNA"
else:
return list(map(type2RNA, small_types))
FQ_TEMPLATE = b"@%s\n%s\n+\n%s\n"
DNA_COMPL_TABLE = str.maketrans("ACGTRYSWKMBDHVN", "TGCAYRWSMKVHDBN")
cdef str creverse_complement(str seq):
return seq.translate(DNA_COMPL_TABLE)[::-1]
cdef int BAM_FREVERSE = 16
cdef object cget_read_info(object ali):
"""Extracts information from AlignedSegment *ali*."""
if (ali.flag & BAM_FREVERSE) != 0:
return (
ali.query_name,
creverse_complement(ali.query_sequence),
# ali.query_qualities[::-1],
ali.qual[::-1],
ali.query_length,
ali.reference_name,
# five prime pos:
ali.reference_end - 1,
ali.reference_start,
ali.reference_end,
"-")
else:
return (
ali.query_name,
ali.query_sequence,
# ali.query_qualities,
ali.qual,
ali.query_length,
ali.reference_name,
# five prime pos:
ali.reference_start,
ali.reference_start,
ali.reference_end,
"+")
def get_read_info(ali):
"""Extracts information from AlignedSegment *ali*."""
return cget_read_info(ali)
# def get_read_info(ali):
# """Extracts information from AlignedSegment *ali*."""
# if ali.is_reverse:
# return (
# ali.query_name,
# reverse_complement(ali.query_sequence),
# # ali.query_qualities[::-1],
# ali.qual[::-1],
# ali.query_length,
# ali.reference_start,
# ali.reference_end,
# "-")
# else:
# return (
# ali.query_name,
# ali.query_sequence,
# # ali.query_qualities,
# ali.qual,
# ali.query_length,
# ali.reference_start,
# ali.reference_end,
# "+")
cdef bint chas_pi_signature(str seq, size_t read_len):
return read_len >= PI_MIN and seq[0] == "T" and read_len <= PI_MAX
def has_pi_signature(str seq, int read_len):
f"""Return True if this is a {PI_MIN}U
(can actually have length up to {PI_MAX})."""
return chas_pi_signature(seq, read_len)
cdef bint chas_endosi_signature(str seq, size_t read_len):
return (SI_MIN - 1) <= read_len <= (SI_MIN + 1) and seq[0] == "G"
def has_endosi_signature(str seq, int read_len):
f"""Return True if this is a {SI_MIN}G."""
return chas_endosi_signature(seq, read_len)
cdef bint chas_26G_signature(str seq, size_t read_len):
return read_len == SI_MAX and seq[0] == "G"
def has_26G_signature(str seq, int read_len):
f"""Return True if this is a {SI_MAX}G."""
return chas_26G_signature(seq, read_len)
# These should have problems mapping because the T-tail is not
# in the genome sequence
cdef bint chas_csr1_endosi_signature(str seq, int read_len):
# if read_len <= 22, t_tail will be the empty string
cdef str t_tail
t_tail = (read_len - SI_MIN) * "T"
if t_tail:
return (seq[0] == "G") and (seq[SI_MIN:] == t_tail)
else:
return False
def has_csr1_endosi_signature(str seq, int read_len):
return chas_csr1_endosi_signature(seq, read_len)
cdef bint chas_allsi_signature(str seq):
cdef unsigned int read_len = len(seq)
return (seq[0] == "G" and ((SI_MIN - 1) <= read_len <= SI_MAX)) or chas_csr1_endosi_signature(seq, read_len)
def count_annots(annot_infos, queue):
"""Returns a dictionary counting the annotation names in the
iterable *annot_infos*.
*annot_infos* is generated by process_annotations.
For the moment, nothing is done with the queue."""
return frequencies((annot_info[0][0] for annot_info in annot_infos))
cdef class Composition:
cdef readonly int a, c, g, t
def __init__(self, int a=0, int c=0, int g=0, int t=0):
self.a = a
self.c = c
self.g = g
self.t = t
def __add__(self, other):
return Composition(
self.a + other.a,
self.c + other.c,
self.g + other.g,
self.t + other.t)
def __reduce__(self):
d = {}
d["a"] = self.a
d["c"] = self.c
d["g"] = self.g
d["t"] = self.t
return (Composition, (), d)
def __setstate__(self, d):
self.a = d["a"]
self.c = d["c"]
self.g = d["g"]
self.t = d["t"]
def count_nucl(int read_len, str nucl):
if nucl == "A":
return {read_len: Composition(a=1)}
elif nucl == "C":
return {read_len: Composition(c=1)}
elif nucl == "G":
return {read_len: Composition(g=1)}
elif nucl == "T":
return {read_len: Composition(t=1)}
else:
return {read_len: Composition()}
# sum doesn't work. Why?
def my_sum(l):
return reduce(add, l)
def add_compositions(comp1, comp2):
return merge_with(my_sum, comp1, comp2)
#return {read_len: comp1.get(read_len, Composition()) + comp2.get(read_len, Composition()) for read_len in chain(comp1.keys(), comp2.keys())}
def count_first_bases(annot_infos, queue):
"""Returns a dictionay {read_len: Composition} based on
the reads retrieved from the iterable *annot_infos*.
*annot_infos* is generated by process_annotations.
For the moment, nothing is done with the queue."""
compositions = defaultdict(Composition)
for annot_info in annot_infos:
(_, seq, _, read_len, _) = annot_info[1][1]
nucl = seq[0]
if nucl == "A":
compositions[read_len] += Composition(a=1)
elif nucl == "C":
compositions[read_len] += Composition(c=1)
elif nucl == "G":
compositions[read_len] += Composition(g=1)
elif nucl == "T":
compositions[read_len] += Composition(t=1)
else:
compositions[read_len] += Composition()
return compositions
def find_matching_annot(annotations, seq):
"""Finds among annotations one to consider as the real one."""
warnings.warn("randomly deciding between several piRNA or miRNA annotations")
return next(iter(annotations[0]))
def count_small(annot_infos, write_queue):
"""Return dictionaries counting for the gene names contained in the
iterable *annot_infos* and also, mixed in the same dictionaries, counts for
annotation types.
*annot_infos* comes from *process_annotations* defined in
*make_annotation_processor*. It consists in pairs ((annot_name, signature), annot_info).
*annot_name* is a string obtained by joining a mix of biotype information
(miRNA, piRNA, protein_coding, RNA_transposons_rmsk, ...) and read signature
information (nothing, 21U, 22G, 26G, csr1_22G, no_signature).
*signature* can be 21-26U, 22G, 26G, csr1_22G, no_signature or NA.
*annot_info* is a pair (annotations, (name, seq, qual, read_len, strand)).
*annotations* is a set of tuples (biotype, strand, start, end, gene_id).
"""
cdef str annot_name, signature, name, seq, qual, strand
cdef bytes bname, bseq, bqual
counters = {small_type: defaultdict(int) for small_type in [
"pi", "mi", "all_si", f"all_si_{SI_MIN}G", f"all_si_{SI_MAX}G", *SI_TYPES]}
for ((annot_name, signature), annot_info) in annot_infos:
annotations, (name, seq, qual, _, strand) = annot_info
bname = name.encode("UTF-8")
bseq = seq.encode("UTF-8")
bqual = qual.encode("UTF-8")
fastq = FQ_TEMPLATE % (bname, bseq, bqual)
if annot_name == "miRNA":
try:
(annot,) = annotations
except ValueError:
print("Unexpected multiple miRNA annotation: %s" % ", ".join(
map(str, annotations)), file=stderr)
annot = find_matching_annot(annotations, seq)
# exit(1)
write_queue.put(("miRNA", fastq))
counters["mi"][annot[4]] += 1
counters["mi"]["miRNA"] += 1
elif annot_name == "piRNA":
try:
(annot,) = annotations
except ValueError:
# print("Unexpected multiple piRNA annotation: %s" % ", ".join(
# map(str, annotations)), stderr)
# TODO: implement this correctly
annot = find_matching_annot(annotations, seq)
write_queue.put(("piRNA", FQ_TEMPLATE % (bname, bseq, bqual)))
counters["pi"][annot[4]] += 1
counters["pi"]["piRNA"] += 1
else:
# Consider a "general" all_si
write_queue.put(("all_siRNA", fastq))
# Determine gene_id
####################
gene_ids = {annot[4] for annot in annotations}
if len(gene_ids) == 0:
counters["all_si"]["unknown_id"] += 1
elif len(gene_ids) == 1:
(gene_id,) = gene_ids
counters["all_si"][gene_id] += 1
else:
counters["all_si"]["_and_".join(sorted(gene_ids))] += 1
counters["all_si"]["all_siRNA"] += 1
if signature in ENDOSI_SIG:
# TODO: make an intermediate category: all_si_{SI_MIN}G and all_si_{SI_MAX}G
# considering signature, but not annotation
write_queue.put((f"all_si_{signature}RNA", fastq))
if len(gene_ids) == 0:
counters[f"all_si_{signature}"]["unknown_id"] += 1
elif len(gene_ids) == 1:
(gene_id,) = gene_ids
counters[f"all_si_{signature}"][gene_id] += 1
else:
counters[f"all_si_{signature}"]["_and_".join(sorted(gene_ids))] += 1
counters[f"all_si_{signature}"][f"all_si_{signature}RNA"] += 1
# From (annotations, (name, seq, qual, read_len, strand) pairs,
# filter out from annotations those that are same_strand with
# respect to the read
#
# Reason 1: for protein_coding or TE annotations, we are only
# interested in RDRP-generated small RNAs, which target an RNA
#
# Reason 2: for the other annotations, we consider that
# same_strand is likely some degradation product, and that the
# library preparation is good enough to make such cases
# negligible
antisense_annots = [
annot for annot in annotations if annot[1] != strand]
# Eliminate reads that are antisense to some annotation other
# than protein_coding, TE, pseudogene, satellites or simple
# repeats
# The reason is that the real target of the small RNA can be
# the other, and not the protein_coding
antisense_biotypes = {annot[0] for annot in antisense_annots}
# Note: before 16/06/2017, satellites and simple_repeats were
# in ALLOWED but not in ANNOTABLE
# We added them to ALLOWED following a seminar by Susan Gasser
if antisense_biotypes - ALLOWED_SI_TARGETS:
# Report as ambiguous if any annot[0] corresponds to
# categories that we want to annotate
if antisense_biotypes & ANNOTABLE_SI_TARGETS:
# We have a mix of "not ALLOWED" and "ANNOTABLE"
write_queue.put((
"ambiguous_type",
"%s\n" % "_and_".join(sorted(antisense_biotypes))))
continue
# At this point, we know that antisense_biotype only contains
# allowed si target types.
# We are not interested in non annotable targets.
target_biotypes = antisense_biotypes & ANNOTABLE_SI_TARGETS
if not target_biotypes:
continue
# Annotation priority:
# TE > protein_coding > pseudogene > satellites > simple_repeats
te_target_biotypes = TE & target_biotypes
if te_target_biotypes:
# Write fastq
##############
write_queue.put((f"te_si_{signature}RNA", fastq))
# Determine te_id
##################
te_ids = {
annot[4] for annot in antisense_annots if annot[0] in te_target_biotypes}
if len(te_ids) == 1:
(te_id,) = te_ids
counters[f"te_si_{signature}"][te_id] += 1
else:
write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(te_ids))))
# Determine exact type
#######################
if TE <= te_target_biotypes:
# Both are present, we can't decide between the two
# Report as ambiguous
write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
else:
# Either DNA_transposons_rmsk or RNA_transposons_rmsk
(biotype,) = te_target_biotypes
counters[f"te_si_{signature}"][biotype] += 1
continue
# At this stage, we know there is no TE annotations
# Next in priority are protein_coding annotations
prot_target_biotypes = PROTEIN_CODING & target_biotypes
if prot_target_biotypes:
# Write fastq
##############
write_queue.put((f"prot_si_{signature}RNA", fastq))
# Determine gene_id
####################
gene_ids = {
annot[4] for annot in antisense_annots if annot[0] in prot_target_biotypes}
if len(gene_ids) == 1:
(gene_id,) = gene_ids
counters[f"prot_si_{signature}"][gene_id] += 1
else:
write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(gene_ids))))
# Determine exact type
#######################
if CDS_UTR <= prot_target_biotypes:
# Both are present, we can't decide between the two
# Report as ambiguous
write_queue.put(("ambiguous_type", "%s\n" % "_and_".join(sorted(target_biotypes))))
elif ("protein_coding_pure_intron" in target_biotypes
and (CDS_UTR & prot_target_biotypes)):
counters[f"prot_si_{signature}"]["protein_coding_exon_junction"] += 1
else:
# At this point we should have only one element in target_biotype
(biotype,) = prot_target_biotypes
counters[f"prot_si_{signature}"][biotype] += 1
continue
# At this stage, we know there is neither TE annotations
# nor protein_coding annotations
# Next in priority are pseudogene annotations
pseu_target_biotypes = {"pseudogene"} & target_biotypes
if pseu_target_biotypes:
# Write fastq
##############
write_queue.put((f"pseu_si_{signature}RNA", fastq))
# Determine te_id
##################
pseu_ids = {
annot[4] for annot in antisense_annots if annot[0] == "pseudogene"}
if len(pseu_ids) == 1:
(pseu_id,) = pseu_ids
counters[f"pseu_si_{signature}"][pseu_id] += 1
else:
write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(pseu_ids))))
counters[f"pseu_si_{signature}"]["pseudogene"] += 1
continue
# At this stage, we know there is neither TE annotations
# nor protein_coding annotations, nor pseudogene annotations
# Next in priority are satellite annotations
satel_target_biotypes = {"satellites_rmsk"} & target_biotypes
if satel_target_biotypes:
# Write fastq
##############
write_queue.put((f"satel_si_{signature}RNA", fastq))
# Determine te_id
##################
satel_ids = {
annot[4] for annot in antisense_annots if annot[0] == "satellites_rmsk"}
if len(satel_ids) == 1:
(satel_id,) = satel_ids
counters[f"satel_si_{signature}"][satel_id] += 1
else:
write_queue.put(("ambiguous_id", "%s\n" % "_and_".join(sorted(satel_ids))))
counters[f"satel_si_{signature}"]["satellites_rmsk"] += 1
continue
# At this stage, we know there is neither TE annotations
# nor protein_coding annotations, nor pseudogene annotations
# nor satellites annotations
# Next in priority are simple_repeats annotations