degradome_metaprofile.py

#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script takes read mapping information from sorted bam files, degradome
and small RNAs, and builds metaprofiles for degradome reads relative to small
RNAs.

We want to obtain similar figures as Shen et al., 2018
(<https://doi.org/10.1016/j.cell.2018.02.002>):

> In C. elegans, piRISC recruits RdRP to its targets. Therefore, we wished to
examine the pattern of RdRP-dependent 22G-RNA production near CLASH-defined
piRNA target sites in both WT and prg-1 mutant worms. To do this, we plotted
22G-RNA levels within a 40-nt region centered on the piRNA complementary
sequences defined by CLASH. The 5′ ends of 22G-RNAs are thought to be formed
directly from RdRP initiating at C residues within the target mRNAs. We
therefore normalized the 22G-RNA levels initiating at each position to the
frequency of C residues within the CLASH-defined targets at each position.
Because the CSR-1, and WAGO Argonaute pathway are thought to have opposing
functions, resisting and supporting piRNA silencing (Seth et al., 2013, Wedeles
                                                     et al., 2013), we
separately considered predicted piRNA targets within previously defined WAGO
and CSR-1 targeted mRNAs (Claycomb et al., 2009, Gu et al., 2009). As a control
set, we considered a target region arbitrarily set 100 nts away (within each
                                                                 mRNA) from of
the piRNA binding sites identified by CLASH. In WT animals, 22G-RNA levels were
much higher for WAGO targets than for CSR-1 targets, as expected (Figures 2A
                                                                  and 2B,
                                                                  Figures S2A
                                                                  and S2B).
However, piRNA binding sites within both WAGO and CSR-1 targets showed a
non-random distribution of 22G-RNA levels across the interval. By contrast, the
control regions within the same target mRNAs, but offset from the hybrid sites,
exhibited no such patterns (Figures 2G and 2H). WAGO targets exhibited a strong
central peak, and clusters of peaks at either end of the piRNA target sites. To
describe these patterns, we refer to the mRNA sequences near the target site as
follows: t1 through t30 includes the presumptive binding site (t1 to t21) plus
9 nucleotides 5′ of the target site (t22 to t30). The mRNA region 3′ of the
target site consists of nucleotides t–1 through t–11. Strikingly, this analysis
revealed a prominent peak in the center of the piRNA complementary region near
t12, and smaller peaks centered at t1 and t21 (Figure 2A). CSR-1 targets
exhibited a cluster of much smaller peaks near the 3′ end of the predicted
target site, with the largest peak residing in sequences located near t–5
(Figure 2B). The amplitudes of 22G-RNA levels on both the WAGO and CSR-1
targets correlated positively with the predicted free energy of piRNA binding
and to a lesser extent with piRNA abundance (Figures 2C–2F and S2C–2F).
"""

import argparse
import logging
import warnings
import os
from sys import stderr
from collections import Counter, deque
from itertools import dropwhile, takewhile
from operator import attrgetter
from cytoolz import merge_with
# from pybedtools import Attributes, BedTool, Interval
# from pybedtools import BedTool, Interval
from pysam import AlignmentFile
# To get coordinates of genes
from libhts import id_list_gtf2bed
# To collapse aligned reads to their 5' ends
from bam25prime import collapse_and_sort


def formatwarning(message, category, filename, lineno, line):
    """Used to format warning messages."""
    return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)


warnings.formatwarning = formatwarning
UNPACK = attrgetter("chrom", "start", "end", "strand")
ANTISTRAND = {"+": "-", "-": "+"}
OPJ = os.path.join


def get_gene_intervals(gene_list_filename, annot_filename, id_kwd=None):
    """
    Obtain the bed coordinates of the genomic regions covered by
    the genes listed in *gene_list_filename*, as annotated in
    *annot_filename*.
    """
    if id_kwd is None:
        id_kwd = "gene_id"
    with open(gene_list_filename, "r") as gene_list_file:
        gene_identifiers = {line.strip() for line in gene_list_file}
    # Deadlock when chaining stream=True?
    # https://daler.github.io/pybedtools/topical-iterators.html
    # intervals = id_list_gtf2bed(
    #     gene_identifiers,
    #     args.annotations, id_kwd=args.id_kwd).sort(
    #         stream=True).merge(stream=True)
    return id_list_gtf2bed(
        gene_identifiers, annot_filename,
        id_kwd=id_kwd).sort().merge(stream=True)
    # for ivl in intervals:
    #     print(ivl)
    # return 0


# def range_tester(pos, upstream, downstream):
#     """
#     Create a function that tests whether a (chrom, pos, strand) genomic point
#     belongs to a certain range from *downstream* and up to *upstream* of
#     position *pos*. The chromosome and strand are actually ignored, so the
#     test is assumed to happen within a given chromosome.
#     """
#     def in_range(neighbour):
#         """Recognize a legitimate (chrom, pos, strand) *neighbour*.
#         Only the five-prime positions are taken into account.
#         Chromosome is assumed to be the same, strand is ignored.
#         """
#         return (pos - upstream
#                 <= neighbour[1]
#                 <= pos + downstream)
#     return in_range


def filter_bam(bamfile, intervals=None):
    """
    Iterates over the *bamfile* and yields only those aligned reads
    that belong to one of the bed *intervals*.

    The output will be generated in the order obtained when sorting
    *intervals*.

    If *intervals* is None, all aligned reads are yielded.
    """
    # :func:`bam25prime.collapse_and_sort` preprocesses and re-sorts the
    # aligned segments so that we have :class:`pybedtools.Interval`s of
    # length 1, corresponding to their 5' ends.
    # Re-sorting is needed because reverse aligned segments will be placed
    # according to their original end coordinate.
    if intervals is None:
        for ali in collapse_and_sort(bamfile.fetch()):
            yield UNPACK(ali)
        # What about the following:
        # yield from collapse_and_sort(bamfile.fetch())
    else:
        # TODO: use bedtools intersect instead of all this looping
        # collapsed = collapse_and_sort(bamfile.fetch())
        # filtered = collapsed.intersect(
        #     intervals, sorted=True, stream=True)
        # for ali in filtered:
        # for ali in collapsed
        #     yield UNPACK(ali)
        # # TODO: Check that sort order of intervals
        # # is the same as the aligned segments.
        # # Chromosome order should be the same, for instance.
        for interval in intervals.sort():
            #print(interval)
            # I	2880868	2883454
            # [...]
            # 2883513
            # 2883514
            # I	2883495	2893718
            # 2883513
            #
            # I	9488487	9492623
            # [...]
            # 9492587
            # 9492616
            # I	9492629	9494589
            # 9492587
            # for ali in bamfile.fetch(interval.chrom,
            #                          interval.start,
            #                          interval.end):
            #     yield ali
            for ali in collapse_and_sort(bamfile.fetch(
                    interval.chrom,
                    interval.start,
                    interval.end)):
                # AlignmentFile.fetch may yield alignments
                # whose start is upstream interval.start
                # or
                # whose end is beyond interval.end
                # (overlap is sufficient)
                if ali.start < interval.start:
                    continue
                if ali.end > interval.end:
                    break
                yield UNPACK(ali)

## debug
#from collections import defaultdict
##
# TODO: allow different upstream and downstream maximum distances
# For CSR1-loaded siRNAs, we expect the cut to happen somewhere
# within the length of the siRNA, so it might be better to look
# from -15 to + 25, for instance
def zip_alis(alis1, alis2, max_dists=None):
    """
    Iterates over two iterators of aligned segments *alis1* and *alis2*
    so that the iteration over *alis2* is driven by that over *alis1*
    in the sense that alignments will be taken in *alis2*
    only in regions surrounding alignments in *alis1*.

    Yields pairs consiting in one alignment from *alis1* and
    the list of those found in its surroundings in *alis2*.

    The input aligned segments should consist in :class:`pybedtools.Interval`s
    of length 1 so that only their five-prime ends are taken into account
    to determine the distance separating them.
    """
    if max_dists is None:
        max_dists = {"+": (15, 25), "-": (25, 15)}
    # Will be updated to contain the alignments from alis2
    # that surround the current alignment from alis1,
    # and are on the opposite strand
    antisense_surroundings = {"+": deque([]), "-": deque([])}
    # Note that alis2 needs to be an iterator, otherwise, next will fail.
    # (An iterator can be obtained by calling iter on an iterable.)
    # In case of StopIteration before even iterating alis1,
    # we let it propagate (exit zip_alis)
    ##debug
    #five2s = {"+": defaultdict(list), "-": defaultdict(list)}
    #def record_five2(chrom_info, pos_info, strand_info):
    #    print(pos_info)
    #    five_list = five2s[strand_info]
    #    if five_list[chrom_info]:
    #        assert five_list[chrom_info][-1] <= pos_info
    #    five_list[chrom_info].append(pos_info)
    ##
    (ref2, five2, _, strand2) = next(alis2)
    ##debug
    #record_five2(ref2, five2, strand2)
    ##
    # Is the alis2 iterator exhausted?
    exhausted = False
    for (ref1, five1, _, strand1) in alis1:
        # Skip alignments in previous chromosomes
        while ref2 < ref1:
            try:
                (ref2, five2, _, strand2) = next(alis2)
                ##debug
                #record_five2(ref2, five2, strand2)
                ##
            except StopIteration:
                # We need to catch this to avoid exiting zip_alis too soon.
                exhausted = True
                break
        if ref2 > ref1:
            # There were no alignments in alis2 in the same chromosome
            # yield ((ref1, five1, strand1), [])
            continue
        # Now we have an alignment on the same chromosome
        assert ref2 == ref1
        # TODO: Check coordinates handling
        # Select the appropriate surroundings
        surroundings = antisense_surroundings[ANTISTRAND[strand1]]
        # Purge surroundings from alignments in the previous chromosomes
        try:
            while surroundings[0][0] < ref1:
                # _ = surroundings.popleft()
                surroundings.popleft()
        except IndexError:
            # surroundings has been emptied
            pass
        (before_dist, after_dist) = max_dists[strand1]
        # Skip upstream alignments
        # Example with before_dist = 4
        #        4321|--->
        # <---|      . skip
        #  <---|     . skip
        #   <---|    . skip
        #    <---|   . OK
        #        432101234
        while ref2 == ref1 and five2 < five1 - before_dist:
            try:
                (ref2, five2, _, strand2) = next(alis2)
                ##debug
                #record_five2(ref2, five2, strand2)
                ##
            except StopIteration:
                # We need to catch this to avoid exiting zip_alis too soon.
                exhausted = True
                break
        # Handle the case where the surroundings is not empty
        # and has only reads from chromosomes after ref1
        if ref2 > ref1:
            # There were no close-enough alignments in alis2
            # in the same chromosome
            # No need to register it:
            # it will be there at the beginning of next iteration
            # antisense_surroundings[strand2].append((ref2, five2, strand2))
            continue
        assert ref2 == ref1
        # Now five2 >= five1 - before_dist
        # Fill the surroundings
        # Example with after_dist = 4
        #        4321|--->
        #    <---|   . append
        #     <---|  . append
        #      <---| . append
        #       <---|. append
        #        <---| append
        #         <---| append
        #          <---| append
        #           <---| append
        #            <---| append
        #            .<---| stop
        #        432101234
        # Problem: We fill for both strands, but using the current value
        # of after_dist. This can over-fill the deque for the other strand.
        while ref2 == ref1 and five2 < five1 + after_dist + 1:
            antisense_surroundings[strand2].append((ref2, five2, strand2))
            try:
                (ref2, five2, _, strand2) = next(alis2)
                ##debug
                #record_five2(ref2, five2, strand2)
                ##
            except StopIteration:
                # We need to catch this to avoid exiting zip_alis too soon.
                exhausted = True
                break
        # Now we are either too far downstream on ref1,
        # or on another chromosome
        # Purge surroundings from upstream alignments, if any
        try:
            while surroundings[0][1] < five1 - before_dist:
                # _ = surroundings.popleft()
                (popped_ref, _, _) = surroundings.popleft()
                assert popped_ref == ref1
                # surroundings.popleft()
        except IndexError:
            # surroundings has been emptied
            pass
        if surroundings:
            # assert sorted(surroundings) == list(surroundings), ", ".join(map(str, surroundings))

            def not_downstream(neighbour):
                """Tells if *neighbour* is not downstream."""
                return neighbour[1] <= five1 + after_dist

            def upstream(neighbour):
                """Tells if *neighbour* is upstream."""
                return neighbour[1] < five1 - before_dist
            real_surroundings = list(takewhile(
                not_downstream,
                dropwhile(upstream, surroundings)))
            # real_surroundings = list(takewhile(
            #     range_tester(five1, before_dist, after_dist),
            #     surroundings))
            # in_range = range_tester(five1, before_dist, after_dist)
            # real_surroundings_2 = [
            #     neighbour for neighbour in surroundings
            #     if in_range(neighbour)]
            # real_surroundings_3 = [
            #     (ref, five, strand)
            #     for (ref, five, strand) in surroundings
            #     if five1 - before_dist <= five <= five1 + after_dist]
            # for (n1, n2, n3) in zip(
            #         real_surroundings,
            #         real_surroundings_2,
            #         real_surroundings_3):
            #     if n1[1] != n2[1]:
            #         print("takewhile-dropwhile not like range_tester: "
            #               f"{n1[1]} != {n2[1]}")
            #         msg = "\n".join([
            #             "surroundings",
            #             ", ".join(map(str, surroundings)),
            #             ", ".join([
            #                 f"five1: {five1}",
            #                 f"before: {before_dist}",
            #                 f"after: {after_dist}"]),
            #             "takewhile-dropwhile:",
            #             ", ".join(map(str, real_surroundings)),
            #             "list-comprehension:",
            #             ", ".join(map(str, real_surroundings_3))])
            #     elif n2[1] != n3[1]:
            #         print("range_tester not like list-comprehension: "
            #               f"{n2[1]} != {n3[1]}")
            #         msg = "\n".join([
            #             "surroundings",
            #             ", ".join(map(str, surroundings)),
            #             ", ".join([
            #                 f"five1: {five1}",
            #                 f"before: {before_dist}",
            #                 f"after: {after_dist}"]),
            #             "range_tester:",
            #             ", ".join(map(str, real_surroundings_2)),
            #             "list-comprehension:",
            #             ", ".join(map(str, real_surroundings_3))])
            #     else:
            #         msg = None
            #     assert n1[1] == n2[1] == n3[1], msg
        else:
            real_surroundings = []
        if real_surroundings:
            for (ref, five, strand) in real_surroundings:
                if ref != ref1:
                    print(f"wrong ref: {ref} != {ref1}", file=stderr)
                if five < five1 - before_dist:
                    print(
                        "too far upstream:"
                        f"{five} ({five-five1} < {-before_dist})",
                        file=stderr)
                if five > five1 + after_dist:
                    print(
                        "too far downstream:",
                        f"{five} ({five-five1} > {after_dist})",
                        file=stderr)
                assert ANTISTRAND[strand] == strand1
            yield ((ref1, five1, strand1), real_surroundings)
        if exhausted:
            raise StopIteration


# TODO: implement better measures (z-scores ?)
def dists_to_ali(ali, neighbours):
    """
    Compute local distance histograms.

    *ali* should be a (chrom, pos, strand) tuple, and the *neighbours* list
    should contain similar tuples with opposite strandedness and should have
    their positions within a certain range of pos (for instance resulting from
    the *max_dists* parameter of :func:`zip_alis`).
    """
    (chrom1, pos1, strand1) = ali

    def compatible(ali2):
        """
        Check that a given neighbouring alignment is a suitable neighbour
        of *ali*.

        No distance checks are made.
        """
        (chrom2, _, strand2) = ali2
        if chrom2 != chrom1 or strand2 == strand1:
            print(chrom2, strand2)
        return chrom2 == chrom1 and strand2 != strand1

    msg = "Some neighbouring reads are not compatible with " \
        + f"{chrom1}, {pos1}, {strand1}!"
    assert all(compatible(ali2) for ali2 in neighbours), msg
    if strand1 == "+":
        return Counter(pos2 - pos1 for (_, pos2, _) in neighbours)
    else:
        return Counter(pos1 - pos2 for (_, pos2, _) in neighbours)


def main():
    """Main function of the program."""
    parser = argparse.ArgumentParser(
        description=__doc__,
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "-d", "--degradome",
        required=True,
        help="Sorted and indexed bam file containing the mapped degradome."
        "A given read is expected to be aligned at only one location.")
    parser.add_argument(
        "-s", "--smallbam",
        required=True,
        help="Sorted and indexed bam file containing the mapped small RNAs."
        "A given read is expected to be aligned at only one location.")
    parser.add_argument(
        "-g", "--gene_list",
        help="Path to a file containing identifiers of genes. Among "
        "degradome reads, only those mapping within those genes' coordinates "
        "will be considered."
        "It is necessary to also provide a gtf file to enable the "
        "determination of those coordinates (with option --annotations).")
    parser.add_argument(
        "-a", "--annotations",
        help="Path to a gtf file giving genes coordinates.\n"
        "The gtf annotations must contain a gene_id field, that will "
        "be used to match gene identifiers present in the list provided "
        "with the --gene_list option.")
    parser.add_argument(
        "-i", "--id_kwd",
        default="gene_id",
        help="keyword under which the feature ID is expected to be found "
        "in the feature annotations in the gtf file. These feature IDs "
        "will be matched against identifiers in the list provided "
        "with the --gene_list option.")
    # parser.add_argument(
    #     "-o", "--out_dir",
    #     required=True,
    #     help="Directory in which results should be written.")
    parser.add_argument(
        "-l", "--logfile",
        help="File in which to write logs.")
    args = parser.parse_args()
    if not args.logfile:
        logfilename = "/dev/null"
    else:
        logfilename = args.logfile
    logging.basicConfig(
        filename=logfilename,
        level=logging.DEBUG)
    # info = logging.info
    # debug = logging.debug
    # warning = logging.warning
    if args.gene_list:
        if not args.annotations:
            warnings.warn(
                "option --gene_list requires option --annotations.\n")
            return 1
        intervals = get_gene_intervals(
            args.gene_list, args.annotations, args.id_kwd)
    else:
        intervals = None
    # TODO: Ensure that all degradome reads are mapped from their 5' end
    # (no soft-clip). Should this be done upstream?
    with AlignmentFile(args.degradome, "rb") as degradome, \
            AlignmentFile(args.smallbam, "rb") as smallbam:
        # combine the counters
        distances = merge_with(
            sum,
            # count distances to neighbouring antisense small RNA
            # for each degradome read
            (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis(
                filter_bam(smallbam),
                filter_bam(degradome, intervals))))
        print("distance", "count", sep="\t")
        for distance in sorted(distances):
            print(distance, distances[distance], sep="\t")
    return 0


if __name__ == "__main__":
    exit(main())