Skip to content
Snippets Groups Projects
Commit 3c971b72 authored by Blaise Li's avatar Blaise Li
Browse files

Script to compute distances between mapped reads.

The script currently takes two bam files and gives the distribution of
distances between reads in one "degradome" file and their antisense
neighbours in a "small RNA" file.
parent 77f6924c
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python3
# vim: set fileencoding=<utf-8> :
"""
This script takes read mapping information from sorted bam files, degradome
and small RNAs, and builds metaprofiles for degradome reads relative to small
RNAs.
We want to obtain similar figures as Shen et al., 2018
(<https://doi.org/10.1016/j.cell.2018.02.002>):
> In C. elegans, piRISC recruits RdRP to its targets. Therefore, we wished to
examine the pattern of RdRP-dependent 22G-RNA production near CLASH-defined
piRNA target sites in both WT and prg-1 mutant worms. To do this, we plotted
22G-RNA levels within a 40-nt region centered on the piRNA complementary
sequences defined by CLASH. The 5′ ends of 22G-RNAs are thought to be formed
directly from RdRP initiating at C residues within the target mRNAs. We
therefore normalized the 22G-RNA levels initiating at each position to the
frequency of C residues within the CLASH-defined targets at each position.
Because the CSR-1, and WAGO Argonaute pathway are thought to have opposing
functions, resisting and supporting piRNA silencing (Seth et al., 2013, Wedeles
et al., 2013), we
separately considered predicted piRNA targets within previously defined WAGO
and CSR-1 targeted mRNAs (Claycomb et al., 2009, Gu et al., 2009). As a control
set, we considered a target region arbitrarily set 100 nts away (within each
mRNA) from of
the piRNA binding sites identified by CLASH. In WT animals, 22G-RNA levels were
much higher for WAGO targets than for CSR-1 targets, as expected (Figures 2A
and 2B,
Figures S2A
and S2B).
However, piRNA binding sites within both WAGO and CSR-1 targets showed a
non-random distribution of 22G-RNA levels across the interval. By contrast, the
control regions within the same target mRNAs, but offset from the hybrid sites,
exhibited no such patterns (Figures 2G and 2H). WAGO targets exhibited a strong
central peak, and clusters of peaks at either end of the piRNA target sites. To
describe these patterns, we refer to the mRNA sequences near the target site as
follows: t1 through t30 includes the presumptive binding site (t1 to t21) plus
9 nucleotides 5′ of the target site (t22 to t30). The mRNA region 3′ of the
target site consists of nucleotides t–1 through t–11. Strikingly, this analysis
revealed a prominent peak in the center of the piRNA complementary region near
t12, and smaller peaks centered at t1 and t21 (Figure 2A). CSR-1 targets
exhibited a cluster of much smaller peaks near the 3′ end of the predicted
target site, with the largest peak residing in sequences located near t–5
(Figure 2B). The amplitudes of 22G-RNA levels on both the WAGO and CSR-1
targets correlated positively with the predicted free energy of piRNA binding
and to a lesser extent with piRNA abundance (Figures 2C–2F and S2C–2F).
"""
import argparse
import logging
import warnings
import os
from collections import Counter, deque
from operator import attrgetter
from cytoolz import merge_with
# from pybedtools import Attributes, BedTool, Interval
# from pybedtools import BedTool, Interval
from pysam import AlignmentFile
# To collapse aligned reads to their 5' ends
from bam25prime import collapse_and_sort
def formatwarning(message, category, filename, lineno, line):
"""Used to format warning messages."""
return "%s:%s: %s: %s\n" % (filename, lineno, category.__name__, message)
warnings.formatwarning = formatwarning
UNPACK = attrgetter("chrom", "start", "end", "strand")
ANTISTRAND = {"+": "-", "-": "+"}
OPJ = os.path.join
def filter_bam(bamfile, intervals=None):
"""
Iterates over the *bamfile* and yields only those aligned reads
that belong to one of the bed *intervals*.
The output will be generated in the order obtained when sorting
*intervals*.
If *intervals* is None, all aligned reads are yielded.
"""
# :func:`bam25prime.collapse_and_sort` preprocesses and re-sorts the
# aligned segments so that we have :class:`pybedtools.Interval`s of
# length 1, corresponding to their 5' ends.
# Re-sorting is needed because reverse aligned segments will be placed
# according to their original end coordinate.
if intervals is None:
for ali in collapse_and_sort(bamfile.fetch()):
yield UNPACK(ali)
# What about the following:
# yield from collapse_and_sort(bamfile.fetch())
else:
# TODO: Check that sort order of intervals
# is the same as the aligned segments.
# Chromosome order should be the same, for instance.
for interval in sorted(intervals):
# for ali in bamfile.fetch(interval.chrom,
# interval.start,
# interval.end):
# yield ali
for ali in collapse_and_sort(bamfile.fetch(
interval.chrom,
interval.start,
interval.end)):
yield UNPACK(ali)
# TODO: allow different upstream and downstream maximum distances
# For CSR1-loaded siRNAs, we expect the cut to happen somewhere
# within the length of the siRNA, so it might be better to look
# from -15 to + 25, for instance
def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
"""
Iterates over two iterators of aligned segments *alis1* and *alis2*
so that the iteration over *alis2* is driven by that over *alis1*
in the sense that alignments will be taken in *alis2*
only in regions surrounding alignments in *alis1*.
Yields pairs consiting in one alignment from *alis1* and
the list of those found in its surroundings in *alis2*.
The input aligned segments should consist in :class:`pybedtools.Interval`s
of length 1 so that only their five-prime ends are taken into account
to determine the distance separating them.
"""
# Will be updated to contain the alignments from alis2
# that surround the current alignment from alis1,
# and are on the opposite strand
antisense_surroundings = {"+": deque([]), "-": deque([])}
# Note that alis2 needs to be an iterator, otherwise, next will fail.
# (An iterator can be obtained by calling iter on an iterable.)
(ref2, five2, _, strand2) = next(alis2)
# TODO: Deal with iterator exhaustion
for (ref1, five1, _, strand1) in alis1:
# Skip alignments in previous chromosomes
while ref2 < ref1:
(ref2, five2, _, strand2) = next(alis2)
if ref2 > ref1:
# There were no alignments in alis2 in the same chromosome
yield ((ref1, five1, strand1), [])
continue
# Now we have an alignment on the same chromosome
# TODO: Check coordinates handling
# Select the appropriate surroundings
surroundings = antisense_surroundings[ANTISTRAND[strand1]]
# Purge surroundings from alignments in the previous chromosomes
try:
while surroundings[0][0] < ref1:
# _ = surroundings.popleft()
surroundings.popleft()
except IndexError:
# surroundings has been emptied
pass
(before_dist, after_dist) = max_dists[strand1]
# Skip upstream alignments
while five2 < five1 - before_dist:
(ref2, five2, _, strand2) = next(alis2)
# Example with before_dist = 4
# 4321|--->
# <---| . skip
# <---| . skip
# <---| . skip
# <---| . OK
# 432101234
# Now five2 >= five1 - before_dist
# Fill the surroundings
while five2 < five1 + after_dist + 1:
antisense_surroundings[strand2].append((ref2, five2, strand2))
(ref2, five2, _, strand2) = next(alis2)
# Example with after_dist = 4
# 4321|--->
# <---| . append
# <---| . append
# <---| . append
# <---|. append
# <---| append
# <---| append
# <---| append
# <---| append
# <---| append
# .<---| stop
# 432101234
# Purge surroundings from upstream alignments, if any
try:
while surroundings[0][1] < five1 - before_dist:
# _ = surroundings.popleft()
surroundings.popleft()
except IndexError:
# surroundings has been emptied
pass
yield ((ref1, five1, strand1), list(surroundings))
def dists_to_ali(ali, neighbours):
"""
Compute local distance histograms.
*ali* should be a (chrom, pos, strand) tuple, and the *neighbours* list
should contain similar tuples with opposite strandedness and should have
their positions within a certain range of pos (for instance resulting from
the *max_dists* parameter of :func:`zip_alis`).
"""
(chrom1, pos1, strand1) = ali
def compatible(ali2):
"""
Check that a given neighbouring alignment is a suitable neighbour
of *ali*.
No distance checks are made.
"""
(chrom2, _, strand2) = ali2
return chrom2 == chrom1 and strand2 != strand1
msg = f"{chrom1}, {pos1}, {strand1}\n" + ", ".join([str(ali2) for ali2 in neighbours])
assert all(compatible(ali2) for ali2 in neighbours), msg
return Counter(pos2 - pos1 for (_, pos2, _) in neighbours)
# TODO: implement filtering from a bed or gtf file (for instance, to indicate the coordinates of genes or gens portions of interest)
def main():
"""Main function of the program."""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-d", "--degradome",
required=True,
help="Sorted and indexed bam file containing the mapped degradome."
"A given read is expected to be aligned at only one location.")
parser.add_argument(
"-s", "--smallbam",
required=True,
help="Sorted and indexed bam file containing the mapped small RNAs."
"A given read is expected to be aligned at only one location.")
# parser.add_argument(
# "-o", "--out_dir",
# required=True,
# help="Directory in which results should be written.")
parser.add_argument(
"-l", "--logfile",
help="File in which to write logs.")
args = parser.parse_args()
if not args.logfile:
logfilename = "/dev/null"
else:
logfilename = args.logfile
logging.basicConfig(
filename=logfilename,
level=logging.DEBUG)
# info = logging.info
# debug = logging.debug
# warning = logging.warning
# TODO: Ensure that all degradome reads are mapped from their 5' end
# (no soft-clip). Should this be done upstream?
with AlignmentFile(args.degradome, "rb") as degradome, \
AlignmentFile(args.smallbam, "rb") as smallbam:
# combine the counters
distances = merge_with(
sum,
# count distances to neighbouring antisense small RNA
# for each degradome read
(dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis(
filter_bam(smallbam),
filter_bam(degradome))))
for distance in sorted(distances):
print(distance, distances[distance], sep="\t")
return 0
if __name__ == "__main__":
exit(main())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment