From ec55107a06f06a66d8153681488825d9f6686acc Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 17 Aug 2018 11:47:02 +0200
Subject: [PATCH] Correcting bugs in 5' distance calculation.

This also seems to make it faster.
---
 Degradome-seq/degradome_metaprofile.py | 188 +++++++++++++++++++++----
 1 file changed, 160 insertions(+), 28 deletions(-)

diff --git a/Degradome-seq/degradome_metaprofile.py b/Degradome-seq/degradome_metaprofile.py
index 07ddda7..73ce101 100755
--- a/Degradome-seq/degradome_metaprofile.py
+++ b/Degradome-seq/degradome_metaprofile.py
@@ -52,7 +52,7 @@ import warnings
 import os
 from sys import stderr
 from collections import Counter, deque
-from itertools import takewhile
+from itertools import dropwhile, takewhile
 from operator import attrgetter
 from cytoolz import merge_with
 # from pybedtools import Attributes, BedTool, Interval
@@ -75,6 +75,48 @@ ANTISTRAND = {"+": "-", "-": "+"}
 OPJ = os.path.join
 
 
+def get_gene_intervals(gene_list_filename, annot_filename, id_kwd=None):
+    """
+    Obtain the bed coordinates of the genomic regions covered by
+    the genes listed in *gene_list_filename*, as annotated in
+    *annot_filename*.
+    """
+    if id_kwd is None:
+        id_kwd = "gene_id"
+    with open(gene_list_filename, "r") as gene_list_file:
+        gene_identifiers = {line.strip() for line in gene_list_file}
+    # Deadlock when chaining stream=True?
+    # https://daler.github.io/pybedtools/topical-iterators.html
+    # intervals = id_list_gtf2bed(
+    #     gene_identifiers,
+    #     args.annotations, id_kwd=args.id_kwd).sort(
+    #         stream=True).merge(stream=True)
+    return id_list_gtf2bed(
+        gene_identifiers, annot_filename,
+        id_kwd=id_kwd).sort().merge(stream=True)
+    # for ivl in intervals:
+    #     print(ivl)
+    # return 0
+
+
+# def range_tester(pos, upstream, downstream):
+#     """
+#     Create a function that tests whether a (chrom, pos, strand) genomic point
+#     belongs to a certain range from *downstream* and up to *upstream* of
+#     position *pos*. The chromosome and strand are actually ignored, so the
+#     test is assumed to happen within a given chromosome.
+#     """
+#     def in_range(neighbour):
+#         """Recognize a legitimate (chrom, pos, strand) *neighbour*.
+#         Only the five-prime positions are taken into account.
+#         Chromosome is assumed to be the same, strand is ignored.
+#         """
+#         return (pos - upstream
+#                 <= neighbour[1]
+#                 <= pos + downstream)
+#     return in_range
+
+
 def filter_bam(bamfile, intervals=None):
     """
     Iterates over the *bamfile* and yields only those aligned reads
@@ -107,6 +149,20 @@ def filter_bam(bamfile, intervals=None):
         # # is the same as the aligned segments.
         # # Chromosome order should be the same, for instance.
         for interval in intervals.sort():
+            #print(interval)
+            # I	2880868	2883454
+            # [...]
+            # 2883513
+            # 2883514
+            # I	2883495	2893718
+            # 2883513
+            #
+            # I	9488487	9492623
+            # [...]
+            # 9492587
+            # 9492616
+            # I	9492629	9494589
+            # 9492587
             # for ali in bamfile.fetch(interval.chrom,
             #                          interval.start,
             #                          interval.end):
@@ -115,14 +171,25 @@ def filter_bam(bamfile, intervals=None):
                     interval.chrom,
                     interval.start,
                     interval.end)):
+                # AlignmentFile.fetch may yield alignments
+                # whose start is upstream interval.start
+                # or
+                # whose end is beyond interval.end
+                # (overlap is sufficient)
+                if ali.start < interval.start:
+                    continue
+                if ali.end > interval.end:
+                    break
                 yield UNPACK(ali)
 
-
+## debug
+#from collections import defaultdict
+##
 # TODO: allow different upstream and downstream maximum distances
 # For CSR1-loaded siRNAs, we expect the cut to happen somewhere
 # within the length of the siRNA, so it might be better to look
 # from -15 to + 25, for instance
-def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
+def zip_alis(alis1, alis2, max_dists=None):
     """
     Iterates over two iterators of aligned segments *alis1* and *alis2*
     so that the iteration over *alis2* is driven by that over *alis1*
@@ -136,6 +203,8 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
     of length 1 so that only their five-prime ends are taken into account
     to determine the distance separating them.
     """
+    if max_dists is None:
+        max_dists = {"+": (15, 25), "-": (25, 15)}
     # Will be updated to contain the alignments from alis2
     # that surround the current alignment from alis1,
     # and are on the opposite strand
@@ -144,7 +213,19 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
     # (An iterator can be obtained by calling iter on an iterable.)
     # In case of StopIteration before even iterating alis1,
     # we let it propagate (exit zip_alis)
+    ##debug
+    #five2s = {"+": defaultdict(list), "-": defaultdict(list)}
+    #def record_five2(chrom_info, pos_info, strand_info):
+    #    print(pos_info)
+    #    five_list = five2s[strand_info]
+    #    if five_list[chrom_info]:
+    #        assert five_list[chrom_info][-1] <= pos_info
+    #    five_list[chrom_info].append(pos_info)
+    ##
     (ref2, five2, _, strand2) = next(alis2)
+    ##debug
+    #record_five2(ref2, five2, strand2)
+    ##
     # Is the alis2 iterator exhausted?
     exhausted = False
     for (ref1, five1, _, strand1) in alis1:
@@ -152,6 +233,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
         while ref2 < ref1:
             try:
                 (ref2, five2, _, strand2) = next(alis2)
+                ##debug
+                #record_five2(ref2, five2, strand2)
+                ##
             except StopIteration:
                 # We need to catch this to avoid exiting zip_alis too soon.
                 exhausted = True
@@ -185,6 +269,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
         while ref2 == ref1 and five2 < five1 - before_dist:
             try:
                 (ref2, five2, _, strand2) = next(alis2)
+                ##debug
+                #record_five2(ref2, five2, strand2)
+                ##
             except StopIteration:
                 # We need to catch this to avoid exiting zip_alis too soon.
                 exhausted = True
@@ -220,6 +307,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
             antisense_surroundings[strand2].append((ref2, five2, strand2))
             try:
                 (ref2, five2, _, strand2) = next(alis2)
+                ##debug
+                #record_five2(ref2, five2, strand2)
+                ##
             except StopIteration:
                 # We need to catch this to avoid exiting zip_alis too soon.
                 exhausted = True
@@ -237,10 +327,64 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
             # surroundings has been emptied
             pass
         if surroundings:
-            def in_range(neighbour):
-                """Recognize aligned segments not too far downstream."""
+            # assert sorted(surroundings) == list(surroundings), ", ".join(map(str, surroundings))
+
+            def not_downstream(neighbour):
+                """Tells if *neighbour* is not downstream."""
                 return neighbour[1] <= five1 + after_dist
-            real_surroundings = list(takewhile(in_range, surroundings))
+
+            def upstream(neighbour):
+                """Tells if *neighbour* is upstream."""
+                return neighbour[1] < five1 - before_dist
+            real_surroundings = list(takewhile(
+                not_downstream,
+                dropwhile(upstream, surroundings)))
+            # real_surroundings = list(takewhile(
+            #     range_tester(five1, before_dist, after_dist),
+            #     surroundings))
+            # in_range = range_tester(five1, before_dist, after_dist)
+            # real_surroundings_2 = [
+            #     neighbour for neighbour in surroundings
+            #     if in_range(neighbour)]
+            # real_surroundings_3 = [
+            #     (ref, five, strand)
+            #     for (ref, five, strand) in surroundings
+            #     if five1 - before_dist <= five <= five1 + after_dist]
+            # for (n1, n2, n3) in zip(
+            #         real_surroundings,
+            #         real_surroundings_2,
+            #         real_surroundings_3):
+            #     if n1[1] != n2[1]:
+            #         print("takewhile-dropwhile not like range_tester: "
+            #               f"{n1[1]} != {n2[1]}")
+            #         msg = "\n".join([
+            #             "surroundings",
+            #             ", ".join(map(str, surroundings)),
+            #             ", ".join([
+            #                 f"five1: {five1}",
+            #                 f"before: {before_dist}",
+            #                 f"after: {after_dist}"]),
+            #             "takewhile-dropwhile:",
+            #             ", ".join(map(str, real_surroundings)),
+            #             "list-comprehension:",
+            #             ", ".join(map(str, real_surroundings_3))])
+            #     elif n2[1] != n3[1]:
+            #         print("range_tester not like list-comprehension: "
+            #               f"{n2[1]} != {n3[1]}")
+            #         msg = "\n".join([
+            #             "surroundings",
+            #             ", ".join(map(str, surroundings)),
+            #             ", ".join([
+            #                 f"five1: {five1}",
+            #                 f"before: {before_dist}",
+            #                 f"after: {after_dist}"]),
+            #             "range_tester:",
+            #             ", ".join(map(str, real_surroundings_2)),
+            #             "list-comprehension:",
+            #             ", ".join(map(str, real_surroundings_3))])
+            #     else:
+            #         msg = None
+            #     assert n1[1] == n2[1] == n3[1], msg
         else:
             real_surroundings = []
         if real_surroundings:
@@ -249,25 +393,21 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}):
                     print(f"wrong ref: {ref} != {ref1}", file=stderr)
                 if five < five1 - before_dist:
                     print(
-                        f"too far upstream {five} ({five-five1} < {-before_dist})",
+                        "too far upstream:"
+                        f"{five} ({five-five1} < {-before_dist})",
                         file=stderr)
                 if five > five1 + after_dist:
                     print(
-                        f"too far downstream: {five} ({five-five1} > {after_dist})",
+                        "too far downstream:",
+                        f"{five} ({five-five1} > {after_dist})",
                         file=stderr)
                 assert ANTISTRAND[strand] == strand1
-            #assert all([
-            #    ref == ref1
-            #    and (five1 - before_dist <= five <= five1 + after_dist)
-            #    for (ref, five, _) in surroundings])
-            # assert surroundings[0][0] == surroundings[-1][0] == ref1
-            # assert surroundings[0][1] >= five1 - before_dist
-            # assert surroundings[-1][1] <= five1 + after_dist
             yield ((ref1, five1, strand1), real_surroundings)
         if exhausted:
             raise StopIteration
 
 
+# TODO: implement better measures (z-scores ?)
 def dists_to_ali(ali, neighbours):
     """
     Compute local distance histograms.
@@ -292,7 +432,7 @@ def dists_to_ali(ali, neighbours):
         return chrom2 == chrom1 and strand2 != strand1
 
     msg = "Some neighbouring reads are not compatible with " \
-            + f"{chrom1}, {pos1}, {strand1}!"
+        + f"{chrom1}, {pos1}, {strand1}!"
     assert all(compatible(ali2) for ali2 in neighbours), msg
     if strand1 == "+":
         return Counter(pos2 - pos1 for (_, pos2, _) in neighbours)
@@ -300,7 +440,6 @@ def dists_to_ali(ali, neighbours):
         return Counter(pos1 - pos2 for (_, pos2, _) in neighbours)
 
 
-# TODO: implement filtering from a bed or gtf file (for instance, to indicate the coordinates of genes or gens portions of interest)
 def main():
     """Main function of the program."""
     parser = argparse.ArgumentParser(
@@ -354,20 +493,13 @@ def main():
     # info = logging.info
     # debug = logging.debug
     # warning = logging.warning
-    # TODO: implement usage of --gene_list using libhts.id_list_gtf2bed
     if args.gene_list:
         if not args.annotations:
-            warnings.warn("option --gene_list requires option --annotations.\n")
+            warnings.warn(
+                "option --gene_list requires option --annotations.\n")
             return 1
-        with open(args.gene_list, "r") as gene_list_file:
-            gene_identifiers = {line.strip() for line in gene_list_file}
-        intervals = id_list_gtf2bed(
-            gene_identifiers,
-            args.annotations, id_kwd=args.id_kwd).sort(
-                stream=True).merge(stream=True)
-        # for ivl in intervals:
-        #     print(ivl)
-        # return 0
+        intervals = get_gene_intervals(
+            args.gene_list, args.annotations, args.id_kwd)
     else:
         intervals = None
     # TODO: Ensure that all degradome reads are mapped from their 5' end
-- 
GitLab