diff --git a/Degradome-seq/degradome_metaprofile.py b/Degradome-seq/degradome_metaprofile.py index 07ddda7e2704340afaaf4282b9f313f098de6795..73ce101bffb2d61895c338bbb3abe7d4806496a8 100755 --- a/Degradome-seq/degradome_metaprofile.py +++ b/Degradome-seq/degradome_metaprofile.py @@ -52,7 +52,7 @@ import warnings import os from sys import stderr from collections import Counter, deque -from itertools import takewhile +from itertools import dropwhile, takewhile from operator import attrgetter from cytoolz import merge_with # from pybedtools import Attributes, BedTool, Interval @@ -75,6 +75,48 @@ ANTISTRAND = {"+": "-", "-": "+"} OPJ = os.path.join +def get_gene_intervals(gene_list_filename, annot_filename, id_kwd=None): + """ + Obtain the bed coordinates of the genomic regions covered by + the genes listed in *gene_list_filename*, as annotated in + *annot_filename*. + """ + if id_kwd is None: + id_kwd = "gene_id" + with open(gene_list_filename, "r") as gene_list_file: + gene_identifiers = {line.strip() for line in gene_list_file} + # Deadlock when chaining stream=True? + # https://daler.github.io/pybedtools/topical-iterators.html + # intervals = id_list_gtf2bed( + # gene_identifiers, + # args.annotations, id_kwd=args.id_kwd).sort( + # stream=True).merge(stream=True) + return id_list_gtf2bed( + gene_identifiers, annot_filename, + id_kwd=id_kwd).sort().merge(stream=True) + # for ivl in intervals: + # print(ivl) + # return 0 + + +# def range_tester(pos, upstream, downstream): +# """ +# Create a function that tests whether a (chrom, pos, strand) genomic point +# belongs to a certain range from *downstream* and up to *upstream* of +# position *pos*. The chromosome and strand are actually ignored, so the +# test is assumed to happen within a given chromosome. +# """ +# def in_range(neighbour): +# """Recognize a legitimate (chrom, pos, strand) *neighbour*. +# Only the five-prime positions are taken into account. +# Chromosome is assumed to be the same, strand is ignored. +# """ +# return (pos - upstream +# <= neighbour[1] +# <= pos + downstream) +# return in_range + + def filter_bam(bamfile, intervals=None): """ Iterates over the *bamfile* and yields only those aligned reads @@ -107,6 +149,20 @@ def filter_bam(bamfile, intervals=None): # # is the same as the aligned segments. # # Chromosome order should be the same, for instance. for interval in intervals.sort(): + #print(interval) + # I 2880868 2883454 + # [...] + # 2883513 + # 2883514 + # I 2883495 2893718 + # 2883513 + # + # I 9488487 9492623 + # [...] + # 9492587 + # 9492616 + # I 9492629 9494589 + # 9492587 # for ali in bamfile.fetch(interval.chrom, # interval.start, # interval.end): @@ -115,14 +171,25 @@ def filter_bam(bamfile, intervals=None): interval.chrom, interval.start, interval.end)): + # AlignmentFile.fetch may yield alignments + # whose start is upstream interval.start + # or + # whose end is beyond interval.end + # (overlap is sufficient) + if ali.start < interval.start: + continue + if ali.end > interval.end: + break yield UNPACK(ali) - +## debug +#from collections import defaultdict +## # TODO: allow different upstream and downstream maximum distances # For CSR1-loaded siRNAs, we expect the cut to happen somewhere # within the length of the siRNA, so it might be better to look # from -15 to + 25, for instance -def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): +def zip_alis(alis1, alis2, max_dists=None): """ Iterates over two iterators of aligned segments *alis1* and *alis2* so that the iteration over *alis2* is driven by that over *alis1* @@ -136,6 +203,8 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): of length 1 so that only their five-prime ends are taken into account to determine the distance separating them. """ + if max_dists is None: + max_dists = {"+": (15, 25), "-": (25, 15)} # Will be updated to contain the alignments from alis2 # that surround the current alignment from alis1, # and are on the opposite strand @@ -144,7 +213,19 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): # (An iterator can be obtained by calling iter on an iterable.) # In case of StopIteration before even iterating alis1, # we let it propagate (exit zip_alis) + ##debug + #five2s = {"+": defaultdict(list), "-": defaultdict(list)} + #def record_five2(chrom_info, pos_info, strand_info): + # print(pos_info) + # five_list = five2s[strand_info] + # if five_list[chrom_info]: + # assert five_list[chrom_info][-1] <= pos_info + # five_list[chrom_info].append(pos_info) + ## (ref2, five2, _, strand2) = next(alis2) + ##debug + #record_five2(ref2, five2, strand2) + ## # Is the alis2 iterator exhausted? exhausted = False for (ref1, five1, _, strand1) in alis1: @@ -152,6 +233,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): while ref2 < ref1: try: (ref2, five2, _, strand2) = next(alis2) + ##debug + #record_five2(ref2, five2, strand2) + ## except StopIteration: # We need to catch this to avoid exiting zip_alis too soon. exhausted = True @@ -185,6 +269,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): while ref2 == ref1 and five2 < five1 - before_dist: try: (ref2, five2, _, strand2) = next(alis2) + ##debug + #record_five2(ref2, five2, strand2) + ## except StopIteration: # We need to catch this to avoid exiting zip_alis too soon. exhausted = True @@ -220,6 +307,9 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): antisense_surroundings[strand2].append((ref2, five2, strand2)) try: (ref2, five2, _, strand2) = next(alis2) + ##debug + #record_five2(ref2, five2, strand2) + ## except StopIteration: # We need to catch this to avoid exiting zip_alis too soon. exhausted = True @@ -237,10 +327,64 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): # surroundings has been emptied pass if surroundings: - def in_range(neighbour): - """Recognize aligned segments not too far downstream.""" + # assert sorted(surroundings) == list(surroundings), ", ".join(map(str, surroundings)) + + def not_downstream(neighbour): + """Tells if *neighbour* is not downstream.""" return neighbour[1] <= five1 + after_dist - real_surroundings = list(takewhile(in_range, surroundings)) + + def upstream(neighbour): + """Tells if *neighbour* is upstream.""" + return neighbour[1] < five1 - before_dist + real_surroundings = list(takewhile( + not_downstream, + dropwhile(upstream, surroundings))) + # real_surroundings = list(takewhile( + # range_tester(five1, before_dist, after_dist), + # surroundings)) + # in_range = range_tester(five1, before_dist, after_dist) + # real_surroundings_2 = [ + # neighbour for neighbour in surroundings + # if in_range(neighbour)] + # real_surroundings_3 = [ + # (ref, five, strand) + # for (ref, five, strand) in surroundings + # if five1 - before_dist <= five <= five1 + after_dist] + # for (n1, n2, n3) in zip( + # real_surroundings, + # real_surroundings_2, + # real_surroundings_3): + # if n1[1] != n2[1]: + # print("takewhile-dropwhile not like range_tester: " + # f"{n1[1]} != {n2[1]}") + # msg = "\n".join([ + # "surroundings", + # ", ".join(map(str, surroundings)), + # ", ".join([ + # f"five1: {five1}", + # f"before: {before_dist}", + # f"after: {after_dist}"]), + # "takewhile-dropwhile:", + # ", ".join(map(str, real_surroundings)), + # "list-comprehension:", + # ", ".join(map(str, real_surroundings_3))]) + # elif n2[1] != n3[1]: + # print("range_tester not like list-comprehension: " + # f"{n2[1]} != {n3[1]}") + # msg = "\n".join([ + # "surroundings", + # ", ".join(map(str, surroundings)), + # ", ".join([ + # f"five1: {five1}", + # f"before: {before_dist}", + # f"after: {after_dist}"]), + # "range_tester:", + # ", ".join(map(str, real_surroundings_2)), + # "list-comprehension:", + # ", ".join(map(str, real_surroundings_3))]) + # else: + # msg = None + # assert n1[1] == n2[1] == n3[1], msg else: real_surroundings = [] if real_surroundings: @@ -249,25 +393,21 @@ def zip_alis(alis1, alis2, max_dists={"+": (15, 25), "-": (25, 15)}): print(f"wrong ref: {ref} != {ref1}", file=stderr) if five < five1 - before_dist: print( - f"too far upstream {five} ({five-five1} < {-before_dist})", + "too far upstream:" + f"{five} ({five-five1} < {-before_dist})", file=stderr) if five > five1 + after_dist: print( - f"too far downstream: {five} ({five-five1} > {after_dist})", + "too far downstream:", + f"{five} ({five-five1} > {after_dist})", file=stderr) assert ANTISTRAND[strand] == strand1 - #assert all([ - # ref == ref1 - # and (five1 - before_dist <= five <= five1 + after_dist) - # for (ref, five, _) in surroundings]) - # assert surroundings[0][0] == surroundings[-1][0] == ref1 - # assert surroundings[0][1] >= five1 - before_dist - # assert surroundings[-1][1] <= five1 + after_dist yield ((ref1, five1, strand1), real_surroundings) if exhausted: raise StopIteration +# TODO: implement better measures (z-scores ?) def dists_to_ali(ali, neighbours): """ Compute local distance histograms. @@ -292,7 +432,7 @@ def dists_to_ali(ali, neighbours): return chrom2 == chrom1 and strand2 != strand1 msg = "Some neighbouring reads are not compatible with " \ - + f"{chrom1}, {pos1}, {strand1}!" + + f"{chrom1}, {pos1}, {strand1}!" assert all(compatible(ali2) for ali2 in neighbours), msg if strand1 == "+": return Counter(pos2 - pos1 for (_, pos2, _) in neighbours) @@ -300,7 +440,6 @@ def dists_to_ali(ali, neighbours): return Counter(pos1 - pos2 for (_, pos2, _) in neighbours) -# TODO: implement filtering from a bed or gtf file (for instance, to indicate the coordinates of genes or gens portions of interest) def main(): """Main function of the program.""" parser = argparse.ArgumentParser( @@ -354,20 +493,13 @@ def main(): # info = logging.info # debug = logging.debug # warning = logging.warning - # TODO: implement usage of --gene_list using libhts.id_list_gtf2bed if args.gene_list: if not args.annotations: - warnings.warn("option --gene_list requires option --annotations.\n") + warnings.warn( + "option --gene_list requires option --annotations.\n") return 1 - with open(args.gene_list, "r") as gene_list_file: - gene_identifiers = {line.strip() for line in gene_list_file} - intervals = id_list_gtf2bed( - gene_identifiers, - args.annotations, id_kwd=args.id_kwd).sort( - stream=True).merge(stream=True) - # for ivl in intervals: - # print(ivl) - # return 0 + intervals = get_gene_intervals( + args.gene_list, args.annotations, args.id_kwd) else: intervals = None # TODO: Ensure that all degradome reads are mapped from their 5' end