diff --git a/libbamutils/libbamutils.py b/libbamutils/libbamutils.py index 409487524453d6158fefe1154b2fdf965a81de0a..0b668f7dd3750828b6efc7fe0d197b598ddb9192 100644 --- a/libbamutils/libbamutils.py +++ b/libbamutils/libbamutils.py @@ -20,11 +20,13 @@ from collections import Counter, deque from itertools import dropwhile, takewhile from operator import attrgetter from cytoolz import merge_with -from pysam import AlignmentFile # To get coordinates of genes from libhts import id_list_gtf2bed # To collapse aligned reads to their 5' ends from bam25prime import collapse_and_sort, filter_feature_size +# bam25prime re-exports those classes from pysam and pybedtools +# to enable simpler dependency management +from bam25prime import AlignmentFile, BedTool from libreads import bam2fastq @@ -372,6 +374,14 @@ class BedFile(GenposFile): def __init__(self, bedpath): self.bed = BedTool(bedpath) + # To be usable with context manager "with" syntax + # for compatibility with BamFile (not sure this really makes sense) + def __enter__(self): + return self + + def __exit__(self, exception_type, exception_value, traceback): + pass + def to_5prime(self, intervals=None, min_len=None, max_len=None): """ Iterate over the *self.bed* and yield only those features @@ -572,25 +582,25 @@ class BamFile(GenposFile): # TODO: allow replacing bam files with bed files, # using pybedtools.BedTool and collapse_and_sort_bedtool def read_distances( - bampath_1, bampath_2, + genposf_1, genposf_2, intervals=None, min_len_1=None, max_len_1=None, min_len_2=None, max_len_2=None, max_dists=None, antisense=True): """ - Compute the distribution of distances of reads in Bam formatted file - *bampath_2* with respect to reads in Bam formatted file *bampath_1*. + Compute the distribution of distances of genomic positions in GenposFile + object *genposf_2* with respect to those in other GenposFile *genposf_1*. - If *intervals* is provided, only reads from file *bampath_1* belonging to + If *intervals* is provided, only positions from file *genposf_1* belonging to these genomic intervals will be considered. - If *min_len_1* and / or *max_len_1* are provided, reads from file - *bampath_1* that fall outside those boundaries will not be considered. + If *min_len_1* and / or *max_len_1* are provided, positions from + *genposf_1* that fall outside those boundaries will not be considered. - If *min_len_2* and / or *max_len_2* are provided, reads from file + If *min_len_2* and / or *max_len_2* are provided, positions from *bampath_2* that fall outside those boundaries will not be considered. - The distances are those between 5' ends of the reads. + The distances are those between 5' ends of the positions. If *max_dists* is provided, it should be a dictionary of pairs (dist_before, dist_after) keyed by strand @@ -598,31 +608,29 @@ def read_distances( By default, it is the following: max_dists = {"+": (120, 120), "-": (120, 120)} - This means that for a given read from *bampath_1*, - reads from *bampath_2* whose 5' end falls outside a [-120, 120] range - around the 5' end of the read of interest will be ignored. + This means that for a given position from *genposf_1*, + positions from *genposf_2* whose 5' end falls outside a [-120, 120] range + around the 5' end of the position of interest will be ignored. - By default, distances will only take into account alignments + By default, distances will only take into account positions on opposite strands. If *antisense* is set to False, then distances will only consider - alignments on the same strand. + positions on the same strand. """ if max_dists is None: max_dists = {"+": (120, 120), "-": (120, 120)} - with BamFile(bampath_1) as refbam, \ - BamFile(bampath_2) as smallbam: - # combine the counters - return merge_with( - sum, - # count distances to neighbouring antisense small reads - # for each refbam read - (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis( - smallbam.to_5prime( - min_len=min_len_2, max_len=max_len_2), - refbam.to_5prime( - intervals=intervals, - min_len=min_len_1, max_len=max_len_1), - max_dists=max_dists, antisense=antisense))) + # combine the counters + return merge_with( + sum, + # count distances to neighbouring genposf_2 positions + # for each genposf_1 positions + (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis( + genposf_2.to_5prime( + min_len=min_len_2, max_len=max_len_2), + genposf_1.to_5prime( + intervals=intervals, + min_len=min_len_1, max_len=max_len_1), + max_dists=max_dists, antisense=antisense))) # TODO: make a separate script based on this @@ -654,7 +662,9 @@ def compute_read_distances( gene_list, annotations, id_kwd) else: intervals = None - return read_distances( - bampath_1, bampath_2, - intervals=intervals, - ansisense=antisense) + with BamFile(bampath_1) as refbam, \ + BamFile(bampath_2) as smallbam: + return read_distances( + refbam, smallbam, + intervals=intervals, + antisense=antisense) diff --git a/pyproject.toml b/pyproject.toml index 7cb5939bbbfd36c3fe3f02d515941aea7324288e..409d80d9a277b6727f5fd91ab829729c93808c45 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [build-system] requires = [ "setuptools", + "bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git", "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git", - "pysam" ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 15629d9679e4d8d4eb4242b6f7aedfc923183562..b03977a2d4a3470bfcbd47a6413cee249190677d 100644 --- a/setup.py +++ b/setup.py @@ -19,8 +19,8 @@ setup( license="MIT", packages=find_packages(), install_requires=[ - "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git", - "pysam"]) + "bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git"]) + "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git"]) #ext_modules = cythonize("libsmallrna/libsmallrna.pyx"), #install_requires=["cytoolz"], #zip_safe=False