From a0966db5352b2fd06f4772272cfc0d63fe5b8848 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 14 Apr 2021 09:26:07 +0200
Subject: [PATCH] Fix dependencies, distance can use bed.
---
libbamutils/libbamutils.py | 74 +++++++++++++++++++++-----------------
pyproject.toml | 2 +-
setup.py | 4 +--
3 files changed, 45 insertions(+), 35 deletions(-)
diff --git a/libbamutils/libbamutils.py b/libbamutils/libbamutils.py
index 4094875..0b668f7 100644
--- a/libbamutils/libbamutils.py
+++ b/libbamutils/libbamutils.py
@@ -20,11 +20,13 @@ from collections import Counter, deque
from itertools import dropwhile, takewhile
from operator import attrgetter
from cytoolz import merge_with
-from pysam import AlignmentFile
# To get coordinates of genes
from libhts import id_list_gtf2bed
# To collapse aligned reads to their 5' ends
from bam25prime import collapse_and_sort, filter_feature_size
+# bam25prime re-exports those classes from pysam and pybedtools
+# to enable simpler dependency management
+from bam25prime import AlignmentFile, BedTool
from libreads import bam2fastq
@@ -372,6 +374,14 @@ class BedFile(GenposFile):
def __init__(self, bedpath):
self.bed = BedTool(bedpath)
+ # To be usable with context manager "with" syntax
+ # for compatibility with BamFile (not sure this really makes sense)
+ def __enter__(self):
+ return self
+
+ def __exit__(self, exception_type, exception_value, traceback):
+ pass
+
def to_5prime(self, intervals=None, min_len=None, max_len=None):
"""
Iterate over the *self.bed* and yield only those features
@@ -572,25 +582,25 @@ class BamFile(GenposFile):
# TODO: allow replacing bam files with bed files,
# using pybedtools.BedTool and collapse_and_sort_bedtool
def read_distances(
- bampath_1, bampath_2,
+ genposf_1, genposf_2,
intervals=None,
min_len_1=None, max_len_1=None,
min_len_2=None, max_len_2=None,
max_dists=None, antisense=True):
"""
- Compute the distribution of distances of reads in Bam formatted file
- *bampath_2* with respect to reads in Bam formatted file *bampath_1*.
+ Compute the distribution of distances of genomic positions in GenposFile
+ object *genposf_2* with respect to those in other GenposFile *genposf_1*.
- If *intervals* is provided, only reads from file *bampath_1* belonging to
+ If *intervals* is provided, only positions from file *genposf_1* belonging to
these genomic intervals will be considered.
- If *min_len_1* and / or *max_len_1* are provided, reads from file
- *bampath_1* that fall outside those boundaries will not be considered.
+ If *min_len_1* and / or *max_len_1* are provided, positions from
+ *genposf_1* that fall outside those boundaries will not be considered.
- If *min_len_2* and / or *max_len_2* are provided, reads from file
+ If *min_len_2* and / or *max_len_2* are provided, positions from
*bampath_2* that fall outside those boundaries will not be considered.
- The distances are those between 5' ends of the reads.
+ The distances are those between 5' ends of the positions.
If *max_dists* is provided, it should be a dictionary
of pairs (dist_before, dist_after) keyed by strand
@@ -598,31 +608,29 @@ def read_distances(
By default, it is the following:
max_dists = {"+": (120, 120), "-": (120, 120)}
- This means that for a given read from *bampath_1*,
- reads from *bampath_2* whose 5' end falls outside a [-120, 120] range
- around the 5' end of the read of interest will be ignored.
+ This means that for a given position from *genposf_1*,
+ positions from *genposf_2* whose 5' end falls outside a [-120, 120] range
+ around the 5' end of the position of interest will be ignored.
- By default, distances will only take into account alignments
+ By default, distances will only take into account positions
on opposite strands.
If *antisense* is set to False, then distances will only consider
- alignments on the same strand.
+ positions on the same strand.
"""
if max_dists is None:
max_dists = {"+": (120, 120), "-": (120, 120)}
- with BamFile(bampath_1) as refbam, \
- BamFile(bampath_2) as smallbam:
- # combine the counters
- return merge_with(
- sum,
- # count distances to neighbouring antisense small reads
- # for each refbam read
- (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis(
- smallbam.to_5prime(
- min_len=min_len_2, max_len=max_len_2),
- refbam.to_5prime(
- intervals=intervals,
- min_len=min_len_1, max_len=max_len_1),
- max_dists=max_dists, antisense=antisense)))
+ # combine the counters
+ return merge_with(
+ sum,
+ # count distances to neighbouring genposf_2 positions
+ # for each genposf_1 positions
+ (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis(
+ genposf_2.to_5prime(
+ min_len=min_len_2, max_len=max_len_2),
+ genposf_1.to_5prime(
+ intervals=intervals,
+ min_len=min_len_1, max_len=max_len_1),
+ max_dists=max_dists, antisense=antisense)))
# TODO: make a separate script based on this
@@ -654,7 +662,9 @@ def compute_read_distances(
gene_list, annotations, id_kwd)
else:
intervals = None
- return read_distances(
- bampath_1, bampath_2,
- intervals=intervals,
- ansisense=antisense)
+ with BamFile(bampath_1) as refbam, \
+ BamFile(bampath_2) as smallbam:
+ return read_distances(
+ refbam, smallbam,
+ intervals=intervals,
+ antisense=antisense)
diff --git a/pyproject.toml b/pyproject.toml
index 7cb5939..409d80d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
[build-system]
requires = [
"setuptools",
+ "bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git",
"libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git",
- "pysam"
]
build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 15629d9..b03977a 100644
--- a/setup.py
+++ b/setup.py
@@ -19,8 +19,8 @@ setup(
license="MIT",
packages=find_packages(),
install_requires=[
- "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git",
- "pysam"])
+ "bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git"])
+ "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git"])
#ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
#install_requires=["cytoolz"],
#zip_safe=False
--
GitLab