Skip to content
Snippets Groups Projects
Commit a0966db5 authored by Blaise Li's avatar Blaise Li
Browse files

Fix dependencies, distance can use bed.

parent 4be6fea6
No related branches found
No related tags found
No related merge requests found
...@@ -20,11 +20,13 @@ from collections import Counter, deque ...@@ -20,11 +20,13 @@ from collections import Counter, deque
from itertools import dropwhile, takewhile from itertools import dropwhile, takewhile
from operator import attrgetter from operator import attrgetter
from cytoolz import merge_with from cytoolz import merge_with
from pysam import AlignmentFile
# To get coordinates of genes # To get coordinates of genes
from libhts import id_list_gtf2bed from libhts import id_list_gtf2bed
# To collapse aligned reads to their 5' ends # To collapse aligned reads to their 5' ends
from bam25prime import collapse_and_sort, filter_feature_size from bam25prime import collapse_and_sort, filter_feature_size
# bam25prime re-exports those classes from pysam and pybedtools
# to enable simpler dependency management
from bam25prime import AlignmentFile, BedTool
from libreads import bam2fastq from libreads import bam2fastq
...@@ -372,6 +374,14 @@ class BedFile(GenposFile): ...@@ -372,6 +374,14 @@ class BedFile(GenposFile):
def __init__(self, bedpath): def __init__(self, bedpath):
self.bed = BedTool(bedpath) self.bed = BedTool(bedpath)
# To be usable with context manager "with" syntax
# for compatibility with BamFile (not sure this really makes sense)
def __enter__(self):
return self
def __exit__(self, exception_type, exception_value, traceback):
pass
def to_5prime(self, intervals=None, min_len=None, max_len=None): def to_5prime(self, intervals=None, min_len=None, max_len=None):
""" """
Iterate over the *self.bed* and yield only those features Iterate over the *self.bed* and yield only those features
...@@ -572,25 +582,25 @@ class BamFile(GenposFile): ...@@ -572,25 +582,25 @@ class BamFile(GenposFile):
# TODO: allow replacing bam files with bed files, # TODO: allow replacing bam files with bed files,
# using pybedtools.BedTool and collapse_and_sort_bedtool # using pybedtools.BedTool and collapse_and_sort_bedtool
def read_distances( def read_distances(
bampath_1, bampath_2, genposf_1, genposf_2,
intervals=None, intervals=None,
min_len_1=None, max_len_1=None, min_len_1=None, max_len_1=None,
min_len_2=None, max_len_2=None, min_len_2=None, max_len_2=None,
max_dists=None, antisense=True): max_dists=None, antisense=True):
""" """
Compute the distribution of distances of reads in Bam formatted file Compute the distribution of distances of genomic positions in GenposFile
*bampath_2* with respect to reads in Bam formatted file *bampath_1*. object *genposf_2* with respect to those in other GenposFile *genposf_1*.
If *intervals* is provided, only reads from file *bampath_1* belonging to If *intervals* is provided, only positions from file *genposf_1* belonging to
these genomic intervals will be considered. these genomic intervals will be considered.
If *min_len_1* and / or *max_len_1* are provided, reads from file If *min_len_1* and / or *max_len_1* are provided, positions from
*bampath_1* that fall outside those boundaries will not be considered. *genposf_1* that fall outside those boundaries will not be considered.
If *min_len_2* and / or *max_len_2* are provided, reads from file If *min_len_2* and / or *max_len_2* are provided, positions from
*bampath_2* that fall outside those boundaries will not be considered. *bampath_2* that fall outside those boundaries will not be considered.
The distances are those between 5' ends of the reads. The distances are those between 5' ends of the positions.
If *max_dists* is provided, it should be a dictionary If *max_dists* is provided, it should be a dictionary
of pairs (dist_before, dist_after) keyed by strand of pairs (dist_before, dist_after) keyed by strand
...@@ -598,28 +608,26 @@ def read_distances( ...@@ -598,28 +608,26 @@ def read_distances(
By default, it is the following: By default, it is the following:
max_dists = {"+": (120, 120), "-": (120, 120)} max_dists = {"+": (120, 120), "-": (120, 120)}
This means that for a given read from *bampath_1*, This means that for a given position from *genposf_1*,
reads from *bampath_2* whose 5' end falls outside a [-120, 120] range positions from *genposf_2* whose 5' end falls outside a [-120, 120] range
around the 5' end of the read of interest will be ignored. around the 5' end of the position of interest will be ignored.
By default, distances will only take into account alignments By default, distances will only take into account positions
on opposite strands. on opposite strands.
If *antisense* is set to False, then distances will only consider If *antisense* is set to False, then distances will only consider
alignments on the same strand. positions on the same strand.
""" """
if max_dists is None: if max_dists is None:
max_dists = {"+": (120, 120), "-": (120, 120)} max_dists = {"+": (120, 120), "-": (120, 120)}
with BamFile(bampath_1) as refbam, \
BamFile(bampath_2) as smallbam:
# combine the counters # combine the counters
return merge_with( return merge_with(
sum, sum,
# count distances to neighbouring antisense small reads # count distances to neighbouring genposf_2 positions
# for each refbam read # for each genposf_1 positions
(dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis( (dists_to_ali(ali, neighbours) for (ali, neighbours) in zip_alis(
smallbam.to_5prime( genposf_2.to_5prime(
min_len=min_len_2, max_len=max_len_2), min_len=min_len_2, max_len=max_len_2),
refbam.to_5prime( genposf_1.to_5prime(
intervals=intervals, intervals=intervals,
min_len=min_len_1, max_len=max_len_1), min_len=min_len_1, max_len=max_len_1),
max_dists=max_dists, antisense=antisense))) max_dists=max_dists, antisense=antisense)))
...@@ -654,7 +662,9 @@ def compute_read_distances( ...@@ -654,7 +662,9 @@ def compute_read_distances(
gene_list, annotations, id_kwd) gene_list, annotations, id_kwd)
else: else:
intervals = None intervals = None
with BamFile(bampath_1) as refbam, \
BamFile(bampath_2) as smallbam:
return read_distances( return read_distances(
bampath_1, bampath_2, refbam, smallbam,
intervals=intervals, intervals=intervals,
ansisense=antisense) antisense=antisense)
[build-system] [build-system]
requires = [ requires = [
"setuptools", "setuptools",
"bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git",
"libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git", "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git",
"pysam"
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -19,8 +19,8 @@ setup( ...@@ -19,8 +19,8 @@ setup(
license="MIT", license="MIT",
packages=find_packages(), packages=find_packages(),
install_requires=[ install_requires=[
"libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git", "bam25prime @ git+https://gitlab.pasteur.fr/bli/bam25prime.git"])
"pysam"]) "libreads @ git+https://gitlab.pasteur.fr/bli/libreads.git"])
#ext_modules = cythonize("libsmallrna/libsmallrna.pyx"), #ext_modules = cythonize("libsmallrna/libsmallrna.pyx"),
#install_requires=["cytoolz"], #install_requires=["cytoolz"],
#zip_safe=False #zip_safe=False
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment