Commit 7c79f77a authored by Blaise Li's avatar Blaise Li

Add script to copy and paste bigwig regions.

parent cde7fc33
*.bw filter=lfs diff=lfs merge=lfs -text
__copyright__ = "Copyright (C) 2020 Blaise Li"
__licence__ = "GNU GPLv3"
__version__ = 0.2
__version__ = 0.3
from .libhts import (
aligner2min_mapq,
gtf_2_genes_exon_lengths,
......@@ -8,6 +8,7 @@ from .libhts import (
make_empty_bigwig,
make_seeding_function,
median_ratio_to_pseudo_ref_size_factors,
paste_bigwig_region,
plot_boxplots, plot_counts_distribution, plot_histo,
plot_lfc_distribution, plot_MA,
plot_norm_correlations, plot_paired_scatters, plot_scatter,
......
......@@ -18,6 +18,8 @@ from functools import reduce
import warnings
import numpy as np
import pandas as pd
# To compute bins in bigwig data
from scipy.stats import binned_statistic
# To compute correlation coefficient, and compute linear regression
from scipy.stats.stats import pearsonr, linregress
# To compute geometric mean
......@@ -249,6 +251,69 @@ def make_empty_bigwig(filename, chrom_sizes):
bw_out.close()
# Possible improvements: use an itearble of (from_region, to_region) pairs,
# or two zippable iterables
def paste_bigwig_region(
from_fname, to_fname, from_region, to_region,
dest_fname, nanmean=False):
"""
Take values from a region *from_region* in a bigwig file *from_fname* and
paste them into a region *to_region* in another bigwig file *to_fname*, and
write the results in a third bigwig file *dest_fname*.
A region should be specified as a (chrom, start, stop) triplet, where start
is zero-based and stop is 1-based (like BED coordinates or Python slices).
*dest_fname* will have the same chromosomes as *to_fname*.
The values in *to_region* are substituted.
Current limitation: The regions should have the same length.
Option *nanmean* is likely not working correctly.
"""
(from_chrom, from_start, from_stop) = from_region
from_len = from_stop - from_start
(to_chrom, to_start, to_stop) = to_region
assert to_stop - to_start == from_len, (
"Regions should have the same lengths.")
from_bw = pyBigWig.open(from_fname)
to_bw = pyBigWig.open(to_fname)
chrom_sizes = list(to_bw.chroms().items())
dest_bw = pyBigWig.open(dest_fname, "w")
dest_bw.addHeader(chrom_sizes)
for (chrom, chrom_len) in chrom_sizes:
nb_bins = ceil(chrom_len / 10)
# values = to_bw.values(chrom, 0, chrom_len)
# Original values, plus some nans for binned_statistics to properly set the bin boundaries
values = np.pad(
to_bw.values(chrom, 0, chrom_len),
# pad zero on the left, and what is needed to complete the bin on the right
(0, 10 * nb_bins - chrom_len),
constant_values=np.nan)
if chrom == to_chrom:
# Replace the values in the desired region
values[to_start:to_stop] = from_bw.values(
from_chrom, from_start, from_stop)
if nanmean:
bin_means = binned_statistic(
range(0, 10 * nb_bins), values,
statistic=np.nanmean, bins=nb_bins).statistic
else:
bin_means = binned_statistic(
# range(0, chrom_len), np.nan_to_num(values),
range(0, 10 * nb_bins), values,
statistic="mean", bins=nb_bins).statistic
dest_bw.addEntries(
chrom, 0,
# Mean for each bin of size 10
values=bin_means,
# values=np.nan_to_num(np.zeros(chrom_len)[0::10]),
span=10, step=10)
dest_bw.close()
to_bw.close()
from_bw.close()
#################
# Bowtie2 stuff #
#################
......
import os
import shutil
import tempfile
import unittest
import libhts
import pyBigWig
import numpy as np
class TestPasteBigWig(unittest.TestCase):
def setUp(self):
self.bw_in_path = "data/klp7co_RPF.bw"
self.from_region = ("klp7co", 0, 2459)
self.to_region = ("III", (10808346-20), (10808346-20)+2459)
self.tmpdir = tempfile.mkdtemp()
self.bw_out_path = os.path.join(
self.tmpdir, "pasted.bw")
libhts.paste_bigwig_region(
self.bw_in_path, self.bw_in_path,
self.from_region, self.to_region,
self.bw_out_path)
def tearDown(self):
shutil.rmtree(self.tmpdir)
def test_preserved_region_num_bins(self):
bw_in = pyBigWig.open(self.bw_in_path)
bw_out = pyBigWig.open(self.bw_out_path)
bins_in = {
(start, stop): val
for (start, stop, val)
in bw_in.intervals(*self.from_region)
if (not np.isnan(val))}
# if (not np.isnan(val)) and val > 0}
# contains extra (start, stop, nan),
# that's where there was no bin
bins_out = {
(start, stop): val
for (start, stop, val)
in bw_out.intervals(*self.from_region)
if (not np.isnan(val))}
self.assertTrue(len(bins_in) == len(bins_out))
def test_preserved_region_bin_contents(self):
bw_in = pyBigWig.open(self.bw_in_path)
bw_out = pyBigWig.open(self.bw_out_path)
bins_in = [
(start, stop, val)
for (start, stop, val)
in bw_in.intervals(*self.from_region)
if (not np.isnan(val))]
bins_out = [
(start, stop, val)
for (start, stop, val)
in bw_out.intervals(*self.from_region)
if (not np.isnan(val))]
for (i, (bin_in, bin_out)) in enumerate(zip(bins_in, bins_out)):
self.assertTrue(bin_in == bin_out)
# There are differences, maybe due to phase differences with respect to bins
# def test_pasted_region_num_bins(self):
# bw_in = pyBigWig.open(self.bw_in_path)
# bw_out = pyBigWig.open(self.bw_out_path)
# bins_in = {
# (start, stop): val
# for (start, stop, val)
# in bw_in.intervals(*self.from_region)
# if (not np.isnan(val))}
# # if (not np.isnan(val)) and val > 0}
# # contains extra (start, stop, nan),
# # that's where there was no bin
# bins_out = {
# (start, stop): val
# for (start, stop, val)
# in bw_out.intervals(*self.to_region)
# if (not np.isnan(val))}
# self.assertTrue(len(bins_in) == len(bins_out))
# def test_pasted_region_bin_contents(self):
# bw_in = pyBigWig.open(self.bw_in_path)
# bw_out = pyBigWig.open(self.bw_out_path)
# bins_in = [
# (start, stop, val)
# for (start, stop, val)
# in bw_in.intervals(*self.from_region)
# if (not np.isnan(val))]
# bins_out = [
# (start, stop, val)
# for (start, stop, val)
# in bw_out.intervals(*self.to_region)
# if (not np.isnan(val))]
# for (i, (bin_in, bin_out)) in enumerate(zip(bins_in, bins_out)):
# self.assertTrue(bin_in == bin_out)
if __name__ == '__main__':
unittest.main()
#!/usr/bin/env python3
# Copyright (C) 2020 Blaise Li
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
"""Copy bigwig data from a region in a bigwig file into another region.
The other region can be in the same or in another bigwig file.
The result will be written in another bigwig file.
"""
from argparse import (
ArgumentParser,
ArgumentDefaultsHelpFormatter, RawTextHelpFormatter)
import sys
from operator import itemgetter
from yaml import safe_load as yload
from libhts import paste_bigwig_region
CONFIG_EXAMPLE = """
# bigwig file containing the region to copy
from_bw: "data/klp7co_RPF.bw"
# bigwig file containing the region to replace
to_bw: "data/klp7co_RPF.bw"
# bigwig file in which to write the result
dest_bw: "/tmp/pasted.bw"
# region from which to copy (BED-like coordinates)
from_region:
chrom: "klp7co"
# zero-based coordinate
start: 0
# 1-based coordinate
end: 2459
# region in which to copy (BED-like coordinates)
to_region:
chrom: "III"
# zero-based coordinate
start: 10808326
# 1-based coordinate
end: 10810785
"""
class Formatter(ArgumentDefaultsHelpFormatter, RawTextHelpFormatter):
"""
Hybrid help formatter.
See comments of <https://stackoverflow.com/a/3853776/1878788>
"""
def main():
"""Run the command-line script."""
# parser = ArgumentParser(
# description=__doc__,
# formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser = ArgumentParser(
description=f"{__doc__}\n\nExample configuration:\n{CONFIG_EXAMPLE}",
formatter_class=Formatter)
parser.add_argument(
"-c", "--configfile",
help="""YAML-formatted file containing information about:
- input bigwig files
- output bigwig file
- region from which to copy
- region where to paste
""")
parser.add_argument(
"-e", "--example_config",
action="store_true",
help="display an example configuration file")
args = parser.parse_args()
if args.example_config:
print(CONFIG_EXAMPLE)
return 0
if not args.configfile:
print("You must provide a configuration file using option -c. "
"See option -e for an example.", file=sys.stderr)
return 1
with open(args.configfile) as config_fh:
config = yload(config_fh)
region_extractor = itemgetter("chrom", "start", "end")
from_region = region_extractor(config["from_region"])
to_region = region_extractor(config["to_region"])
paste_bigwig_region(
config["from_bw"], config["to_bw"],
from_region, to_region,
config["dest_bw"])
return 0
if __name__ == "__main__":
sys.exit(main())
......@@ -32,7 +32,9 @@ setup(
author_email="blaise.li@normalesup.org",
license="GNU GPLv3",
packages=find_packages(),
scripts=["scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh"],
scripts=[
"scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh",
"scripts/copypaste_bigwig_regions.py"],
install_requires=[
#"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git@744dd79b579577cb6e131653260d7990946be3ad#egg=libworkflows-0.1",
#"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git#egg=libworkflows-0.1",
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment