Add script to copy and paste bigwig regions.

7c79f77a · Blaise Li · cde7fc33 · 7c79f77a · 7c79f77a · 7c79f77a
Commit 7c79f77a authored 5 years ago by Blaise Li
--- a/.gitattributes
+++ b/.gitattributes
+*.bw filter=lfs diff=lfs merge=lfs -text
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
 __copyright__ = "Copyright (C) 2020 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = 0.2
+__version__ = 0.3
 from .libhts import (
    aligner2min_mapq,
    gtf_2_genes_exon_lengths,
@@ -8,6 +8,7 @@ from .libhts import (
    make_empty_bigwig,
    make_seeding_function,
    median_ratio_to_pseudo_ref_size_factors,
+    paste_bigwig_region,
    plot_boxplots, plot_counts_distribution, plot_histo,
    plot_lfc_distribution, plot_MA,
    plot_norm_correlations, plot_paired_scatters, plot_scatter,

--- a/libhts/data/klp7co_RPF.bw
+++ b/libhts/data/klp7co_RPF.bw
--- a/libhts/libhts.py
+++ b/libhts/libhts.py
@@ -18,6 +18,8 @@ from functools import reduce
 import warnings
 import numpy as np
 import pandas as pd
+# To compute bins in bigwig data
+from scipy.stats import binned_statistic
 # To compute correlation coefficient, and compute linear regression
 from scipy.stats.stats import pearsonr, linregress
 # To compute geometric mean
@@ -249,6 +251,69 @@ def make_empty_bigwig(filename, chrom_sizes):
    bw_out.close()
+# Possible improvements: use an itearble of (from_region, to_region) pairs,
+# or two zippable iterables
+def paste_bigwig_region(
+        from_fname, to_fname, from_region, to_region,
+        dest_fname, nanmean=False):
+    """
+    Take values from a region *from_region* in a bigwig file *from_fname* and
+    paste them into a region *to_region* in another bigwig file *to_fname*, and
+    write the results in a third bigwig file *dest_fname*.
+    A region should be specified as a (chrom, start, stop) triplet, where start
+    is zero-based and stop is 1-based (like BED coordinates or Python slices).
+    *dest_fname* will have the same chromosomes as *to_fname*.
+    The values in *to_region* are substituted.
+    Current limitation: The regions should have the same length.
+    Option *nanmean* is likely not working correctly.
+    """
+    (from_chrom, from_start, from_stop) = from_region
+    from_len = from_stop - from_start
+    (to_chrom, to_start, to_stop) = to_region
+    assert to_stop - to_start == from_len, (
+        "Regions should have the same lengths.")
+    from_bw = pyBigWig.open(from_fname)
+    to_bw = pyBigWig.open(to_fname)
+    chrom_sizes = list(to_bw.chroms().items())
+    dest_bw = pyBigWig.open(dest_fname, "w")
+    dest_bw.addHeader(chrom_sizes)
+    for (chrom, chrom_len) in chrom_sizes:
+        nb_bins = ceil(chrom_len / 10)
+        # values = to_bw.values(chrom, 0, chrom_len)
+        # Original values, plus some nans for binned_statistics to properly set the bin boundaries
+        values = np.pad(
+            to_bw.values(chrom, 0, chrom_len),
+            # pad zero on the left, and what is needed to complete the bin on the right
+            (0, 10 * nb_bins - chrom_len),
+            constant_values=np.nan)
+        if chrom == to_chrom:
+            # Replace the values in the desired region
+            values[to_start:to_stop] = from_bw.values(
+                from_chrom, from_start, from_stop)
+        if nanmean:
+            bin_means = binned_statistic(
+                range(0, 10 * nb_bins), values,
+                statistic=np.nanmean, bins=nb_bins).statistic
+        else:
+            bin_means = binned_statistic(
+                # range(0, chrom_len), np.nan_to_num(values),
+                range(0, 10 * nb_bins), values,
+                statistic="mean", bins=nb_bins).statistic
+        dest_bw.addEntries(
+            chrom, 0,
+            # Mean for each bin of size 10
+            values=bin_means,
+            # values=np.nan_to_num(np.zeros(chrom_len)[0::10]),
+            span=10, step=10)
+    dest_bw.close()
+    to_bw.close()
+    from_bw.close()
 #################
 # Bowtie2 stuff #
 #################

--- a/libhts/test_libhts.py
+++ b/libhts/test_libhts.py
+import os
+import shutil
+import tempfile
+import unittest
+import libhts
+import pyBigWig
+import numpy as np
+class TestPasteBigWig(unittest.TestCase):
+    def setUp(self):
+        self.bw_in_path = "data/klp7co_RPF.bw"
+        self.from_region = ("klp7co", 0, 2459)
+        self.to_region = ("III", (10808346-20), (10808346-20)+2459)
+        self.tmpdir = tempfile.mkdtemp()
+        self.bw_out_path = os.path.join(
+            self.tmpdir, "pasted.bw")
+        libhts.paste_bigwig_region(
+            self.bw_in_path, self.bw_in_path,
+            self.from_region, self.to_region,
+            self.bw_out_path)
+    def tearDown(self):
+        shutil.rmtree(self.tmpdir)
+    def test_preserved_region_num_bins(self):
+        bw_in = pyBigWig.open(self.bw_in_path)
+        bw_out = pyBigWig.open(self.bw_out_path)
+        bins_in = {
+            (start, stop): val
+            for (start, stop, val)
+            in bw_in.intervals(*self.from_region)
+            if (not np.isnan(val))}
+            # if (not np.isnan(val)) and val > 0}
+        # contains extra (start, stop, nan),
+        # that's where there was no bin
+        bins_out = {
+            (start, stop): val
+            for (start, stop, val)
+            in bw_out.intervals(*self.from_region)
+            if (not np.isnan(val))}
+        self.assertTrue(len(bins_in) == len(bins_out))
+    def test_preserved_region_bin_contents(self):
+        bw_in = pyBigWig.open(self.bw_in_path)
+        bw_out = pyBigWig.open(self.bw_out_path)
+        bins_in = [
+            (start, stop, val)
+            for (start, stop, val)
+            in bw_in.intervals(*self.from_region)
+            if (not np.isnan(val))]
+        bins_out = [
+            (start, stop, val)
+            for (start, stop, val)
+            in bw_out.intervals(*self.from_region)
+            if (not np.isnan(val))]
+        for (i, (bin_in, bin_out)) in enumerate(zip(bins_in, bins_out)):
+            self.assertTrue(bin_in == bin_out)
+    # There are differences, maybe due to phase differences with respect to bins
+    # def test_pasted_region_num_bins(self):
+    #     bw_in = pyBigWig.open(self.bw_in_path)
+    #     bw_out = pyBigWig.open(self.bw_out_path)
+    #     bins_in = {
+    #         (start, stop): val
+    #         for (start, stop, val)
+    #         in bw_in.intervals(*self.from_region)
+    #         if (not np.isnan(val))}
+    #         # if (not np.isnan(val)) and val > 0}
+    #     # contains extra (start, stop, nan),
+    #     # that's where there was no bin
+    #     bins_out = {
+    #         (start, stop): val
+    #         for (start, stop, val)
+    #         in bw_out.intervals(*self.to_region)
+    #         if (not np.isnan(val))}
+    #     self.assertTrue(len(bins_in) == len(bins_out))
+    # def test_pasted_region_bin_contents(self):
+    #     bw_in = pyBigWig.open(self.bw_in_path)
+    #     bw_out = pyBigWig.open(self.bw_out_path)
+    #     bins_in = [
+    #         (start, stop, val)
+    #         for (start, stop, val)
+    #         in bw_in.intervals(*self.from_region)
+    #         if (not np.isnan(val))]
+    #     bins_out = [
+    #         (start, stop, val)
+    #         for (start, stop, val)
+    #         in bw_out.intervals(*self.to_region)
+    #         if (not np.isnan(val))]
+    #     for (i, (bin_in, bin_out)) in enumerate(zip(bins_in, bins_out)):
+    #         self.assertTrue(bin_in == bin_out)
+if __name__ == '__main__':
+    unittest.main()
--- a/scripts/copypaste_bigwig_regions.py
+++ b/scripts/copypaste_bigwig_regions.py
+#!/usr/bin/env python3
+# Copyright (C) 2020 Blaise Li
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""Copy bigwig data from a region in a bigwig file into another region.
+The other region can be in the same or in another bigwig file.
+The result will be written in another bigwig file.
+"""
+from argparse import (
+    ArgumentParser,
+    ArgumentDefaultsHelpFormatter, RawTextHelpFormatter)
+import sys
+from operator import itemgetter
+from yaml import safe_load as yload
+from libhts import paste_bigwig_region
+CONFIG_EXAMPLE = """
+# bigwig file containing the region to copy
+from_bw: "data/klp7co_RPF.bw"
+# bigwig file containing the region to replace
+to_bw: "data/klp7co_RPF.bw"
+# bigwig file in which to write the result
+dest_bw: "/tmp/pasted.bw"
+# region from which to copy (BED-like coordinates)
+from_region:
+    chrom: "klp7co"
+    # zero-based coordinate
+    start: 0
+    # 1-based coordinate
+    end: 2459
+# region in which to copy (BED-like coordinates)
+to_region:
+    chrom: "III"
+    # zero-based coordinate
+    start: 10808326
+    # 1-based coordinate
+    end: 10810785
+"""
+class Formatter(ArgumentDefaultsHelpFormatter, RawTextHelpFormatter):
+    """
+    Hybrid help formatter.
+    See comments of <https://stackoverflow.com/a/3853776/1878788>
+    """
+def main():
+    """Run the command-line script."""
+    # parser = ArgumentParser(
+    #     description=__doc__,
+    #     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser = ArgumentParser(
+        description=f"{__doc__}\n\nExample configuration:\n{CONFIG_EXAMPLE}",
+        formatter_class=Formatter)
+    parser.add_argument(
+        "-c", "--configfile",
+        help="""YAML-formatted file containing information about:
+    - input bigwig files
+    - output bigwig file
+    - region from which to copy
+    - region where to paste
+""")
+    parser.add_argument(
+        "-e", "--example_config",
+        action="store_true",
+        help="display an example configuration file")
+    args = parser.parse_args()
+    if args.example_config:
+        print(CONFIG_EXAMPLE)
+        return 0
+    if not args.configfile:
+        print("You must provide a configuration file using option -c. "
+              "See option -e for an example.", file=sys.stderr)
+        return 1
+    with open(args.configfile) as config_fh:
+        config = yload(config_fh)
+    region_extractor = itemgetter("chrom", "start", "end")
+    from_region = region_extractor(config["from_region"])
+    to_region = region_extractor(config["to_region"])
+    paste_bigwig_region(
+        config["from_bw"], config["to_bw"],
+        from_region, to_region,
+        config["dest_bw"])
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())
--- a/setup.py
+++ b/setup.py
@@ -32,7 +32,9 @@ setup(
    author_email="blaise.li@normalesup.org",
    license="GNU GPLv3",
    packages=find_packages(),
-    scripts=["scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh"],
+    scripts=[
+        "scripts/bam2bigwig.sh", "scripts/sam2indexedbam.sh",
+        "scripts/copypaste_bigwig_regions.py"],
    install_requires=[
        #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git@744dd79b579577cb6e131653260d7990946be3ad#egg=libworkflows-0.1",
        #"libworkflows @ git+https://gitlab+deploy-token-31:isEzpsgbNf2sJMdUDy2g@gitlab.pasteur.fr/bli/libworkflows.git#egg=libworkflows-0.1",