More robust merging of bigwig files.

Started to apply to RNA-seq and GRO-seq technique used in small RNA-seq.

More robust merging of bigwig files.
efb29419 · Blaise Li · 3c815d52 · efb29419 · efb29419
Commit efb29419 authored 7 years ago by Blaise Li
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -49,7 +49,7 @@ import husl
 from libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
 from libworkflows import get_chrom_sizes, column_converter
 from libworkflows import strip_split, last_lines, save_plot, test_na_file
-from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS
+from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
 from libworkflows import read_htseq_counts, sum_htseq_counts
 from libworkflows import read_feature_counts, sum_feature_counts
@@ -969,20 +969,44 @@ rule merge_bigwig_reps:
        #expand(OPJ(output_dir, aligner, "mapped_C_elegans", "{{lib}}_{rep}_on_C_elegans_norm_{{orientation}}.bw"), rep=[rep for rep in REPS if frozenset((wildcards.lib, rep)) not in forbidden]),
    output:
        bw = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"),
+    log:
+        warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"),
+    threads: 2  # to limit memory usage, actually
    run:
-        bws = [pyBigWig.open(bw_filename) for bw_filename in input]
+        with warn_context(log.warnings) as warn:
-        #for bw_filename in input:
+            try:
-        #    bws.append(pyBigWig.open(bw_filename))
+                bws = [pyBigWig.open(bw_filename) for bw_filename in input]
-        bw_out = pyBigWig.open(output.bw, "w")
+                #for bw_filename in input:
-        bw_out.addHeader(list(bws[0].chroms().items()))
+                #    bws.append(pyBigWig.open(bw_filename))
-        for (chrom, chrom_len) in bw_out.chroms().items():
+            except RuntimeError as e:
-            assert all([bw.chroms()[chrom] == chrom_len for bw in bws])
+                warn(str(e))
-            means = np.nanmean(np.vstack([bw.values(chrom, 0, chrom_len, numpy=True) for bw in bws]), axis=0)
+                warn("Generating empty file.\n")
-            # bin size is 10
+                # Make the file empty
-            bw_out.addEntries(chrom, 0, values=np.nan_to_num(means[0::10]), span=10, step=10)
+                open(output.bw, "w").close()
-        bw_out.close()
+            else:
-        for bw in bws:
+                bw_out = pyBigWig.open(output.bw, "w")
-            bw.close()
+                bw_out.addHeader(list(chrom_sizes.items()))
+                for (chrom, chrom_len) in bw_out.chroms().items():
+                    try:
+                        assert all([bw.chroms()[chrom] == chrom_len for bw in bws])
+                    except KeyError as e:
+                        warn(str(e))
+                        warn(f"Chromosome {chrom} might be missing from one of the input files.\n")
+                        for filename, bw in zip(input, bws):
+                            msg = " ".join([f"{filename}:", *list(bw.chroms().keys())])
+                            warn(f"{msg}:\n")
+                        #raise
+                        warn(f"The bigwig files without {chrom} will be skipped.\n")
+                    to_use = [bw for bw in bws if chrom in bw.chroms()]
+                    if to_use:
+                        means = np.nanmean(np.vstack([bw.values(chrom, 0, chrom_len, numpy=True) for bw in to_use]), axis=0)
+                    else:
+                        means = np.zeros(chrom_len)
+                    # bin size is 10
+                    bw_out.addEntries(chrom, 0, values=np.nan_to_num(means[0::10]), span=10, step=10)
+                bw_out.close()
+                for bw in bws:
+                    bw.close()
 rule join_all_counts:

--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -45,7 +45,7 @@ from rpy2.robjects import Formula, StrVector
 from libhts import do_deseq2, status_setter, plot_lfc_distribution, plot_MA
 from libhts import median_ratio_to_pseudo_ref_size_factors, size_factor_correlations, plot_norm_correlations, plot_counts_distribution
 from libworkflows import get_chrom_sizes, column_converter, make_id_list_getter
-from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS
+from libworkflows import strip_split, save_plot, test_na_file, SHELL_FUNCTIONS, warn_context
 from libworkflows import read_htseq_counts, sum_htseq_counts
 from libworkflows import read_intersect_counts, sum_intersect_counts
 from libworkflows import read_feature_counts, sum_feature_counts
@@ -690,19 +690,44 @@ rule merge_bigwig_reps:
        expand(OPJ(mapping_dir, aligner, "mapped_C_elegans", "{{lib}}_{rep}", "{{lib}}_{rep}_on_C_elegans_by_{{norm_type}}_{{orientation}}.bw"), rep=REPS),
    output:
        bw = OPJ(mapping_dir, aligner, "mapped_C_elegans", "{lib}_mean", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.bw"),
+    log:
+        warnings = OPJ(log_dir, "merge_bigwig_reps", "{lib}_mean_on_C_elegans_by_{norm_type}_{orientation}.warnings"),
    threads: 12  # to limit memory usage, actually
    run:
-        bws = [pyBigWig.open(bw_filename) for bw_filename in input]
+        with warn_context(log.warnings) as warn:
-        bw_out = pyBigWig.open(output.bw, "w")
+            try:
-        bw_out.addHeader(list(bws[0].chroms().items()))
+                bws = [pyBigWig.open(bw_filename) for bw_filename in input]
-        for (chrom, chrom_len) in bw_out.chroms().items():
+                #for bw_filename in input:
-            assert all([bw.chroms()[chrom] == chrom_len for bw in bws])
+                #    bws.append(pyBigWig.open(bw_filename))
-            means = np.nanmean(np.vstack([bw.values(chrom, 0, chrom_len, numpy=True) for bw in bws]), axis=0)
+            except RuntimeError as e:
-            # bin size is 10
+                warn(str(e))
-            bw_out.addEntries(chrom, 0, values=np.nan_to_num(means[0::10]), span=10, step=10)
+                warn("Generating empty file.\n")
-        bw_out.close()
+                # Make the file empty
-        for bw in bws:
+                open(output.bw, "w").close()
-            bw.close()
+            else:
+                bw_out = pyBigWig.open(output.bw, "w")
+                bw_out.addHeader(list(chrom_sizes.items()))
+                for (chrom, chrom_len) in bw_out.chroms().items():
+                    try:
+                        assert all([bw.chroms()[chrom] == chrom_len for bw in bws])
+                    except KeyError as e:
+                        warn(str(e))
+                        warn(f"Chromosome {chrom} might be missing from one of the input files.\n")
+                        for filename, bw in zip(input, bws):
+                            msg = " ".join([f"{filename}:", *list(bw.chroms().keys())])
+                            warn(f"{msg}:\n")
+                        #raise
+                        warn(f"The bigwig files without {chrom} will be skipped.\n")
+                    to_use = [bw for bw in bws if chrom in bw.chroms()]
+                    if to_use:
+                        means = np.nanmean(np.vstack([bw.values(chrom, 0, chrom_len, numpy=True) for bw in to_use]), axis=0)
+                    else:
+                        means = np.zeros(chrom_len)
+                    # bin size is 10
+                    bw_out.addEntries(chrom, 0, values=np.nan_to_num(means[0::10]), span=10, step=10)
+                bw_out.close()
+                for bw in bws:
+                    bw.close()
 rule join_all_counts: