Use correct size factor for iCLIP.

47ac1ab4 · Blaise Li · 00f001de · 47ac1ab4
Commit 47ac1ab4 authored 7 years ago by Blaise Li
--- a/CLIP/iCLIP.snakefile
+++ b/CLIP/iCLIP.snakefile
@@ -110,6 +110,7 @@ wildcard_constraints:
    lib="|".join(LIBS),
    rep="\d+",
    orientation="|".join(ORIENTATIONS),
+    norm="|".join(SIZE_FACTORS),
    #size_range="\d+-\d+"

 preprocessing = [
@@ -136,9 +137,9 @@ counting = [
    ## Will be pulled in as dependencies of other needed results:
    # expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
    ##
-    expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS),
-    expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
-    expand(OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_by_{norm_type}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm_type=NORM_TYPES, orientation=["all"]),
+    expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, orientation=ORIENTATIONS),
+    expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
+    expand(OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, norm=NORM_TYPES, orientation=["all"]),
 ]

 #TODO:
@@ -378,7 +379,6 @@ def set_alignment_settings(wildcards):
 ###########
 # Mapping #
 ###########
-# TODO: replace settings by function of read_type
 rule map_on_genome:
    input:
        # fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{read_type}.fastq.gz"),
@@ -630,28 +630,46 @@ rule compute_median_ratio_to_pseudo_ref_size_factors:
        median_ratios.to_csv(output.median_ratios_file, sep="\t")


+def source_norm_file(wildcards):
+    if wildcards.norm == "median_ratio_to_pseudo_ref":
+        return OPJ(output_dir, f"{wildcards.trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{wildcards.read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
+    else:
+        return rules.summarize_feature_counts.output.summary
+
+
 rule make_normalized_bigwig:
    input:
        bam = rules.sam2indexedbam.output.sorted_bam,
        #bam = rules.fuse_bams.output.sorted_bam,
-        # TODO: use sourcing function based on norm_type
+        # TODO: use sourcing function based on norm
+        norm_file = source_norm_file,
        #size_factor_file = rules.compute_coverage.output.coverage
-        median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
+        #median_ratios_file = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "protein_coding_fwd_median_ratios_to_pseudo_ref.txt"),
        # TODO: compute this
        #scale_factor_file = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation", "all_%s_on_C_elegans" % size_selected, "pisimi_median_ratios_to_pseudo_ref.txt"),
    output:
-        bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, "mapped_%s" % genome, "{lib}_{rep}_{read_type}_on_%s_by_{norm_type}_{orientation}.bw" % genome),
+        bigwig_norm = OPJ(output_dir, "{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
    #params:
    #    orient_filter = bamcoverage_filter,
    threads: 12  # to limit memory usage, actually
    benchmark:
-        OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}_benchmark.txt")
+        OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}_benchmark.txt")
    params:
        genome_binned = genome_binned,
    log:
-        log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}.log"),
-        err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm_type}_{orientation}.err"),
+        log = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}.log"),
+        err = OPJ(log_dir, "{trimmer}", "make_normalized_bigwig", "{lib}_{rep}_{read_type}_by_{norm}_{orientation}.err"),
    run:
+        if wildcards.norm == "median_ratio_to_pseudo_ref":
+            size = float(pd.read_table(
+                input.norm_file, index_col=0, header=None).loc[
+                    f"{wildcards.lib}_{wildcards.rep}"])
+        else:
+            # We normalize by million in order not to have too small values
+            size = pd.read_table(input.norm_file).T.loc[wildcards.norm][0] / 1000000
+            #scale = 1 / pd.read_table(input.summary, index_col=0).loc[
+            #    wildcards.norm_file].loc[f"{wildcards.lib}_{wildcards.rep}"]
+        assert size > 0
        # TODO: make this a function of deeptools version
        no_reads = """Error: The generated bedGraphFile was empty. Please adjust
 your deepTools settings and check your input files.
@@ -663,14 +681,19 @@ bam2bigwig.sh: bedGraphToBigWig failed
            shell("""
                bam2bigwig.sh {input.bam} {params.genome_binned} \\
                    {wildcards.lib}_{wildcards.rep} {wildcards.orientation} %s \\
-                    {input.median_ratios_file} {output.bigwig_norm} \\
+                    %f {output.bigwig_norm} \\
                    > {log.log} 2> {log.err} \\
                    || error_exit "bam2bigwig.sh failed"
-                """ % LIB_TYPE[-1])
+                """ % (LIB_TYPE[-1], size))
        except CalledProcessError as e:
            if last_lines(log.err, 2) in {no_reads, zero_bytes}:
-                with open(output.bigwig_norm, "w") as bwfile:
-                    bwfile.write("")
+                bw_out = pyBigWig.open(output.bigwig_norm, "w")
+                bw_out.addHeader(list(chrom_sizes.items()))
+                for (chrom, chrom_len) in bw_out.chroms().items():
+                    bw_out.addEntries(chrom, 0, values=np.nan_to_num(np.zeros(chrom_len)[0::10]), span=10, step=10)
+                bw_out.close()
+                #with open(output.bigwig_norm, "w") as bwfile:
+                #    bwfile.write("")
            else:
                raise