From a7cb2a01530cab78e8c13de186e3b1fce4fe249a Mon Sep 17 00:00:00 2001 From: rlegendr <rachel.legendre@pasteur.fr> Date: Tue, 1 Mar 2022 17:07:00 +0100 Subject: [PATCH] connect seacr to IA and chipflowr --- README.md | 2 - Snakefile | 216 ++++++++++++---------- test/config.yaml | 6 +- workflow/rules/igv_session.rules | 11 +- workflow/rules/intersectionApproach.rules | 12 +- 5 files changed, 139 insertions(+), 108 deletions(-) diff --git a/README.md b/README.md index ef856a9..68edfeb 100644 --- a/README.md +++ b/README.md @@ -316,8 +316,6 @@ compute_idr: intersectionApproach: yes ia_overlap: 0.8 ``` -> SEACR may lead to produce few number of peaks so the Intersection Approach can produce an empty file with no union of peaks. We advice to set differential analysis setting to no and check the results of SEACR before tu run DE. - ## Run the pipeline on test data diff --git a/Snakefile b/Snakefile index 6da138c..1b1d84f 100755 --- a/Snakefile +++ b/Snakefile @@ -144,7 +144,7 @@ for row in design.itertuples(index=True, name='Pandas'): if mark == getattr(row, "IP_NAME").split("_")[0]: nb_rep = getattr(row, "NB_IP") if (getattr(row, "IP_NAME")).endswith(tuple(conds)) and getattr(row, "NB_IP") > 1: - if nb >= 1: + if nb >= 1 and mark not in MARK_OK: MARK_OK.append(mark) CORR_INPUT_OK.append(getattr(row, "INPUT_NAME").split("_")[0]) break @@ -152,6 +152,7 @@ for row in design.itertuples(index=True, name='Pandas'): nb += 1 nb = 0 + if config["differential_analysis"]["input_counting"]: CORR_INPUT = [] # corresponding INPUT files for input counting for mark in CORR_INPUT_OK: @@ -522,7 +523,7 @@ include: os.path.join(RULES, "EstimateLibraryComplexity.rules") # plot Fingerprint #---------------------------------- def IPandINPUT(wildcards): - return [os.path.join(analysis_dir,"{}/{{IP}}_{}_sort_dedup{}.bam".format(biasedRegions_dir, ref, biasedRegions)), str(os.path.join(analysis_dir, biasedRegions_dir +"/") + IP_INPUT[IP_INPUT.IP == wildcards.IP].iloc[0]['INPUT'] + "_{}_sort_dedup{}.bam".format(ref, biasedRegions))] + return [str(os.path.join(analysis_dir, biasedRegions_dir +"/") + IP_INPUT[IP_INPUT.IP == wildcards.IP].iloc[0]['INPUT'] + "_{}_sort_dedup{}.bam".format(ref, biasedRegions)), os.path.join(analysis_dir,"{}/{{IP}}_{}_sort_dedup{}.bam".format(biasedRegions_dir, ref, biasedRegions))] plotFingerprint_input = IPandINPUT @@ -662,14 +663,17 @@ if config["macs2"]["do"]: final_output.extend(expand(macs2_output, zip, SAMPLE=ALL_IP_PC)) include: os.path.join(RULES, "macs2.rules") - #---------------------------------- # Peak Calling with SEACR #---------------------------------- if config["seacr"]["do"]: - peak_caller += ["seacr"] - mod += [config["seacr"]["threshold"]] + if not config["macs2"]["do"]: + peak_caller = ["seacr"] + mod = [config["seacr"]["threshold"]] + else: + peak_caller += ["seacr"] + mod += [config["seacr"]["threshold"]] # produce bedgrah files bedgraph_input = "{}/{{SAMPLE}}_{}_sort_dedup{}.bam".format(biasedRegions_dir, ref, biasedRegions) bedgraph_genome = config["genome"]["fasta_file"]+".fai" @@ -707,92 +711,111 @@ def stats_pc_input(wildcards): elif wildcards.CALLER == "seacr": return expand(os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/{IP_REP}.{{MOD}}.bed"), IP_REP=IP_ALL) -#macs2_rep = [] -#macs2_rep.extend(expand(os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_REP}}_peaks.{}Peak".format(model_dir, model)), IP_REP=IP_ALL)) stats_peakCalling_input = stats_pc_input stats_peakCalling_csv = os.path.join(analysis_dir, "{CALLER}_{MOD}Peaks_metrics_mqc.out") stats_peakCalling_marks = marks stats_peakCalling_conds = conds stats_peakCalling_rep = rep_flag -stats_peakCalling_log = os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/{MOD}/Peaks_metrics.out") +stats_peakCalling_log = os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/{MOD}_Peaks_metrics.out") include: os.path.join(RULES, "stats_peakCalling.rules") final_output.extend(expand(stats_peakCalling_csv, zip, CALLER=peak_caller, MOD=mod)) - #---------------------------------- # IDR computing #---------------------------------- +if config["macs2"]["do"]: + + if model in ["narrow"]: + compute_idr_mode = "narrowPeak" + else : + compute_idr_mode = "broadPeak" + compute_idr_input1 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}1_peaks.{}Peak".format(model_dir, model)) + compute_idr_input2 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}2_peaks.{}Peak".format(model_dir, model)) + compute_idr_output = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr.txt".format(model_dir, ref, model)) + compute_idr_output_peak = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir, + ref, model, config["compute_idr"]["thresh"], model)) + compute_idr_log = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_idr.out".format(model_dir,model)) + include: os.path.join(RULES, "compute_idr.rules") + final_output.extend(expand(compute_idr_output, zip, IP_IDR=REP_IDR, CASE=CASE)) -if model in ["narrow"]: - compute_idr_mode = "narrowPeak" -else : - compute_idr_mode = "broadPeak" -compute_idr_input1 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}1_peaks.{}Peak".format(model_dir, model)) -compute_idr_input2 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}2_peaks.{}Peak".format(model_dir, model)) -compute_idr_output = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr.txt".format(model_dir, ref, model)) -compute_idr_output_peak = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir, - ref, model, config["compute_idr"]["thresh"], model)) -compute_idr_log = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_idr.out".format(model_dir,model)) -include: os.path.join(RULES, "compute_idr.rules") -final_output.extend(expand(compute_idr_output, zip, IP_IDR=REP_IDR, CASE=CASE)) #---------------------------------- # Select reproducible peaks #---------------------------------- +CALL_MOD = [] +if config["macs2"]["do"] and model in ["narrow"] and not config["compute_idr"]["intersectionApproach"]: + CALL_MOD += ["macs2_" + model] - -if model in ["narrow"] and not config["compute_idr"]["intersectionApproach"]: # Select IDR peaks - select_peaks_input_rep = os.path.join(analysis_dir, "07-IDR/{{CALLER}}/{}/{{IP_IDR}}_{}1vs{}2_{}_{}_idr{}.{{MOD}}Peak".format(model_dir, rep_flag, rep_flag, ref, model, config["compute_idr"]["thresh"])) - select_peaks_input_ppr = os.path.join(analysis_dir, "07-IDR/{{CALLER}}/{}/{{IP_IDR}}_{}1vs{}2_{}_{}_idr{}.{{MOD}}Peak".format(model_dir, "PPR", "PPR", ref, model, config["compute_idr"]["thresh"])) - select_peaks_input_pool = os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/{}/{{IP_IDR}}_PPRPool_peaks.{{MOD}}Peak".format(model_dir)) - select_peaks_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}/{}/logs/{{IP_IDR}}_selected_{{MOD}}peaks.o".format(model_dir)) - select_peaks_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}/{}/{{IP_IDR}}_select.{{MOD}}Peak".format(model_dir)) + def IDR_input_rep(wildcards): + #if wildcards.CALLER == "macs2_narrow": + return str(os.path.join(analysis_dir, "07-IDR/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_"+rep_flag+"1vs"+rep_flag+"2_"+ref+"_"+model+"_idr"+str(config["compute_idr"]["thresh"])+"."+model+"Peak")) + + def IDR_input_ppr(wildcards): + #if wildcards.CALLER == "macs2_narrow": + #return [os.path.join(analysis_dir, "07-IDR/macs2/%s/{IP_IDR}_%s1vs%s2_%s_%s_idr%s.%sPeak" % (model_dir, + # "PPR", "PPR", ref, model, config["compute_idr"]["thresh"], model))] + return str(os.path.join(analysis_dir, "07-IDR/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_PPR1vsPPR2_"+ref+"_"+model+"_idr"+str(config["compute_idr"]["thresh"])+"."+model+"Peak")) + + def IDR_input_pool(wildcards): + #if wildcards.CALLER == "macs2_narrow": + #return [os.path.join(analysis_dir, "06-PeakCalling/macs2/%s/{IP_IDR}_PPRPool_peaks.%sPeak" % (model_dir, model))] + return str(os.path.join(analysis_dir, "06-PeakCalling/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_PPRPool_peaks."+model+"Peak")) + + select_peaks_input_rep = IDR_input_rep + select_peaks_input_ppr = IDR_input_ppr + select_peaks_input_pool = IDR_input_pool + select_peaks_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}_{}/logs/{{IP_IDR}}_selected_peaks.o".format(model_dir)) + select_peaks_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}_{}/{{IP_IDR}}_select.Peak".format(model_dir)) include: os.path.join(RULES, "select_peaks.rules") - final_output.extend(expand(select_peaks_output, zip, CALLER="macs2", IP_IDR=IP_REP, MOD=model)) - + final_output.extend(expand(select_peaks_output, CALLER="macs2", IP_IDR=IP_REP)) + #---------------------------------- # Intersection approach #---------------------------------- +if config["seacr"]["do"] : + CALL_MOD += ["seacr_" + config["seacr"]["threshold"]] +if (config["macs2"]["do"] and config["compute_idr"]["intersectionApproach"]) or (config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"]): + CALL_MOD += ["macs2_" + model] + + def IA_input(wildcards): - if wildcards.CALLER == "macs2": - return expand(os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/%s/{{IP_IDR}}_{CASE}_peaks.{{MOD}}Peak" % (model_dir)), CASE=rep) - elif wildcards.CALLER == "seacr": - return expand(os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/{{IP_IDR}}_{CASE}.{{MOD}}.bed"), CASE=rep) + if wildcards.CALLER == "macs2_"+ config["macs2"]["mode_choice"]: + return expand(os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/%s/{{IP_IDR}}_{CASE}_peaks.{MOD}Peak" % (model_dir)), CALLER="macs2", CASE=rep, MOD=config["macs2"]["mode_choice"]) + elif wildcards.CALLER == "seacr_"+ config["seacr"]["threshold"]: + return expand(os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/{{IP_IDR}}_{CASE}.{MOD}.bed"), CALLER="seacr", CASE=rep, MOD=config["seacr"]["threshold"]) -if model in ["broad"] or config["compute_idr"]["intersectionApproach"] or config["seacr"]["do"] : +if (config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"]) or config["compute_idr"]["intersectionApproach"] or config["seacr"]["do"] : intersectionApproach_input_rep = IA_input - intersectionApproach_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/logs/{IP_IDR}_IA_{MOD}peaks.o") - intersectionApproach_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{IP_IDR}_IA.{MOD}Peak") - if model in ["broad"] : + intersectionApproach_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/logs/{IP_IDR}_IA_peaks.o") + intersectionApproach_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{IP_IDR}_IA.Peak") + if config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"] : intersectionApproach_overlap = 0.8 else: intersectionApproach_overlap = config["compute_idr"]["ia_overlap"] include: os.path.join(RULES, "intersectionApproach.rules") - final_output.extend(expand(intersectionApproach_output, zip, CALLER=peak_caller, IP_IDR=IP_REP, MOD=mod)) - select_peaks_output = intersectionApproach_output - + final_output.extend(expand(intersectionApproach_output, CALLER=CALL_MOD, IP_IDR=IP_REP)) #---------------------------------- # Compute IDR metrics #---------------------------------- - -idr_peaks = [] -idr_peaks.extend(expand(os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir, - ref,model, config["compute_idr"]["thresh"], model)), zip, IP_IDR=REP_IDR, CASE=CASE)) -metrics_peaks_input = idr_peaks -metrics_peaks_marks = marks -metrics_peaks_conds = conds -metrics_peaks_rep = rep_flag -metrics_peaks_logs = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/IDR_metrics.out".format(model_dir)) -metrics_peaks_output = os.path.join(analysis_dir, "IDR_metrics_mqc.out") -include: os.path.join(RULES, "metrics_peaks.rules") -final_output.extend([metrics_peaks_output]) +if config["macs2"]["do"] : + idr_peaks = [] + idr_peaks.extend(expand(os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir, + ref,model, config["compute_idr"]["thresh"], model)), zip, IP_IDR=REP_IDR, CASE=CASE)) + metrics_peaks_input = idr_peaks + metrics_peaks_marks = marks + metrics_peaks_conds = conds + metrics_peaks_rep = rep_flag + metrics_peaks_logs = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/IDR_metrics.out".format(model_dir)) + metrics_peaks_output = os.path.join(analysis_dir, "IDR_metrics_mqc.out") + include: os.path.join(RULES, "metrics_peaks.rules") + final_output.extend([metrics_peaks_output]) #---------------------------------- @@ -800,61 +823,56 @@ final_output.extend([metrics_peaks_output]) #---------------------------------- if len(conds) > 1 and config["differential_analysis"]["do"]: + def getPeakFilesByMark(wildcards): - ALL_IP = expand(select_peaks_output, zip, CALLER=peak_caller, IP_IDR=IP_REP, MOD=mod) - IP_dict = {} - for mark in marks: - IP_dict[mark] = [] - for file in ALL_IP: - if mark in file: - IP_dict[mark].append(file) + if wildcards.CALLER in ["seacr_"+config["seacr"]["threshold"], "macs2_broad" ] or (wildcards.CALLER in ["macs2_narrow"] and config["compute_idr"]["intersectionApproach"]): + return expand(os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{MARK}_{COND}_IA.Peak"), + CALLER=wildcards.CALLER, MARK=wildcards.MARK, COND=conds) + else : + return expand(os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{MARK}_{COND}_select.Peak"), + CALLER=wildcards.CALLER, MARK=wildcards.MARK, COND=conds) + - return IP_dict[wildcards["MARK"]] #---------------------------------- # get union peaks #---------------------------------- union_peaks_input = getPeakFilesByMark - union_peaks_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_selected_{MOD}peaks.out") + union_peaks_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_selected_peaks.out") if len(conds) == 2 : - union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.{{MOD}}Peak_overlap.bed".format(conds[0], conds[1], ref)) + union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], ref)) elif len(conds) == 3 : - union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.{{MOD}}Peak_overlap.bed".format(conds[0], conds[1], conds[2], ref)) + union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], conds[2], ref)) else : - union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.{{MOD}}Peak_overlap.bed".format(ref)) + union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.bed".format(ref)) include: os.path.join(RULES, "union_peaks.rules") - final_output.extend(expand(union_peaks_output, zip, CALLER=peak_caller, MARK=MARK_OK, MOD=mod)) + final_output.extend(expand(union_peaks_output, zip, CALLER=CALL_MOD, MARK=MARK_OK)) #---------------------------------- # get GFF from peak files #---------------------------------- if len(conds) == 2 : - bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.{{MOD}}Peak_overlap.bed".format(conds[0], conds[1], ref)) - bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.{{MOD}}Peak_overlap.gff".format(conds[0], conds[1], ref)) + bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], ref)) + bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], ref)) elif len(conds) == 3 : - bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.{{MOD}}Peak_overlap.bed".format(conds[0], conds[1], conds[2], ref)) - bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.{{MOD}}Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref)) + bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], conds[2], ref)) + bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref)) else : - bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.{{MOD}}Peak_overlap.bed".format(ref)) - bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.{{MOD}}Peak_overlap.gff".format(ref)) + bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.bed".format(ref)) + bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.gff".format(ref)) - bed_to_gff_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_{MOD}Peaks_bed2gff.out") - final_output.extend(expand(bed_to_gff_output, zip, CALLER=peak_caller, MARK=MARK_OK, MOD=mod)) + bed_to_gff_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_Peaks_bed2gff.out") + final_output.extend(expand(bed_to_gff_output, zip, CALLER=CALL_MOD, MARK=MARK_OK)) include: os.path.join(RULES, "bed_to_gff.rules") def getBAMFilesByMark(wildcards): - ALL_BAM = expand(os.path.join(analysis_dir, "{}/{{SAMPLE}}_{}_sort_dedup{}.bam".format(biasedRegions_dir, ref, biasedRegions)), SAMPLE=IP_ALL ) - BAM_dict = {} - for mark in marks: - BAM_dict[mark] = [] - for file in ALL_BAM: - if mark in file: - BAM_dict[mark].append(file) - return BAM_dict[wildcards["MARK"]] + return expand(os.path.join(analysis_dir, "%s/{MARK}_{COND}_{REP}_%s_sort_dedup%s.bam" % (biasedRegions_dir, ref, biasedRegions)), + MARK=wildcards.MARK, COND=conds, REP=rep) + #---------------------------------- # feature Count on peaks @@ -865,18 +883,18 @@ if len(conds) > 1 and config["differential_analysis"]["do"]: feature_counts_optional_input = [] if config["differential_analysis"]["input_counting"]: feature_counts_optional_input += expand(os.path.join(analysis_dir, "{}/{{INPUT}}_{}_sort_dedup{}.bam".format(biasedRegions_dir, ref, biasedRegions)), INPUT=CORR_INPUT) - feature_counts_output_count = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/{MARK}_Matrix_Optimal_{MOD}Peak.mtx") + feature_counts_output_count = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/{MARK}_Matrix_Optimal_Peak.mtx") if len(conds) == 2 : - feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.{{MOD}}Peak_overlap.gff".format(conds[0], conds[1], ref)) + feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], ref)) elif len(conds) == 3 : - feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.{{MOD}}Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref)) + feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref)) else : - feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.{{MOD}}Peak_overlap.gff".format(ref)) + feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.gff".format(ref)) - feature_counts_log = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_{MOD}counts.out") + feature_counts_log = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_counts.out") feature_counts_options = "-t peak -g gene_id" feature_counts_threads = 4 - final_output.extend(expand(feature_counts_output_count, zip, CALLER=peak_caller, MARK=MARK_OK, MOD=mod)) + final_output.extend(expand(feature_counts_output_count, zip, CALLER=CALL_MOD, MARK=MARK_OK)) include: os.path.join(RULES, "feature_counts.rules") #---------------------------------- @@ -901,17 +919,17 @@ if len(conds) > 1 and config["differential_analysis"]["do"]: chipflowr_init_padj = config["differential_analysis"]["pAdjustMethod"] chipflowr_init_alpha = config["differential_analysis"]["alpha"] chipflowr_init_batch = config["differential_analysis"]["batch"] - chipflowr_init_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{{MOD}}_{}_{}".format(method, norm)) - chipflowr_init_config_r = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{{MOD}}_{}_{}/config.R".format(method, norm)) + chipflowr_init_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}".format(method, norm)) + chipflowr_init_config_r = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/config.R".format(method, norm)) chipflowr_init_genome = ref include: os.path.join(RULES, "chipflowr_init.rules") chipflowr_config_r = chipflowr_init_config_r - chipflowr_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{{MOD}}_{}_{}".format(method, norm)) - chipflowr_report = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{{MOD}}_{}_{}/{{MARK}}_Stat_report_{}_{}.html".format(method, norm,method, norm)) - chipflowr_logs = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{{MOD}}_{}_{}/{{MARK}}_{}_{}_Log.txt".format(method, norm, method, norm)) - final_output.extend(expand(chipflowr_report, zip, CALLER=peak_caller, MARK=MARK_OK, MOD=mod)) + chipflowr_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}".format(method, norm)) + chipflowr_report = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/{{MARK}}_Stat_report_{}_{}.html".format(method, norm,method, norm)) + chipflowr_logs = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/{{MARK}}_{}_{}_Log.txt".format(method, norm, method, norm)) + final_output.extend(expand(chipflowr_report, CALLER=CALL_MOD, MARK=MARK_OK)) include: os.path.join(RULES, "chipflowr.rules") #---------------------------------- @@ -921,9 +939,12 @@ if len(conds) > 1 and config["differential_analysis"]["do"]: if config["igv_session"]["do"]: igv_session_input = expand(bamCoverage_output, SAMPLE=IP_ALL, REF=ref, allow_missing=True) if len(IP_REP) > 1 and config["macs2"]["do"]: - igv_session_input += expand(select_peaks_output, CALLER="macs2", IP_IDR=IP_REP, MOD=model) + if config["compute_idr"]["intersectionApproach"]: + igv_session_input += expand(intersectionApproach_output, CALLER="macs2_" + model, IP_IDR=IP_REP) + else: + igv_session_input += expand(select_peaks_output, CALLER="macs2", IP_IDR=IP_REP) elif len(IP_REP) > 1 and config["seacr"]["do"]: - igv_session_input += expand(select_peaks_output, CALLER="seacr", IP_IDR=IP_REP, MOD=config["seacr"]["threshold"]) + igv_session_input += expand(intersectionApproach_output, CALLER="seacr_" + config["seacr"]["threshold"], IP_IDR=IP_REP) if len(IP_ALL) > 1 and config["macs2"]["do"]: igv_session_input += expand(macs2_output, SAMPLE=IP_ALL) elif len(IP_ALL) > 1 and config["seacr"]["do"]: @@ -932,6 +953,7 @@ if config["igv_session"]["do"]: igv_session_output = os.path.join(analysis_dir, "12-IGV/igv_session.xml") igv_session_genome = ref igv_session_mark = marks + igv_session_conds = conds igv_session_wdir = os.path.join(analysis_dir, "12-IGV/") igv_session_autoScale = config["igv_session"]["autoScale"] igv_session_normalize = config["igv_session"]["normalize"] @@ -942,6 +964,8 @@ if config["igv_session"]["do"]: #---------------------------------- # MultiQC report #---------------------------------- +try : model_dir +except NameError : model_dir = "seacr_" + config["seacr"]["threshold"] multiqc_input = final_output multiqc_input_dir = analysis_dir diff --git a/test/config.yaml b/test/config.yaml index 53cfef4..e441a6c 100644 --- a/test/config.yaml +++ b/test/config.yaml @@ -56,7 +56,7 @@ design: marks: H3K27ac, Klf4 condition: shCtrl, shUbc9 replicates: Rep - spike: false + spike: no spike_genome_file: /path/to/genome/directory/dmel9.fa #=============================================================================== @@ -65,14 +65,14 @@ design: # # :Parameters: # -# - index: assuming needed indexes are here if False +# - index: assuming needed indexes are here if no # - genome_directory: directory where all indexed are written # - name: name of prefix use in all output files from mapping # - fasta_file: path to Reference Genome in fasta format #=============================================================================== genome: - index: true + index: yes genome_directory: genome/ name: mm10 fasta_file: genome/mm10.fa diff --git a/workflow/rules/igv_session.rules b/workflow/rules/igv_session.rules index b6976b4..2664393 100755 --- a/workflow/rules/igv_session.rules +++ b/workflow/rules/igv_session.rules @@ -31,6 +31,7 @@ rule igv_session: genome = igv_session_genome, wdir = igv_session_wdir, marks = igv_session_mark, + conds = igv_session_conds, autoScale = igv_session_autoScale, normalize = igv_session_normalize log : @@ -40,6 +41,7 @@ rule igv_session: import os import sys import shutil + import itertools colors= ["135,86,146", "243,132,0", "161,202,241", "190,0,50", "243,195,0", @@ -47,11 +49,11 @@ rule igv_session: "249,147,121", "96,78,151", "46,166,0", "179,68,108", "220,211,0", "136,45,23", "141,182,0", "101,69,34", "226,88,34", "43,61,38"] - # create dictionnary with one color by mark + # create dictionnary with one color by mark_cond mark_color = {} i = 0 - for mark in params.marks: - mark_color[mark] = colors[i] + for combinaison in list(itertools.product(params.marks, params.conds)): + mark_color["_".join(combinaison)] = colors[i] i += 1 # get real path of xml output file @@ -81,7 +83,8 @@ rule igv_session: #print custom panels print("<Panel height=\"1180\" name=\"DataPanel\" width=\"3077\">") - for mark in marks: + for combinaison in list(itertools.product(params.marks, params.conds)): + mark = "_".join(combinaison) for file in input: id = os.path.basename(file) name = id.split("_cov")[0] diff --git a/workflow/rules/intersectionApproach.rules b/workflow/rules/intersectionApproach.rules index 1d9b9f0..eec795c 100755 --- a/workflow/rules/intersectionApproach.rules +++ b/workflow/rules/intersectionApproach.rules @@ -39,10 +39,16 @@ rule intersectionApproach: Rep1_temp=$(mktemp) Rep2_temp=$(mktemp) - awk '$9>2 && $7>3 {{print $0}}' {input.rep[0]} > $Rep1_temp - awk '$9>2 && $7>3 {{print $0}}' {input.rep[1]} > $Rep2_temp + if [[ {input.rep[0]} == *"stringent"* || {input.rep[0]} == *"relaxed"* ]] + then + cat {input.rep[0]} > $Rep1_temp + cat {input.rep[1]} > $Rep2_temp + else + awk '$9>2 && $7>3 {{print $0}}' {input.rep[0]} > $Rep1_temp + awk '$9>2 && $7>3 {{print $0}}' {input.rep[1]} > $Rep2_temp + fi - intersectBed -c -r -f {params.overlap} -a $Rep1_temp -b $Rep2_temp | awk 'BEGIN{{OFS="\t"}} {{if($NF==2) print $1,$2,$3,$4,$5,$6,$7,$8,$9, $10}}' > {output} 2> {log.out} + intersectBed -c -r -f {params.overlap} -a $Rep1_temp -b $Rep2_temp > {output} 2> {log.out} # if file empty, stop rule if [ ! -s {output} ] ;then -- GitLab