From 5febd34ee8f875cb7c3f42b0a658fefa04509780 Mon Sep 17 00:00:00 2001
From: rlegendre <rachel.legendre@pasteur.fr>
Date: Fri, 4 Aug 2023 14:49:32 +0200
Subject: [PATCH] add tested no control version

---
 README.md                          |  73 +++++++++++++++++++--
 Snakefile_noCTL                    | 100 ++++++++++++++---------------
 workflow/rules/metrics_peaks.rules |  69 ++++++++++----------
 3 files changed, 151 insertions(+), 91 deletions(-)

diff --git a/README.md b/README.md
index 7b63bf5..8faeae9 100644
--- a/README.md
+++ b/README.md
@@ -173,7 +173,7 @@ Design columns:
 Link to an Example: [design.txt](https://gitlab.pasteur.fr/hub/ePeak/-/blob/master/test/design.txt)
 
 
-**Example of config file and associated fastq file:**
+**Example of design file and associated fastq file:**
 
 For an experiment with one mark (H3K27ac) and 2 conditions (shCtrl and shUbc9) in duplicates, except for one INPUT which have failed. The list of fastq is the following :
 
@@ -187,7 +187,7 @@ INPUT_shUbc9_rep1_R1.fastq.gz
 INPUT_shUbc9_rep2_R1.fastq.gz
 
 ```
-For this experiment, the config file should be :
+For this experiment, the design file should be :
 
 ```
 IP_NAME	INPUT_NAME	NB_IP	NB_INPUT
@@ -199,6 +199,7 @@ H3K27ac_shUbc9	INPUT_shUbc9	2	2
 
 IP of the first condition are linked to the unique INPUT, and each IP file of second condition is linked to his INPUT.
 
+> For design with INPUT files, NB_IP and NB_INPUT correspond to the replicate number of files to associate IP with a specific INPUT. So, one line by IP (for each replicate) is expected.
 
 ### How to fill the config file
 
@@ -241,7 +242,9 @@ design:
 Each step has it proper setting in independent chunk.
 The second section of the configuration file is divided in multiple chunks. Each chunk gather the parameters of one step of the pipeline: `adapters`, `bowtie2_mapping`, `mark_duplicates`, `remove_blacklist`, `peak_calling`, `compute_idr` and `differential_analysis`. 
 
-The `options` parameter present in `adapters`, `bowtie2_mapping` and `peak_calling` allows you to provide any parameter recognised by cutadapt, bowtie2 and macs2 respectively. For example for the `bowtie2_mapping` chunk, `options` can be fill with "--very-sensitive".
+The `options` parameter present in `adapters`, `bowtie2_mapping` and `peak_calling` allows you to provide any parameter recognised by cutadapt, bowtie2 and macs2 respectively. For example for the `bowtie2_mapping` chunk, `options` can be fill with "--very-sensitive" only or more options (adapted to paired-end data, see example 2).
+
+example 1 :
 
 ```
 bowtie2_mapping:
@@ -249,6 +252,14 @@ bowtie2_mapping:
     threads: 4
 ```
 
+example 2 :
+
+```
+bowtie2_mapping:
+    options: "--very-sensitive -X 700 --no-mixed --no-discordant"
+    threads: 4
+```
+
 ### How to fill the multiqc config
 
 At the beginning of `config/multiqc_config.yaml` file, you have the possibility to customize header of MultiQC report according to your experiment as you can see below: 
@@ -333,9 +344,9 @@ intersectionApproach:
 
 ```
 
-### Default mode for CUT&RUN/CUT&Tag
+### Default mode for CUT&RUN (with INPUT)
 
-With CUT&RUN/CUT&Tag data, make deduplication only on INPUT/IgG data (dedup_IP to False). Then perform a stringent peak calling with SEACR and use Intersection Approach. Overlapping parameter of IA on peaks is set at 0.8. Set SEACR normalization to non if experiment have control genome (a scaling factor will be calulated from spike-in) .
+With CUT&RUN data, make deduplication only on INPUT/IgG data (dedup_IP to False). Then perform a stringent peak calling with SEACR and use Intersection Approach. Overlapping parameter of IA on peaks is set at 0.8. Set SEACR normalization to non if experiment have control genome (a scaling factor will be calulated from spike-in) .
 
 ```
 mark_duplicates:
@@ -360,6 +371,58 @@ intersectionApproach:
 ```
 
 
+### Default mode for CUT&Tag (or with no INPUT)
+
+With CUT&Tag data, that does not require an input control or IgG control sample, the "noCTL" version of ePeak is recommended.
+
+To running ePeak without any control, the design file should be adapted:
+
+```
+IP_NAME NB_IP
+H3K27ac_shCtrl    2
+H3K27ac_shUbc9    2
+```
+
+> For design without any INPUT file, NB_IP correspond to the number of replicates available for each sample. So, one line by sample (one for all replicates) is expected.
+
+
+The config file stay identical. 
+
+```
+bowtie2_mapping:
+    options: "--very-sensitive -X 700 --no-mixed --no-discordant"
+    threads: 4
+
+mark_duplicates:
+    do: yes
+    dedup_IP: 'False' 
+    threads: 4
+
+peak_calling:
+    mode_choice: 'narrow'    
+    no_model: no             
+    options: "--keep-dup all "
+    cutoff: 0.1
+    genomeSize: mm
+
+compute_idr:
+    do: yes
+    rank: 'signal.value'
+    thresh: 0.05
+
+intersectionApproach: 
+    do: no
+    ia_overlap: 0.8
+
+```
+
+To run the pipeline, the correct Snakefile (Snakefile_noCTL) should be used:
+
+
+`snakemake -s Snakefile_noCTL --use-singularity --singularity-args "-B '$HOME'" --cluster-config config/cluster_config.json --cluster "sbatch --mem={cluster.ram} --cpus-per-task={threads} " -j 200 --nolock --cores 1`
+
+
+
 ## Run the pipeline on test data 
 
 You need to have sra-toolkit installed before to download test data.
diff --git a/Snakefile_noCTL b/Snakefile_noCTL
index 5c2f436..9e4c4de 100755
--- a/Snakefile_noCTL
+++ b/Snakefile_noCTL
@@ -118,21 +118,21 @@ nb = 0
 # -----------------------------------------------
 # built list of MARK_COND_REP from config.yaml and check correspondance between design, config and fastq files
 
-if len(conds) > 1 :
-#built list of MARK_COND_REP from config.yaml
-    conf_cond = ["{mark}_{cond}_{flag}".format(cond=cond, mark=mark, flag=rep_flag) for cond in conds for mark in marks]
-    for row in design.itertuples(index=True, name='Pandas'):
-        for elem in conf_cond:
-            if elem in getattr(row, "IP_NAME"):
-                raise ValueError("Please check correspondance between config and design file: %s is not %s "
-                                 % (elem,getattr(row, "IP_NAME")))
-            elif not any(elem in s for s in samples):
-                raise ValueError("Please check correspondance between config file and fastq filenames: %s not found in %s"
-                                 % (elem, samples))
-            elif sum(getattr(row, "IP_NAME")+"_"+rep_flag in s for s in samples) is not int(getattr(row, "NB_IP")):
-                raise ValueError("Please check correspondance between number of replicates and/or prefix names in design "
-                                 "file and fastq filenames: %s not found %s times in %s" % (getattr(row, "IP_NAME"),
-                                                                                             getattr(row, "NB_IP"),samples))
+# if len(conds) > 1 :
+# #built list of MARK_COND_REP from config.yaml
+#     conf_cond = ["{mark}_{cond}_{flag}".format(cond=cond, mark=mark, flag=rep_flag) for cond in conds for mark in marks]
+#     for row in design.itertuples(index=True, name='Pandas'):
+#         for elem in conf_cond:
+#             if elem in getattr(row, "IP_NAME"):
+#                 raise ValueError("Please check correspondance between config and design file: %s is not %s "
+#                                  % (elem,getattr(row, "IP_NAME")))
+#             elif not any(elem in s for s in samples):
+#                 raise ValueError("Please check correspondance between config file and fastq filenames: %s not found in %s"
+#                                  % (elem, samples))
+#             elif sum(getattr(row, "IP_NAME")+"_"+rep_flag in s for s in samples) is not int(getattr(row, "NB_IP")):
+#                 raise ValueError("Please check correspondance between number of replicates and/or prefix names in design "
+#                                  "file and fastq filenames: %s not found %s times in %s" % (getattr(row, "IP_NAME"),
+#                                                                                              getattr(row, "NB_IP"),samples))
 
 
 # -----------------------------------------------
@@ -567,43 +567,39 @@ if config["macs2"]["do"]:
 # Peak Calling with SEACR
 #----------------------------------
 
-# if config["seacr"]["do"]:
-#     if not config["macs2"]["do"]:
-#         peak_caller = ["seacr"]
-#         mod = [config["seacr"]["threshold"]]
-#     else:
-#         peak_caller += ["seacr"]
-#         mod += [config["seacr"]["threshold"]]
-
-#     # produce bedgrah files
-#     bedgraph_input = "{}/{{SAMPLE}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
-#     if config["design"]["spike"]:
-#         bedgraph_scaleFactor = compute_scaling_factor_output
-#     else:
-#         bedgraph_scaleFactor = bedgraph_input
-#     bedgraph_genome = config["genome"]["fasta_file"]+".fai"
-#     if paired:
-#         bedgraph_options = " -bedpe "
-#     else:
-#         bedgraph_options = ""
-#     bedgraph_logs = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_bedgraph.out")
-#     bedgraph_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}_bedgraph.bg")
-#     #final_output.extend(expand(bedgraph_output, SAMPLE=samples))
-#     include: os.path.join(RULES, "bedgraph.rules")
-
-#     def IGGtoIP(wildcards):
-#         return str(os.path.join(analysis_dir,"06-PeakCalling/seacr/") + IP_INPUT[IP_INPUT.IP == wildcards.SAMPLE].iloc[0]['INPUT'] + "_bedgraph.bg")
-
-#     seacr_bed_ip = bedgraph_output
-#     seacr_bed_input = IGGtoIP
-#     seacr_bed_threshold = config["seacr"]["threshold"]
-#     seacr_bed_norm = config["seacr"]["norm"]
-#     seacr_output_prefix = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}")
-#     seacr_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{{SAMPLE}}.{}.bed".format(config["seacr"]["threshold"]))
-#     seacr_logs_std = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.out")
-#     seacr_logs_err = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.err")
-#     final_output.extend(expand(seacr_output, SAMPLE=IP_ALL))
-#     include: os.path.join(RULES, "seacr.rules")   
+if config["seacr"]["do"]:
+    if not config["macs2"]["do"]:
+        peak_caller = ["seacr"]
+        mod = [config["seacr"]["threshold"]]
+    else:
+        peak_caller += ["seacr"]
+        mod += [config["seacr"]["threshold"]]
+
+    # produce bedgrah files
+    bedgraph_input = "{}/{{SAMPLE}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
+    if config["design"]["spike"]:
+        bedgraph_scaleFactor = compute_scaling_factor_output
+    else:
+        bedgraph_scaleFactor = bedgraph_input
+    bedgraph_genome = config["genome"]["fasta_file"]+".fai"
+    if paired:
+        bedgraph_options = " -bedpe "
+    else:
+        bedgraph_options = ""
+    bedgraph_logs = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_bedgraph.out")
+    bedgraph_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}_bedgraph.bg")
+    #final_output.extend(expand(bedgraph_output, SAMPLE=samples))
+    include: os.path.join(RULES, "bedgraph.rules")
+
+    seacr_bed_ip = bedgraph_output
+    seacr_bed_threshold = config["seacr"]["threshold"]
+    seacr_bed_norm = config["seacr"]["norm"]
+    seacr_output_prefix = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}")
+    seacr_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{{SAMPLE}}.{}.bed".format(config["seacr"]["threshold"]))
+    seacr_logs_std = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.out")
+    seacr_logs_err = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.err")
+    final_output.extend(expand(seacr_output, SAMPLE=IP_ALL))
+    include: os.path.join(RULES, "seacr_noCTL.rules")   
 
 
 #----------------------------------
diff --git a/workflow/rules/metrics_peaks.rules b/workflow/rules/metrics_peaks.rules
index 95737d7..2c71aee 100755
--- a/workflow/rules/metrics_peaks.rules
+++ b/workflow/rules/metrics_peaks.rules
@@ -66,40 +66,41 @@ rule metrics_peaks:
         for mark in marks:
             for cond in conds:
                 sample = mark+"_"+cond
-                d['Cond'].append(mark+"_"+cond)
-                for file in inputs:
-                    if sample in file:
-                        if 'vs'+params.rep in file:
-                            nb_rep = sum(1 for line in open(file))
-                            d['Replicates'].append(nb_rep)
-                        if 'vsPPR' in file:
-                            nb_ppr = sum(1 for line in open(file))
-                            d['PPR'].append(nb_ppr)
-                        if 'vsSPR1' in file:
-                            nb_spr1 = sum(1 for line in open(file))
-                            d['SPR1'].append(nb_spr1)
-                        if 'vsSPR2' in file:
-                            nb_spr2 = sum(1 for line in open(file))
-                            d['SPR2'].append(nb_spr2)
-                #catch illegal division by zero
-                try:
-                    scr = round(max(nb_spr1,nb_spr2)/min(nb_spr1,nb_spr2), 2)
-                except ZeroDivisionError:
-                    scr = 0
-                try:
-                    rr = round(max(nb_rep,nb_ppr)/min(nb_rep,nb_ppr), 2)
-                except ZeroDivisionError:
-                    rr = 0
-                d['SCR'].append(scr)
-                d['RR'].append(rr)
-                if (scr < 2 and rr != 0 ) and (rr < 2 and rr != 0 ) :
-                    d['Score'].append(1)
-                elif (scr > 2 or scr == 0 ) and (rr < 2 and rr != 0 ) :
-                    d['Score'].append(0)
-                elif ( scr < 2 and scr != 0 ) and (rr > 2 or rr == 0 ) :
-                    d['Score'].append(0)
-                else:
-                    d['Score'].append(-1)
+                if any(sample in element for element in inputs):
+                    d['Cond'].append(sample) 
+                    for file in inputs:
+                        if sample in file:
+                            if 'vs'+params.rep in file:
+                                nb_rep = sum(1 for line in open(file))
+                                d['Replicates'].append(nb_rep)
+                            if 'vsPPR' in file:
+                                nb_ppr = sum(1 for line in open(file))
+                                d['PPR'].append(nb_ppr)
+                            if 'vsSPR1' in file:
+                                nb_spr1 = sum(1 for line in open(file))
+                                d['SPR1'].append(nb_spr1)
+                            if 'vsSPR2' in file:
+                                nb_spr2 = sum(1 for line in open(file))
+                                d['SPR2'].append(nb_spr2)
+                    #catch illegal division by zero
+                    try:
+                        scr = round(max(nb_spr1,nb_spr2)/min(nb_spr1,nb_spr2), 2)
+                    except ZeroDivisionError:
+                        scr = 0
+                    try:
+                        rr = round(max(nb_rep,nb_ppr)/min(nb_rep,nb_ppr), 2)
+                    except ZeroDivisionError:
+                        rr = 0
+                    d['SCR'].append(scr)
+                    d['RR'].append(rr)
+                    if (scr < 2 and rr != 0 ) and (rr < 2 and rr != 0 ) :
+                        d['Score'].append(1)
+                    elif (scr > 2 or scr == 0 ) and (rr < 2 and rr != 0 ) :
+                        d['Score'].append(0)
+                    elif ( scr < 2 and scr != 0 ) and (rr > 2 or rr == 0 ) :
+                        d['Score'].append(0)
+                    else:
+                        d['Score'].append(-1)
 
 
 
-- 
GitLab