Snakefile 48.5 KB
Newer Older
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
1
#########################################################################
2
# ePeak: Standardize and reproducible ChIP-seq analysis from raw        #
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
3
4
5
6
#           data to differential analysis                               #
# Authors: Rachel Legendre, Maelle Daunesse                             #
# Copyright (c) 2019-2020  Institut Pasteur (Paris) and CNRS.           #
#                                                                       #
7
# This file is part of ePeak workflow.                                  #
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
8
#                                                                       #
9
# ePeak is free software: you can redistribute it and/or modify         #
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
10
11
12
13
# it under the terms of the GNU General Public License as published by  #
# the Free Software Foundation, either version 3 of the License, or     #
# (at your option) any later version.                                   #
#                                                                       #
14
# ePeak is distributed in the hope that it will be useful,              #
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
15
16
17
18
19
# but WITHOUT ANY WARRANTY; without even the implied warranty of        #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          #
# GNU General Public License for more details .                         #
#                                                                       #
# You should have received a copy of the GNU General Public License     #
20
# along with ePeak (LICENSE).                                           #
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
21
22
23
24
25
# If not, see <https://www.gnu.org/licenses/>.                          #
#########################################################################



26

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
27
28
29
30
31
import pandas as pd
from fnmatch import fnmatch
from re import sub, match
from itertools import repeat, chain
import os
32
import glob
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
33
34
35
36
37
38
39
40

#-------------------------------------------------------
# read config files

configfile: "config/config.yaml"
#put a relative path
RULES = os.path.join("workflow", "rules")

41
design_user = pd.read_csv(config["design"]["design_file"], header=0, sep='\t')
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78

"""
REQUIREMENTS IN DESIGN:
 - all files in one directory
 - fullname of files must be :
        MARK_COND_REP_MATE.fastq.gz
 - name on design must be:
        MARK_COND


"""

#-------------------------------------------------------
# list of all files in the directory 'input_dir'

filenames = [f for f in os.listdir(config["input_dir"]) if match(r'.*'+config["input_mate"]+config["input_extension"]+'', f)] 
if not filenames :
    raise ValueError("Please provides input fastq files")

#-------------------------------------------------------
# paired-end data gestion

mate_pair = config["input_mate"]
rt1 = mate_pair.replace("[12]", "1")
rt2 = mate_pair.replace("[12]", "2")

R1 = [1 for this in filenames if rt1 in this]
R2 = [1 for this in filenames if rt2 in this]

if len(R2) == 0:
    paired = False
else:
    if R1 == R2:
        paired = True
    else:
        raise ValueError("Please provides single or paired files only")

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
79
80
81
82
83
84
#get sample names
filename_R1 = [ file for file in filenames if match(r'.*'+rt1+config["input_extension"]+'', file)]
samples = [sub(rt1+config["input_extension"], '', file) for file in filename_R1]
marks = [ x.strip() for x in (config["design"]["marks"]).split(",")]
conds = [ x.strip() for x in (config["design"]["condition"]).split(",")]
rep_flag = config["design"]["replicates"]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
85

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
86

87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# -----------------------------------------------
# Compute number of replicates by IP/INPUT

#get uniq IP and INPUT
unique_ip =  pd.unique(design_user["IP_NAME"])
unique_input =  pd.unique(design_user["INPUT_NAME"])
#Fill new design with informations about IP
design = pd.DataFrame(unique_ip, columns = ['IP_NAME'])
design['NB_IP'] = design_user["IP_NAME"].value_counts().values
#initialize INPUT related columns
inputs = []
nb_input = []
#fill new design with informations about INPUT
for row in design.itertuples(index=True, name='Pandas'):
    inputs.append(design_user.loc[design_user['IP_NAME'] == getattr(row, "IP_NAME"), 'INPUT_NAME'].iloc[0])
    nb_input.append(int(design_user.loc[design_user['IP_NAME'] == getattr(row, "IP_NAME"), 'INPUT_NAME'].value_counts().values))
design['INPUT_NAME'] = inputs
design['NB_INPUT'] = nb_input


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -----------------------------------------------
# get list of INPUT files

INPUT = []
for row in design.itertuples(index=True, name='Pandas'):
    for file in samples:
        mark = getattr(row, "INPUT_NAME")
        if not pd.isna(mark):
            if fnmatch(file, mark+"*"+rep_flag+"1*"):
                i = 1
                name = sub(mate_pair, '', file)
                INPUT.append(name)
                if getattr(row, "NB_IP") > 1 :
                    while i < getattr(row, "NB_IP"):
                        INPUT.append(name)
                        i += 1

# -----------------------------------------------
# get list of IP files

IP_ALL = []
for mark in design['IP_NAME']:
    for file in samples:
        if fnmatch(file, mark+"*") and file not in IP_ALL:
            name = sub(mate_pair, '', file)
            IP_ALL.append(name)


# -----------------------------------------------
# check design
# Select mark for statistical analysis if there is more than one condition (with min 2 rep) for each mark


MARK_OK = []
CORR_INPUT_OK = [] # corresponding INPUT condition
nb = 0
for row in design.itertuples(index=True, name='Pandas'):
    for mark in marks:
        if mark == getattr(row, "IP_NAME").split("_")[0]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
146
            nb_rep = getattr(row, "NB_IP")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
147
            if (getattr(row, "IP_NAME")).endswith(tuple(conds)) and getattr(row, "NB_IP") > 1:
148
                if nb >= 1 and mark not in MARK_OK:               
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
149
150
151
152
153
154
155
                    MARK_OK.append(mark)
                    CORR_INPUT_OK.append(getattr(row, "INPUT_NAME").split("_")[0])
                    break
                else:                 
                    nb += 1
nb = 0

156

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
157
158
159
160
161
162
163
164
if config["differential_analysis"]["input_counting"]:
    CORR_INPUT = [] # corresponding INPUT files for input counting
    for mark in CORR_INPUT_OK:
        for file in samples:
            if fnmatch(file, mark+"_*") and file not in CORR_INPUT:
                name = sub(mate_pair, '', file)
                CORR_INPUT.append(name)
    
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
165
  
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

# -----------------------------------------------
# built list of MARK_COND_REP from config.yaml and check correspondance between design, config and fastq files

if len(conds) > 1 :
#built list of MARK_COND_REP from config.yaml
    conf_cond = ["{mark}_{cond}_{flag}".format(cond=cond, mark=mark, flag=rep_flag) for cond in conds for mark in marks]
    for row in design.itertuples(index=True, name='Pandas'):
        for elem in conf_cond:
            if elem in getattr(row, "IP_NAME"):
                raise ValueError("Please check correspondance between config and design file: %s is not %s "
                                 % (elem,getattr(row, "IP_NAME")))
            elif not any(elem in s for s in samples):
                raise ValueError("Please check correspondance between config file and fastq filenames: %s not found in %s"
                                 % (elem, samples))
            elif sum(getattr(row, "IP_NAME")+"_"+rep_flag in s for s in samples) is not int(getattr(row, "NB_IP")):
                raise ValueError("Please check correspondance between number of replicates and/or prefix names in design "
                                 "file and fastq filenames: %s not found %s times in %s" % (getattr(row, "IP_NAME"),
                                                                                             getattr(row, "NB_IP"),samples))
185
186
187
            elif sum(getattr(row, "INPUT_NAME")+"_"+rep_flag in s for s in samples) is not int(getattr(row, "NB_INPUT")):
                raise ValueError("Please check correspondance between number of replicates and/or prefix names in design "
                                 "file and fastq filenames: %s not found %s times in %s" % (getattr(row, "INPUT_NAME"), getattr(row, "NB_INPUT"),samples))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
188
189


190
191
192
193
194
195
196
197
198
199
# -----------------------------------------------
# get correspondance between IP and INPUT files
d = {}
d["IP"] = []
d["INPUT"] = []
for row in design_user.itertuples(index=True, name='Pandas'):     
        d["IP"].append(getattr(row, "IP_NAME") +"_"+ rep_flag + str(getattr(row, "NB_IP")))
        d["INPUT"].append(getattr(row, "INPUT_NAME") +"_"+ rep_flag + str(getattr(row, "NB_INPUT")))

IP_INPUT = pd.DataFrame.from_dict(d)
200

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
# -----------------------------------------------
# get REP names

###### INCLUDE BETTER GESTION OF AUTO REPLICATES WITH 3 REP
###### TODO use https://snakemake.readthedocs.io/en/stable/project_info/faq.html#i-don-t-want-expand-to-use-every-wildcard-what-can-i-do 
#get list with replicates names for all IP

if nb_rep > 1 :
    rep = [rep_flag+'1', rep_flag+'2']
    # deduce from rep the list for SPR & PPR
    ppr = ['PPR1', 'PPR2', 'PPRPool']
    spr = ['SPR1.1', 'SPR1.2', 'SPR2.1', 'SPR2.2']
else :
    rep = [rep_flag+'1']


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# -----------------------------------------------
# From the design file, we get only IP with more than one replicate and at least one INPUT


IP_REP = [] # all IP passing IDR
IP_REP_DUP = [] # corresponding INPUT
IP_NO_INPUT = [] # all IP with replicates but without INPUT
INPUT_NA = [] # corresponding INPUT (list of NA according to number of IP)
IP_NA = [] # all IP without replicates and without INPUT
for row in design.itertuples(index=True, name='Pandas'):
    if getattr(row, "NB_IP") > 1 and getattr(row, "NB_INPUT") > 1:
        IP_REP_DUP.append(getattr(row, "INPUT_NAME")+"_Pool")
        IP_REP.append(getattr(row, "IP_NAME"))
    elif getattr(row, "NB_IP") > 1 and getattr(row, "NB_INPUT") == 1:
        IP_REP_DUP.append(getattr(row, "INPUT_NAME")+"_"+rep_flag+"1")
        IP_REP.append(getattr(row, "IP_NAME"))
    elif getattr(row, "NB_IP") > 1 and getattr(row, "NB_INPUT") < 1:
        IP_NO_INPUT.append(getattr(row, "IP_NAME"))
        INPUT_NA.append('NA')
    elif getattr(row, "NB_IP") == 1 and getattr(row, "NB_INPUT") < 1:
        IP_NA.append(getattr(row, "IP_NAME"))

# We get only INPUT with at least one replicate that are linked to an IP with multiple replicates
INPUT_REP = []
for row in design.itertuples(index=True, name='Pandas'):
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
242
    if getattr(row, "NB_INPUT") > 1 and getattr(row, "NB_IP") > 1 and (getattr(row, "INPUT_NAME")) not in INPUT_REP:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
243
244
        INPUT_REP.append(getattr(row, "INPUT_NAME"))

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
245

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
246
247
248
249
250
251
# -----------------------------------------------
# get IP passed pre IDR step (for rep, ppr and spr)

IP_IDR = []
IP_SPR = []
IP_PPR = []
252
PPR_POOL = []
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
253
254
255
INPUT_SPR  = []
INPUT_PPR = []
#IDR_REP = []
256
# for only IP with more than one replicate
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
257
258
259
260
261
262
263
for cond in IP_REP:
    tmp = []
    for ip in IP_ALL:
        name = ip.split("_" + rep_flag)
        if ip.startswith(cond):
            spr_file = sub(rep_flag, 'SPR', ip)
            ppr_file =  sub(rep_flag, 'PPR', ip)
264
            pool_file = sub(r''+rep_flag+'[0-9]*', 'PPRPool', ip)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
265
266
267
268
269
            tmp.append(ip)
            #IDR_REP.append("Rep"+name[1])
            IP_SPR.append(spr_file+".1")
            IP_SPR.append(spr_file+".2")
            IP_PPR.append(ppr_file)
270
271
272
273
274
275
276
277
278
279
280
281
            PPR_POOL.append(pool_file)
            # add corresponding INPUT
            #get INPUT for this IP and this rep
            ctl = design_user.loc[(design_user['IP_NAME'] == name[0]) & (design_user['NB_IP'] == int(name[1])), 'INPUT_NAME'].iloc[0]+"_"+rep_flag+str(design_user.loc[(design_user['IP_NAME'] == name[0]) & (design_user['NB_IP'] == int(name[1])), 'NB_INPUT'].iloc[0])
            #check is INPUT is a POOL or not
            if design.loc[design['INPUT_NAME'] == (design_user.loc[design_user['IP_NAME'] == name[0], 'INPUT_NAME'].iloc[0]), 'NB_INPUT'].iloc[0] > 1 :
                ctl_pool = (design_user.loc[design_user['IP_NAME'] == name[0], 'INPUT_NAME'].iloc[0])+'_Pool'
            else: ctl_pool = ctl
            #tmp_df from all preIDR + input
            tmp_df = pd.DataFrame({"IP":[spr_file+".1",spr_file+".2", ppr_file, pool_file],"INPUT":[ctl, ctl, ctl, ctl_pool]})
            #add to IP_INPUT
            IP_INPUT = IP_INPUT.append(tmp_df, ignore_index = True)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
282
283
284
285
286
287
288
289
290
    IP_IDR.append(tmp)


# -----------------------------------------------
# get pooled IP passed pre IDR step and corresponding INPUT

PPR_POOL = []
INPUT_POOL = []
for row in design.itertuples(index=True, name='Pandas'):
291
    #print(row)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
292
293
294
295
296
297
298
299
300
    if getattr(row, "NB_IP") > 1 and getattr(row, "NB_INPUT") > 1 :
        PPR_POOL.append(getattr(row, "IP_NAME") + "_PPRPool")
        INPUT_POOL.append(getattr(row, "INPUT_NAME") + "_Pool")
        pass_pool = 0
    elif getattr(row, "NB_IP") > 1 and getattr(row, "NB_INPUT") == 1 :
        PPR_POOL.append(getattr(row, "IP_NAME") + "_PPRPool")
        INPUT_POOL.append(getattr(row, "INPUT_NAME")+ "_" + rep_flag + "1")
        pass_pool = 1

301
302


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# all files passing PhantomPeakQualTool if no-model is chosen
ALL = IP_ALL + IP_SPR + IP_PPR + PPR_POOL


# all files passing peak calling step
ALL_IP_PC = IP_ALL + IP_SPR + IP_PPR + PPR_POOL
ALL_INPUT_PC = INPUT + INPUT_SPR + INPUT_PPR + INPUT_POOL

# get files for IDR
CASE = [rep_flag, "PPR", "SPR1.", "SPR2."]*len(IP_REP)
REP_IDR = list(chain(*zip(*repeat(IP_REP,4))))
IN_IDR = list(chain(*zip(*repeat(IP_REP_DUP,4))))

# -----------------------------------------------
# Add wildcard constraints

wildcard_constraints:
    sample = "[A-Za-z-_0-9]+_{0}[0-9]+".format(rep_flag),
    IP_REP = "[A-Za-z-_0-9]+_{0}[0-9]+".format(rep_flag),
    REP = "{0}[0-9]+".format(rep_flag),
    SPR = "[A-Za-z-_0-9]+_SPR[0-9]\.[1-4]*",
    PPR = "[A-Za-z-_0-9]+_PPR[0-9]*",
    POOL = "[A-Za-z-_0-9]+_PPRPool",
    INPUT_POOL = "[A-Za-z-_0-9]+_(Pool|{0}1)".format(rep_flag),
    MARK = "[A-Za-z-_0-9]+"

# -----------------------------------------------
#initialize global variables

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
332
333
334
analysis_dir = config["analysis_dir"]
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
335
336
337

sample_dir = config["input_dir"]

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
338
input_data = [sample_dir + "/{{SAMPLE}}{}{}".format(rt1,config["input_extension"])]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
339
if paired:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
340
    input_data += [sample_dir + "/{{SAMPLE}}{}{}".format(rt2,config["input_extension"])]    
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
341
342
343
344
345
346
347
348
349
350
351
# global variable for all output files
final_output = []

# -----------------------------------------------
#begin of the rules

#----------------------------------
# quality control
#----------------------------------

fastqc_input_fastq = input_data
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
352
353
354
fastqc_output_done = os.path.join(analysis_dir, "00-Fastqc/{{SAMPLE}}{}_fastqc.done".format(rt1))
fastqc_wkdir = os.path.join(analysis_dir, "00-Fastqc")
fastqc_log = os.path.join(analysis_dir, "00-Fastqc/logs/{{SAMPLE}}{}_fastqc_raw.log".format(rt1))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
355
356
357
final_output.extend(expand(fastqc_output_done, SAMPLE=samples))
include: os.path.join(RULES, "fastqc.rules")

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
358
359


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
360
361
362
363
364
365
366
367
368
#----------------------------------
# Remove adapters
#----------------------------------

if config["adapters"]["remove"] :

    ## TODO add AlienTrimmer
    adapter_tool = "adapters"
    cutadapt_input_fastq = input_data
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
369
370
    cutadapt_wkdir = os.path.join(analysis_dir, "01-Trimming")
    cutadapt_output = [os.path.join(analysis_dir, "01-Trimming/{{SAMPLE}}{}_trim{}".format(rt1,config["input_extension"]))]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
371
    if paired:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
372
        cutadapt_output += [os.path.join(analysis_dir, "01-Trimming/{{SAMPLE}}{}_trim{}".format(rt2,config["input_extension"]))]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
373
374
375
376
377
378
379
380
381

    # Set parameters
    #todo write the good line according to tool
    cutadapt_adapt_list = config["adapters"]["adapter_list"]
    
    cutadapt_options = config["adapters"]["options"]
    cutadapt_mode = config["adapters"]["mode"]
    cutadapt_min  = config["adapters"]["m"]
    cutadapt_qual = config["adapters"]["quality"]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
382
    cutadapt_log = os.path.join(analysis_dir, "01-Trimming/logs/{SAMPLE}_trim.txt")
383
    #final_output.extend(expand(cutadapt_output, SAMPLE=samples))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
384
385
386
387
388
389
390
391
392
393
    include: os.path.join(RULES, "cutadapt.rules")

else:
    cutadapt_output = input_data


#----------------------------------
# genome gestion
#----------------------------------

394
ref = config["genome"]["name"]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
395

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
396
397
#if config["design"]["spike"]:
#    ref += [ "spikes" ]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
    # TODO add possibility of index name and also, check if 


def mapping_index(wildcards):
    if (wildcards.REF == config["genome"]["name"]):
        input = config["genome"]["fasta_file"]
    elif (wildcards.REF == "spikes"):
        input = config["design"]["spike_genome_file"]
    return(input)

#----------------------------------
# bowtie2 indexing
#----------------------------------

if config["genome"]["index"]: 
    # indexing for bowtie2
    bowtie2_index_fasta = mapping_index
    bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
    bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
417
    bowtie2_index_log = os.path.join(analysis_dir, "02-Mapping/logs/bowtie2_{REF}_indexing.log")
418
    #final_output.extend(expand(bowtie2_index_output_done, REF=ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
419
420
421
422
423
424
425
426
427
428
429
430
    include: os.path.join(RULES, "bowtie2_index.rules")

else:
    bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
    bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")

#----------------------------------
# bowtie2 MAPPING
#----------------------------------

bowtie2_mapping_input = cutadapt_output
bowtie2_mapping_index_done = bowtie2_index_output_done
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
431
432
433
434
435
bowtie2_mapping_sort = os.path.join(analysis_dir, "02-Mapping/{SAMPLE}_{REF}_sort.bam")
bowtie2_mapping_bam = os.path.join(analysis_dir, "02-Mapping/{SAMPLE}_{REF}.bam")
bowtie2_mapping_sortprefix = os.path.join(analysis_dir, "02-Mapping/{SAMPLE}_{REF}_sort")
bowtie2_mapping_logs_err = os.path.join(analysis_dir, "02-Mapping/logs/{SAMPLE}_{REF}_mapping.err")
bowtie2_mapping_logs_out = os.path.join(analysis_dir, "02-Mapping/logs/{SAMPLE}_{REF}_mapping.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
436
437
bowtie2_mapping_prefix_index = bowtie2_index_output_prefix
bowtie2_mapping_options =  config["bowtie2_mapping"]["options"]
438
#final_output.extend(expand(bowtie2_mapping_sort, SAMPLE=samples, REF=ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
439
include: os.path.join(RULES, "bowtie2_mapping.rules")
440
441
biasedRegions_dir = os.path.join(analysis_dir, "02-Mapping")
biasedRegions = ""
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
442
443
444
445
446


#----------------------------------
# Mark duplicated reads
#----------------------------------
447
448
449
450
451
452
453
454
455
456
dedup_IP = config['mark_duplicates']['dedup_IP']

def dedup(wildcards):
    if dedup_IP == "true":
        return "true" 
    elif (wildcards.SAMPLE in IP_ALL) and dedup_IP == "false":
        return "false"
    else :
        return "true"

457
458
459
460
461
if config['mark_duplicates']['do']:
    biasedRegions = "_dedup"
    biasedRegions_dir = os.path.join(analysis_dir, "03-Deduplication")
    mark_duplicates_input = bowtie2_mapping_sort
    mark_duplicates_output = os.path.join(analysis_dir, "03-Deduplication/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
462
    mark_duplicates_metrics = os.path.join(analysis_dir, "03-Deduplication/{SAMPLE}_{REF}_sort_dedup.txt")
463
    mark_duplicates_remove = dedup
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
464
465
    mark_duplicates_log_std = os.path.join(analysis_dir, "03-Deduplication/logs/{SAMPLE}_{REF}}_sort_dedup.out")
    mark_duplicates_log_err = os.path.join(analysis_dir, "03-Deduplication/logs/{SAMPLE}_{REF}_sort_dedup.err")
466
467
468
    mark_duplicates_tmpdir = config['tmpdir']
    #final_output.extend(expand(mark_duplicates_output, SAMPLE=samples, REF=ref))
    include: os.path.join(RULES, "mark_duplicates.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
469
470
471
472
473
474
475
476
477



#----------------------------------
# Spikes counting
#----------------------------------

if config["design"]["spike"]:
    # counting on spikes
478
    spikes_counting_input = expand(os.path.join(analysis_dir, "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions)), SAMPLE=samples, REF="spikes")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
479
    spikes_counting_output_json = os.path.join(analysis_dir, "09-CountMatrix/Spikes_count.json")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
480
    spikes_counting_output = os.path.join(analysis_dir, "Spikes_metrics.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
481
    spikes_counting_log = os.path.join(analysis_dir, "03-Deduplication/logs/Spikes_metrics.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
482
483
484
    final_output.extend([spikes_counting_output_json])
    include: os.path.join(RULES, "spikes_counting.rules")

485
486
487
488
489
490
   
    #compute scale factor if spike-in 
    compute_scaling_factor_input = "{}/{{SAMPLE}}_spikes_sort{}.bam".format(biasedRegions_dir, biasedRegions)
    compute_scaling_factor_output = "{}/{{SAMPLE}}_scaleFactor.txt".format(biasedRegions_dir, biasedRegions)
    #final_output.extend(expand(compute_scaling_factor_output, SAMPLE=samples))
    include: os.path.join(RULES, "compute_scaling_factor.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
491
492

#----------------------------------
493
# Remove biased regions
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
494
#----------------------------------
495

496
if config["remove_biasedRegions"]["do"]:
497
498
    remove_biasedRegions_input = os.path.join(analysis_dir, "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions))
    biasedRegions += "_biasedRegions"
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
499
    biasedRegions_dir = os.path.join(analysis_dir, "04-NobiasedRegions")
500
501
502
503
    remove_biasedRegions_output = os.path.join(analysis_dir, "04-NobiasedRegions/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions))
    remove_biasedRegions_log_std = os.path.join(analysis_dir, "04-NobiasedRegions/logs/{{SAMPLE}}_{{REF}}_sort{}.out".format(biasedRegions))
    remove_biasedRegions_log_err = os.path.join(analysis_dir, "04-NobiasedRegions/logs/{{SAMPLE}}_{{REF}}_sort{}.err".format(biasedRegions))
    #final_output.extend(expand(remove_biasedRegions_output, SAMPLE=samples, REF=ref))
504
    include: os.path.join(RULES, "remove_biasedRegions.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
505

506
507
508
509
#----------------------------------
# Coverage step
#----------------------------------
if config["bamCoverage"]["do"]:
510
    bamCoverage_input = "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions)
511
512
    bamCoverage_logs = os.path.join(analysis_dir, "12-IGV/logs/{SAMPLE}_{REF}.out")
    bamCoverage_output = os.path.join(analysis_dir, "12-IGV/{SAMPLE}_{REF}_coverage.bw")
513
    bamCoverage_options = config['bamCoverage']['options']
514
515
    if config["bamCoverage"]["spike-in"] and config["design"]["spike"]:
        bamCoverage_scaleFactor = compute_scaling_factor_output
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
516
517
    else:
        bamCoverage_scaleFactor = "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
518
    final_output.extend(expand(bamCoverage_output,  SAMPLE=samples, REF=ref, allow_missing=True))    
519
520
521
    include: os.path.join(RULES, "bamCoverage.rules")  


522
523
524
#----------------------------------
# Estimate Library Complexity
#----------------------------------
525
lib_complexity_input = "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions)
526
527
528
529
530
531
532
533
534
lib_complexity_metrics = os.path.join(analysis_dir, "05-QC/Complexity/{SAMPLE}_{REF}_metrics.txt")
lib_complexity_log_std = os.path.join(analysis_dir, "05-QC/Complexity/logs/{SAMPLE}_{REF}_complexity.out")
lib_complexity_log_err = os.path.join(analysis_dir, "05-QC/Complexity/logs/{SAMPLE}_{REF}_complexity.err")
final_output.extend(expand(lib_complexity_metrics,  SAMPLE=samples, REF=ref))    
include: os.path.join(RULES, "EstimateLibraryComplexity.rules")

#----------------------------------
# plot Fingerprint
#----------------------------------
535
def IPandINPUT(wildcards):
536
    return [str(biasedRegions_dir + "/" + IP_INPUT[IP_INPUT.IP == wildcards.IP].iloc[0]['INPUT'] + "_{}_sort{}.bam".format(ref, biasedRegions)), "{}/{{IP}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)]
537
538
539
540


plotFingerprint_input = IPandINPUT
plotFingerprint_output = os.path.join(analysis_dir, "05-QC/Fingerprint/{{IP}}_{}_fingerprint.pdf".format(ref))
541
plotFingerprint_output_raw =  os.path.join(analysis_dir, "05-QC/Fingerprint/{{IP}}_{}_fingerprint_rawcounts.txt".format(ref))
542
plotFingerprint_options = ""
543
544
plotFingerprint_logs = os.path.join(analysis_dir, "05-QC/Fingerprint/logs/{{IP}}_{}_Fingerprint.out".format(ref))
final_output.extend(expand(plotFingerprint_output,  IP=IP_ALL))    
545
546
547
548
549
include: os.path.join(RULES, "plotFingerprint.rules")

#----------------------------------
# GeneBody plot
#----------------------------------
550
551
552
553
554
555
556
557
558
if config['geneBody']['do']:
    computeMatrix_input = bamCoverage_output
    computeMatrix_output = os.path.join(analysis_dir, "05-QC/GeneBody/{SAMPLE}_{REF}.mat.gz")
    computeMatrix_regions = config['geneBody']['regionsFileName']
    computeMatrix_options = " --beforeRegionStartLength 3000 --regionBodyLength 5000 --afterRegionStartLength 3000 --skipZeros "
    computeMatrix_mode = "scale-regions"
    computeMatrix_logs = os.path.join(analysis_dir, "05-QC/GeneBody/logs/{SAMPLE}_{REF}_matrix.out")
    #final_output.extend(expand(computeMatrix_output,  SAMPLE=samples, REF=ref))    
    include: os.path.join(RULES, "computeMatrix.rules")
559

560
561
562
563
564
565
    plotHeatmap_input = computeMatrix_output
    plotHeatmap_output = os.path.join(analysis_dir, "05-QC/GeneBody/{SAMPLE}_{REF}_GeneBody.pdf")
    plotHeatmap_options = ""
    plotHeatmap_logs = os.path.join(analysis_dir, "05-QC/GeneBody/logs/{SAMPLE}_{REF}_GeneBody.out")
    final_output.extend(expand(plotHeatmap_output,  SAMPLE=samples, REF=ref))    
    include: os.path.join(RULES, "plotHeatmap.rules")
566

567

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
568
569
570
571
#------------------------------------------------
# fragment size distribution - Picard CollectInsertSizeMetrics
#------------------------------------------------
if paired:
572
    insert_size_metrics_input = "{}/{{SAMPLE}}_{{REF}}_sort{}.bam".format(biasedRegions_dir, biasedRegions)
573
574
    insert_size_metrics_output_txt = os.path.join(analysis_dir, "05-QC/FragmentSizeDistribution/{SAMPLE}_{REF}_fragmentSizeDistribution.txt")
    insert_size_metrics_histo = os.path.join(analysis_dir, "05-QC/FragmentSizeDistribution/{SAMPLE}_{REF}_fragmentSizeDistribution.pdf")
575
576
    insert_size_metrics_log_out = os.path.join(analysis_dir, "05-QC/FragmentSizeDistribution/logs/{SAMPLE}_{REF}_fragment_size.out")
    insert_size_metrics_log_err = os.path.join(analysis_dir, "05-QC/FragmentSizeDistribution/logs/{SAMPLE}_{REF}_fragment_size.err")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
577
578
579
    final_output.extend(expand(insert_size_metrics_output_txt, SAMPLE=samples, REF=ref))
    include: os.path.join(RULES, "fragment_size_distribution.rules")

580
#-------------------------------------
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
581
582
583
584
585
586
587
588
# then we performed peak calling only in files against reference genome and not spike genome !
ref = config["genome"]["name"] 

#----------------------------------
# preIDR on INPUT
#----------------------------------

if len(INPUT_REP) > 0 and pass_pool == 0 :
589
    preIDR_pool_input_bam = expand("%s/{{INPUT}}_{REP}_%s_sort%s.bam" % (biasedRegions_dir,ref, biasedRegions), REP = rep)
590
    preIDR_pool_log = "%s/logs/{INPUT}_preIDR_input.o" % (biasedRegions_dir)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
591
    if len(rep) > 2:
592
593
        preIDR_pool_output = ["{}/{{INPUT}}_Pool_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions),
                                 "{}/{{INPUT}}_MaxiPool_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
594
    else:
595
        preIDR_pool_output = "{}/{{INPUT}}_Pool_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
596

597
    #final_output.extend(expand(preIDR_pool_output, INPUT=INPUT_REP))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
598
599
600
601
602
603
604
605
    include: os.path.join(RULES, "preIDR_Pool.rules")

#----------------------------------
# preIDR on IP
#----------------------------------

if len(IP_REP) > 0:
    # run SPR
606
    preIDR_SPR_input_bam = expand("%s/{{IP}}_{REP}_%s_sort%s.bam" % (biasedRegions_dir,ref, biasedRegions) , REP = rep)
607
    preIDR_SPR_log = "%s/logs/{IP}_preIDR_SPR.o"% (biasedRegions_dir)
608
609
    preIDR_SPR_output = expand("%s/{{IP}}_{SPR}_%s_sort%s.bam" % (biasedRegions_dir,ref, biasedRegions),  SPR = spr)
    #final_output.extend(expand(preIDR_SPR_output, IP=IP_REP))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
610
611
612
    include: os.path.join(RULES, "preIDR_SPR.rules")

    # run PPR
613
    preIDR_PPR_input_bam = expand("%s/{{IP}}_{REP}_%s_sort%s.bam" % (biasedRegions_dir,ref, biasedRegions), REP = rep)
614
    preIDR_PPR_log = "%s/logs/{IP}_preIDR_PPR.o" % (biasedRegions_dir)
615
616
    preIDR_PPR_output =  expand("%s/{{IP}}_{PPR}_%s_sort%s.bam" % (biasedRegions_dir,ref, biasedRegions), PPR = ppr)
    #final_output.extend(expand(preIDR_PPR_output, IP=IP_REP))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
617
618
619
620
621
622
    include: os.path.join(RULES, "preIDR_PPR.rules")

#----------------------------------
# Cross correlation
#----------------------------------

623
if config["macs2"]["no_model"]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
624
    # PhantomPeakQualTools rule
625
626
627
    spp_input = "{}/{{ALL}}_{}_sort{}.bam".format(biasedRegions_dir,ref, biasedRegions)
    spp_output_pdf = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/{{ALL}}_{}_sort{}_phantom.pdf".format(ref, biasedRegions))
    spp_metrics = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/{{ALL}}_{}_sort{}_spp.out".format(ref, biasedRegions))
628
629
    spp_log_std = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/logs/{ALL}_phantom.out")
    spp_log_err = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/logs/{ALL}_phantom.err")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
630
631
632
633
634
    spp_tmpdir = config['tmpdir']
    final_output.extend(expand(spp_metrics, ALL=ALL))
    include: os.path.join(RULES, "spp.rules")
else :
    # PhantomPeakQualTools rule
635
636
637
    spp_input = "{}/{{ALL}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
    spp_output_pdf = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/{{ALL}}_{}_sort{}_phantom.pdf".format(ref, biasedRegions))
    spp_metrics = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/{{ALL}}_{}_sort{}_spp.out".format(ref, biasedRegions))
638
639
    spp_log_std = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/logs/{ALL}_phantom.out")
    spp_log_err = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/logs/{ALL}_phantom.err")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
640
641
642
643
644
645
    spp_tmpdir = config['tmpdir']
    final_output.extend(expand(spp_metrics, ALL=IP_ALL))
    include: os.path.join(RULES, "spp.rules")


#----------------------------------
646
# Peak Calling with MACS2
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
647
#----------------------------------
648
649
if config["macs2"]["do"]:
    def INPUTtoIP(wildcards):
650
        return str(biasedRegions_dir + "/" + IP_INPUT[IP_INPUT.IP == wildcards.SAMPLE].iloc[0]['INPUT'] + "_{}_sort{}.bam".format(ref, biasedRegions))
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667

    model = config["macs2"]["mode_choice"]
    model_dir = model
    mod = [model]
    peak_caller = ["macs2"]
    if config["macs2"]["no_model"]:
        model_dir += "-nomodel"
    if model in ["narrow", "broad"]:
        # Peak Calling on replicates
        if model in ["narrow"]:
            # add corresponding options
            macs2_options = "-p {} ".format(config["macs2"]['cutoff']) + config["macs2"]['options']
        else:
            macs2_options = "-p {} --broad --broad-cutoff {} ".format(config["macs2"]['cutoff'],
                                config["macs2"]['cutoff']) + config["macs2"]['options']
        if config["macs2"]["no_model"]:
            macs2_options += " --nomodel "
668
            macs2_shift_file = os.path.join(analysis_dir, "05-QC/PhantomPeakQualTools/{{SAMPLE}}_{}_sort{}_spp.out".format(ref, biasedRegions))
669
670
671
672
673
674
675
        else:
            macs2_shift_file = "Empty"
        if paired :
            macs2_pe_mode = "no"
        else:
            macs2_pe_mode = "no"

676
        macs2_input_bam = "{}/{{SAMPLE}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
677
678
679
680
681
682
683
        macs2_control = INPUTtoIP
        macs2_log = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/logs/{{SAMPLE}}.out".format(model_dir))
        macs2_output = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{SAMPLE}}_peaks.{}Peak".format(model_dir, model))
        macs2_output_prefix = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{SAMPLE}}".format(model_dir))

        final_output.extend(expand(macs2_output, zip, SAMPLE=ALL_IP_PC))
        include: os.path.join(RULES, "macs2.rules")
684
685
686
    else :
        raise ValueError("Please provides valid model for MACS2")

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
687

688
689
690
691
692
#----------------------------------
# Peak Calling with SEACR
#----------------------------------

if config["seacr"]["do"]:
693
694
695
696
697
698
    if not config["macs2"]["do"]:
        peak_caller = ["seacr"]
        mod = [config["seacr"]["threshold"]]
    else:
        peak_caller += ["seacr"]
        mod += [config["seacr"]["threshold"]]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
699

700
    # produce bedgrah files
701
    bedgraph_input = "{}/{{SAMPLE}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions)
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
702
703
704
705
    if config["design"]["spike"]:
        bedgraph_scaleFactor = compute_scaling_factor_output
    else:
        bedgraph_scaleFactor = bedgraph_input
706
707
708
709
710
    bedgraph_genome = config["genome"]["fasta_file"]+".fai"
    if paired:
        bedgraph_options = " -bedpe "
    else:
        bedgraph_options = ""
711
712
    bedgraph_logs = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_bedgraph.out")
    bedgraph_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}_bedgraph.bg")
713
    #final_output.extend(expand(bedgraph_output, SAMPLE=samples))
714
715
    include: os.path.join(RULES, "bedgraph.rules")

716
717
    def IGGtoIP(wildcards):
        return str(os.path.join(analysis_dir,"06-PeakCalling/seacr/") + IP_INPUT[IP_INPUT.IP == wildcards.SAMPLE].iloc[0]['INPUT'] + "_bedgraph.bg")
718
719

    seacr_bed_ip = bedgraph_output
720
    seacr_bed_input = IGGtoIP
721
722
723
724
725
726
    seacr_bed_threshold = config["seacr"]["threshold"]
    seacr_bed_norm = config["seacr"]["norm"]
    seacr_output_prefix = os.path.join(analysis_dir, "06-PeakCalling/seacr/{SAMPLE}")
    seacr_output = os.path.join(analysis_dir, "06-PeakCalling/seacr/{{SAMPLE}}.{}.bed".format(config["seacr"]["threshold"]))
    seacr_logs_std = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.out")
    seacr_logs_err = os.path.join(analysis_dir, "06-PeakCalling/seacr/logs/{SAMPLE}_seacr_calling.err")
727
    final_output.extend(expand(seacr_output, SAMPLE=IP_ALL))
728
729
730
    include: os.path.join(RULES, "seacr.rules")   


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
731
732
733
734
#----------------------------------
# Peak Calling metrics
#----------------------------------

735
736
737
738
739
740
741
def stats_pc_input(wildcards):
    if wildcards.CALLER == "macs2":
        return expand(os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/%s/{IP_REP}_peaks.{{MOD}}Peak" % (model_dir)), IP_REP=IP_ALL)
    elif wildcards.CALLER == "seacr":
        return expand(os.path.join(analysis_dir, "06-PeakCalling/{{CALLER}}/{IP_REP}.{{MOD}}.bed"), IP_REP=IP_ALL)

stats_peakCalling_input = stats_pc_input
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
742
stats_peakCalling_csv = os.path.join(analysis_dir, "{CALLER}_{MOD}_Peaks_metrics.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
743
744
745
stats_peakCalling_marks = marks
stats_peakCalling_conds = conds
stats_peakCalling_rep = rep_flag
746
stats_peakCalling_log = os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/{MOD}_Peaks_metrics.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
747
include: os.path.join(RULES, "stats_peakCalling.rules")
748
final_output.extend(expand(stats_peakCalling_csv, zip, CALLER=peak_caller, MOD=mod))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
749
750
751
752
753


#----------------------------------
# IDR computing
#----------------------------------
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
754
if config["macs2"]["do"] and config["compute_idr"]["do"]:
755
756
757
758
759
760
761
762
763
764
765
766
767

    if model in ["narrow"]:
        compute_idr_mode = "narrowPeak"
    else :
        compute_idr_mode = "broadPeak"
    compute_idr_input1 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}1_peaks.{}Peak".format(model_dir, model))
    compute_idr_input2 = os.path.join(analysis_dir, "06-PeakCalling/macs2/{}/{{IP_IDR}}_{{CASE}}2_peaks.{}Peak".format(model_dir, model))
    compute_idr_output = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr.txt".format(model_dir, ref, model))
    compute_idr_output_peak = os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir, 
        ref, model, config["compute_idr"]["thresh"], model))
    compute_idr_log = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_idr.out".format(model_dir,model))
    include: os.path.join(RULES, "compute_idr.rules")
    final_output.extend(expand(compute_idr_output, zip, IP_IDR=REP_IDR, CASE=CASE))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
768
769
770
771
772
773



#----------------------------------
# Select reproducible peaks
#----------------------------------
774
CALL_MOD = []
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
775

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
776
if config["macs2"]["do"] and model in ["narrow"] and not config["intersectionApproach"]["do"] and config["compute_idr"]["do"]:
777
    CALL_MOD += ["macs2_" + model_dir]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
778
    # Select IDR peaks
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
    def IDR_input_rep(wildcards):
        #if wildcards.CALLER == "macs2_narrow":
        return str(os.path.join(analysis_dir, "07-IDR/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_"+rep_flag+"1vs"+rep_flag+"2_"+ref+"_"+model+"_idr"+str(config["compute_idr"]["thresh"])+"."+model+"Peak"))

    def IDR_input_ppr(wildcards):
        #if wildcards.CALLER == "macs2_narrow":
            #return [os.path.join(analysis_dir, "07-IDR/macs2/%s/{IP_IDR}_%s1vs%s2_%s_%s_idr%s.%sPeak" % (model_dir, 
            #    "PPR", "PPR", ref, model, config["compute_idr"]["thresh"], model))]
        return str(os.path.join(analysis_dir, "07-IDR/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_PPR1vsPPR2_"+ref+"_"+model+"_idr"+str(config["compute_idr"]["thresh"])+"."+model+"Peak"))

    def IDR_input_pool(wildcards):
        #if wildcards.CALLER == "macs2_narrow":
            #return [os.path.join(analysis_dir, "06-PeakCalling/macs2/%s/{IP_IDR}_PPRPool_peaks.%sPeak" % (model_dir, model))]
        return str(os.path.join(analysis_dir, "06-PeakCalling/macs2/"+model_dir+"/"+wildcards.IP_IDR+"_PPRPool_peaks."+model+"Peak"))

    select_peaks_input_rep = IDR_input_rep
    select_peaks_input_ppr = IDR_input_ppr
    select_peaks_input_pool = IDR_input_pool
    select_peaks_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}_{}/logs/{{IP_IDR}}_selected_peaks.o".format(model_dir))
798
    select_peaks_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}_{}/{{IP_IDR}}_select.{}Peak".format(model_dir, model))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
799
    include: os.path.join(RULES, "select_peaks.rules")
800
801
    final_output.extend(expand(select_peaks_output, CALLER="macs2", IP_IDR=IP_REP))

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
802
803
804
805
#----------------------------------
# Intersection approach
#----------------------------------

806
807
if config["seacr"]["do"] :
    CALL_MOD += ["seacr_" + config["seacr"]["threshold"]]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
808
if (config["macs2"]["do"] and config["intersectionApproach"]["do"]) or (config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"]):
809
    CALL_MOD += ["macs2_" + model_dir]
810

811
def IA_input(wildcards):
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
812
    if wildcards.CALLER == "macs2_"+ model_dir:
813
814
815
        return expand(os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/%s/{{IP_IDR}}_{CASE}_peaks.{MOD}Peak" % (model_dir)), CALLER="macs2", CASE=rep, MOD=config["macs2"]["mode_choice"])
    elif wildcards.CALLER == "seacr_"+ config["seacr"]["threshold"]:
        return expand(os.path.join(analysis_dir, "06-PeakCalling/{CALLER}/{{IP_IDR}}_{CASE}.{MOD}.bed"), CALLER="seacr", CASE=rep,  MOD=config["seacr"]["threshold"])
816

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
817
if (config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"]) or config["intersectionApproach"]["do"] or config["seacr"]["do"] :
818
    intersectionApproach_input_rep = IA_input
819
    intersectionApproach_logs = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/logs/{IP_IDR}_IA_peaks.o")
820
    intersectionApproach_output = os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{IP_IDR}_IA.bed")
821
    if config["macs2"]["do"] and config["macs2"]["mode_choice"] in ["broad"] :
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
822
823
        intersectionApproach_overlap = 0.8
    else:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
824
        intersectionApproach_overlap = config["intersectionApproach"]["ia_overlap"]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
825
    include: os.path.join(RULES, "intersectionApproach.rules")
826
    final_output.extend(expand(intersectionApproach_output, CALLER=CALL_MOD,  IP_IDR=IP_REP))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
827
828
829
830
831
832
    

#----------------------------------
# Compute IDR metrics
#----------------------------------

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
833
if config["macs2"]["do"] and config["compute_idr"]["do"]:
834
835
836
837
838
839
840
841
    idr_peaks = []
    idr_peaks.extend(expand(os.path.join(analysis_dir, "07-IDR/macs2/{}/{{IP_IDR}}_{{CASE}}1vs{{CASE}}2_{}_{}_idr{}.{}Peak".format(model_dir,
                            ref,model, config["compute_idr"]["thresh"], model)), zip, IP_IDR=REP_IDR, CASE=CASE))
    metrics_peaks_input = idr_peaks
    metrics_peaks_marks = marks
    metrics_peaks_conds = conds
    metrics_peaks_rep = rep_flag
    metrics_peaks_logs = os.path.join(analysis_dir, "07-IDR/macs2/{}/logs/IDR_metrics.out".format(model_dir))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
842
    metrics_peaks_output = os.path.join(analysis_dir, "IDR_metrics.out")
843
844
    include: os.path.join(RULES, "metrics_peaks.rules")
    final_output.extend([metrics_peaks_output])
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
845
846
847
848
849
850
851


#----------------------------------
# Run differential analysis
#----------------------------------

if len(conds) > 1 and config["differential_analysis"]["do"]:
852
    
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
853
854

    def getPeakFilesByMark(wildcards):
855
        if wildcards.CALLER in ["seacr_"+config["seacr"]["threshold"], "macs2_broad", "macs2_broad-nomodel" ] or (wildcards.CALLER in  ["macs2_narrow","macs2_narrow-nomodel"] and config["intersectionApproach"]["do"]): 
856
            return expand(os.path.join(analysis_dir, "08-ReproduciblePeaks/{CALLER}/{MARK}_{COND}_IA.bed"), 
857
858
                CALLER=wildcards.CALLER, MARK=wildcards.MARK, COND=conds)
        else :
859
            return expand(os.path.join(analysis_dir, "08-ReproduciblePeaks/{{CALLER}}/{{MARK}}_{{COND}}_select.{}Peak".format(model)), 
860
861
                CALLER=wildcards.CALLER, MARK=wildcards.MARK, COND=conds)
 
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
862
863
864
865
866
867
868


    #----------------------------------
    # get union peaks
    #----------------------------------

    union_peaks_input = getPeakFilesByMark
869
    union_peaks_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_selected_peaks.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
870
    if len(conds) == 2 :
871
        union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
872
    elif len(conds) == 3 :
873
        union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], conds[2], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
874
    else :
875
       union_peaks_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.bed".format(ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
876
    include: os.path.join(RULES, "union_peaks.rules")
877
    final_output.extend(expand(union_peaks_output, zip, CALLER=CALL_MOD, MARK=MARK_OK))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
878
879
880
881
882
883

    #----------------------------------
    # get GFF from peak files
    #----------------------------------

    if len(conds) == 2 :
884
885
        bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], ref))
        bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
886
    elif len(conds) == 3 :
887
888
        bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.bed".format(conds[0], conds[1], conds[2],  ref))
        bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
889
    else :
890
891
        bed_to_gff_input = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.bed".format(ref))
        bed_to_gff_output = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.gff".format(ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
892

893
894
    bed_to_gff_logs = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_Optimal_Peaks_bed2gff.out")
    final_output.extend(expand(bed_to_gff_output, zip, CALLER=CALL_MOD, MARK=MARK_OK))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
895
896
897
898
    include: os.path.join(RULES, "bed_to_gff.rules")


    def getBAMFilesByMark(wildcards):
899
        return expand(os.path.join(analysis_dir, "%s/{MARK}_{COND}_{REP}_%s_sort%s.bam" % (biasedRegions_dir, ref, biasedRegions)),  
900
901
            MARK=wildcards.MARK, COND=conds, REP=rep)
 
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
902
903
904
905
906
907
908
909
910

    #----------------------------------
    # feature Count on peaks
    #----------------------------------


    feature_counts_input = getBAMFilesByMark
    feature_counts_optional_input = []
    if config["differential_analysis"]["input_counting"]:
911
        feature_counts_optional_input += expand("{}/{{INPUT}}_{}_sort{}.bam".format(biasedRegions_dir, ref, biasedRegions), INPUT=CORR_INPUT)
912
    feature_counts_output_count = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/{MARK}_Matrix_Optimal_Peak.mtx")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
913
    if len(conds) == 2 :
914
        feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
915
    elif len(conds) == 3 :
916
        feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_{}u{}u{}_{}.optimal.Peak_overlap.gff".format(conds[0], conds[1], conds[2], ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
917
    else :
918
        feature_counts_gff = os.path.join(analysis_dir, "09-CountMatrix/{{CALLER}}/{{MARK}}_all_conds_{}.optimal.Peak_overlap.gff".format(ref))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
919

920
    feature_counts_log = os.path.join(analysis_dir, "09-CountMatrix/{CALLER}/logs/{MARK}_counts.out")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
921
922
    feature_counts_options = "-t peak -g gene_id"
    feature_counts_threads = 4
923
    final_output.extend(expand(feature_counts_output_count, zip, CALLER=CALL_MOD, MARK=MARK_OK))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
924
925
926
927
928
929
930
931
932
933
    include: os.path.join(RULES, "feature_counts.rules")

    #----------------------------------
    # differential analysis on peaks
    #----------------------------------
    

    method = config["differential_analysis"]["method"]
    norm = config["differential_analysis"]["normalisation"]

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
934
935
936
937
938
    chipflowr_init_input = feature_counts_output_count
    chipflowr_init_conds = conds
    chipflowr_init_rep = rep
    chipflowr_init_method = method
    chipflowr_init_norm = norm
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
939
    if config["differential_analysis"]["spikes"] and config["design"]["spike"]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
940
941
        chipflowr_init_spikes = spikes_counting_output_json
        chipflowr_init_input_done = spikes_counting_output_json
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
942
    else:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
943
944
945
946
947
        chipflowr_init_spikes = ""
        chipflowr_init_input_done = feature_counts_output_count
    chipflowr_init_padj = config["differential_analysis"]["pAdjustMethod"]
    chipflowr_init_alpha = config["differential_analysis"]["alpha"]
    chipflowr_init_batch = config["differential_analysis"]["batch"]
948
949
    chipflowr_init_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}".format(method, norm))
    chipflowr_init_config_r = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/config.R".format(method, norm))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
950
951
    chipflowr_init_genome = ref
    include: os.path.join(RULES, "chipflowr_init.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
952
953


Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
954
    chipflowr_config_r = chipflowr_init_config_r
955
956
957
958
    chipflowr_output_dir = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}".format(method, norm))
    chipflowr_report = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/{{MARK}}_Stat_report_{}_{}.html".format(method, norm,method, norm))
    chipflowr_logs = os.path.join(analysis_dir, "10-DifferentialAnalysis/{{CALLER}}/{{MARK}}_{}_{}/{{MARK}}_{}_{}_Log.txt".format(method, norm, method, norm))
    final_output.extend(expand(chipflowr_report,  CALLER=CALL_MOD, MARK=MARK_OK))
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
959
    include: os.path.join(RULES, "chipflowr.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
960

961

962
963
964
965
966
#----------------------------------  
# IGV session
#----------------------------------

if config["igv_session"]["do"]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
967
    igv_session_input = expand(bamCoverage_output, SAMPLE=IP_ALL, REF=ref, allow_missing=True)
968
    if len(IP_REP) > 1 and config["macs2"]["do"]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
969
        if config["intersectionApproach"]["do"]:
970
971
972
            igv_session_input += expand(intersectionApproach_output, CALLER="macs2_" + model, IP_IDR=IP_REP)
        else:
            igv_session_input += expand(select_peaks_output, CALLER="macs2", IP_IDR=IP_REP)
973
    elif len(IP_REP) > 1 and config["seacr"]["do"]:
974
        igv_session_input += expand(intersectionApproach_output, CALLER="seacr_" + config["seacr"]["threshold"], IP_IDR=IP_REP)
975
    if len(IP_ALL) > 1 and config["macs2"]["do"]:
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
976
        igv_session_input += expand(macs2_output, SAMPLE=IP_ALL)
977
978
979
    elif len(IP_ALL) > 1 and config["seacr"]["do"]:
        igv_session_input += expand(seacr_output, SAMPLE=IP_ALL)

980
981
982
    igv_session_output = os.path.join(analysis_dir, "12-IGV/igv_session.xml")
    igv_session_genome = ref
    igv_session_mark = marks
983
    igv_session_conds = conds
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
984
    igv_session_wdir = os.path.join(analysis_dir, "12-IGV/")
985
986
987
988
989
    igv_session_autoScale = config["igv_session"]["autoScale"]
    igv_session_normalize = config["igv_session"]["normalize"]
    igv_session_log_out = os.path.join(analysis_dir, "12-IGV/logs/igv_session.out")
    final_output.extend([igv_session_output])
    include: os.path.join(RULES, "igv_session.rules")
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
990

991

Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
992
993
994
#----------------------------------  
# MultiQC report
#----------------------------------
995
996
try : model_dir
except NameError : model_dir = "seacr_" + config["seacr"]["threshold"]
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
997
998

multiqc_input = final_output
Rachel  LEGENDRE's avatar
Rachel LEGENDRE committed
999
1000
multiqc_input_dir = analysis_dir
multiqc_logs = os.path.join(analysis_dir, "11-Multiqc/multiqc.log")