blast_approach.nf 13.1 KB
Newer Older
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
#!/usr/bin/env nextflow

workflow.onComplete = {
    // any workflow property can be used here
    println "Pipeline complete"
    println "Command line: $workflow.commandLine"
}


workflow.onError = {
    println "Oops .. something went wrong"
}

params.help=false

def usage() {
    println("blast_approach.nf --in <fastq_dir> --out <output_dir> --cpus <nb_cpus> -w <output_temp>")
    println("--in Directory containing fastq files (Single end only). Full path is mandatory")
    println("--out Output directory")
    println("--cpus Number of cpus for process (default 2, max 12)")
    println("-w Temporary output (usually /pasteur/scratch/directory)")
    println("--identity Read minimum identity in percent (Default 95)")
    println("--coverage Read minimum coverage in percent (Default 90)")
    println("--evalue E-value threshold (Default 1E-3)")
}


if(params.help){
    usage()
    exit(1)
}

params.in="${baseDir}/test/"
fastqChannel = Channel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
35
                    .fromPath("${params.in}/*.{fastq,fastq.gz}")
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
36 37 38 39 40
                    .map{
                        file -> tuple(file.baseName,file)
                    }
                    .ifEmpty { exit 1, "Cannot find any reads matching: ${params.in}" }
                    //.subscribe{ println it }
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
41
params.cpus = 6
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
42
params.nt = "/local/databases/fasta/nt"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
43
params.identity = 50
44
params.dia_identity = 40
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
45
params.coverage = 50
46
params.dia_coverage = 40
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
47 48 49
params.evalue = 1E-3
params.hit = 1
params.wordsize = 28
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
50
params.taxadb = "/local/databases/rel/taxadb/current/db/taxadb_full.sqlite"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
51 52 53 54 55 56 57
params.out = "${baseDir}/annotation/"
params.human = "/local/databases/index/bowtie/2.1.0/hg38.fa"
params.mouse = "/local/databases/index/bowtie/2.1.0/mm10.fa"
params.phi = "/local/databases/index/bowtie/2.1.0/phiX.fa"
params.genomes = "$baseDir/databases/genomes_resume.fasta"
params.alienseq = "$baseDir/databases/alienTrimmerPF8contaminants.fasta"
params.minlength = 35
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
58
//params.nrdb = "/local/databases/index/diamond/nrprot/nr.dmnd"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
59
params.nrdb = "/pasteur/services/policy01/banques/prod/rel/nrprot/nrprot_2018-09-16/diamond/0.9/nr.dmnd"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
60
//params.not_aligned = "${baseDir}/not_aligned/"
61
params.mail = "amine.ghozlane@pasteur.fr"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
62
params.memory_mbma = 60000
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
63 64 65
myDir = file(params.out)
myDir.mkdirs()

66 67 68 69
// notalignedChannel = Channel.watchPath( "${params.not_aligned}/*.fastq" )
//                            .map{
//                               file -> tuple(file.baseName,file)
//                            }
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
70 71 72


process fastqfiltering {
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
73
    memory "5G"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
74 75 76 77 78 79 80 81 82 83
    cpus params.cpus
    //publishDir "$myDir", mode: 'copy'

    input:
    set sample_id, file(reads) from fastqChannel

    output:
    set sample_id, file("*_notmapped.fastq") into filteringChannel
    file("*.txt") into resumeChannel mode flatten

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
84
    shell:
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
85
    """
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
    #!/bin/bash
    bowtie2 -p !{params.cpus}  --sensitive-local -x !{params.human}\
        -U !{reads} -S /dev/null \
        --un !{sample_id}_notmapped_human.fastq > !{sample_id}_mapping_human.txt 2>&1
    bowtie2 -p !{params.cpus}  --sensitive-local -x !{params.mouse}\
        -U !{sample_id}_notmapped_human.fastq -S /dev/null \
        --un !{sample_id}_notmapped_mouse.fastq > !{sample_id}_mapping_mouse.txt 2>&1
    bowtie2 -p !{params.cpus}  --sensitive-local -x !{params.phi}\
        -U !{sample_id}_notmapped_mouse.fastq -S /dev/null \
        --un !{sample_id}_notmapped.fastq > !{sample_id}_mapping_phi.txt 2>&1

    nb_raw=\$(echo \$((`wc -l < !{reads}` / 4)))
    echo -e "Number of raw reads\t\$nb_raw" > !{myDir}/!{sample_id}_resume_nb_reads.tsv
    nb_filt=\$(echo \$((`wc -l < !{sample_id}_notmapped.fastq` / 4)))
    echo -e "Number of reads after filtering\t\$nb_filt" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
101 102 103 104 105 106
    """
}

resumeChannel.subscribe{ it.copyTo(myDir)}

process trimming {
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
107
    publishDir "$myDir", mode: 'copy'
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
108 109 110 111 112
    input:
    set sample_id, file(reads) from filteringChannel

    output:
    file("*.fastq") into trimmingChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
113
    //set sample_id, file("*.fastq") into trimmingChannel
114
    // set sample_id, file("*.fasta") into fastafiltChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
115

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
    shell:
    """
    #!/bin/bash
    AlienTrimmer -i !{reads} -o !{sample_id}.fastq  -c !{params.alienseq} \
        -l !{params.minlength}
    nb_trim=\$(echo \$((`wc -l < !{sample_id}.fastq` / 4)))
    echo -e "Number of reads after trimming\t\$nb_trim" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
    """
}

/*process khmer {
    cpus params.cpus
    //memory "25G"
    memory "15G"

    input:
    set sample_id, file(reads) from trimmingChannel

    output:
    set sample_id, file("khmer/*.fastq") into khmerChannel
    //file("khmer/*.fastq.gz") into khmeroutChannel mode flatten

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
138 139
    script:
    """
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
140 141 142
    normalize-by-median.py -k 20 -C 20 -N 4 -x 3e9  \
        --savegraph graph.ct --output output.pe.keep  ${reads}
    filter-abund.py -V graph.ct -T ${params.cpus} --output ${sample_id}_kh.fastq output.pe.keep
143

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
144
    """
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
145
}*/
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
146 147 148 149

process mbma {
    //queue = 'common'
    //clusterOptions='--qos=normal -p common'
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
150
    publishDir "$myDir", mode: 'copy'
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
151 152 153 154
    //errorStrategy 'finish'

    input:
    file(reads) from trimmingChannel.toList()
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
155
    //file(reads) from khmerChannel.toList()
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
156 157

    output:
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
158 159 160 161 162
    //file("abundance/count_matrix.tsv") into countChannel
    //file("not_aligned/*.fastq") into naChannel mode flatten
    file("count_matrix.tsv") into countChannel
    file("not_aligned/*.temp") into naChannel mode flatten

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
163 164
    shell:
    """
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
165 166 167 168
    #!/bin/bash
    mbma.py mapping -se ${reads} -o abundance --shared -m SE -t ${params.cpus} \
           -db ${params.genomes} -e !{params.mail} -q ${params.queue} \
           -p ${params.partition} --bowtie2 -mem ${params.memory_mbma}
169
    mkdir not_aligned/
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
170 171
    for infile in \$(ls abundance/unmapped/*)
    do
172
        nn=`basename \$infile`
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
173
        mv \$infile not_aligned/\$nn.temp
174
    done
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
175
    mv abundance/comptage/count_matrix.txt count_matrix.tsv
176 177 178 179 180 181 182 183
    """
}

process mbma_krona {
    input:
    file(counts) from countChannel

    output:
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
184
    file("*.txt") into mbmataxChannel mode flatten
185 186 187 188

    shell:
    """
    i=1
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
189
    for samp in \$(head -n1 !{counts} | cut -f 3- -d \$'\t'); do
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218
        tail -n +2 !{counts} | cut -f \$((2 + \$i)),1 -d \$'\t' | awk -F \$'\t' '{print \$2, \$1}' OFS=\$'\t'> \$samp.txt
        i=\$((\$i + 1))
    done
    """
}

// naChannel = Channel.fromPath("${params.out}/not_aligned/*.fastq").ifEmpty{exit 1, "No reads file were found in the not_aligned folder"}
// naChannel.subscribe{println it}
// naChannel = Channel.fromPath("${myDir}/not_aligned/*.fastq")

notalignedChannel = naChannel.map {
                                tmp = it.baseName
                                tuple(tmp.split('\\.')[0], it)
                            }

// notalignedChannel.subscribe{println it}

process fastqtofasta {

    input:
    set sample_id, file(reads) from notalignedChannel

    output:
    set sample_id, file("*.fasta") into notalignedfastaChannel

    shell:
    """
    python ${baseDir}/bin/fastq2fasta.py -i !{reads} \
            -o !{sample_id}.fasta
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
219 220 221 222 223 224 225 226 227
    """
}

process blast {
    publishDir "$myDir", mode: 'copy'
    cpus params.cpus
    memory "10G"

    input:
228
    set sample_id, file(fasta) from notalignedfastaChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
229 230 231

    output:
    //set sample_id, file(fasta), file("*_nt.txt"), file("*_catalogue.txt") into blastChannel
232
    set sample_id, file(fasta), file("*_nt.txt") into blastChannel mode flatten
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
233 234 235 236 237 238 239 240 241 242

    shell:
    """
    # NCBI
    blastn -query !{fasta} -out !{sample_id}_nt.txt -outfmt \
           "6 qseqid sseqid qlen length mismatch gapopen qstart qend sstart send pident qcovs evalue bitscore" \
           -db !{params.nt}  -max_target_seqs !{params.hit} \
           -evalue !{params.evalue} -num_threads !{params.cpus} \
           -perc_identity !{params.identity} -qcov_hsp_perc !{params.coverage} \
           -word_size !{params.wordsize}
243 244 245

    nb_not_mapped=`grep "^>" !{fasta} | wc -l`
    echo -e "Number of unmmapped reads\t\$nb_not_mapped" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
246 247 248 249 250 251
    """
}

process taxonomy {
    publishDir "$myDir", mode: 'copy'
    memory "5G"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
252 253
    beforeScript ='source /local/gensoft2/adm/etc/profile.d/modules.sh;module use /pasteur/projets/policy01/Matrix/modules'
    module = 'Python/3.6.0:taxadb/0.6.0'
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
254 255 256 257 258 259

    input:
    set sample_id, file(fasta), file(nt) from blastChannel

    output:
    set sample_id, file("*_not_annotated.fasta") into notAnnotatedChannel
260 261
    file("*_taxonomy.txt") into restaxChannel
    file("*_annotation.txt") into resannotChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279

    shell:
    """
    #!/bin/bash
    tax_count=\$(wc -l !{nt} |cut -f 1 -d " ")
    if [ "\$tax_count" -gt "0" ]
    then
        # Annot ncbi
        python3 !{baseDir}/bin/get_taxonomy3.py -i !{nt} \
                -d !{params.taxadb} \
                -o !{sample_id}_taxonomy.txt
        python2 !{baseDir}/bin/ExtractNCBIDB2.py -f !{nt} \
            -g !{sample_id}_taxonomy.txt -nb 1 -o !{sample_id}_annotation.txt
        # Get sequence not annotated
        if [ -f !{sample_id}_annotation.txt ]
        then
            python2 !{baseDir}/bin/extract_fasta.py -q !{sample_id}_annotation.txt \
             -t !{fasta} -n -o !{sample_id}_not_annotated.fasta
280
             cp !{sample_id}_not_annotated.fasta !{myDir}
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
281 282 283
        else
            touch !{sample_id}_annotation.txt !{sample_id}_not_annotated.fasta
        fi
284

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
285 286 287
    else
        cat !{fasta} > !{sample_id}_not_annotated.fasta
        #touch !{sample_id}_krona.txt
288
        touch !{sample_id}_annotation.txt !{sample_id}_taxonomy.txt
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
289
    fi
290 291 292 293 294

    nb_annot=`wc -l < !{sample_id}_annotation.txt`
    echo -e "Number of annotated reads with BLAST\t\$nb_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
    nb_not_annot=`grep "^>" !{sample_id}_not_annotated.fasta | wc -l`
    echo -e "Number of unannotated reads with BLAST\t\$nb_not_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
295 296 297 298 299
    """
}

process diamond {
    publishDir "$myDir", mode: 'copy'
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
300 301
    cpus params.cpus
    memory "15G"
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
302 303

    input:
304
    set sample_id, file(fasta) from notAnnotatedChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
305 306

    output:
307
    set sample_id, file("*_dia.txt") into diamondChannel //file(fasta),
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
308 309 310 311

    script:
    """
    diamond blastx -d ${params.nrdb} -q ${fasta} -o ${sample_id}_dia.txt \
312
           --outfmt 6 qseqid sseqid qlen length mismatch gapopen qstart qend sstart send pident qcovhsp evalue bitscore \
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
313
           -e ${params.evalue} -k 1 -p ${params.cpus} --id ${params.dia_identity} \
314
           --query-cover ${params.dia_coverage}
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
315 316 317 318
    """
}

process annotation_for_krona {
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
319 320 321
    beforeScript ='source /local/gensoft2/adm/etc/profile.d/modules.sh;module use /pasteur/projets/policy01/Matrix/modules'
    module = 'Python/3.6.0:taxadb/0.6.0'
    echo true
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
322 323 324

    input:
    set sample_id, file(dia) from diamondChannel
325

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
326 327
    output:
    file("*_kronablast.txt") into taxChannel
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
328
    file("*_dia_annotation.txt") into diannotChannel
329

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
330 331
    shell:
    """
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
    #!/bin/bash
    tax_count=\$(wc -l < !{dia})
    if [ "\$tax_count" -gt "0" ]
    then
        # Annot ncbi
        python3 !{baseDir}/bin/get_taxonomy3.py -i !{dia} \
                -d !{params.taxadb} \
                -o !{sample_id}_dia_taxonomy.txt
        python2 !{baseDir}/bin/ExtractNCBIDB2.py -f !{dia} \
                -g !{sample_id}_dia_taxonomy.txt -nb 1 -o !{sample_id}_dia_annotation.txt
        cat !{params.out}/!{sample_id}_annotation.txt !{sample_id}_dia_annotation.txt > !{sample_id}_final_annotation.txt
    else
        cat !{params.out}/!{sample_id}_annotation.txt > !{sample_id}_final_annotation.txt
        touch !{sample_id}_dia_annotation.txt
    fi
347 348 349 350 351 352 353 354 355 356 357 358 359 360
    # Interest column for krona
    cut -s -f 3-10 !{sample_id}_final_annotation.txt > !{sample_id}_annotation_interest.txt
    # count number of elements in annotated compared to the number of sequence
    # to annot
    count_reads=\$(grep "^>" -c !{params.out}/!{sample_id}.fasta)
    # Create Krona annotation
    while read line; do echo -e "1\t\$line"; done < !{sample_id}_annotation_interest.txt > !{sample_id}_krona.txt
    cat !{sample_id}_krona.txt > not-annotated-!{sample_id}_kronablast.txt
    annot=\$(wc -l !{sample_id}_krona.txt | cut -f 1 -d ' ')
    # Count not annoted elements
    if [ "\$count_reads" -gt "\$annot" ]; then
        val=\$(( count_reads - annot))
        echo -e "\$val\tNA\tNA\tNA\tNA\tNA\tNA\tNA" >> not-annotated-!{sample_id}_kronablast.txt
    fi
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
361 362

    nb_annot=\$(wc -l < !{sample_id}_dia_annotation.txt)
363
    echo -e "Number of annotated reads with DIAMOND\t\$nb_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
364 365 366
    """
}

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
367 368 369 370 371
diannotChannel.subscribe{ it.copyTo(myDir)}
/*taxChannel.subscribe{ println it }
mbmataxChannel.subscribe{ println it }*/
//combined_taxChannel = taxChannel.toList().concat(mbmataxChannel)
    /*.subscribe{ println it }*/
372
// taxChannel = Channel.fromPath("${myDir}/krona/*.txt")
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
373 374 375 376 377 378

process krona {
    publishDir "$myDir", mode: 'copy'

    input:
    file(kr) from taxChannel.toList()
379
    file(mbma_kr) from mbmataxChannel.toList()
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
380 381

    output:
382
    file("*html") into kronaChannel mode flatten
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
383 384 385 386 387

    shell:
    """
    #!/bin/bash
    files=""
388
    for i in `ls *.txt`
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
389
    do
390 391
        name=\$(echo \$i | cut -f 1 -d "_" )
        nb_line=\$(wc -l \$i | cut -f 1 -d " ")
Amine  GHOZLANE's avatar
Amine GHOZLANE committed
392 393 394 395 396 397 398 399 400 401
        if [ "\$nb_line" -gt "0" ]
        then
            files="\${files} \$i,\${name} "
        fi
    done

    ktImportText \${files} -o result.html
    """
}

Amine  GHOZLANE's avatar
Amine GHOZLANE committed
402 403
println "Project : $workflow.projectDir"
println "Cmd line: $workflow.commandLine"