#!/usr/bin/env nextflow workflow.onComplete = { // any workflow property can be used here println "Pipeline complete" println "Command line: $workflow.commandLine" } workflow.onError = { println "Oops .. something went wrong" } params.help=false def usage() { println("blast_approach.nf --in --out --cpus -w ") println("--in Directory containing fastq files (Single end only). Full path is mandatory") println("--out Output directory") println("--cpus Number of cpus for process (default 2, max 12)") println("-w Temporary output (usually /pasteur/scratch/directory)") println("--identity Read minimum identity in percent (Default 95)") println("--coverage Read minimum coverage in percent (Default 90)") println("--evalue E-value threshold (Default 1E-3)") } if(params.help){ usage() exit(1) } params.in="${baseDir}/test/" fastqChannel = Channel .fromPath("${params.in}/*.{fastq,fastq.gz}") .map{ file -> tuple(file.baseName,file) } .ifEmpty { exit 1, "Cannot find any reads matching: ${params.in}" } //.subscribe{ println it } params.cpus = 6 params.nt = "/local/databases/fasta/nt" params.identity = 50 params.dia_identity = 40 params.coverage = 50 params.dia_coverage = 40 params.evalue = 1E-3 params.hit = 1 params.wordsize = 28 params.taxadb = "/local/databases/rel/taxadb/current/db/taxadb_full.sqlite" params.out = "${baseDir}/annotation/" params.human = "/local/databases/index/bowtie/2.1.0/hg38.fa" params.mouse = "/local/databases/index/bowtie/2.1.0/mm10.fa" params.phi = "/local/databases/index/bowtie/2.1.0/phiX.fa" params.genomes = "$baseDir/databases/genomes_resume.fasta" params.alienseq = "$baseDir/databases/alienTrimmerPF8contaminants.fasta" params.minlength = 35 //params.nrdb = "/local/databases/index/diamond/nrprot/nr.dmnd" params.nrdb = "/pasteur/services/policy01/banques/prod/rel/nrprot/nrprot_2018-09-16/diamond/0.9/nr.dmnd" //params.not_aligned = "${baseDir}/not_aligned/" params.mail = "amine.ghozlane@pasteur.fr" params.memory_mbma = 60000 myDir = file(params.out) myDir.mkdirs() // notalignedChannel = Channel.watchPath( "${params.not_aligned}/*.fastq" ) // .map{ // file -> tuple(file.baseName,file) // } process fastqfiltering { memory "5G" cpus params.cpus //publishDir "$myDir", mode: 'copy' input: set sample_id, file(reads) from fastqChannel output: set sample_id, file("*_notmapped.fastq") into filteringChannel file("*.txt") into resumeChannel mode flatten shell: """ #!/bin/bash bowtie2 -p !{params.cpus} --sensitive-local -x !{params.human}\ -U !{reads} -S /dev/null \ --un !{sample_id}_notmapped_human.fastq > !{sample_id}_mapping_human.txt 2>&1 bowtie2 -p !{params.cpus} --sensitive-local -x !{params.mouse}\ -U !{sample_id}_notmapped_human.fastq -S /dev/null \ --un !{sample_id}_notmapped_mouse.fastq > !{sample_id}_mapping_mouse.txt 2>&1 bowtie2 -p !{params.cpus} --sensitive-local -x !{params.phi}\ -U !{sample_id}_notmapped_mouse.fastq -S /dev/null \ --un !{sample_id}_notmapped.fastq > !{sample_id}_mapping_phi.txt 2>&1 nb_raw=\$(echo \$((`wc -l < !{reads}` / 4))) echo -e "Number of raw reads\t\$nb_raw" > !{myDir}/!{sample_id}_resume_nb_reads.tsv nb_filt=\$(echo \$((`wc -l < !{sample_id}_notmapped.fastq` / 4))) echo -e "Number of reads after filtering\t\$nb_filt" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv """ } resumeChannel.subscribe{ it.copyTo(myDir)} process trimming { publishDir "$myDir", mode: 'copy' input: set sample_id, file(reads) from filteringChannel output: file("*.fastq") into trimmingChannel //set sample_id, file("*.fastq") into trimmingChannel // set sample_id, file("*.fasta") into fastafiltChannel shell: """ #!/bin/bash AlienTrimmer -i !{reads} -o !{sample_id}.fastq -c !{params.alienseq} \ -l !{params.minlength} nb_trim=\$(echo \$((`wc -l < !{sample_id}.fastq` / 4))) echo -e "Number of reads after trimming\t\$nb_trim" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv """ } /*process khmer { cpus params.cpus //memory "25G" memory "15G" input: set sample_id, file(reads) from trimmingChannel output: set sample_id, file("khmer/*.fastq") into khmerChannel //file("khmer/*.fastq.gz") into khmeroutChannel mode flatten script: """ normalize-by-median.py -k 20 -C 20 -N 4 -x 3e9 \ --savegraph graph.ct --output output.pe.keep ${reads} filter-abund.py -V graph.ct -T ${params.cpus} --output ${sample_id}_kh.fastq output.pe.keep """ }*/ process mbma { //queue = 'common' //clusterOptions='--qos=normal -p common' publishDir "$myDir", mode: 'copy' //errorStrategy 'finish' input: file(reads) from trimmingChannel.toList() //file(reads) from khmerChannel.toList() output: //file("abundance/count_matrix.tsv") into countChannel //file("not_aligned/*.fastq") into naChannel mode flatten file("count_matrix.tsv") into countChannel file("not_aligned/*.temp") into naChannel mode flatten shell: """ #!/bin/bash mbma.py mapping -se ${reads} -o abundance --shared -m SE -t ${params.cpus} \ -db ${params.genomes} -e !{params.mail} -q ${params.queue} \ -p ${params.partition} --bowtie2 -mem ${params.memory_mbma} mkdir not_aligned/ for infile in \$(ls abundance/unmapped/*) do nn=`basename \$infile` mv \$infile not_aligned/\$nn.temp done mv abundance/comptage/count_matrix.txt count_matrix.tsv """ } process mbma_krona { input: file(counts) from countChannel output: file("*.txt") into mbmataxChannel mode flatten shell: """ i=1 for samp in \$(head -n1 !{counts} | cut -f 3- -d \$'\t'); do tail -n +2 !{counts} | cut -f \$((2 + \$i)),1 -d \$'\t' | awk -F \$'\t' '{print \$2, \$1}' OFS=\$'\t'> \$samp.txt i=\$((\$i + 1)) done """ } // naChannel = Channel.fromPath("${params.out}/not_aligned/*.fastq").ifEmpty{exit 1, "No reads file were found in the not_aligned folder"} // naChannel.subscribe{println it} // naChannel = Channel.fromPath("${myDir}/not_aligned/*.fastq") notalignedChannel = naChannel.map { tmp = it.baseName tuple(tmp.split('\\.')[0], it) } // notalignedChannel.subscribe{println it} process fastqtofasta { input: set sample_id, file(reads) from notalignedChannel output: set sample_id, file("*.fasta") into notalignedfastaChannel shell: """ python ${baseDir}/bin/fastq2fasta.py -i !{reads} \ -o !{sample_id}.fasta """ } process blast { publishDir "$myDir", mode: 'copy' cpus params.cpus memory "10G" input: set sample_id, file(fasta) from notalignedfastaChannel output: //set sample_id, file(fasta), file("*_nt.txt"), file("*_catalogue.txt") into blastChannel set sample_id, file(fasta), file("*_nt.txt") into blastChannel mode flatten shell: """ # NCBI blastn -query !{fasta} -out !{sample_id}_nt.txt -outfmt \ "6 qseqid sseqid qlen length mismatch gapopen qstart qend sstart send pident qcovs evalue bitscore" \ -db !{params.nt} -max_target_seqs !{params.hit} \ -evalue !{params.evalue} -num_threads !{params.cpus} \ -perc_identity !{params.identity} -qcov_hsp_perc !{params.coverage} \ -word_size !{params.wordsize} nb_not_mapped=`grep "^>" !{fasta} | wc -l` echo -e "Number of unmmapped reads\t\$nb_not_mapped" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv """ } process taxonomy { publishDir "$myDir", mode: 'copy' memory "5G" beforeScript ='source /local/gensoft2/adm/etc/profile.d/modules.sh;module use /pasteur/projets/policy01/Matrix/modules' module = 'Python/3.6.0:taxadb/0.6.0' input: set sample_id, file(fasta), file(nt) from blastChannel output: set sample_id, file("*_not_annotated.fasta") into notAnnotatedChannel file("*_taxonomy.txt") into restaxChannel file("*_annotation.txt") into resannotChannel shell: """ #!/bin/bash tax_count=\$(wc -l !{nt} |cut -f 1 -d " ") if [ "\$tax_count" -gt "0" ] then # Annot ncbi python3 !{baseDir}/bin/get_taxonomy3.py -i !{nt} \ -d !{params.taxadb} \ -o !{sample_id}_taxonomy.txt python2 !{baseDir}/bin/ExtractNCBIDB2.py -f !{nt} \ -g !{sample_id}_taxonomy.txt -nb 1 -o !{sample_id}_annotation.txt # Get sequence not annotated if [ -f !{sample_id}_annotation.txt ] then python2 !{baseDir}/bin/extract_fasta.py -q !{sample_id}_annotation.txt \ -t !{fasta} -n -o !{sample_id}_not_annotated.fasta cp !{sample_id}_not_annotated.fasta !{myDir} else touch !{sample_id}_annotation.txt !{sample_id}_not_annotated.fasta fi else cat !{fasta} > !{sample_id}_not_annotated.fasta #touch !{sample_id}_krona.txt touch !{sample_id}_annotation.txt !{sample_id}_taxonomy.txt fi nb_annot=`wc -l < !{sample_id}_annotation.txt` echo -e "Number of annotated reads with BLAST\t\$nb_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv nb_not_annot=`grep "^>" !{sample_id}_not_annotated.fasta | wc -l` echo -e "Number of unannotated reads with BLAST\t\$nb_not_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv """ } process diamond { publishDir "$myDir", mode: 'copy' cpus params.cpus memory "15G" input: set sample_id, file(fasta) from notAnnotatedChannel output: set sample_id, file("*_dia.txt") into diamondChannel //file(fasta), script: """ diamond blastx -d ${params.nrdb} -q ${fasta} -o ${sample_id}_dia.txt \ --outfmt 6 qseqid sseqid qlen length mismatch gapopen qstart qend sstart send pident qcovhsp evalue bitscore \ -e ${params.evalue} -k 1 -p ${params.cpus} --id ${params.dia_identity} \ --query-cover ${params.dia_coverage} """ } process annotation_for_krona { beforeScript ='source /local/gensoft2/adm/etc/profile.d/modules.sh;module use /pasteur/projets/policy01/Matrix/modules' module = 'Python/3.6.0:taxadb/0.6.0' echo true input: set sample_id, file(dia) from diamondChannel output: file("*_kronablast.txt") into taxChannel file("*_dia_annotation.txt") into diannotChannel shell: """ #!/bin/bash tax_count=\$(wc -l < !{dia}) if [ "\$tax_count" -gt "0" ] then # Annot ncbi python3 !{baseDir}/bin/get_taxonomy3.py -i !{dia} \ -d !{params.taxadb} \ -o !{sample_id}_dia_taxonomy.txt python2 !{baseDir}/bin/ExtractNCBIDB2.py -f !{dia} \ -g !{sample_id}_dia_taxonomy.txt -nb 1 -o !{sample_id}_dia_annotation.txt cat !{params.out}/!{sample_id}_annotation.txt !{sample_id}_dia_annotation.txt > !{sample_id}_final_annotation.txt else cat !{params.out}/!{sample_id}_annotation.txt > !{sample_id}_final_annotation.txt touch !{sample_id}_dia_annotation.txt fi # Interest column for krona cut -s -f 3-10 !{sample_id}_final_annotation.txt > !{sample_id}_annotation_interest.txt # count number of elements in annotated compared to the number of sequence # to annot count_reads=\$(grep "^>" -c !{params.out}/!{sample_id}.fasta) # Create Krona annotation while read line; do echo -e "1\t\$line"; done < !{sample_id}_annotation_interest.txt > !{sample_id}_krona.txt cat !{sample_id}_krona.txt > not-annotated-!{sample_id}_kronablast.txt annot=\$(wc -l !{sample_id}_krona.txt | cut -f 1 -d ' ') # Count not annoted elements if [ "\$count_reads" -gt "\$annot" ]; then val=\$(( count_reads - annot)) echo -e "\$val\tNA\tNA\tNA\tNA\tNA\tNA\tNA" >> not-annotated-!{sample_id}_kronablast.txt fi nb_annot=\$(wc -l < !{sample_id}_dia_annotation.txt) echo -e "Number of annotated reads with DIAMOND\t\$nb_annot" >> !{myDir}/!{sample_id}_resume_nb_reads.tsv """ } diannotChannel.subscribe{ it.copyTo(myDir)} /*taxChannel.subscribe{ println it } mbmataxChannel.subscribe{ println it }*/ //combined_taxChannel = taxChannel.toList().concat(mbmataxChannel) /*.subscribe{ println it }*/ // taxChannel = Channel.fromPath("${myDir}/krona/*.txt") process krona { publishDir "$myDir", mode: 'copy' input: file(kr) from taxChannel.toList() file(mbma_kr) from mbmataxChannel.toList() output: file("*html") into kronaChannel mode flatten shell: """ #!/bin/bash files="" for i in `ls *.txt` do name=\$(echo \$i | cut -f 1 -d "_" ) nb_line=\$(wc -l \$i | cut -f 1 -d " ") if [ "\$nb_line" -gt "0" ] then files="\${files} \$i,\${name} " fi done ktImportText \${files} -o result.html """ } println "Project : $workflow.projectDir" println "Cmd line: $workflow.commandLine"