diff --git a/README.md b/README.md index 8710596c757beb71bd78f7a0cfd960bdc488c318..debb2590fa9160e3210cfc1a22465e4adbc9d09c 100644 --- a/README.md +++ b/README.md @@ -203,6 +203,11 @@ Gitlab developers <br /><br /> ## WHAT'S NEW IN +### v3.0 + +1) vcf_ficher.nf now parallelized for all the genome + + ### v2.1 1) vcf_ficher.nf improved for the whole genome diff --git a/bin/fisher_lod.py b/bin/fisher_lod.py index a827afe38d4f37724d4676ce04df5e954b197875..0a2ac08b21febb6459e2b8e78d20d38556540e5e 100644 --- a/bin/fisher_lod.py +++ b/bin/fisher_lod.py @@ -182,11 +182,11 @@ random.seed(1) ################ Ignition -if region == "None": - region = None -else: - # region = region.strip('[]').replace('"', '').replace(' ', '').split(',') # old version when no parall in nf - region = region.replace('"', '').replace(' ', '') +# if region == "None": +# region = None +#else: +# # region = region.strip('[]').replace('"', '').replace(' ', '').split(',') # old version when no parall in nf +# region = region.replace('"', '').replace(' ', '') ################ End ignition @@ -240,9 +240,10 @@ with warnings.catch_warnings(): # for v in vcf(i1): # tempo = fisher(v = v, columns = columns) # df = df.append(tempo) - for v in vcf: - tempo = fisher(v = v, columns = columns) - df = df.append(tempo) + for v in vcf: # parse each line of vcf with the content of region in it + if v.CHROM == region: + tempo = fisher(v = v, columns = columns) + df = df.append(tempo) # on ecrit la dataframe dans un fichier df.to_csv('./fisher.tsv', sep='\t', index=False) diff --git a/vcf_fisher.config b/vcf_fisher.config index 45fa722c4015e2a2477e24f1ca99c70563fec29d..00ab124f22e096d2a0ea007d6c8e5a2f2cd7044c 100644 --- a/vcf_fisher.config +++ b/vcf_fisher.config @@ -20,7 +20,7 @@ env { //Warning: do not write the out_path now. See below. If written here, the one below is not considered" ped_path = "/pasteur/zeus/projets/p01/BioIT/gmillot/08002_bourgeron/dataset/Dyslexia.pedigree.txt" // pedigree chr_path = "/pasteur/zeus/projets/p01/BioIT/gmillot/reference_genomes/human hg19_grch37/hg19_grch37p5_chr_size_cumul.txt" - region = "chr7, chr1, chr2" // region to parse. Write "chr7:0-147000000, chr10:1000000-2000000" for a single region, "chr7:0-147000000, chr10:1000000-2000000" if two regions, ""chr7" for a whole chromosome, "chr7, chr1" for two chromosomes and "None" for the complete genome // Warning : replace eval() by ast.literal_eval() from ast package in the main py code ? + region = "None" // region to parse. Write "chr7:0-147000000, chr10:1000000-2000000" for a single region, "chr7:0-147000000, chr10:1000000-2000000" if two regions, ""chr7" for a whole chromosome, "chr7, chr1" for two chromosomes and "None" for the complete genome // Warning : replace eval() by ast.literal_eval() from ast package in the main py code ? y_lim1 = 5 // max y-axis limit of the top panel in the miami plot, in log10, i.e., 5 means up to score 10^5 y_lim2 = 5 // max y-axis limit of the bottom panel in the miami plot, in log10, i.e., 5 means up to score 10^5 cute_path = "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v11.4.0/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\Gael\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R" diff --git a/vcf_fisher.nf b/vcf_fisher.nf index 92b7e6b789d75ec5caa17eba2ef9427bd697aea9..6946670dd0015cff09816dafc9d3433d369389cc 100644 --- a/vcf_fisher.nf +++ b/vcf_fisher.nf @@ -131,7 +131,7 @@ process WorkflowVersion { // create a file with the workflow version in out_path process fisher { label 'python' // see the withLabel: bash in the nextflow config file - publishDir path: "${out_path}", mode: 'copy', overwrite: false + //publishDir path: "${out_path}", mode: 'copy', overwrite: false cache 'true' //no channel input here for the vcf, because I do not transform it @@ -143,7 +143,7 @@ process fisher { //val region2 from region_ch output: - file "*.tsv" into fisher_ch + file "*.tsv" into fisher_ch1 // multi channel script: """ @@ -151,17 +151,20 @@ process fisher { fisher_lod.py ${vcf} ${ped} "${region2}" """ } -fisher_ch.collectFile(name: "fisher.tsv").subscribe{it -> it.copyTo("${out_path}")} // concatenate all the cov_report.txt files in channel cov_report_ch into a single file published into ${out_path}/reports + +fisher_ch1.collectFile(name: "fisher.tsv", skip:1, keepHeader:true).into{fisher_ch2 ; fisher_ch3} +fisher_ch2.subscribe{it -> it.copyTo("${out_path}")} process miamiplot { label 'r_ext' // see the withLabel: bash in the nextflow config file - publishDir path: "${out_path}", mode: 'copy', overwrite: false + publishDir "${out_path}", mode: 'copy', pattern: "{*.png}", overwrite: false // https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob + publishDir "${out_path}/reports", mode: 'copy', pattern: "{miami_report.txt}", overwrite: false // https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob cache 'true' //no channel input here for the vcf, because I do not transform it input: - file fisher from fisher_ch + file fisher from fisher_ch3 file chr from chr_ch val y_lim1 val y_lim2 @@ -180,6 +183,7 @@ process miamiplot { } + process Backup { label 'bash' // see the withLabel: bash in the nextflow config file publishDir "${out_path}/reports", mode: 'copy', overwrite: false // since I am in mode copy, all the output files will be copied into the publishDir. See \\wsl$\Ubuntu-20.04\home\gael\work\aa\a0e9a739acae026fb205bc3fc21f9b