diff --git a/bin/parse_correlation_results.py b/bin/parse_correlation_results.py index 68e7dbaf9f4f25a2995d87bd6267dd9f242693f8..b7f524141f8a53b12b662e6fba86454fb0928be2 100644 --- a/bin/parse_correlation_results.py +++ b/bin/parse_correlation_results.py @@ -11,6 +11,7 @@ def get_trait(fi): traits = re.search('([-0-9A-Z]+_[-0-9A-Z-]+).log', fi) return [traits.group(1)] +print(file_pairs) print(file_h2) traits = [get_trait(fi) for fi in file_h2] # trait for trait in diff --git a/input_files/Data_test_EAS.csv b/input_files/Data_test_EAS.csv deleted file mode 100644 index e3ecf588841598d3aa22e671e15cb81ce243eee6..0000000000000000000000000000000000000000 --- a/input_files/Data_test_EAS.csv +++ /dev/null @@ -1,4 +0,0 @@ -"filename" "Consortium" "Outcome" "FullName" "internalDataLink" "Type" "Reference" "ReferenceLink" "dataLink" "Nsample" "Ncase" "Ncontrol" "Nsnp" "snpid" "POS" "CHR" "a1" "a2" "freq" "pval" "n" "z" "OR" "se" "index_type" "imp" "ncas" "ncont" -"WBC_EAS_chr22.tsv" "BCT" "WBC" "White blood cell count" "Cellular" "Chen MH et al. 2020" "https://pubmed.ncbi.nlm.nih.gov/32888493/" "http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90002001-GCST90003000/GCST90002373/harmonised/32888493-GCST90002373-EFO_0007988.h.tsv.gz" 15061 36864690 "hm_rsid" "hm_pos" "hm_chrom" "hm_effect_allele" "hm_other_allele" "hm_effect_allele_frequency" "p_value" "hm_beta" "standard_error" "rs-number" -"PLT_EAS_chr22.tsv" "BCT" "PLT" "Platelet count" "Cellular" "Chen MH et al. 2020" "https://pubmed.ncbi.nlm.nih.gov/32888493/" "http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90002001-GCST90003000/GCST90002356/harmonised/32888493-GCST90002356-EFO_0004309.h.tsv.gz" 15171 36877391 "hm_rsid" "hm_pos" "hm_chrom" "hm_effect_allele" "hm_other_allele" "hm_effect_allele_frequency" "p_value" "hm_beta" "standard_error" "rs-number" -"RBC_EAS_chr22.tsv" "BCT" "RBC" "Red blood cell count" "Cellular" "Chen MH et al. 2020" "https://pubmed.ncbi.nlm.nih.gov/32888493/" "http://ftp.ebi.ac.uk/pub/databases/gwas/summary_statistics/GCST90002001-GCST90003000/GCST90002362/harmonised/32888493-GCST90002362-EFO_0007978.h.tsv.gz" 14222 36679691 "hm_rsid" "hm_pos" "hm_chrom" "hm_effect_allele" "hm_other_allele" "hm_effect_allele_frequency" "p_value" "hm_beta" "standard_error" "rs-number" diff --git a/input_files/group.txt b/input_files/group.txt index 601b632230cdfbbc7c6bf89c059e92ae24a0ca1a..f20cd7aa48493942a6d86603c916fd3a5786bbb1 100644 --- a/input_files/group.txt +++ b/input_files/group.txt @@ -1,2 +1 @@ -GRP1;z_BCT_WBC z_BCT_PLT z_BCT_RBC -GRP2;z_BCT_PLT z_BCT_RBC +GRP1;z_SHRINE_FVC z_SHRINE_PEF z_SHRINE_FEV1 z_SHRINE_RATIO diff --git a/jass_pipeline.nf b/jass_pipeline.nf index 67d0d7509a131e4f00ed53bd1f69d303730ca678..9841c2fd61b0852fff59edbcd46bcfe19782be65 100644 --- a/jass_pipeline.nf +++ b/jass_pipeline.nf @@ -1,4 +1,4 @@ -nextflow.enable.dsl=1 +nextflow.enable.dsl=2 /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ JASS suite pipeline @@ -8,25 +8,28 @@ nextflow.enable.dsl=1 /* Parameter to set if optional pipeline steps are performed */ params.compute_project=true // Compute JASS runs -params.compute_LDSC_matrix=false // Infer the genetic covariance and residual covariance using the LDscore regression (Bulik-Sullivan, et al, 2015). The residual covariance is necessary to perform multi-trait GWAS (see julienne, et al 2021) If set to false, the residual covariance will be infered from Zscores -params.compute_imputation=false +params.compute_LDSC_matrix=true // Infer the genetic covariance and residual covariance using the LDscore regression (Bulik-Sullivan, et al, 2015). The residual covariance is necessary to perform multi-trait GWAS (see julienne, et al 2021) If set to false, the residual covariance will be infered from Zscores +params.compute_imputation=true /* Path of input data */ -params.meta_data = "${projectDir}"+"/input_files/Data_test_EAS.csv" // file describing gwas summary statistic format -params.gwas_folder = "/pasteur/zeus/projets/p02/GGS_JASS/1._DATA/Summary_stat_hg38/SAS/" +params.meta_data = "${projectDir}"+"/input_files/Meta_data_preliminary_analysis.csv" // file describing gwas summary statistic format +params.gwas_folder = "${projectDir}" + "/RAW_SUMSTAT/" -params.region = "${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_SAS.bed" -params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_SAS_0_01.csv"//"${projectDir}/Ref_Panel/1000G_SAS_0_01_chr22.csv" +// //"/pasteur/zeus/projets/p02/GGS_JASS/1._DATA/Summary_stat_hg38/EAS/" -params.ancestry="SAS" -params.prefix="ALL_ensemble_1000G_hg38_SAS_chr" -params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_SAS_" +params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EUR.bed") +params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_EUR_0_01.csv" +//"${projectDir}/Ref_Panel/1000G_SAS_0_01_chr22.csv" + +params.ancestry="EUR" +params.prefix="ALL_ensemble_1000G_hg38_EUR_chr" +params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_EUR_" params.suffix="" -params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/SAS/' +params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/EUR/' /* Folder to store result output */ params.output_folder = "${projectDir}" -diagnostic_folder= params.output_folder + "/sample_size/" +params.diagnostic_folder= params.output_folder + "/sample_size/" /**********************/ /* Process Parameters */ @@ -36,343 +39,110 @@ diagnostic_folder= params.output_folder + "/sample_size/" the following parameters to your data to ensure imputation accuracy. see https://statistical-genetics.pages.pasteur.fr/raiss/#optimizing-raiss-parameters-for-your-data */ - params.r2threshold = 0.6 params.eigenthreshold = 0.05 params.minimumld = 5 -params.ld_folder="/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/SAS/*.ld" -params.ref_panel = '/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/SAS/' +params.ld_type = "plink" +params.ld_folder = Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/EUR/*.ld").collect() +params.ref_panel = '/pasteur/zeus/projets/p02/GGS_JASS/jass_analysis_pipeline/Ref_panel_by_chr/' +chr_channel = Channel.from(1..22) +params.ref_chr_path= Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EUR/ALL_ensemble_1000G_hg38_EUR_chr*.bim").collect() + +params.perform_sanity_checks=true +params.perform_accuracy_checks=true + /* Project group */ params.group = "${projectDir}/input_files/group.txt" group = file(params.group) - -/* Generate channel from path */ -Region_channel = Channel.fromPath(params.region) -Region_channel2 = Channel.fromPath(params.region) - -chr_channel = Channel.from(21..22) -ref_chr_path=params.ref_panel+"/ALL_ensemble_1000G_hg38_SAS_chr*.bim" - - -extract_sample_size_script_channel = Channel.fromPath("${projectDir}/bin/extract_sample_size.py") -generate_trait_pairs_channel = Channel.fromPath("${projectDir}/bin/generate_trait_pairs.py") -parse_correlation_channel = Channel.fromPath("${projectDir}/bin/parse_correlation_results.py") - - -process meta_data_GWAS{ - output: - path "meta_data_chk*.csv" into meta_data mode flatten - """ - d=`wc -l ${params.meta_data}` - e=`echo \$d | cut -d ' ' -f 1` - - for ((i = 2; i <= \$e; i++)); - do - head -n1 ${params.meta_data} > "meta_data_chk\$i.csv" - head -n \$i ${params.meta_data} | tail -n 1 >> "meta_data_chk\$i.csv" - done - """ -} - -process Clean_GWAS { - publishDir "${params.output_folder}/harmonized_GWAS_files/", pattern: "*.txt", mode: 'copy' - publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file/*.txt", mode: 'copy' - input: - path ref_panel from params.ref_panel_WG - path meta_chunk from meta_data - output: - path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel - path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel2 - path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel3 - path "*.txt" into cleaned_gwas_chr_channel mode flatten - path "*.txt" into cleaned_gwas_chr_channel2 mode flatten - """ - mkdir -p harmonized_GWAS_1_file - pwd - ls ${params.gwas_folder} - echo ${params.gwas_folder} - working_dir=\$(pwd) - echo \$working_dir - full_path=\$working_dir/${meta_chunk} - echo "full_path" - echo \$full_path - - jass_preprocessing --gwas-info \$full_path --ref-path ${ref_panel} \ - --input-folder ${params.gwas_folder} --diagnostic-folder ${diagnostic_folder} \ - --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file/ - """ -} - - -process Impute_GWAS { - publishDir "${params.output_folder}", pattern: "imputed_GWAS/*.txt", mode: 'copy' - input: - path gwas_files from cleaned_gwas_chr_channel - path ref_file from Channel.fromPath(ref_chr_path).collect() - path ld_file from Channel.fromPath(params.ld_folder).collect() - output: - path "imputed_GWAS/*.txt" into imputed_gwas_channel - path "imputed_GWAS/*.txt" into imputed_gwas_channel2 - when: - params.compute_imputation - script: - """ - mkdir -p imputed_GWAS - - chrom=\$(echo ${gwas_files} | cut -d '_' -f4 | cut -d "." -f1) - study=\$(echo ${gwas_files} | cut -d '_' -f2,3) - - echo \$chrom - echo \$study - - raiss --chrom \${chrom} --gwas \${study} --ref-folder ./ --R2-threshold ${params.r2threshold} --eigen-threshold ${params.eigenthreshold} --ld-folder ./ --zscore-folder ./ --output-folder ./imputed_GWAS --ref-panel-prefix ${params.prefix_Impute_GWAS} --ref-panel-suffix ${params.suffix}.bim --minimum-ld ${params.minimumld} - """ -} - -process Do_not_Impute_GWAS { - queue 'dedicated,common,ggs' - input: - path gwas_files from cleaned_gwas_chr_channel2.collect() - output: - path "clean_gwas/*.txt" into not_imputed_gwas_channel - path "clean_gwas/*.txt" into not_imputed_gwas_channel2 - when: - !params.compute_imputation - script: - """ - if [ ! -d "clean_gwas" ] - then - mkdir clean_gwas - fi - cp -L *.txt ./clean_gwas - - """ -} - -/* - Process related to LD-score calculation -*/ -process Munge_LDSC_data { - publishDir "${params.output_folder}", pattern: "ldsc_data/data_*.sumstats.gz", mode: 'copy' - publishDir "${params.output_folder}", pattern: "ldsc_data/*.log", mode: 'copy' - time = {1.h * task.attempt} - maxRetries = 4 - queue 'dedicated,common,ggs' - //errorStrategy='ignore' - input: - path clean_gwas from cleaned_gwas_channel.flatten() - - output: - path "ldsc_data/*.sumstats.gz" into ldsc_data_channel - path "ldsc_data/*.sumstats.gz" into ldsc_data_channel_bis - path "ldsc_data/*.sumstats.gz" into ldsc_data_channel_three - when: - params.compute_LDSC_matrix - """ - cp ${projectDir}/bin/extract_sample_size.py ./ - Nsamp=\$(python2.7 extract_sample_size.py ${clean_gwas} ${params.meta_data}) - - if [ ! -d "ldsc_data" ] - then - mkdir ldsc_data - fi - tag=\$(echo ${clean_gwas} | cut -d '.' -f1 | cut -d '_' -f2,3) - echo \$tag - echo \$Nsamp - cat $clean_gwas | sed '1s/A1/A2/g' | sed '1s/A0/A1/g' > corrected_gwas.txt - - head corrected_gwas.txt - munge_sumstats.py --sumstats corrected_gwas.txt --N \$Nsamp --out ldsc_data/data_\$tag - """ -} - -process Generate_trait_pair { - time '1h' - queue 'dedicated,common,ggs' - input: - path generate_trait_pairs_script from generate_trait_pairs_channel - path ldsc_data from ldsc_data_channel.unique().collect() - output: - path "pairs_chunk_*.txt" into combi_channel mode flatten - when: - params.compute_LDSC_matrix - """ - python3 ${generate_trait_pairs_script} - """ -} - -process heritability_LDSC_data { - memory {8.GB * task.attempt} - time {24.h * task.attempt} - queue 'dedicated,common,ggs' - publishDir "${params.output_folder}/h2_data/", pattern: "*.log", mode: 'copy' - input: - path ldsc_data from ldsc_data_channel_three.flatten() - output: - path "*.log" into h2_log_channel - when: - params.compute_LDSC_matrix - """ - my_trait=\$(ls ${ldsc_data} | cut -d "_" -f2,3 | cut -d '.' -f1) - ldsc.py --h2 ${ldsc_data} --ref-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --w-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --out \${my_trait} - """ -} - - -process Correlation_LDSC_data { - memory {8.GB * task.attempt} - time {24.h * task.attempt} - queue 'dedicated,common,ggs' - publishDir "${params.output_folder}/cor_data/", pattern: "*.log", mode: 'copy' - input: - path trait_pair from combi_channel - path ldsc_data from ldsc_data_channel_bis.unique().collect() - output: - path "*.log" into cor_log_channel - when: - params.compute_LDSC_matrix - - """ - - echo ${trait_pair} - IFS=';' read -ra my_trait <<< "\$(cat ${trait_pair})" - i=1 - - for trait_p in \${my_trait[@]} - do - - echo \$trait_p - trait1=\$(echo \$trait_p | cut -d '.' -f1 | cut -d '_' -f2,3) - trait2=\$(echo \$trait_p | cut -d ',' -f2 | cut -d '.' -f1 | cut -d '_' -f2,3) - - ldsc.py --rg \$trait_p --out \${trait1}-_-\${trait2} --ref-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --w-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ - - done - """ -} - - -process Correlation_matrices { - - publishDir "${params.output_folder}/Correlation_matrices/", pattern: "*.csv", mode: 'copy' - time '1h' - queue 'dedicated,common,ggs' - input: - path parsing_script from parse_correlation_channel - path ldsc_data from cor_log_channel.collect() - path h2_ld from h2_log_channel.collect() - output: - path "Covariance_matrix_H0.csv" into cov_H0_matrice_channel - path "Covariance_matrix_genetic.csv" into cov_gen_matrice_channel - path "*.csv" into parsing_results - when: - params.compute_LDSC_matrix - """ - python3 ${parsing_script} - - """ -} - - -process Create_inittable_LDSC { - publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' - memory {16.GB * task.attempt} - time {24.h * task.attempt} - queue 'dedicated,common,ggs' - input: - path cleaned_gwas_chr from imputed_gwas_channel.mix(not_imputed_gwas_channel).unique().collect() - path cleaned_gwas from cleaned_gwas_channel2.collect() - path Covariance_H0 from cov_H0_matrice_channel - path Covariance_gen from cov_gen_matrice_channel - path Regions from Region_channel - output: - path "*.hdf5" into init_table_channel - when: - params.compute_LDSC_matrix - """ - - gwas_list_str=`echo ${cleaned_gwas} | sed s'/.txt//'g` - echo \$gwas_list_str - date_init=\$(date +"%m_%d_%Y-%H:%M") - - init_name="inittable_LDSC_\$date_init.hdf5" - jass create-inittable --input-data-path "./*.txt" --init-covariance-path ${Covariance_H0} --init-genetic-covariance-path ${Covariance_gen} --regions-map-path ${Regions} --description-file-path ${params.meta_data} --init-table-path \$init_name - - """ -} - - -process Create_inittable { - - publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' - memory {16.GB * task.attempt} - time {24.h * task.attempt} - - input: - path cleaned_gwas_chr from imputed_gwas_channel2.mix(not_imputed_gwas_channel2).unique().collect() - path cleaned_gwas from cleaned_gwas_channel3.collect() - path Regions from Region_channel2 - output: - path "*.hdf5" into init_table_channel_no_ldsc - when: - !params.compute_LDSC_matrix - """ - - gwas_list_str=`echo ${cleaned_gwas} | sed s'/.txt//'g` - echo \$gwas_list_str - date_init=\$(date +"%m_%d_%Y-%H:%M") - - init_name="inittable_NO_LDSC_\$date_init.hdf5" - jass create-inittable --input-data-path "./*.txt" --regions-map-path ${Regions} --description-file-path ${params.meta_data} --init-table-path \$init_name - - """ -} - - -/* - Generate one list of phenotype -*/ -process get_pheno_group { - input: - path 'group.txt' from group - output: - path "pheno_group_*" into pheno_group_channel - when: - params.compute_project - """ - split -l 1 group.txt pheno_group_ - """ -} - -process Create_project_data { - publishDir "${projectDir}/worktable/", pattern:"worktable_*.hdf5", mode: 'move' - publishDir "${projectDir}/quadrant/", pattern:"quadrant_*.png", mode: 'move' - publishDir "${projectDir}/manhattan/", pattern:"manhattan_*.png", mode: 'move' - - input: - path pheno_group from pheno_group_channel.collect() - path init_table from init_table_channel.mix(init_table_channel_no_ldsc) - - output: - path "worktable*.hdf5" into worktable_channel - path "*.png" into plot_channel - stdout result - - when: - params.compute_project - """ - for pheno_line in ${pheno_group} - do - echo \$pheno_line - group_tag="\$(cat \$pheno_line | cut -d';' -f1)" - pheno_list="\$(cat \$pheno_line | cut -d';' -f2)" - - echo \$group_tag - echo \$pheno_list - - jass create-project-data --phenotypes \$pheno_list --init-table-path ./${init_table} --worktable-path ./worktable_bis_\$group_tag.hdf5 --manhattan-plot-path ./manhattan_\$group_tag.png --quadrant-plot-path ./quadrant_\$group_tag.png - done - """ -} -//result.println { it.trim() } +extract_sample_size_script_channel = "${projectDir}/bin/extract_sample_size.py" +generate_trait_pairs_channel = "${projectDir}/bin/generate_trait_pairs.py" +parse_correlation_channel = "${projectDir}/bin/parse_correlation_results.py" + +/*****************************/ +/* process inclusion */ +/*****************************/ +include {Meta_data_GWAS; Clean_GWAS} from "./modules/Clean_GWAS" +include {Impute_GWAS} from "./modules/RAISS_imputation" +include {Perf_RAISS; Sanity_Checks_RAISS; Put_in_harmonized_folder; Put_in_imputed_folder} from "./modules/RAISS_sanity_checks" +include {Munge_LDSC_data; Heritability_LDSC_data;Generate_trait_pair; Correlation_LDSC_data;Parsing_correlation_matrices} from "./modules/LDSC" //; Generate_trait_pair; +include {Create_inittable_LDSC; Create_inittable; Get_pheno_group; Create_project_data} from "./modules/JASS" + + +/*******************************/ +workflow{ + /****** PREPROCESSING ******/ + Meta_data_GWAS(params.meta_data) + Clean_GWAS(params.ref_panel_WG, Meta_data_GWAS.out.flatten()) + + /******** IMPUTATION *******/ + if(params.compute_imputation){ + + Impute_GWAS(Clean_GWAS.out.cleaned_gwas_chr_channel.flatten(), params.ref_chr_path.collect(), params.ld_folder.collect()) + + sanity_cleaned_in_folder = Put_in_harmonized_folder(Clean_GWAS.out.cleaned_gwas_chr_channel.collect()) + sanity_imputed_in_folder = Put_in_imputed_folder(Impute_GWAS.out.imputed_gwas_channel.collect()) + + chr_channel + .max() + .map{it + 1} + .set {chr_max} + + chr_channel + .min() + .set {chr_min} + + Clean_GWAS.out.cleaned_gwas_chr_channel + .map {(it.name.toString() =~ /(z_[0-9A-Za-z-]+_[0-9A-Za-z-]+)/ )[0][1]} + .unique() + .set {trait_channel} + + Clean_GWAS.out.cleaned_gwas_chr_channel.flatten().filter( ~/.*z.*chr22\.txt/ ).view() + if(params.perform_accuracy_checks){ + Perf_RAISS(Clean_GWAS.out.cleaned_gwas_chr_channel.flatten().filter( ~/.*z.*_chr22\.txt/ ), params.ref_chr_path.collect(), params.ld_folder.collect()) + } + + if(params.perform_sanity_checks){ + Sanity_Checks_RAISS(sanity_cleaned_in_folder.collect(), sanity_imputed_in_folder.collect(), trait_channel, chr_min, chr_max) + } + } + + // /****** Genetic variance and corresponding inittable ******/ + + if(params.compute_LDSC_matrix){ + Munge_LDSC_data(Clean_GWAS.out.cleaned_gwas_channel.flatten()) + Heritability_LDSC_data(Munge_LDSC_data.out.ldsc_data_channel) + Generate_trait_pair(generate_trait_pairs_channel, Munge_LDSC_data.out.ldsc_data_channel.collect()) + Correlation_LDSC_data(Generate_trait_pair.out.flatten(), Munge_LDSC_data.out.ldsc_data_channel.collect()) + Parsing_correlation_matrices(parse_correlation_channel, Correlation_LDSC_data.out.cor_log_channel.collect(), Heritability_LDSC_data.out.h2_log_channel.collect()) + + /****** JASS Inittable with LDSC covariance matrix (preferred method) *****/ + if(params.compute_imputation){ + Create_inittable_LDSC(Impute_GWAS.out.imputed_gwas_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), Parsing_correlation_matrices.out.cov_H0_matrice_channel, Parsing_correlation_matrices.out.cov_gen_matrice_channel, params.region) + } + else{ + Create_inittable_LDSC(Clean_GWAS.out.cleaned_gwas_chr_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), Parsing_correlation_matrices.out.cov_H0_matrice_channel, Parsing_correlation_matrices.out.cov_gen_matrice_channel, params.region) + } + + /*Compute multi-trait GWAS*/ + if(params.compute_project){ + Get_pheno_group(params.group) + Create_project_data(Get_pheno_group.out.pheno_group, Create_inittable_LDSC.out.init_table) + } + } + else { + if(params.compute_imputation){ + Create_inittable(Impute_GWAS.out.imputed_gwas_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), params.region) + } + else{ + Create_inittable(Clean_GWAS.out.cleaned_gwas_chr_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), params.region) + } + + /*Compute multi-trait GWAS*/ + if(params.compute_project){ + Get_pheno_group(params.group) + Get_pheno_group.out.pheno_group.flatten().view() + Create_project_data(Get_pheno_group.out.pheno_group.collect(), Create_inittable.out.init_table_channel_no_ldsc) + } + } +} \ No newline at end of file diff --git a/jass_pipeline_v2.nf b/jass_pipeline_v2.nf deleted file mode 100644 index 3001e340a18907d4075f888ffa9af637590af53d..0000000000000000000000000000000000000000 --- a/jass_pipeline_v2.nf +++ /dev/null @@ -1,152 +0,0 @@ -nextflow.enable.dsl=2 -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - JASS suite pipeline - Authors : Hanna Julienne, Hervé Ménager & Lucie Troubat -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -/* Parameter to set if optional pipeline steps are performed */ -params.compute_project=true // Compute JASS runs -params.compute_LDSC_matrix=false // Infer the genetic covariance and residual covariance using the LDscore regression (Bulik-Sullivan, et al, 2015). The residual covariance is necessary to perform multi-trait GWAS (see julienne, et al 2021) If set to false, the residual covariance will be infered from Zscores -params.compute_imputation=false - -/* Path of input data */ -params.meta_data = "${projectDir}"+"/input_files/Data_test_EAS.csv" // file describing gwas summary statistic format -params.gwas_folder = "${projectDir}" + "/test_data/hg38_EAS/" - -// //"/pasteur/zeus/projets/p02/GGS_JASS/1._DATA/Summary_stat_hg38/EAS/" - -params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EAS.bed") -params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_EAS_0_01_chr22_21.csv" -//"${projectDir}/Ref_Panel/1000G_SAS_0_01_chr22.csv" - -params.ancestry="EAS" -params.prefix="ALL_ensemble_1000G_hg38_EAS_chr" -params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_EAS_" -params.suffix="" -params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/EAS/' - -/* Folder to store result output */ -params.output_folder = "${projectDir}" -params.diagnostic_folder= "/pasteur/zeus/projets/p02/GGS_JASS/jass_pipeline_dev_copie/jass_suite_pipeline/sample_size/" - -/**********************/ -/* Process Parameters */ -/**********************/ -/* Imputation parameter (for RAISS) */ -/* If you decide to run RAISS, we strongly encourage you to tune - the following parameters to your data to ensure imputation accuracy. - see https://statistical-genetics.pages.pasteur.fr/raiss/#optimizing-raiss-parameters-for-your-data -*/ -params.r2threshold = 0.6 -params.eigenthreshold = 0.05 -params.minimumld = 5 -params.ld_type = "plink" -params.ld_folder = Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/EAS/*.ld").collect() -params.ref_panel = '/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EAS/' -chr_channel = Channel.from(21..22) -params.ref_chr_path= Channel.fromPath(params.ref_panel+"/ALL_ensemble_1000G_hg38_EAS_chr*.bim").collect() -params.perform_sanity_checks=true -params.perform_accuracy_checks=true - - -/* Project group */ -params.group = "${projectDir}/input_files/group.txt" -group = file(params.group) -extract_sample_size_script_channel = "${projectDir}/bin/extract_sample_size.py" -generate_trait_pairs_channel = "${projectDir}/bin/generate_trait_pairs.py" -parse_correlation_channel = "${projectDir}/bin/parse_correlation_results.py" - -/*****************************/ -/* process inclusion */ -/*****************************/ -include {Meta_data_GWAS; Clean_GWAS} from "./modules/Clean_GWAS" -include {Impute_GWAS} from "./modules/RAISS_imputation" -include {Perf_RAISS; Sanity_Checks_RAISS; Put_in_harmonized_folder; Put_in_imputed_folder} from "./modules/RAISS_sanity_checks" -include {Munge_LDSC_data; Heritability_LDSC_data;Generate_trait_pair; Correlation_LDSC_data;Parsing_correlation_matrices} from "./modules/LDSC" //; Generate_trait_pair; -include {Create_inittable_LDSC; Create_inittable; Get_pheno_group; Create_project_data} from "./modules/JASS" - - -/*******************************/ -workflow{ - /****** PREPROCESSING ******/ - Meta_data_GWAS(params.meta_data) - Clean_GWAS(params.ref_panel_WG, Meta_data_GWAS.out.flatten()) - - /******** IMPUTATION *******/ - if(params.compute_imputation){ - - Impute_GWAS(Clean_GWAS.out.cleaned_gwas_chr_channel.flatten(), params.ref_chr_path.collect(), params.ld_folder.collect()) - - sanity_cleaned_in_folder = Put_in_harmonized_folder(Clean_GWAS.out.cleaned_gwas_chr_channel.collect()) - sanity_imputed_in_folder = Put_in_imputed_folder(Impute_GWAS.out.imputed_gwas_channel.collect()) - - chr_channel - .max() - .map{it + 1} - .set {chr_max} - - chr_channel - .min() - .set {chr_min} - - Clean_GWAS.out.cleaned_gwas_chr_channel - .map {(it.name.toString() =~ /(z_[0-9A-Za-z-]+_[0-9A-Za-z-]+)/ )[0][1]} - .unique() - .set {trait_channel} - - Clean_GWAS.out.cleaned_gwas_chr_channel.flatten().filter( ~/.*z.*chr22\.txt/ ).view() - if(params.perform_accuracy_checks){ - Perf_RAISS(Clean_GWAS.out.cleaned_gwas_chr_channel.flatten().filter( ~/.*z.*_chr22\.txt/ ), params.ref_chr_path.collect(), params.ld_folder.collect()) - //Perf_RAISS(Clean_GWAS.out.cleaned_gwas_chr_channel.collect(), params.ref_chr_path.collect(), params.ld_folder.collect()) - } - - if(params.perform_sanity_checks){ - Sanity_Checks_RAISS(sanity_cleaned_in_folder.collect(), sanity_imputed_in_folder.collect(), trait_channel, chr_min, chr_max) - } - } - - // /****** Genetic variance and corresponding inittable ******/ - - if(params.compute_LDSC_matrix){ - Munge_LDSC_data(Clean_GWAS.out.cleaned_gwas_channel.flatten()) - Heritability_LDSC_data(Munge_LDSC_data.out.ldsc_data_channel) - Generate_trait_pair(generate_trait_pairs_channel, Munge_LDSC_data.out.ldsc_data_channel.collect()) - Correlation_LDSC_data(Generate_trait_pair.out.flatten(), Munge_LDSC_data.out.ldsc_data_channel.collect()) - Parsing_correlation_matrices(parse_correlation_channel, Correlation_LDSC_data.out.cor_log_channel, Heritability_LDSC_data.out.h2_log_channel) - - // /****** JASS Inittable with LDSC covariance matrix (preferred method)*****/ - if(params.compute_imputation){ - Create_inittable_LDSC(Impute_GWAS.out.imputed_gwas_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), Parsing_correlation_matrices.out.cov_H0_matrice_channel, Parsing_correlation_matrices.out.cov_gen_matrice_channel, params.region) - } - else{ - Create_inittable_LDSC(Clean_GWAS.out.cleaned_gwas_chr_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), Parsing_correlation_matrices.out.cov_H0_matrice_channel, Parsing_correlation_matrices.out.cov_gen_matrice_channel, params.region) - } - - /*Compute multi-trait GWAS*/ - if(params.compute_project){ - Get_pheno_group(params.group) - Create_project_data(Get_pheno_group.out.pheno_group, Create_inittable_LDSC.out.init_table) - //Create_project_data.out.result.view() - } - } - else { - if(params.compute_imputation){ - Create_inittable(Impute_GWAS.out.imputed_gwas_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), params.region) - } - else{ - Create_inittable(Clean_GWAS.out.cleaned_gwas_chr_channel.collect(), Clean_GWAS.out.cleaned_gwas_channel.collect(), params.region) - } - - - - /*Compute multi-trait GWAS*/ - if(params.compute_project){ - Get_pheno_group(params.group) - Get_pheno_group.out.pheno_group.flatten().view() - Create_project_data(Get_pheno_group.out.pheno_group.collect(), Create_inittable.out.init_table_channel_no_ldsc) - //Create_project_data.out.result.view() - } - } -} \ No newline at end of file diff --git a/modules/LDSC.nf b/modules/LDSC.nf index d690536bdd0a8a6eb5109ba28ec69ddd681042b9..85092ca6e23b78676f627c16bf90c85a5bf710cf 100644 --- a/modules/LDSC.nf +++ b/modules/LDSC.nf @@ -41,7 +41,6 @@ process Heritability_LDSC_data { process Generate_trait_pair { time '1h' - queue 'dedicated,common,ggs' input: path generate_trait_pairs_script path ldsc_data @@ -86,7 +85,6 @@ process Correlation_LDSC_data { process Parsing_correlation_matrices { publishDir "${params.output_folder}/Correlation_matrices/", pattern: "*.csv", mode: 'copy' time '1h' - queue 'dedicated,common,ggs' input: path parsing_script path ldsc_data diff --git a/modules/RAISS_imputation.nf b/modules/RAISS_imputation.nf index ffc50afe9f31bf69878e6f59ae14b33ad3358eb5..0e461b2a75042ac68b0c4b5c4f3ee79307a4401e 100644 --- a/modules/RAISS_imputation.nf +++ b/modules/RAISS_imputation.nf @@ -21,7 +21,6 @@ process Impute_GWAS { } process Do_not_Impute_GWAS { - queue 'dedicated,common,ggs' input: path gwas_files from cleaned_gwas_chr_channel2.collect() output: diff --git a/modules/RAISS_sanity_checks.nf b/modules/RAISS_sanity_checks.nf index e08dc1016044909168dded160e5be72acfe095b5..03abb8bbca59b91ebf1a94272517a7cab2a417d4 100755 --- a/modules/RAISS_sanity_checks.nf +++ b/modules/RAISS_sanity_checks.nf @@ -6,6 +6,9 @@ process Perf_RAISS { path gwas_files path ref_file path ld_file + output: + path "imputed_GWAS/*.txt", emit: perf_imputed_file + path "raiss_report/perf_report_*", emit: raiss_report script: """ mkdir -p imputed_GWAS diff --git a/nextflow_slurm.config b/nextflow_slurm.config index 47c22e0f789493666e792675c4896ca12e384dcc..3708887cafee374a2c20586041785d55e1f6813f 100644 --- a/nextflow_slurm.config +++ b/nextflow_slurm.config @@ -22,56 +22,59 @@ singularity { runOptions = '--home $HOME:/home/$USER -B /pasteur/zeus/projets/p02/' } executor { - submitRateLimit = '10 sec' - maxErrors=20 - maxRetries=4 - errorStrategy='terminate' - maxForks=400 - queueSize = 500 + name = 'slurm' + + + queueSize = 400 } process{ + errorStrategy = 'ignore' + maxErrors = 20 + maxRetries = 2 + withName: 'Compute_MAF' { container='plink_1.90b6.21--h516909a_0.sif' time='1h' - queue='dedicated,common,ggs' + queue = "ggs,common" cpus=1 } withName: 'Create_WG_reference_panel' { memory = '32G' time='1h' - queue='dedicated,common,ggs' + queue = "ggs,common" cpus=1 } withName: 'Meta_data_GWAS' { - time='1h' - queue='dedicated,common,ggs' + qos='ultrafast' + queue = "ggs,common" cpus=1 } withName: 'Clean_GWAS' { memory = '32G' - time='1h' - queue='dedicated,common,ggs' + time='2h' + queue = "ggs,common" cpus=1 } withName: 'Impute_GWAS' { memory={8.GB * task.attempt} - time={72.h * task.attempt} + errorStrategy = 'ignore' + time='24h' maxRetries=4 - queue='dedicated,ggs,common' + queue = "ggs,common" cpus=1 } withName: 'Perf_RAISS' { memory={8.GB * task.attempt} - time={72.h * task.attempt} + time='24h' maxRetries=4 - queue='dedicated,ggs,common' + queue = "ggs,common" cpus=1 } @@ -80,50 +83,70 @@ process{ cpus=1 time={1.h * task.attempt} maxRetries=4 - queue='dedicated,common,ggs' + queue = "ggs,common" } withName: 'Generate_trait_pair' { + container='jass_preprocessing_2.2--pyhdfd78af_0.sif' + queue = "ggs,common" cpus=1 } withName: 'Heritability_LDSC_data' { memory={8.GB * task.attempt} - time={24.h * task.attempt} - queue='dedicated,common,ggs' + time={2.h * task.attempt} + queue = "ggs,common" container="ldsc_1.0.1--py_0.sif" cpus=1 } withName: 'Correlation_LDSC_data' { container="ldsc_1.0.1--py_0.sif" + queue = "ggs,common" cpus=1 } withName: 'Parsing_correlation_matrices' { container='jass_preprocessing_2.2--pyhdfd78af_0.sif' - cpus=1 + qos='ultrafast' + queue = "ggs,common" + cpus=1 } withName: 'Create_inittable_LDSC' { memory={16.GB * task.attempt} - time={24.h * task.attempt} + queue = "ggs,common" + time={2.h * task.attempt} + cpus=1 + } + + withName: 'sanity_cleaned_in_folder' { cpus=1 + qos='ultrafast' + } + + withName: 'sanity_imputed_in_folder' { + cpus=1 + qos='ultrafast' } withName: 'Create_inittable' { memory={16.GB * task.attempt} - time={24.h * task.attempt} + queue = "ggs,common" + time={2.h * task.attempt} cpus=1 } withName: 'get_pheno_group' { - cpus=1 + cpus=1 + qos='ultrafast' } withName: 'Create_project_data' { cpus=1 + queue = "ggs,common" + time={2.h * task.attempt} } } diff --git a/old_versions/jass_pipeline.nf b/old_versions/jass_pipeline.nf new file mode 100644 index 0000000000000000000000000000000000000000..67d0d7509a131e4f00ed53bd1f69d303730ca678 --- /dev/null +++ b/old_versions/jass_pipeline.nf @@ -0,0 +1,378 @@ +nextflow.enable.dsl=1 +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + JASS suite pipeline + Authors : Hanna Julienne, Hervé Ménager & Lucie Troubat +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +/* Parameter to set if optional pipeline steps are performed */ +params.compute_project=true // Compute JASS runs +params.compute_LDSC_matrix=false // Infer the genetic covariance and residual covariance using the LDscore regression (Bulik-Sullivan, et al, 2015). The residual covariance is necessary to perform multi-trait GWAS (see julienne, et al 2021) If set to false, the residual covariance will be infered from Zscores +params.compute_imputation=false + +/* Path of input data */ +params.meta_data = "${projectDir}"+"/input_files/Data_test_EAS.csv" // file describing gwas summary statistic format +params.gwas_folder = "/pasteur/zeus/projets/p02/GGS_JASS/1._DATA/Summary_stat_hg38/SAS/" + +params.region = "${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_SAS.bed" +params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_SAS_0_01.csv"//"${projectDir}/Ref_Panel/1000G_SAS_0_01_chr22.csv" + +params.ancestry="SAS" +params.prefix="ALL_ensemble_1000G_hg38_SAS_chr" +params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_SAS_" +params.suffix="" +params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/SAS/' + +/* Folder to store result output */ +params.output_folder = "${projectDir}" +diagnostic_folder= params.output_folder + "/sample_size/" + +/**********************/ +/* Process Parameters */ +/**********************/ +/* Imputation parameter (for RAISS) */ +/* If you decide to run RAISS, we strongly encourage you to tune + the following parameters to your data to ensure imputation accuracy. + see https://statistical-genetics.pages.pasteur.fr/raiss/#optimizing-raiss-parameters-for-your-data +*/ + +params.r2threshold = 0.6 +params.eigenthreshold = 0.05 +params.minimumld = 5 +params.ld_folder="/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/SAS/*.ld" +params.ref_panel = '/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/SAS/' + +/* Project group */ +params.group = "${projectDir}/input_files/group.txt" +group = file(params.group) + +/* Generate channel from path */ +Region_channel = Channel.fromPath(params.region) +Region_channel2 = Channel.fromPath(params.region) + +chr_channel = Channel.from(21..22) +ref_chr_path=params.ref_panel+"/ALL_ensemble_1000G_hg38_SAS_chr*.bim" + + +extract_sample_size_script_channel = Channel.fromPath("${projectDir}/bin/extract_sample_size.py") +generate_trait_pairs_channel = Channel.fromPath("${projectDir}/bin/generate_trait_pairs.py") +parse_correlation_channel = Channel.fromPath("${projectDir}/bin/parse_correlation_results.py") + + +process meta_data_GWAS{ + output: + path "meta_data_chk*.csv" into meta_data mode flatten + """ + d=`wc -l ${params.meta_data}` + e=`echo \$d | cut -d ' ' -f 1` + + for ((i = 2; i <= \$e; i++)); + do + head -n1 ${params.meta_data} > "meta_data_chk\$i.csv" + head -n \$i ${params.meta_data} | tail -n 1 >> "meta_data_chk\$i.csv" + done + """ +} + +process Clean_GWAS { + publishDir "${params.output_folder}/harmonized_GWAS_files/", pattern: "*.txt", mode: 'copy' + publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file/*.txt", mode: 'copy' + input: + path ref_panel from params.ref_panel_WG + path meta_chunk from meta_data + output: + path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel + path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel2 + path "harmonized_GWAS_1_file/*.txt" into cleaned_gwas_channel3 + path "*.txt" into cleaned_gwas_chr_channel mode flatten + path "*.txt" into cleaned_gwas_chr_channel2 mode flatten + """ + mkdir -p harmonized_GWAS_1_file + pwd + ls ${params.gwas_folder} + echo ${params.gwas_folder} + working_dir=\$(pwd) + echo \$working_dir + full_path=\$working_dir/${meta_chunk} + echo "full_path" + echo \$full_path + + jass_preprocessing --gwas-info \$full_path --ref-path ${ref_panel} \ + --input-folder ${params.gwas_folder} --diagnostic-folder ${diagnostic_folder} \ + --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file/ + """ +} + + +process Impute_GWAS { + publishDir "${params.output_folder}", pattern: "imputed_GWAS/*.txt", mode: 'copy' + input: + path gwas_files from cleaned_gwas_chr_channel + path ref_file from Channel.fromPath(ref_chr_path).collect() + path ld_file from Channel.fromPath(params.ld_folder).collect() + output: + path "imputed_GWAS/*.txt" into imputed_gwas_channel + path "imputed_GWAS/*.txt" into imputed_gwas_channel2 + when: + params.compute_imputation + script: + """ + mkdir -p imputed_GWAS + + chrom=\$(echo ${gwas_files} | cut -d '_' -f4 | cut -d "." -f1) + study=\$(echo ${gwas_files} | cut -d '_' -f2,3) + + echo \$chrom + echo \$study + + raiss --chrom \${chrom} --gwas \${study} --ref-folder ./ --R2-threshold ${params.r2threshold} --eigen-threshold ${params.eigenthreshold} --ld-folder ./ --zscore-folder ./ --output-folder ./imputed_GWAS --ref-panel-prefix ${params.prefix_Impute_GWAS} --ref-panel-suffix ${params.suffix}.bim --minimum-ld ${params.minimumld} + """ +} + +process Do_not_Impute_GWAS { + queue 'dedicated,common,ggs' + input: + path gwas_files from cleaned_gwas_chr_channel2.collect() + output: + path "clean_gwas/*.txt" into not_imputed_gwas_channel + path "clean_gwas/*.txt" into not_imputed_gwas_channel2 + when: + !params.compute_imputation + script: + """ + if [ ! -d "clean_gwas" ] + then + mkdir clean_gwas + fi + cp -L *.txt ./clean_gwas + + """ +} + +/* + Process related to LD-score calculation +*/ +process Munge_LDSC_data { + publishDir "${params.output_folder}", pattern: "ldsc_data/data_*.sumstats.gz", mode: 'copy' + publishDir "${params.output_folder}", pattern: "ldsc_data/*.log", mode: 'copy' + time = {1.h * task.attempt} + maxRetries = 4 + queue 'dedicated,common,ggs' + //errorStrategy='ignore' + input: + path clean_gwas from cleaned_gwas_channel.flatten() + + output: + path "ldsc_data/*.sumstats.gz" into ldsc_data_channel + path "ldsc_data/*.sumstats.gz" into ldsc_data_channel_bis + path "ldsc_data/*.sumstats.gz" into ldsc_data_channel_three + when: + params.compute_LDSC_matrix + """ + cp ${projectDir}/bin/extract_sample_size.py ./ + Nsamp=\$(python2.7 extract_sample_size.py ${clean_gwas} ${params.meta_data}) + + if [ ! -d "ldsc_data" ] + then + mkdir ldsc_data + fi + tag=\$(echo ${clean_gwas} | cut -d '.' -f1 | cut -d '_' -f2,3) + echo \$tag + echo \$Nsamp + cat $clean_gwas | sed '1s/A1/A2/g' | sed '1s/A0/A1/g' > corrected_gwas.txt + + head corrected_gwas.txt + munge_sumstats.py --sumstats corrected_gwas.txt --N \$Nsamp --out ldsc_data/data_\$tag + """ +} + +process Generate_trait_pair { + time '1h' + queue 'dedicated,common,ggs' + input: + path generate_trait_pairs_script from generate_trait_pairs_channel + path ldsc_data from ldsc_data_channel.unique().collect() + output: + path "pairs_chunk_*.txt" into combi_channel mode flatten + when: + params.compute_LDSC_matrix + """ + python3 ${generate_trait_pairs_script} + """ +} + +process heritability_LDSC_data { + memory {8.GB * task.attempt} + time {24.h * task.attempt} + queue 'dedicated,common,ggs' + publishDir "${params.output_folder}/h2_data/", pattern: "*.log", mode: 'copy' + input: + path ldsc_data from ldsc_data_channel_three.flatten() + output: + path "*.log" into h2_log_channel + when: + params.compute_LDSC_matrix + """ + my_trait=\$(ls ${ldsc_data} | cut -d "_" -f2,3 | cut -d '.' -f1) + ldsc.py --h2 ${ldsc_data} --ref-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --w-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --out \${my_trait} + """ +} + + +process Correlation_LDSC_data { + memory {8.GB * task.attempt} + time {24.h * task.attempt} + queue 'dedicated,common,ggs' + publishDir "${params.output_folder}/cor_data/", pattern: "*.log", mode: 'copy' + input: + path trait_pair from combi_channel + path ldsc_data from ldsc_data_channel_bis.unique().collect() + output: + path "*.log" into cor_log_channel + when: + params.compute_LDSC_matrix + + """ + + echo ${trait_pair} + IFS=';' read -ra my_trait <<< "\$(cat ${trait_pair})" + i=1 + + for trait_p in \${my_trait[@]} + do + + echo \$trait_p + trait1=\$(echo \$trait_p | cut -d '.' -f1 | cut -d '_' -f2,3) + trait2=\$(echo \$trait_p | cut -d ',' -f2 | cut -d '.' -f1 | cut -d '_' -f2,3) + + ldsc.py --rg \$trait_p --out \${trait1}-_-\${trait2} --ref-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ --w-ld-chr ${params.LD_SCORE_folder}${params.prefix}@ + + done + """ +} + + +process Correlation_matrices { + + publishDir "${params.output_folder}/Correlation_matrices/", pattern: "*.csv", mode: 'copy' + time '1h' + queue 'dedicated,common,ggs' + input: + path parsing_script from parse_correlation_channel + path ldsc_data from cor_log_channel.collect() + path h2_ld from h2_log_channel.collect() + output: + path "Covariance_matrix_H0.csv" into cov_H0_matrice_channel + path "Covariance_matrix_genetic.csv" into cov_gen_matrice_channel + path "*.csv" into parsing_results + when: + params.compute_LDSC_matrix + """ + python3 ${parsing_script} + + """ +} + + +process Create_inittable_LDSC { + publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' + memory {16.GB * task.attempt} + time {24.h * task.attempt} + queue 'dedicated,common,ggs' + input: + path cleaned_gwas_chr from imputed_gwas_channel.mix(not_imputed_gwas_channel).unique().collect() + path cleaned_gwas from cleaned_gwas_channel2.collect() + path Covariance_H0 from cov_H0_matrice_channel + path Covariance_gen from cov_gen_matrice_channel + path Regions from Region_channel + output: + path "*.hdf5" into init_table_channel + when: + params.compute_LDSC_matrix + """ + + gwas_list_str=`echo ${cleaned_gwas} | sed s'/.txt//'g` + echo \$gwas_list_str + date_init=\$(date +"%m_%d_%Y-%H:%M") + + init_name="inittable_LDSC_\$date_init.hdf5" + jass create-inittable --input-data-path "./*.txt" --init-covariance-path ${Covariance_H0} --init-genetic-covariance-path ${Covariance_gen} --regions-map-path ${Regions} --description-file-path ${params.meta_data} --init-table-path \$init_name + + """ +} + + +process Create_inittable { + + publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' + memory {16.GB * task.attempt} + time {24.h * task.attempt} + + input: + path cleaned_gwas_chr from imputed_gwas_channel2.mix(not_imputed_gwas_channel2).unique().collect() + path cleaned_gwas from cleaned_gwas_channel3.collect() + path Regions from Region_channel2 + output: + path "*.hdf5" into init_table_channel_no_ldsc + when: + !params.compute_LDSC_matrix + """ + + gwas_list_str=`echo ${cleaned_gwas} | sed s'/.txt//'g` + echo \$gwas_list_str + date_init=\$(date +"%m_%d_%Y-%H:%M") + + init_name="inittable_NO_LDSC_\$date_init.hdf5" + jass create-inittable --input-data-path "./*.txt" --regions-map-path ${Regions} --description-file-path ${params.meta_data} --init-table-path \$init_name + + """ +} + + +/* + Generate one list of phenotype +*/ +process get_pheno_group { + input: + path 'group.txt' from group + output: + path "pheno_group_*" into pheno_group_channel + when: + params.compute_project + """ + split -l 1 group.txt pheno_group_ + """ +} + +process Create_project_data { + publishDir "${projectDir}/worktable/", pattern:"worktable_*.hdf5", mode: 'move' + publishDir "${projectDir}/quadrant/", pattern:"quadrant_*.png", mode: 'move' + publishDir "${projectDir}/manhattan/", pattern:"manhattan_*.png", mode: 'move' + + input: + path pheno_group from pheno_group_channel.collect() + path init_table from init_table_channel.mix(init_table_channel_no_ldsc) + + output: + path "worktable*.hdf5" into worktable_channel + path "*.png" into plot_channel + stdout result + + when: + params.compute_project + """ + for pheno_line in ${pheno_group} + do + echo \$pheno_line + group_tag="\$(cat \$pheno_line | cut -d';' -f1)" + pheno_list="\$(cat \$pheno_line | cut -d';' -f2)" + + echo \$group_tag + echo \$pheno_list + + jass create-project-data --phenotypes \$pheno_list --init-table-path ./${init_table} --worktable-path ./worktable_bis_\$group_tag.hdf5 --manhattan-plot-path ./manhattan_\$group_tag.png --quadrant-plot-path ./quadrant_\$group_tag.png + done + """ +} +//result.println { it.trim() }