diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..ab9ccccb6bdab67862e1dc706dcbfc6260c81cb9 --- /dev/null +++ b/LICENSE @@ -0,0 +1 @@ +JASS Pipeline © 2019 by Institut Pasteur is licensed under Attribution-NonCommercial-ShareAlike 4.0 International. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/ diff --git a/README.md b/README.md index 083e00ff5e7b8744e43a7ccaaa58b0db77aa7217..91bd08354e4c5e50138240b72a4d6a34b4cbd935 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,9 @@ Once done you can launch the pipeline using by replacing {ABSOLUTE_PATH_TO_PIPEL nextflow run jass_pipeline.nf --ref_panel_WG {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}Ref_Panel/1000G_EAS_0_01_chr22_21.csv --gwas_folder {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/test_data/hg38_EAS/ --meta-data {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/Data_test_EAS.csv --region {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/All_Regions_ALL_ensemble_1000G_hg38_EAS.bed --group {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/group.txt -with-report jass_report.html -c nextflow_local.config ``` See the description of required parameters in the next section. -You can specify parameter in the jass_pipeline.nf header if prefered. + +On the command line above, parameters are passed down to nextflow directly on the command line. This should override parameters specified in the jass_pipeline.nf. +> **However, you can specify/modify parameter in the jass_pipeline.nf header if prefered.** If all went well, you have cleaned the three summary statistic files, aligned them on the reference panel, and integrated them in one database. This database was used to perform a multi-trait GWAS on the three traits. @@ -153,7 +155,7 @@ and require a HPC cluster. tar -jxvf eur_w_ld_chr.tar.bz2 ``` -For hg38 and Admixed American, African, East Asian, European and South-east Asian ancestries you can download our precomputed panels: +For **hg38 and Admixed American, African, East Asian, European and South-east Asian ancestries** you can download our precomputed panels: [https://zenodo.org/record/8096589](https://zenodo.org/record/8096589) For details on the computation method for this panel consult [https://www.biorxiv.org/content/10.1101/2023.06.23.546248v1](https://www.biorxiv.org/content/10.1101/2023.06.23.546248v1). diff --git a/bin/make_heatmap.R b/bin/make_heatmap.R index 45b4aa367134072546f4d5b90e219e0a5b10b5a9..7419fba60da09b8468e443dea27947bf9f058e1f 100644 --- a/bin/make_heatmap.R +++ b/bin/make_heatmap.R @@ -23,6 +23,12 @@ setkey(trait_name, "ID") Rho = fread(correlation_file) setkey(Rho, "V1") +D = as.matrix(1-Rho[,-1]) +D[is.na(D)] = 0 +h = hclust(as.dist(D)) +order_trait <- trait_name$ID[h$order] + + Pval = fread(p_file) setkey(Pval, "V1") @@ -30,7 +36,6 @@ Rho = Rho[trait_name$ID , c("V1",trait_name$ID), with=FALSE] Pval = Pval[trait_name$ID , c("V1",trait_name$ID), with=FALSE] cor_long = melt(as.data.table(Rho),id.vars=c('V1')) -order_trait <- trait_name$ID pval_long =melt(as.data.table(Pval),id.vars=c('V1')) @@ -45,6 +50,8 @@ pval_long[,cor_val:= signif(round(cor_long$value, 2), 2)] pval_long[,full_annot := str_trim(paste0(cor_val, "\n",pval_annote))] pval_long$full_annot[pval_long$full_annot == "1"] <- "" +pval_long$full_annot[pval_long$full_annot == "NA"] <- "" + cor_long[, variable:=ordered(cor_long$variable, levels=order_trait)] cor_long[, V1:=ordered(cor_long$V1, levels=order_trait)] @@ -65,7 +72,10 @@ pG = pG + theme(legend.key.size = unit(1, 'cm'), #change legend key size pG = pG + geom_text(data=pval_long, aes(x=V1,y=variable, label=full_annot), size=3.5) -png(paste("HeatMap_", strsplit(Args[6], split=".", fixed=T)[[1]][1],".png", sep=""),width = 20, height = 20, +png_width <- max(length(trait_name$ID)*0.5, 10) +png_height <- max(length(trait_name$ID)*0.5, 10) + +png(paste("HeatMap_", strsplit(Args[6], split=".", fixed=T)[[1]][1],".png", sep=""),width =png_width , height = png_height, units = "in", res = 300, pointsize = 4) diff --git a/bin/parse_correlation_results.py b/bin/parse_correlation_results.py index b7f524141f8a53b12b662e6fba86454fb0928be2..5509f9fe1cd52ce0b9f6b72c591ba3fda1870a41 100644 --- a/bin/parse_correlation_results.py +++ b/bin/parse_correlation_results.py @@ -2,6 +2,10 @@ import re import pandas as pd import glob import numpy as np +import sys + +ancestry = sys.argv[1] +current_date = sys.argv[2] print("Parsing_correlation") file_pairs = set(glob.glob("*-_-*.log")) @@ -134,13 +138,13 @@ for i1, t1 in enumerate(traits): Sd_cov_matrix_genetic.loc[t2_col, t2_col] = float(L_h2_t2[0].split(":")[1].split(" ")[2].strip("()\n")) -Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic.csv", sep="\t") -Covariance_matrix_H0.to_csv("Covariance_matrix_H0.csv", sep="\t") -Correlation_matrix_genetic.to_csv("Correlation_matrix_genetic.csv", sep="\t") +Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t") +Covariance_matrix_H0.to_csv("Covariance_matrix_H0_"+ancestry+"_"+current_date+".csv", sep="\t") +Correlation_matrix_genetic.to_csv("Correlation_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t") -Sd_cov_matrix_genetic.to_csv("Sd_cov_matrix_genetic.csv", sep="\t") -Sd_matrix_H0.to_csv("Sd_matrix_H0.csv", sep="\t") -Sd_cor_matrix_genetic.to_csv("Sd_cor_matrix_genetic.csv", sep="\t") -Pval_matrix_genetic.to_csv("Pval_cor_matrix_genetic.csv", sep="\t") +Sd_cov_matrix_genetic.to_csv("Sd_cov_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t") +Sd_matrix_H0.to_csv("Sd_matrix_H0_"+ancestry+"_"+current_date+".csv", sep="\t") +Sd_cor_matrix_genetic.to_csv("Sd_cor_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t") +Pval_matrix_genetic.to_csv("Pval_cor_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t") print("Parsing_correlation") diff --git a/input_files/test1.tsv b/input_files/test1.tsv new file mode 100644 index 0000000000000000000000000000000000000000..9e0dcc57f48d14afdf2c3fc2f8c45471af5731c2 --- /dev/null +++ b/input_files/test1.tsv @@ -0,0 +1,3 @@ +filename internalDataLink Consortium Outcome FullName Type Reference ReferenceLink dataLink Nsample Ncase Ncontrol Nsnp snpid a1 a2 freq pval n z OR se code imp ncas ncont altNcas altNcont Unnamed: 28 +GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt GIANT HEIGHT Height Anthropometry Wood et al. 2014 http://www.ncbi.nlm.nih.gov/pubmed/25282103 https://www.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files 253288 2550858.0 MarkerName Allele1 Allele2 Freq.Allele1.HapMapCEU p N b SE +SNP_gwas_mc_merge_nogc.tbl.uniq GIANT BMI Body Mass Index Anthropometry Locke et al. 2015 http://www.ncbi.nlm.nih.gov/pubmed/25673413 https://www.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files 339224 2554637.0 SNP A1 A2 Freq1.Hapmap p N b se diff --git a/jass_pipeline.nf b/jass_pipeline.nf index 5fef9b0441568fd837113486ec79b1f938ab257d..ee66f3994542e078e66d1201867646ea3292dd82 100644 --- a/jass_pipeline.nf +++ b/jass_pipeline.nf @@ -16,12 +16,12 @@ params.meta_data = "${projectDir}"+"/input_files/Data_test_EAS.csv" // "${projectDir}"+"/input_files/Meta_data_preliminary_analysis.csv" file describing gwas summary statistic format params.gwas_folder = "${projectDir}"+'/test_data/hg38_EAS/' -params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EUR.bed") +params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EAS.bed") params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_EAS_0_01_chr22_21.csv" params.ancestry="EAS" -params.prefix="ALL_ensemble_1000G_hg38_EUR_chr" +params.prefix="ALL_ensemble_1000G_hg38_EAS_chr" params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_EAS_" params.suffix="" params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/EAS/' @@ -45,7 +45,7 @@ params.ld_type = "plink" params.ld_folder = Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/EAS/*.ld").collect() params.ref_panel = '/pasteur/zeus/projets/p02/GGS_JASS/jass_analysis_pipeline/Ref_panel_by_chr/' chr_channel = Channel.from(1..22) -params.ref_chr_path= Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EUR/ALL_ensemble_1000G_hg38_EUR_chr*.bim").collect() +params.ref_chr_path= Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EAS/ALL_ensemble_1000G_hg38_EAS_chr*.bim").collect() params.perform_sanity_checks=false params.perform_accuracy_checks=false @@ -58,6 +58,34 @@ generate_trait_pairs_channel = "${projectDir}/bin/generate_trait_pairs.py" parse_correlation_channel = "${projectDir}/bin/parse_correlation_results.py" make_heatmap_channel = "${projectDir}/bin/make_heatmap.R" +// folder id output will be a hash of all parameters +string_input = String.valueOf(params.compute_project) + + String.valueOf(params.compute_LDSC_matrix) + + String.valueOf(params.compute_imputation) + + String.valueOf(params.meta_data) + + String.valueOf(params.gwas_folder) + + String.valueOf(params.region) + + String.valueOf(params.ref_panel_WG) + + String.valueOf(params.ancestry) + + String.valueOf(params.prefix) + + String.valueOf(params.prefix_Impute_GWAS) + + String.valueOf(params.suffix) + + String.valueOf(params.LD_SCORE_folder) + + String.valueOf(params.output_folder) + + String.valueOf(params.diagnostic_folder) + + String.valueOf(params.r2threshold) + + String.valueOf(params.eigenthreshold) + + String.valueOf(params.minimumld) + + String.valueOf(params.ld_type) + + String.valueOf(params.ld_folder) + + String.valueOf(params.ref_panel) + + String.valueOf(params.ref_chr_path) + + String.valueOf(params.perform_sanity_checks) + + String.valueOf(params.perform_accuracy_checks) + + String.valueOf(params.group) + +params.hash_params = string_input.md5() + /*****************************/ /* process inclusion */ /*****************************/ diff --git a/modules/Clean_GWAS.nf b/modules/Clean_GWAS.nf index 962ef2c3fb149b4fa2ee334ad387e1c7c03b1de7..b193fe930b6ecd5e9491fc3d28c33379fbdb6a7e 100644 --- a/modules/Clean_GWAS.nf +++ b/modules/Clean_GWAS.nf @@ -5,8 +5,7 @@ process Meta_data_GWAS{ output: path "meta_data_chk*.csv" """ - d=`wc -l ${pheno_list}` - e=`echo \$d | cut -d ' ' -f 1` + e=\$(grep -c '' ${pheno_list}) for ((i = 2; i <= \$e; i++)); do @@ -17,16 +16,16 @@ process Meta_data_GWAS{ } process Clean_GWAS{ - publishDir "${params.output_folder}/harmonized_GWAS_files/", pattern: "*.txt", mode: 'copy' - publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file/*.txt", mode: 'copy' + publishDir "${params.output_folder}/harmonized_GWAS_files_${params.ancestry}_${params.hash_input}/", pattern: "*.txt", mode: 'copy' + publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/*.txt", mode: 'copy' input: path ref_panel path meta_chunk output: - path "harmonized_GWAS_1_file/*.txt", emit: cleaned_gwas_channel + path "harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/*.txt", emit: cleaned_gwas_channel path "*.txt", emit: cleaned_gwas_chr_channel """ - mkdir -p harmonized_GWAS_1_file + mkdir -p harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input} pwd ls ${params.gwas_folder} echo ${params.gwas_folder} @@ -38,6 +37,6 @@ process Clean_GWAS{ jass_preprocessing --gwas-info \$full_path --ref-path ${ref_panel} \ --input-folder ${params.gwas_folder} --diagnostic-folder ${params.diagnostic_folder} \ - --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file/ + --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/ """ } diff --git a/modules/JASS.nf b/modules/JASS.nf index 1ac35dedc9fe60d0602a64a3c89d11164a3d47b6..393ff3f79fec9fbbdb63780a08eac32f0eea2647 100644 --- a/modules/JASS.nf +++ b/modules/JASS.nf @@ -1,6 +1,6 @@ process Create_inittable_LDSC { - publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' + publishDir "${params.output_folder}/init_table_${params.ancestry}_${params.hash_input}/", pattern: "*.hdf5", mode: 'copy' input: path cleaned_gwas_chr path cleaned_gwas @@ -22,7 +22,7 @@ process Create_inittable_LDSC { } process Create_inittable { - publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy' + publishDir "${params.output_folder}/init_table_${params.ancestry}_${params.hash_input}/", pattern: "*.hdf5", mode: 'copy' input: path cleaned_gwas_chr path cleaned_gwas @@ -54,7 +54,7 @@ process Get_pheno_group { } process Create_project_data { - publishDir "${params.output_folder}/worktable/", pattern:"worktable_*.hdf5", mode: 'copy' + publishDir "${params.output_folder}/worktable_${params.ancestry}_${params.hash_input}/", pattern:"worktable_*.hdf5", mode: 'copy' publishDir "${params.output_folder}/quadrant/", pattern:"quadrant_*.png", mode: 'copy' publishDir "${params.output_folder}/manhattan/", pattern:"manhattan_*.png", mode: 'copy' input: @@ -77,4 +77,4 @@ process Create_project_data { jass create-project-data --phenotypes \$pheno_list --init-table-path ./${init_table} --worktable-path ./worktable_bis_\$group_tag.hdf5 --manhattan-plot-path ./manhattan_\$group_tag.png --quadrant-plot-path ./quadrant_\$group_tag.png done """ -} \ No newline at end of file +} diff --git a/modules/LDSC.nf b/modules/LDSC.nf index f6e8ce981d33638b9f723a9d4332139efa3baea3..b6bc0748ac5eff01fa9381739b2d0738f5f1cc3d 100644 --- a/modules/LDSC.nf +++ b/modules/LDSC.nf @@ -1,7 +1,7 @@ process Munge_LDSC_data { - publishDir "${params.output_folder}", pattern: "ldsc_data/data_*.sumstats.gz", mode: 'copy' - publishDir "${params.output_folder}", pattern: "ldsc_data/*.log", mode: 'copy' + publishDir "${params.output_folder}", pattern: "ldsc_data_${params.ancestry}_${params.hash_input}/data_*.sumstats.gz", mode: 'copy' + publishDir "${params.output_folder}", pattern: "ldsc_data_${params.ancestry}_${params.hash_input}/*.log", mode: 'copy' input: path clean_gwas @@ -27,7 +27,7 @@ process Munge_LDSC_data { process Heritability_LDSC_data { - publishDir "${params.output_folder}/h2_data/", pattern: "*.log", mode: 'copy' + publishDir "${params.output_folder}/h2_data_${params.ancestry}_${params.hash_input}/", pattern: "*.log", mode: 'copy' input: path ldsc_data output: @@ -56,7 +56,7 @@ process Correlation_LDSC_data { memory {8.GB * task.attempt} time {24.h * task.attempt} - publishDir "${params.output_folder}/cor_data/", pattern: "*.log", mode: 'copy' + publishDir "${params.output_folder}/cor_data_${params.ancestry}_${params.hash_input}/", pattern: "*.log", mode: 'copy' input: path trait_pair path ldsc_data @@ -90,13 +90,13 @@ process Parsing_correlation_matrices { path ldsc_data path h2_ld output: - path "Covariance_matrix_H0.csv", emit: cov_H0_matrice_channel - path "Covariance_matrix_genetic.csv", emit: cov_gen_matrice_channel + path "Covariance_matrix_H0_${params.ancestry}_${params.hash_input}.csv", emit: cov_H0_matrice_channel + path "Covariance_matrix_genetic_${params.ancestry}_${params.hash_input}.csv", emit: cov_gen_matrice_channel path "*.csv", emit: parsing_results when: params.compute_LDSC_matrix """ - python3 ${parsing_script} + python3 ${parsing_script} ${params.ancestry} ${params.hash_input} """ } @@ -116,7 +116,7 @@ process Make_HeatMap { params.compute_LDSC_matrix """ - Rscript ${make_heatmap_script} Correlation_matrix_genetic.csv Pval_cor_matrix_genetic.csv + Rscript ${make_heatmap_script} Correlation_matrix_genetic_${params.ancestry}_${params.hash_input}.csv Pval_cor_matrix_genetic_${params.ancestry}_${params.hash_input}.csv """