Merge branch 'master' into 'heatmap'

# Conflicts: # README.md

Merge branch 'master' into 'heatmap'
c3298448 · Hanna JULIENNE · 2e1c3cdd · 84ce2ec4 · c3298448 · c3298448
Commit c3298448 authored 8 months ago by Hanna JULIENNE
--- a/LICENSE
+++ b/LICENSE
+JASS Pipeline © 2019 by Institut Pasteur is licensed under Attribution-NonCommercial-ShareAlike 4.0 International. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc-sa/4.0/
--- a/README.md
+++ b/README.md
@@ -81,7 +81,9 @@ Once done you can launch the pipeline using by replacing {ABSOLUTE_PATH_TO_PIPEL
    nextflow run jass_pipeline.nf --ref_panel_WG {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}Ref_Panel/1000G_EAS_0_01_chr22_21.csv --gwas_folder {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/test_data/hg38_EAS/ --meta-data {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/Data_test_EAS.csv --region {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/All_Regions_ALL_ensemble_1000G_hg38_EAS.bed --group {ABSOLUTE_PATH_TO_PIPELINE_FOLDER}/input_files/group.txt -with-report jass_report.html -c nextflow_local.config
 ```
 See the description of required parameters in the next section.
-You can specify parameter in the jass_pipeline.nf header if prefered.
+
+On the command line above, parameters are passed down to nextflow directly on the command line. This should override parameters specified in the jass_pipeline.nf.
+> **However, you can specify/modify parameter in the jass_pipeline.nf header if prefered.**

 If all went well, you have cleaned the three summary statistic files, aligned them on the reference panel, and integrated them in one database. This database was used to perform a multi-trait GWAS on the three traits. 

@@ -153,7 +155,7 @@ and require a HPC cluster.
    tar -jxvf eur_w_ld_chr.tar.bz2
 ```

-For hg38 and Admixed American, African, East Asian, European and South-east Asian ancestries you can download our precomputed panels:
+For **hg38 and Admixed American, African, East Asian, European and South-east Asian ancestries** you can download our precomputed panels:
 [https://zenodo.org/record/8096589](https://zenodo.org/record/8096589)

 For details on the computation method for this panel consult [https://www.biorxiv.org/content/10.1101/2023.06.23.546248v1](https://www.biorxiv.org/content/10.1101/2023.06.23.546248v1). 

--- a/bin/make_heatmap.R
+++ b/bin/make_heatmap.R
@@ -23,6 +23,12 @@ setkey(trait_name, "ID")
 Rho = fread(correlation_file)
 setkey(Rho, "V1")

+D = as.matrix(1-Rho[,-1])
+D[is.na(D)] = 0
+h = hclust(as.dist(D))
+order_trait <- trait_name$ID[h$order]
+
+
 Pval = fread(p_file)
 setkey(Pval, "V1")

@@ -30,7 +36,6 @@ Rho = Rho[trait_name$ID , c("V1",trait_name$ID), with=FALSE]
 Pval = Pval[trait_name$ID , c("V1",trait_name$ID), with=FALSE]

 cor_long = melt(as.data.table(Rho),id.vars=c('V1'))
-order_trait <- trait_name$ID

 pval_long =melt(as.data.table(Pval),id.vars=c('V1'))

@@ -45,6 +50,8 @@ pval_long[,cor_val:= signif(round(cor_long$value, 2), 2)]
 pval_long[,full_annot := str_trim(paste0(cor_val, "\n",pval_annote))]

 pval_long$full_annot[pval_long$full_annot == "1"] <- ""
+pval_long$full_annot[pval_long$full_annot == "NA"] <- ""
+

 cor_long[, variable:=ordered(cor_long$variable, levels=order_trait)]
 cor_long[, V1:=ordered(cor_long$V1, levels=order_trait)]
@@ -65,7 +72,10 @@ pG = pG +  theme(legend.key.size = unit(1, 'cm'), #change legend key size

 pG = pG + geom_text(data=pval_long, aes(x=V1,y=variable, label=full_annot), size=3.5)

-png(paste("HeatMap_", strsplit(Args[6], split=".", fixed=T)[[1]][1],".png", sep=""),width = 20, height    = 20,
+png_width <- max(length(trait_name$ID)*0.5, 10)
+png_height <- max(length(trait_name$ID)*0.5, 10)
+
+png(paste("HeatMap_", strsplit(Args[6], split=".", fixed=T)[[1]][1],".png", sep=""),width =png_width , height = png_height,
  units     = "in",
  res       = 300,
  pointsize = 4)

--- a/bin/parse_correlation_results.py
+++ b/bin/parse_correlation_results.py
@@ -2,6 +2,10 @@ import re
 import pandas as pd
 import glob
 import numpy as np
+import sys
+
+ancestry = sys.argv[1]
+current_date = sys.argv[2]

 print("Parsing_correlation")
 file_pairs = set(glob.glob("*-_-*.log"))
@@ -134,13 +138,13 @@ for i1, t1 in enumerate(traits):
                Sd_cov_matrix_genetic.loc[t2_col, t2_col] = float(L_h2_t2[0].split(":")[1].split(" ")[2].strip("()\n"))


-Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic.csv", sep="\t")
-Covariance_matrix_H0.to_csv("Covariance_matrix_H0.csv", sep="\t")
-Correlation_matrix_genetic.to_csv("Correlation_matrix_genetic.csv", sep="\t")
+Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t")
+Covariance_matrix_H0.to_csv("Covariance_matrix_H0_"+ancestry+"_"+current_date+".csv", sep="\t")
+Correlation_matrix_genetic.to_csv("Correlation_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t")

-Sd_cov_matrix_genetic.to_csv("Sd_cov_matrix_genetic.csv", sep="\t")
-Sd_matrix_H0.to_csv("Sd_matrix_H0.csv", sep="\t")
-Sd_cor_matrix_genetic.to_csv("Sd_cor_matrix_genetic.csv", sep="\t")
-Pval_matrix_genetic.to_csv("Pval_cor_matrix_genetic.csv", sep="\t")
+Sd_cov_matrix_genetic.to_csv("Sd_cov_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t")
+Sd_matrix_H0.to_csv("Sd_matrix_H0_"+ancestry+"_"+current_date+".csv", sep="\t")
+Sd_cor_matrix_genetic.to_csv("Sd_cor_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t")
+Pval_matrix_genetic.to_csv("Pval_cor_matrix_genetic_"+ancestry+"_"+current_date+".csv", sep="\t")

 print("Parsing_correlation")
--- a/input_files/test1.tsv
+++ b/input_files/test1.tsv
+filename	internalDataLink	Consortium	Outcome	FullName	Type	Reference	ReferenceLink	dataLink	Nsample	Ncase	Ncontrol	Nsnp	snpid	a1	a2	freq	pval	n	z	OR	se	code	imp	ncas	ncont	altNcas	altNcont	Unnamed: 28
+GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt		GIANT	HEIGHT	Height	Anthropometry	Wood et al. 2014	http://www.ncbi.nlm.nih.gov/pubmed/25282103	https://www.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files 	253288			2550858.0	MarkerName	Allele1	Allele2	Freq.Allele1.HapMapCEU	p	N	b		SE
+SNP_gwas_mc_merge_nogc.tbl.uniq		GIANT	BMI	Body Mass Index	Anthropometry	Locke et al. 2015	http://www.ncbi.nlm.nih.gov/pubmed/25673413	https://www.broadinstitute.org/collaboration/giant/index.php/GIANT_consortium_data_files 	339224			2554637.0	SNP	A1	A2	Freq1.Hapmap	p	N	b		se
--- a/jass_pipeline.nf
+++ b/jass_pipeline.nf
@@ -16,12 +16,12 @@ params.meta_data = "${projectDir}"+"/input_files/Data_test_EAS.csv"
 // "${projectDir}"+"/input_files/Meta_data_preliminary_analysis.csv" file describing gwas summary statistic format
 params.gwas_folder = "${projectDir}"+'/test_data/hg38_EAS/' 

-params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EUR.bed")
+params.region = Channel.fromPath("${projectDir}"+"/input_files/All_Regions_ALL_ensemble_1000G_hg38_EAS.bed")
 params.ref_panel_WG = "${projectDir}"+"/Ref_Panel/1000G_EAS_0_01_chr22_21.csv"


 params.ancestry="EAS"
-params.prefix="ALL_ensemble_1000G_hg38_EUR_chr"
+params.prefix="ALL_ensemble_1000G_hg38_EAS_chr"
 params.prefix_Impute_GWAS="ALL_ensemble_1000G_hg38_EAS_"
 params.suffix=""
 params.LD_SCORE_folder='/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LDSCORE/EAS/'
@@ -45,7 +45,7 @@ params.ld_type = "plink"
 params.ld_folder = Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/Matrix_LD_RAISS/EAS/*.ld").collect()
 params.ref_panel = '/pasteur/zeus/projets/p02/GGS_JASS/jass_analysis_pipeline/Ref_panel_by_chr/'
 chr_channel = Channel.from(1..22)
-params.ref_chr_path= Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EUR/ALL_ensemble_1000G_hg38_EUR_chr*.bim").collect()
+params.ref_chr_path= Channel.fromPath("/pasteur/zeus/projets/p02/GGS_WKD/DATA_1000G/Panels/EAS/ALL_ensemble_1000G_hg38_EAS_chr*.bim").collect()

 params.perform_sanity_checks=false
 params.perform_accuracy_checks=false
@@ -58,6 +58,34 @@ generate_trait_pairs_channel = "${projectDir}/bin/generate_trait_pairs.py"
 parse_correlation_channel = "${projectDir}/bin/parse_correlation_results.py"
 make_heatmap_channel = "${projectDir}/bin/make_heatmap.R"

+// folder id output will be a hash of all parameters
+string_input =  String.valueOf(params.compute_project) +
+                String.valueOf(params.compute_LDSC_matrix) +
+                String.valueOf(params.compute_imputation) +
+                String.valueOf(params.meta_data) +
+                String.valueOf(params.gwas_folder) +
+                String.valueOf(params.region) +
+                String.valueOf(params.ref_panel_WG) +
+                String.valueOf(params.ancestry) +
+                String.valueOf(params.prefix) +
+                String.valueOf(params.prefix_Impute_GWAS) +
+                String.valueOf(params.suffix) +
+                String.valueOf(params.LD_SCORE_folder) +
+                String.valueOf(params.output_folder) +
+                String.valueOf(params.diagnostic_folder) +
+                String.valueOf(params.r2threshold) +
+                String.valueOf(params.eigenthreshold) +
+                String.valueOf(params.minimumld) +
+                String.valueOf(params.ld_type) +
+                String.valueOf(params.ld_folder) +
+                String.valueOf(params.ref_panel) +
+                String.valueOf(params.ref_chr_path) +
+                String.valueOf(params.perform_sanity_checks) +
+                String.valueOf(params.perform_accuracy_checks) +
+                String.valueOf(params.group)
+
+params.hash_params = string_input.md5()
+
 /*****************************/
 /*   process inclusion   */
 /*****************************/

--- a/modules/Clean_GWAS.nf
+++ b/modules/Clean_GWAS.nf
@@ -5,8 +5,7 @@ process Meta_data_GWAS{
    output:
        path "meta_data_chk*.csv"
    """
-    d=`wc -l ${pheno_list}`
-    e=`echo \$d | cut -d ' ' -f 1`
+    e=\$(grep -c '' ${pheno_list})

    for ((i = 2; i <= \$e; i++));
    do
@@ -17,16 +16,16 @@ process Meta_data_GWAS{
 }

 process Clean_GWAS{
-    publishDir "${params.output_folder}/harmonized_GWAS_files/", pattern: "*.txt", mode: 'copy'
-    publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file/*.txt", mode: 'copy'
+    publishDir "${params.output_folder}/harmonized_GWAS_files_${params.ancestry}_${params.hash_input}/", pattern: "*.txt", mode: 'copy'
+    publishDir "${params.output_folder}", pattern: "harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/*.txt", mode: 'copy'
    input:
        path ref_panel
        path meta_chunk
    output:
-        path "harmonized_GWAS_1_file/*.txt", emit: cleaned_gwas_channel
+        path "harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/*.txt", emit: cleaned_gwas_channel
        path "*.txt", emit: cleaned_gwas_chr_channel
    """
-    mkdir -p harmonized_GWAS_1_file
+    mkdir -p harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}
    pwd
    ls ${params.gwas_folder}
    echo ${params.gwas_folder}
@@ -38,6 +37,6 @@ process Clean_GWAS{

    jass_preprocessing --gwas-info \$full_path --ref-path ${ref_panel} \
        --input-folder ${params.gwas_folder} --diagnostic-folder ${params.diagnostic_folder} \
-        --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file/
+        --output-folder ./ --output-folder-1-file harmonized_GWAS_1_file_${params.ancestry}_${params.hash_input}/
    """
 }
--- a/modules/JASS.nf
+++ b/modules/JASS.nf

 process Create_inittable_LDSC {
-    publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy'
+    publishDir "${params.output_folder}/init_table_${params.ancestry}_${params.hash_input}/", pattern: "*.hdf5", mode: 'copy'
    input:
        path cleaned_gwas_chr
        path cleaned_gwas
@@ -22,7 +22,7 @@ process Create_inittable_LDSC {
 }

 process Create_inittable {
-    publishDir "${params.output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy'
+    publishDir "${params.output_folder}/init_table_${params.ancestry}_${params.hash_input}/", pattern: "*.hdf5", mode: 'copy'
    input:
        path cleaned_gwas_chr
        path cleaned_gwas
@@ -54,7 +54,7 @@ process Get_pheno_group {
 }

 process Create_project_data {
-    publishDir "${params.output_folder}/worktable/", pattern:"worktable_*.hdf5", mode: 'copy'
+    publishDir "${params.output_folder}/worktable_${params.ancestry}_${params.hash_input}/", pattern:"worktable_*.hdf5", mode: 'copy'
    publishDir "${params.output_folder}/quadrant/", pattern:"quadrant_*.png", mode: 'copy'
    publishDir "${params.output_folder}/manhattan/", pattern:"manhattan_*.png", mode: 'copy'
    input:
@@ -77,4 +77,4 @@ process Create_project_data {
            jass create-project-data --phenotypes \$pheno_list --init-table-path ./${init_table} --worktable-path ./worktable_bis_\$group_tag.hdf5 --manhattan-plot-path ./manhattan_\$group_tag.png --quadrant-plot-path ./quadrant_\$group_tag.png
        done
    """
-}
\ No newline at end of file
+}
--- a/modules/LDSC.nf
+++ b/modules/LDSC.nf

 process Munge_LDSC_data {
-    publishDir "${params.output_folder}", pattern: "ldsc_data/data_*.sumstats.gz", mode: 'copy'
-    publishDir "${params.output_folder}", pattern: "ldsc_data/*.log", mode: 'copy'
+    publishDir "${params.output_folder}", pattern: "ldsc_data_${params.ancestry}_${params.hash_input}/data_*.sumstats.gz", mode: 'copy'
+    publishDir "${params.output_folder}", pattern: "ldsc_data_${params.ancestry}_${params.hash_input}/*.log", mode: 'copy'

    input:
        path clean_gwas
@@ -27,7 +27,7 @@ process Munge_LDSC_data {

 process Heritability_LDSC_data {

-    publishDir "${params.output_folder}/h2_data/", pattern: "*.log", mode: 'copy'
+    publishDir "${params.output_folder}/h2_data_${params.ancestry}_${params.hash_input}/", pattern: "*.log", mode: 'copy'
    input:
        path ldsc_data
    output:
@@ -56,7 +56,7 @@ process Correlation_LDSC_data {
    memory {8.GB * task.attempt}
    time {24.h * task.attempt}

-    publishDir "${params.output_folder}/cor_data/", pattern: "*.log", mode: 'copy'
+    publishDir "${params.output_folder}/cor_data_${params.ancestry}_${params.hash_input}/", pattern: "*.log", mode: 'copy'
    input:
        path trait_pair
        path ldsc_data 
@@ -90,13 +90,13 @@ process Parsing_correlation_matrices {
        path ldsc_data
        path h2_ld
    output:
-        path "Covariance_matrix_H0.csv", emit: cov_H0_matrice_channel
-        path "Covariance_matrix_genetic.csv", emit: cov_gen_matrice_channel
+        path "Covariance_matrix_H0_${params.ancestry}_${params.hash_input}.csv", emit: cov_H0_matrice_channel
+        path "Covariance_matrix_genetic_${params.ancestry}_${params.hash_input}.csv", emit: cov_gen_matrice_channel
        path "*.csv", emit: parsing_results
    when:
        params.compute_LDSC_matrix
    """
-    python3 ${parsing_script}
+    python3 ${parsing_script} ${params.ancestry} ${params.hash_input}

    """
 }
@@ -116,7 +116,7 @@ process Make_HeatMap {
        params.compute_LDSC_matrix

    """
-        Rscript ${make_heatmap_script} Correlation_matrix_genetic.csv Pval_cor_matrix_genetic.csv
+        Rscript ${make_heatmap_script} Correlation_matrix_genetic_${params.ancestry}_${params.hash_input}.csv Pval_cor_matrix_genetic_${params.ancestry}_${params.hash_input}.csv

    """