Commit d2bb4089 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

Pipeline integrating jass_preprocessing, raiss and jass

parents
import pandas as pd
import glob as glob
def list_intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
check_jass_prepro = False
check_imputation = True
if check_jass_prepro:
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/1._DATA/ldscore_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/pasteur/projets/policy01/PCMA/1._DATA/ldscore_data/"+tag, index_col=1, sep="\t")
zs_aft = pd.read_csv("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/"+tag, index_col=1, sep="\t")
print(zs_bef.head())
shared_SNP = zs_bef.index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "Z"]))
if check_imputation:
print("Check imputation")
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/1._DATA/Imputed_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/imputed_GWAS/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/pasteur/projets/policy01/PCMA/1._DATA/Imputed_data/"+tag, index_col=0, sep="\t")
zs_aft = pd.read_csv("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/imputed_GWAS/"+tag, index_col=0, sep="\t")
shared_SNP = zs_bef.index.intersection(zs_aft.index)
shared_imputed_SNP = zs_bef.loc[zs_bef.Var > -1.0].index.intersection(zs_aft.index)
shared_original_SNP = zs_bef.loc[zs_bef.Var < -0.5].index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "z_score"]))
print("Correlation imputed Z_score:")
cor_imp = (zs_aft.loc[shared_imputed_SNP, "Z"].corr(zs_bef.loc[shared_imputed_SNP, "z_score"]))
print(cor_imp)
print("Correlation original Z_score:")
print(zs_aft.loc[shared_original_SNP, "Z"].corr(zs_bef.loc[shared_original_SNP, "z_score"]))
if cor_imp < 0.98:
print("WARNING :Imputation is incoherent ({0}) for case {1}".format(cor_imp, tag))
import pandas as pd
import glob as glob
def list_intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
check_jass_prepro = False
check_imputation = True
if check_jass_prepro:
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/1._DATA/ldscore_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/mnt/atlas/PCMA/1._DATA/ldscore_data/"+tag, index_col=1, sep="\t")
zs_aft = pd.read_csv("/mnt/atlas/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/"+tag, index_col=1, sep="\t")
print(zs_bef.head())
shared_SNP = zs_bef.index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "Z"]))
if check_imputation:
print("Check imputation")
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/1._DATA/Imputed_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/JASS_pipeline_output/imputed_GWAS/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/mnt/atlas/PCMA/1._DATA/Imputed_data/"+tag, index_col=0, sep="\t")
zs_aft = pd.read_csv("/mnt/atlas/PCMA/JASS_pipeline_output/imputed_GWAS/"+tag, index_col=0, sep="\t")
shared_SNP = zs_bef.index.intersection(zs_aft.index)
shared_imputed_SNP = zs_bef.loc[zs_bef.Var > -1.0].index.intersection(zs_aft.index)
shared_original_SNP = zs_bef.loc[zs_bef.Var < -0.5].index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "z_score"]))
print("Correlation imputed Z_score:")
print(zs_aft.loc[shared_imputed_SNP, "Z"].corr(zs_bef.loc[shared_imputed_SNP, "z_score"]))
print("Correlation original Z_score:")
print(zs_aft.loc[shared_original_SNP, "Z"].corr(zs_bef.loc[shared_original_SNP, "z_score"]))
import pandas as pd
import sys
f = sys.argv[1]
meta_data_loc = sys.argv[2]
d = f.split('.')[0]
D = pd.read_csv(meta_data_loc, sep="\t")
D.index = "z_"+D.Consortium+"_"+D.Outcome
Nsample = D.loc[d, "Nsample"]
print Nsample
import glob
from itertools import combinations
import os
cwd = os.getcwd()
print(cwd)
L = glob.glob('{}/*.gz'.format(cwd))
L_s = [j.split('/')[-1] for j in L]
L_combi = [",".join(map(str, comb)) for comb in combinations(L_s, 2)]
print(L_s)
size_chunk = 100
N_files = len(L_combi)//size_chunk
i=0
for i in range(N_files+1):
start = i*size_chunk
end = (i+1)*size_chunk
if i < N_files:
print(";".join(L_combi[start:end]),file=open("pairs_chunk_{0}.txt".format(i), "w"))
else:
print(";".join(L_combi[start:]),file=open("pairs_chunk_{0}.txt".format(i), "w"))
params.compute_project=false
params.compute_LDSC_matrix=false
params.compute_imputation=true
params.group = "$baseDir/input_files/group.txt"
group = file(params.group)
netPath = "/pasteur/projets/policy01/PCMA/"
output_folder = "/pasteur/projets/policy01/PCMA/JASS_pipeline_output/"
GWAS_labels = netPath + 'jass_analysis_pipeline/input_files/test_final.csv'
GWAS_path = "/pasteur/projets/policy01/PCMA/1._DATA/RAW.GWAS/"
diagnostic_folder= output_folder + "/sample_size/"
harmonized_GWAS_folder = output_folder + "harmonized_GWAS/"
harmonized_GWAS_1_file_folder = output_folder + "harmonized_GWAS/"
chr_channel = Channel.from(1..22)
ref_chr_channel=Channel.fromPath("/pasteur/projets/policy01/PCMA/1._DATA/ImpG_refpanel/chr*.eur.1pct.bim")
ref_chr_channel2=Channel.fromPath("/pasteur/projets/policy01/PCMA/1._DATA/ImpG_refpanel/chr*.eur.1pct.bim")
ref_chr_channel3=Channel.fromPath("/pasteur/projets/policy01/PCMA/1._DATA/ImpG_refpanel/chr*.eur.1pct.*")
ld_channel=Channel.fromPath("/pasteur/projets/policy01/PCMA/WKD_Hanna/impute_for_jass/ld_block_new_plink/*.ld")
Region_channel = Channel.fromPath('/pasteur/projets/policy01/PCMA/jass_analysis_pipeline/input_files/fourier_ls-all.bed')
Region_channel2 = Channel.fromPath('/pasteur/projets/policy01/PCMA/jass_analysis_pipeline/input_files/fourier_ls-all.bed')
process Compute_MAF{
input:
file ref_panel from ref_chr_channel3.collect()
val chr from chr_channel
output:
file "*.frq" into MAF_channel
"""
echo "Compute_MAF"
prefix="chr"
suffix=".eur.1pct"
bfile="\${prefix}${chr}\${suffix}"
echo \$bfile
plink --bfile \${bfile} --freq --out ./chr${chr}
"""
}
process create_WG_reference_panel{
publishDir "${baseDir}/Ref_Panel", pattern: "*.csv", mode: 'copy'
input:
file maf_files from MAF_channel.collect()
file chr_files from ref_chr_channel.collect()
output:
file "1000G_EUR_0_01.csv" into ref_panel_wg_channel
"""
#!/usr/bin/env python3
import subprocess as sub
import pandas as pd
import os
cwd = os.getcwd()
print(cwd)
prefix= "chr"
suffix=".eur.1pct"
refchr_list = []
for chrom in range(1,23):
fi = "{0}{1}{2}.bim".format(prefix, chrom, suffix)
print(fi)
print(type(fi))
position = pd.read_csv(fi, sep='\t', names=['chr', "rsid", "?", "pos", "ref_al", "alt_al"])
position.set_index("rsid", inplace=True)
ref_chr = pd.read_csv("./chr{0}.frq".format(chrom), sep="\\s+")
ref_chr['pos'] = position.loc[ref_chr.SNP, "pos"].values
refchr_list.append(ref_chr[["CHR", "pos", "SNP", "A1", "A2", "MAF"]])
ref= pd.concat(refchr_list)
ref.loc[~(ref.A1+ref.A2).isin(["AT", 'TA','CG','GC'])].to_csv("1000G_EUR_0_01.csv", index=False, header=False, sep="\t")
"""
}
process meta_data_GWAS{
output:
file "meta_data_chk*" into meta_data mode flatten
"""
d=`wc -l ${GWAS_labels}`
e=`echo \$d | cut -d ' ' -f 1`
for ((i = 2; i <= \$e; i++));
do
head -n1 ${GWAS_labels} > "meta_data_chk\$i.csv"
head -n \$i ${GWAS_labels} | tail -n 1 >> "meta_data_chk\$i.csv"
done
"""
}
process Clean_GWAS {
//publishDir "${output_folder}/harmonized_GWAS", pattern: "*.txt", mode: 'copy'
//publishDir "${output_folder}", pattern: "harmonized_GWAS_1_file/*.txt", mode: 'copy'
input:
file ref_panel from ref_panel_wg_channel
file meta_chunk from meta_data
output:
file "harmonized_GWAS_1_file/*.txt" into cleaned_gwas mode flatten
file "*.txt" into cleaned_gwas_chr_channel mode flatten
file "*.txt" into cleaned_gwas_chr_channel2 mode flatten
"""
mkdir harmonized_GWAS_1_file
echo ${GWAS_path}
working_dir=\$(pwd)
echo \$working_dir
full_path=\$working_dir/${meta_chunk}
echo "full_path"
echo \$full_path
jass_preprocessing --gwas-info \$full_path --ref-path ${ref_panel} \
--input-folder ${GWAS_path} --diagnostic-folder ${diagnostic_folder} \
--output-folder ./ --output-folder-1-file harmonized_GWAS_1_file/
"""
}
process Impute_GWAS {
//publishDir "${output_folder}", pattern: "imputed_GWAS/*.txt", mode: 'copy'
memory '8 GB'
time '8h'
input:
file gwas_files from cleaned_gwas_chr_channel
file ref_file from ref_chr_channel2.collect()
file ld_file from ld_channel.collect()
output:
file "imputed_GWAS/*.txt" into imputed_gwas_channel
file "imputed_GWAS/*.txt" into imputed_gwas_channel2
when:
params.compute_imputation
script:
"""
mkdir imputed_GWAS
chrom=\$(echo ${gwas_files} | cut -d '_' -f4 | cut -d "." -f1)
study=\$(echo ${gwas_files} | cut -d '_' -f2,3)
echo \$chrom
echo \$study
raiss --chrom \${chrom} --gwas \${study} --ref-folder ./ --R2-threshold 0.8 --ld-folder ./ --zscore-folder ./ --output-folder ./imputed_GWAS
"""
}
process Do_not_Impute_GWAS {
input:
file gwas_files from cleaned_gwas_chr_channel2.collect()
output:
file "*.txt" into not_imputed_gwas_channel
file "*.txt" into not_imputed_gwas_channel2
when:
!params.compute_imputation
script:
"""
"""
}
/*
process related to LD-score calculation
*/
process Munge_LDSC_data {
//publishDir "${baseDir}", pattern: "ldsc_data/data_*.sumstats.gz", mode: 'copy'
input:
file clean_gwas from cleaned_gwas
output:
file "ldsc_data/*.sumstats.gz" into ldsc_data_channel
file "ldsc_data/*.sumstats.gz" into ldsc_data_channel_bis
when:
params.compute_LDSC_matrix
"""
Nsamp=\$(python2.7 ${baseDir}/extract_sample_size.py ${clean_gwas} ${GWAS_labels})
if [ ! -d "ldsc_data" ]
then
mkdir ldsc_data
fi
tag=\$(echo ${clean_gwas} | cut -d '.' -f1 | cut -d '_' -f2,3)
echo \$tag
echo \$Nsamp
cat $clean_gwas | sed '1s/A1/A2/g' | sed '1s/A0/A1/g' > corrected_gwas.txt
head corrected_gwas.txt
munge_sumstats.py --sumstats corrected_gwas.txt --N \$Nsamp --out ldsc_data/data_\$tag
"""
}
process Generate_trait_pair {
input:
file ldsc_data from ldsc_data_channel.collect()
output:
file "pairs_chunk_*.txt" into combi_channel mode flatten
when:
params.compute_LDSC_matrix
"""
python3 ${baseDir}/generate_trait_pairs.py
"""
}
process Correlation_LDSC_data {
//publishDir "${baseDir}/cor_data/", pattern: "*.log", mode: 'copy'
input:
file trait_pair from combi_channel
file ldsc_data from ldsc_data_channel_bis.collect()
output:
file "*.log" into cor_log_channel
when:
params.compute_LDSC_matrix
"""
echo ${trait_pair}
IFS=';' read -ra my_trait <<< "\$(cat ${trait_pair})"
i=1
for trait_p in \${my_trait[@]}
do
echo \$trait_p
trait1=\$(echo \$trait_p | cut -d '.' -f1 | cut -d '_' -f2,3)
trait2=\$(echo \$trait_p | cut -d ',' -f2 | cut -d '.' -f1 | cut -d '_' -f2,3)
ldsc.py --rg \$trait_p --out \${trait1}-_-\${trait2} --ref-ld-chr ${baseDir}/eur_w_ld_chr/ --w-ld-chr ${baseDir}/eur_w_ld_chr/
done
"""
}
process Correlation_matrices {
//publishDir "${baseDir}/Correlation_matrices/", pattern: "*.csv", mode: 'copy'
input:
file ldsc_data from cor_log_channel.collect()
output:
file "Covariance_matrix_H0.csv" into cov_H0_matrice_channel
when:
params.compute_LDSC_matrix
"""
python3 ${baseDir}/parse_correlation_results.py
"""
}
process Create_inittable_LDSC {
publishDir "${output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy'
input:
file cleaned_gwas_chr from imputed_gwas_channel.mix(not_imputed_gwas_channel).collect()
file Covariance_H0 from cov_H0_matrice_channel
file Regions from Region_channel
output:
file "*.hdf5" into init_table_channel
when:
params.compute_LDSC_matrix
"""
gwas_list_str=`echo ${cleaned_gwas_chr}`
e="\${gwas_list_str// /;}"
echo "\$e"
date_init=\$(date +"%m_%d_%Y-%H:%M")
init_name="inittable_LDSC_\$date_init.hdf5"
jass create-inittable --input-data-path "\$e" --init-covariance-path ${Covariance_H0} --regions-map-path ${Regions} --description-file-path ${GWAS_labels} --init-table-path \$init_name
"""
}
process Create_inittable {
publishDir "${output_folder}/init_table/", pattern: "*.hdf5", mode: 'copy'
input:
file cleaned_gwas_chr from imputed_gwas_channel2.mix(not_imputed_gwas_channel2).collect()
file Regions from Region_channel2
output:
file "*.hdf5" into init_table_channel_no_ldsc
when:
!params.compute_LDSC_matrix
"""
gwas_list_str=`echo ${cleaned_gwas_chr}`
e="\${gwas_list_str// /;}"
echo "\$e"
date_init=\$(date +"%m_%d_%Y-%H:%M")
init_name="inittable_NO_LDSC_\$date_init.hdf5"
jass create-inittable --input-data-path "\$e" --regions-map-path ${Regions} --description-file-path ${GWAS_labels} --init-table-path inittable_no_LDSC.hdf5
"""
}
/*
Generate one list of phenotype
*/
process get_pheno_group {
input:
file 'group.txt' from group
output:
file "pheno_group_*" into pheno_group_channel
when:
params.compute_project
"""
split -l 1 group.txt pheno_group_
"""
}
process Create_project_data {
publishDir "${baseDir}/worktable/", pattern:"worktable_*.hdf5", mode: 'move'
publishDir "${baseDir}/quadrant/", pattern:"quadrant_*.png", mode: 'move'
publishDir "${baseDir}/manhattan/", pattern:"manhattan_*.png", mode: 'move'
input:
file pheno_group from pheno_group_channel.collect()
file init_table from init_table_channel.mix(init_table_channel_no_ldsc)
output:
file "worktable*.hdf5" into worktable_channel
file "*.png" into plot_channel
stdout result
when:
params.compute_project
"""
for pheno_line in ${pheno_group}
do
echo \$pheno_line
group_tag="\$(cat \$pheno_line | cut -d';' -f1)"
pheno_list="\$(cat \$pheno_line | cut -d';' -f2)"
echo \$group_tag
echo \$pheno_list
jass create-project-data --phenotypes \$pheno_list --init-table-path ./${init_table} --worktable-path ./worktable_bis_\$group_tag.hdf5 --manhattan-plot-path ./manhattan_\$group_tag.png --quadrant-plot-path ./quadrant_\$group_tag.png
done
"""
}
result.println { it.trim() }
dag {
enabled = true
file = 'dag.dot'
}
report {
enabled = true
file = 'nextflow_logs/report.html'
}
trace {
enabled = true
file = 'nextflow_logs/trace.txt'
}
singularity {
enabled = true
autoMounts = true
runOptions = '--home $HOME:/home/$USER'
}
process{
executor='slurm'
maxErrors=10
maxRetries=3
maxForks=400
queueSize=500
errorStrategy='retry'
cache='deep'
withName: 'Compute_MAF' {
container='plink_1.90b5--heea4ae3_0.sif'
cpus=1
}
//docker://quay.io/biocontainers/jass_preprocessing:1.0--py_0
withName: 'create_WG_reference_panel' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'meta_data_GWAS' {
cpus=1
}
withName: 'Clean_GWAS' {
//container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Impute_GWAS' {
container="raiss_2.0--py_0.sif"
cpus=1
}
withName: 'Munge_LDSC_data' {
container='ldsc_1.0.1--py_0.sif'
cpus=1
}
withName: 'Generate_trait_pair' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Correlation_LDSC_data' {
container="ldsc_1.0.1--py_0.sif"
cpus=1
}
withName: 'Correlation_matrices' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Create_inittable_LDSC' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
withName: 'Create_inittable' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
withName: 'get_pheno_group' {
cpus=1
}
withName: 'Create_project_data' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
}
import re
import pandas as pd
import glob
import numpy as np
print("Parsing_correlation")
file_input = glob.glob("*.log")
print(file_input)
def get_trait(fi):
traits = re.search('([-0-9A-Z]+_[-0-9A-Z-]+)-_-([-0-9A-Z]+_[-0-9A-Z-]+).log', fi)
return [traits.group(1), traits.group(2)]
traits = [get_trait(fi) for fi in file_input] # trait for trait in
traits = list(set([trait for trait_pair in traits for trait in trait_pair])) # fla
print(traits)
traits_ind = 'z_' + pd.Series(traits)
### Create matrices:
Covariance_matrix_H0 = pd.DataFrame(index=traits_ind, columns=traits_ind)
Covariance_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind)
Correlation_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind)