Commit d2bb4089 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

Pipeline integrating jass_preprocessing, raiss and jass

parents
import pandas as pd
import glob as glob
def list_intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
check_jass_prepro = False
check_imputation = True
if check_jass_prepro:
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/1._DATA/ldscore_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/pasteur/projets/policy01/PCMA/1._DATA/ldscore_data/"+tag, index_col=1, sep="\t")
zs_aft = pd.read_csv("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/"+tag, index_col=1, sep="\t")
print(zs_bef.head())
shared_SNP = zs_bef.index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "Z"]))
if check_imputation:
print("Check imputation")
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/1._DATA/Imputed_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/imputed_GWAS/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/pasteur/projets/policy01/PCMA/1._DATA/Imputed_data/"+tag, index_col=0, sep="\t")
zs_aft = pd.read_csv("/pasteur/projets/policy01/PCMA/JASS_pipeline_output/imputed_GWAS/"+tag, index_col=0, sep="\t")
shared_SNP = zs_bef.index.intersection(zs_aft.index)
shared_imputed_SNP = zs_bef.loc[zs_bef.Var > -1.0].index.intersection(zs_aft.index)
shared_original_SNP = zs_bef.loc[zs_bef.Var < -0.5].index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "z_score"]))
print("Correlation imputed Z_score:")
cor_imp = (zs_aft.loc[shared_imputed_SNP, "Z"].corr(zs_bef.loc[shared_imputed_SNP, "z_score"]))
print(cor_imp)
print("Correlation original Z_score:")
print(zs_aft.loc[shared_original_SNP, "Z"].corr(zs_bef.loc[shared_original_SNP, "z_score"]))
if cor_imp < 0.98:
print("WARNING :Imputation is incoherent ({0}) for case {1}".format(cor_imp, tag))
import pandas as pd
import glob as glob
def list_intersection(lst1, lst2):
return list(set(lst1) & set(lst2))
check_jass_prepro = False
check_imputation = True
if check_jass_prepro:
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/1._DATA/ldscore_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/mnt/atlas/PCMA/1._DATA/ldscore_data/"+tag, index_col=1, sep="\t")
zs_aft = pd.read_csv("/mnt/atlas/PCMA/JASS_pipeline_output/harmonized_GWAS_1_LD/"+tag, index_col=1, sep="\t")
print(zs_bef.head())
shared_SNP = zs_bef.index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "Z"]))
if check_imputation:
print("Check imputation")
LD_BEF = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/1._DATA/Imputed_data/z_*.txt")]
LD_AFT = [fp.split("/")[-1] for fp in glob.glob("/mnt/atlas/PCMA/JASS_pipeline_output/imputed_GWAS/z_*.txt")]
LD_set = list_intersection(LD_BEF,LD_AFT)
for tag in LD_set:
print(tag)
zs_bef = pd.read_csv("/mnt/atlas/PCMA/1._DATA/Imputed_data/"+tag, index_col=0, sep="\t")
zs_aft = pd.read_csv("/mnt/atlas/PCMA/JASS_pipeline_output/imputed_GWAS/"+tag, index_col=0, sep="\t")
shared_SNP = zs_bef.index.intersection(zs_aft.index)
shared_imputed_SNP = zs_bef.loc[zs_bef.Var > -1.0].index.intersection(zs_aft.index)
shared_original_SNP = zs_bef.loc[zs_bef.Var < -0.5].index.intersection(zs_aft.index)
print("Shared SNP prop: {0}".format(len(shared_SNP)/len(zs_bef.index)))
print("Correlation Z_score:")
print(zs_aft.loc[shared_SNP, "Z"].corr(zs_bef.loc[shared_SNP, "z_score"]))
print("Correlation imputed Z_score:")
print(zs_aft.loc[shared_imputed_SNP, "Z"].corr(zs_bef.loc[shared_imputed_SNP, "z_score"]))
print("Correlation original Z_score:")
print(zs_aft.loc[shared_original_SNP, "Z"].corr(zs_bef.loc[shared_original_SNP, "z_score"]))
import pandas as pd
import sys
f = sys.argv[1]
meta_data_loc = sys.argv[2]
d = f.split('.')[0]
D = pd.read_csv(meta_data_loc, sep="\t")
D.index = "z_"+D.Consortium+"_"+D.Outcome
Nsample = D.loc[d, "Nsample"]
print Nsample
import glob
from itertools import combinations
import os
cwd = os.getcwd()
print(cwd)
L = glob.glob('{}/*.gz'.format(cwd))
L_s = [j.split('/')[-1] for j in L]
L_combi = [",".join(map(str, comb)) for comb in combinations(L_s, 2)]
print(L_s)
size_chunk = 100
N_files = len(L_combi)//size_chunk
i=0
for i in range(N_files+1):
start = i*size_chunk
end = (i+1)*size_chunk
if i < N_files:
print(";".join(L_combi[start:end]),file=open("pairs_chunk_{0}.txt".format(i), "w"))
else:
print(";".join(L_combi[start:]),file=open("pairs_chunk_{0}.txt".format(i), "w"))
This diff is collapsed.
dag {
enabled = true
file = 'dag.dot'
}
report {
enabled = true
file = 'nextflow_logs/report.html'
}
trace {
enabled = true
file = 'nextflow_logs/trace.txt'
}
singularity {
enabled = true
autoMounts = true
runOptions = '--home $HOME:/home/$USER'
}
process{
executor='slurm'
maxErrors=10
maxRetries=3
maxForks=400
queueSize=500
errorStrategy='retry'
cache='deep'
withName: 'Compute_MAF' {
container='plink_1.90b5--heea4ae3_0.sif'
cpus=1
}
//docker://quay.io/biocontainers/jass_preprocessing:1.0--py_0
withName: 'create_WG_reference_panel' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'meta_data_GWAS' {
cpus=1
}
withName: 'Clean_GWAS' {
//container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Impute_GWAS' {
container="raiss_2.0--py_0.sif"
cpus=1
}
withName: 'Munge_LDSC_data' {
container='ldsc_1.0.1--py_0.sif'
cpus=1
}
withName: 'Generate_trait_pair' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Correlation_LDSC_data' {
container="ldsc_1.0.1--py_0.sif"
cpus=1
}
withName: 'Correlation_matrices' {
container='jass_preprocessing_1.0--py_0.sif'
cpus=1
}
withName: 'Create_inittable_LDSC' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
withName: 'Create_inittable' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
withName: 'get_pheno_group' {
cpus=1
}
withName: 'Create_project_data' {
container='jass_2.0--pyh5ca1d4c_0.sif'
cpus=1
}
}
import re
import pandas as pd
import glob
import numpy as np
print("Parsing_correlation")
file_input = glob.glob("*.log")
print(file_input)
def get_trait(fi):
traits = re.search('([-0-9A-Z]+_[-0-9A-Z-]+)-_-([-0-9A-Z]+_[-0-9A-Z-]+).log', fi)
return [traits.group(1), traits.group(2)]
traits = [get_trait(fi) for fi in file_input] # trait for trait in
traits = list(set([trait for trait_pair in traits for trait in trait_pair])) # fla
print(traits)
traits_ind = 'z_' + pd.Series(traits)
### Create matrices:
Covariance_matrix_H0 = pd.DataFrame(index=traits_ind, columns=traits_ind)
Covariance_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind)
Correlation_matrix_genetic = pd.DataFrame(index=traits_ind, columns=traits_ind)
for i1, t1 in enumerate(traits):
for t2 in traits[(i1+1):]:
print(t1,t2)
f=0
flip=False
try:
cov_fi = "{0}-_-{1}.log".format(t1, t2)
fi = open(cov_fi, "r")
f=1
except FileNotFoundError:
try:
cov_fi = "{0}-_-{1}.log".format(t2, t1)
fi = open(cov_fi, "r")
f=1
flip=True
except FileNotFoundError:
print("Not found")
print(t1, t2)
f=0
pass
if f==1:
print("PARSING")
print(cov_fi)
L = fi.readlines()
#retrieve Intercept
L_intercept = list(filter(lambda x:re.match("Intercept:", x)!=None , L))
L_gencov = list(filter(lambda x:re.match('Total Observed scale gencov', x)!=None , L))
L_gencor = list(filter(lambda x:re.match('Genetic Correlation', x)!=None , L))
L_h2 = list(filter(lambda x:re.match('Total Observed scale h2', x)!=None , L))
t1_col = "z_" + t1
t2_col = "z_" + t2
if len(L_intercept)==3:
Covariance_matrix_H0.loc[t1_col, t2_col] = float(L_intercept[2].split(" ")[1])
Covariance_matrix_H0.loc[t2_col, t1_col] = float(L_intercept[2].split(" ")[1])
Covariance_matrix_genetic.loc[t1_col, t2_col] = float(L_gencov[0].split(":")[1].split(" ")[1])
Covariance_matrix_genetic.loc[t2_col, t1_col] = float(L_gencov[0].split(":")[1].split(" ")[1])
Correlation_matrix_genetic.loc[t1_col, t1_col] = 1
Correlation_matrix_genetic.loc[t2_col, t2_col] = 1
print(L_gencor)
Correlation_matrix_genetic.loc[t1_col, t2_col] = float(L_gencor[1].split(":")[1].split(" ")[1])
Correlation_matrix_genetic.loc[t2_col, t1_col] = float(L_gencor[1].split(":")[1].split(" ")[1])
if flip:
Covariance_matrix_H0.loc[t1_col, t1_col] = float(L_intercept[1].split(" ")[1])
Covariance_matrix_H0.loc[t2_col, t2_col] = float(L_intercept[0].split(" ")[1])
Covariance_matrix_genetic.loc[t1_col, t1_col] = float(L_h2[1].split(":")[1].split(" ")[1])
Covariance_matrix_genetic.loc[t2_col, t2_col] = float(L_h2[0].split(":")[1].split(" ")[1])
else:
Covariance_matrix_H0.loc[t1_col, t1_col] = float(L_intercept[0].split(" ")[1])
Covariance_matrix_H0.loc[t2_col, t2_col] = float(L_intercept[1].split(" ")[1])
Covariance_matrix_genetic.loc[t1_col, t1_col] = float(L_h2[0].split(":")[1].split(" ")[1])
Covariance_matrix_genetic.loc[t2_col, t2_col] = float(L_h2[1].split(":")[1].split(" ")[1])
Covariance_matrix_genetic.to_csv("Covariance_matrix_genetic.csv", sep="\t")
Covariance_matrix_H0.to_csv("Covariance_matrix_H0.csv", sep="\t")
Covariance_matrix_genetic.to_csv("Correlation_matrix_genetic.csv", sep="\t")
print("Parsing_correlation")
#!/bin/bash -l
##############################
# Job blueprint #
##############################
# Give your job a name, so you can recognize it in the queue overview
#SBATCH --job-name=nextflow_jass
#SBATCH -o /pasteur/projets/policy01/PCMA/jass_analysis_pipeline/sbatch.log
#SBATCH -e /pasteur/projets/policy01/PCMA/jass_analysis_pipeline/error.log
# Define, how many nodes you need. Here, we ask for 1 node.
# Each node has 16 or 20 CPU cores.
#SBATCH --nodes=1
# You can further define the number of tasks with --ntasks-per-*
# See "man sbatch" for details. e.g. --ntasks=4 will ask for 4 cpus.
# Define, how long the job will run in real time. This is a hard cap meaning
# that if the job runs longer than what is written here, it will be
# force-stopped by the server. If you make the expected time too long, it will
# take longer for the job to start. Here, we say the job will take 5 minutes.
# d-hh:mm:ss
#SBATCH --time=0-23:00:00
# How much memory you need.
# --mem will define memory per node and
# --mem-per-cpu will define memory per CPU/core. Choose one of those.
##SBATCH --mem-per-cpu=1500MB
#SBATCH --mem=5GB # this one is not in effect, due to the double hash
module load java/1.8.0
module load nextflow/19.07.0
module load singularity/3.4.0
cd /pasteur/projets/policy01/PCMA/jass_analysis_pipeline
source /pasteur/homes/hjulienn/jass_suite/jass_suite/bin/activate
nextflow run jass_pipeline.nf -with-report jass_report.html -with-timeline jass_timeline.html -with-dag dag.png
# Finish the script
deactivate
exit 0
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment