Commit f083000c authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

factorize further code for simulating Zscore for more cohort structure

parent b9936427
......@@ -37,14 +37,24 @@ Project Organization
│ │
│   ├── data <- Scripts to download or generate data
│   │   └── make_dataset.py
| |__ 1 generate genotype with a specified LD structure
| |__ 2 generate_Zscores.py
| | Generate estimated Zscore by simulating a continuous phenotypes from genotype and
| normal noise. then estimate beta and Zscore with a linear regression. Cohort size
| are generated based on hgcovid cohort sizes
| _ 3 generate_Zscores_small_cohort.py
Variant of the precedent script to simulate Z-scores using only small cohorts. Hypothesis to test:
Can we recover a proper signal in the meta analysis using very noisy input
│ │
│   ├── features <- Scripts to turn raw data into features for modeling
│   │   └── build_features.py
│ │
│   ├── models <- Scripts to train models and then use trained models to make
│ │ │ predictions
│   │   ├── predict_model.py
│   │   └── train_model.py
│   ├── models
│ │ │
│   │   ├── Impute_simulated_signal.py
impute Zscores using simulated data : 1 mask 50 SNPs on 200, reimpute then save
│   │   └──
│ │
│   └── visualization <- Scripts to create exploratory and results oriented visualizations
│   └── visualize.py
......
......@@ -26,6 +26,7 @@ def compute_Y(beta, X, sig=1.0):
Y = X.dot(beta) + noise
return Y
def compute_Zscore(Y, X, sig):
nind = X.shape[0]
......@@ -33,22 +34,27 @@ def compute_Zscore(Y, X, sig):
for i in range(X.shape[1]):
x = X.iloc[:,i]
Zscores[i] = x.dot(Y) / (np.linalg.norm(x,2)*sig)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return Zscores
def generate_one_causal(amplitude, LD_cor, X, sig=1.0):
def generate_null_signal(amp_dummy, LD_cor, X, sig=1.0, sig_genet =0.00001):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=sig_genet)
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_one_causal(amplitude, LD_cor, X, sig=1.0, sig_genet =0.00001):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta = np.random.normal(size=nsnp,scale=sig_genet)
beta[nsnp//2] = amplitude
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_causal(amplitude, LD_cor, X, sig=1.0):
def generate_two_causal(amplitude, LD_cor, X, sig=1.0, sig_genet =0.00001):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta = np.random.normal(size=nsnp,scale=sig_genet)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = amplitude
Y = compute_Y(beta, X, sig)
......@@ -56,9 +62,9 @@ def generate_two_causal(amplitude, LD_cor, X, sig=1.0):
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_opposite(amplitude, LD_cor, X, sig=1.0):
def generate_two_opposite(amplitude, LD_cor, X, sig=1.0, sig_genet =0.00001):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta = np.random.normal(size=nsnp,scale=sig_genet)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = -amplitude
Y = compute_Y(beta, X, sig)
......@@ -66,6 +72,8 @@ def generate_two_opposite(amplitude, LD_cor, X, sig=1.0):
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def define_signal_generator(tag):
if tag=="null":
return generate_null_signal
if tag=="one_causal":
return generate_one_causal
if tag=="two_causal":
......@@ -73,33 +81,31 @@ def define_signal_generator(tag):
if tag=="two_opposite":
return generate_two_opposite
if __name__ == '__main__':
# import sample size each study
Ssize = pd.read_csv("../../data/external/meta_data/Ncases_B2_vs_ALL.tsv", sep="\t")
Ssize = Ssize.loc[[re.search("EUR",i)!=None for i in Ssize.Name],]
Ssize["per"] = Ssize.n_cases / (Ssize.n_cases + Ssize.n_controls)
Ssize["N_effective"] = np.floor(Ssize.per * (1- Ssize.per) * (Ssize.n_cases + Ssize.n_controls))
Ssize.N_effective = Ssize.N_effective.astype(int, copy=False)
Ssize.to_csv("../../data/external/meta_data/N_effective.csv")
# get the type of simulation from command line
tag = "one_causal" # possibles values are two_opposite, two_causal, one_causal
for tag in ['two_opposite', 'two_causal', 'one_causal']:
def simulate_Z_scores(geno_file, LD_file, cohort_meta,Z_score_path, amp=0.05):
"""
Parameters :
geno_file (str): path to the genotype file
LD_file (str): path to the genotype file
cohort_meta (pd.DataFrame): Description of the cohort (main point is
to simulate according to specified sample size). Must contain columns: N_effective, study
Z_score_path (str) : Path to the output folder + sim prefix to save the results
amp = amplitude of the causal signal
"""
# possibles values are null, two_opposite, two_causal, one_causal
for tag in ["null",'two_opposite', 'two_causal', 'one_causal']:
print("Simulating values for {}".format(tag))
#get genotype
X = pd.read_csv("../../data/processed/Simulated/Genotypes/genotype2.csv", sep="\t")
LD_cor = pd.read_csv("../../data/processed/Simulated/Genotypes/LD_matrix2.csv", sep="\t")
X = pd.read_csv(geno_file, sep="\t")
LD_cor = pd.read_csv(LD_file, sep="\t")
LD_cor = LD_cor.values
nsnp = X.shape[1]
# Estimate LD from genotype:
scaler = StandardScaler()
# transform to centered scaled
X = pd.DataFrame(scaler.fit_transform(X))
#LD_cor = X.corr().values
n_snp = LD_cor.shape[0]
n_masked = 20
amp = 1
print('search of best rd for amplitude {}'.format(amp))
signal_generator = define_signal_generator(tag)
......@@ -108,24 +114,39 @@ if __name__ == '__main__':
Zscore = signal["Zscore"]
beta = signal["causal"]
All_Zscore = pd.DataFrame({"Beta_meta-analysis" : Zscore / X.shape[0]**0.5, "Z_meta-analysis" : Zscore})
All_Zscore = pd.DataFrame({"Beta_all_SNPs" : Zscore / X.shape[0]**0.5, "Zscore_all_SNPs" : Zscore})
for i, neff in enumerate(Ssize.N_effective):
for i, neff in enumerate(cohort_meta.N_effective):
print(i, neff)
id_start = Ssize.N_effective[:i].sum()
id_end = Ssize.N_effective[:(i+1)].sum()
id_start = cohort_meta.N_effective[:i].sum()
id_end = cohort_meta.N_effective[:(i+1)].sum()
study = Ssize.iloc[i,0]
study = cohort_meta.study.iloc[i]
print("Number of sample for {}".format(study))
print(Ssize.N_effective.iloc[i])
print(cohort_meta.N_effective.iloc[i])
X_study = X.iloc[id_start:id_end,]
signal = signal_generator(amp, LD_cor, X_study, sig=1)
Zscore = signal["Zscore"]
print("R2_genet {}".format(signal["R2_genet"]))
All_Zscore["Z_{0}".format(study)] = Zscore
All_Zscore["Beta_{0}".format(study)] = Zscore / X_study.shape[0]**0.5
All_Zscore.to_csv("../../data/processed/Simulated/Zscores/Zscore_{0}.csv".format(tag))
All_Zscore.to_csv("{0}_{1}.csv".format(Z_score_path, tag))
if __name__ == '__main__':
# import sample size each study
Ssize = pd.read_csv("../../data/external/meta_data/Ncases_B2_vs_ALL.tsv", sep="\t")
Ssize = Ssize.loc[[re.search("EUR",i)!=None for i in Ssize.Name],]
Ssize["per"] = Ssize.n_cases / (Ssize.n_cases + Ssize.n_controls)
Ssize["study"] = Ssize.Name
Ssize["N_effective"] = np.floor(Ssize.per * (1- Ssize.per) * (Ssize.n_cases + Ssize.n_controls))
Ssize.N_effective = Ssize.N_effective.astype(int, copy=False)
Ssize.to_csv("../../data/external/meta_data/N_effective.csv")
print(Ssize.head())
simulate_Z_scores("../../data/processed/Simulated/Genotypes/genotype2.csv", "../../data/processed/Simulated/Genotypes/LD_matrix2.csv", Ssize,"../../data/processed/Simulated/Zscores/Zscore", amp=0.05)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment