diff --git a/Figures_manuscript/scripts/Fig_SUP_gain_vs_Ntrait.R b/Figures_manuscript/scripts/Fig_SUP_gain_vs_Ntrait.R index 3f643ce9f220d5577de2212ed41ffb5acbcc0638..5380f02a9af140add97baf7a5d25761c5e398cea 100644 --- a/Figures_manuscript/scripts/Fig_SUP_gain_vs_Ntrait.R +++ b/Figures_manuscript/scripts/Fig_SUP_gain_vs_Ntrait.R @@ -39,8 +39,6 @@ for(key in set_features$Id){ } - - summary(set_features$percent_increase) summary(set_features[, Joint]) summary(set_features[, Both+Univariate]) @@ -73,8 +71,10 @@ long_set$variable = feats_name2[as.character(long_set$variable)] unique(long_set$variable) long_set[,`#trait`:= k] +long_set = long_set[variable != "# of univariate loci",] -png("../outputs/Figure_SUP_gain_Ntraits.png", width=8.5, height=8.5, unit="in", res=300) -p = ggplot(long_set, aes(x=`#trait`, y=value, color=overlap)) + geom_point(alpha=0.65) + facet_wrap(.~variable, scales = "free")+ theme_minimal() +ylab('') +png("../outputs/Figure_SUP_gain_Ntraits.png", width=12, height=4.5, unit="in", res=300) +p = ggplot(long_set, aes(x=`#trait`, y=value, color=overlap)) + geom_point(alpha=0.65) + facet_wrap(.~variable, scales = "free", + strip.position = "left")+ theme_minimal() + ylab(NULL) print(p) dev.off() \ No newline at end of file diff --git a/Figures_manuscript/scripts/Fig_SUP_simulation_H0.R b/Figures_manuscript/scripts/Fig_SUP_simulation_H0.R new file mode 100644 index 0000000000000000000000000000000000000000..63e6a71202c4d9218fb1c4b65f495435774a8d9c --- /dev/null +++ b/Figures_manuscript/scripts/Fig_SUP_simulation_H0.R @@ -0,0 +1,141 @@ + + +library(MASS) +library(ggplot2) +library(data.table) + + + +M = 1000 +N_samp = 50000# c(seq(2000, 10000, by=1000), 50000, 10000) +Ns_frac = c(1) # Sample overlap +herit = c(0.4, 0.33, 0.3, 0.35, 0.31) +pgm= 0.4 * min(herit) +pgh = 0.25 * min(herit) + +Beta = matrix(0,nrow=M,ncol=2) +epsgh = matrix(pgh, ncol=5, nrow=5)/ M +diag(epsgh) = herit /M + + +#Genetic effects +BG_mat = list(# + "null_k5" = data.frame(matrix(0,nrow=M,ncol=5)) + ) + + +N= 2000 +overlap= 1 +sign_threshold = 0.05 /M + +C_env = seq(0.0, 0.5, len=3) +ce=-0.2 + +Zscore_list = list() + +for(hypothesis in names(BG_mat)){ + print(hypothesis) + if(hypothesis=="null_k5"){ + herit = rep(0, 5) + }else{ + herit = c(0.4, 0.33, 0.3, 0.35, 0.31) + } + + for(overlap in Ns_frac){ + simu = 1 + D_pval = data.frame(matrix(NA, nrow=length(N_samp)* length(C_env), ncol=4)) + names(D_pval) = c("N_samp", "env_cor","Omnibus", "MTAG") + + for(ce in C_env){ + + pe = ce * min(1-herit) + eps = matrix(pe*runif(25,0.75, 1.25), ncol=5, nrow=5) + eps = (eps + t(eps))/2 + diag(eps) = 1-herit + print(eps) + print(rep(1,5) %*% eps %*% as.matrix(rep(1,5))) + Beta = BG_mat[[hypothesis]] + + # Genetic effects for the two trait + print(ce) + print(simu) + for(N in N_samp){ + X1 = matrix(0, nrow=N, ncol=M) + X2 = matrix(0, nrow=N, ncol=M) + + D_pval[simu,"N_samp"] = N + D_pval[simu,"env_cor"] = ce + # Draw SNPs : + MAF = runif(M,0.05, 0.95) + i= 1 + + for(p in MAF){ + X1[,i] = rbinom(N, 2, p) + X2[,i] = rbinom(N, 2, p) + i = i +1 + } + + NS = dim(X1)[1] * overlap + + X2[1:NS,] = X1[1:NS,] + X1 = scale(X1) + X2 = scale(X2) + env1 = mvrnorm(N, rep(0,5), eps) + env2 = mvrnorm(N, rep(0,5), eps) + env2[1:NS, ] = env1[1:NS,] + + Y1 = (as.matrix(X1) %*% as.matrix(Beta) + env1) + Y2 = (as.matrix(X2) %*% as.matrix(Beta) + env2) + + Y = cbind(Y1[,1:2], Y2[,3:5]) + + rho_ov= cor(Y) # equal to (pe+pg) * Sample_overlap + + Beta_estimate1 = t((t(Y[,1]) %*% X1) %*% solve(t(X1) %*% X1)) + Beta_estimate2 = t((t(Y[,2]) %*% X1) %*% solve(t(X1) %*% X1)) + Beta_estimate3 = t((t(Y[,3]) %*% X2) %*% solve(t(X2) %*% X2)) + Beta_estimate4 = t((t(Y[,4]) %*% X2) %*% solve(t(X2) %*% X2)) + Beta_estimate5 = t((t(Y[,5]) %*% X2) %*% solve(t(X2) %*% X2)) + + Beta_est = data.frame(B1 = Beta_estimate1, B2=Beta_estimate2, B3 = Beta_estimate3, B4=Beta_estimate4,B5 = Beta_estimate5) + + Z = Beta_est* sqrt(N) + Z = as.data.frame(Z) + + names(Z) = c("z1", "z2", "z3", "z4","z5") + sigr = rho_ov + diag(sigr) = 1 + + # Omnibus + Omni_stat= diag(as.matrix(Z) %*% solve(sigr) %*% as.matrix(t(Z))) + pval = pchisq(Omni_stat, df=dim(Z)[2], lower.tail=FALSE) + + D_pval[simu, "Omnibus"]= mean(pval < sign_threshold) + Z["pval_omni"] = pval + simu_name = paste0(hypothesis, "_ce_", ce, "_ov_", overlap) + Zscore_list[[simu_name]] = Z + + write.table(Z, paste0("./Figures_manuscript/outputs/Z_score_H0_",simu_name,".tsv"), sep="\t", quote=FALSE,row.names = FALSE) + write.table(sigr, paste0("./Figures_manuscript/outputs/COV_H0_",simu_name,".tsv"), sep='\t',quote=FALSE,row.names = FALSE) + + simu = simu+1 + + } + } + } +} + +for(hypo in names(Zscore_list)){ + + Zscore_list[[hypo]]["hypothesis"] = hypo +} + +dim(Zscore_list[[1]]) +ZH = rbindlist(Zscore_list) + + +ZH[1:4,] + +png(paste("./Figures_manuscript/outputs/Fig_SUP_H0_distribution.png")) +ggplot(ZH, aes(x=pval_omni)) + geom_histogram() + facet_wrap(.~hypothesis)+ theme_minimal() +dev.off() \ No newline at end of file diff --git a/Figures_manuscript/scripts/Figure_6_compare_trait_selection_method_without_duplicates.py b/Figures_manuscript/scripts/Figure_6_compare_trait_selection_method_without_duplicates.py index 440b8e1a4e86c133e3ac35b3f6d7c40b33cf327d..e61fe652c7fd58d2236df2cf6bc3cacc80510759 100644 --- a/Figures_manuscript/scripts/Figure_6_compare_trait_selection_method_without_duplicates.py +++ b/Figures_manuscript/scripts/Figure_6_compare_trait_selection_method_without_duplicates.py @@ -13,8 +13,10 @@ df_train = pd.read_csv('../inputs/clinical_grouping_analysis_2023-09-06/category df_val = pd.read_csv('../inputs/clinical_grouping_analysis_2023-09-06/category_traitset_with_mean_test_jass.tsv',delimiter='\t') output = '../outputs/Figure_6_compare_trait_selection_method_jass_without_duplicates.pdf' +np.random.seed(484) similar_train = df_train.loc[df_train.n_group==1] + dif_train = df_train.loc[(df_train.n_group>1)&(df_train.n_group<=4)] verydif_train = df_train.loc[df_train.n_group>4] random_baseline_train = df_train.iloc[np.random.choice(range(df_train.shape[0]), 100)] @@ -24,12 +26,12 @@ dif_val = df_val.loc[(df_val.n_group>1)&(df_val.n_group<=4)] verydif_val = df_val.loc[df_val.n_group>4] random_baseline_val = df_val.iloc[np.random.choice(range(df_val.shape[0]), 100)] -datadriven = df_val.loc[df_val.rank_datadriven < df_val.rank_datadriven.nsmallest(101).iloc[-1]] +datadriven = df_val.loc[df_val.rank_datadriven < df_val.rank_datadriven.nsmallest(101).iloc[-1]] datadriven.reset_index(inplace=True,drop=True) + print(len(datadriven)) - def ttest(pair, metric): return ttest_ind(dict_data_train[pair[0]][metric].values,dict_data_val[pair[1]][metric].values,axis=0,equal_var=False,alternative='two-sided') diff --git a/Regression_analysis/scripts/5f-cross_validation_linear.py b/Regression_analysis/scripts/5f-cross_validation_linear.py index 90886a28327bb4fc91742f2a9bd50557e2b4ed9e..c0308e9d5dab7919da23e6e57506ad422f45cea0 100644 --- a/Regression_analysis/scripts/5f-cross_validation_linear.py +++ b/Regression_analysis/scripts/5f-cross_validation_linear.py @@ -41,6 +41,7 @@ corR2_val_adj = [] cor_p_train = [] cor_p_val = [] df_jass_all = pd.DataFrame() + for i in [2,3,4,5,6]: # read the saved files df_jass_training = pd.read_csv('../inputs/traitset_jass_CVtraining{}-newSUMMARY_remove-nan.tsv'.format(i),delimiter='\t') @@ -72,12 +73,21 @@ for i in [2,3,4,5,6]: df_jass_test['train'] = False df_jass_all = pd.concat([df_jass_all,df_jass_training],axis=0) df_jass_all = pd.concat([df_jass_all,df_jass_test],axis=0) + + # Apply Scaling on the whole data at once. X_all = df_jass_all[feature_candidate] +X_ranges = pd.DataFrame({"minimum_value": X_all.min(), + "maximum_value": X_all.max()}) + +X_ranges.to_csv("../outputs/range_features.tsv", sep="\t") + X_scaled_all = preprocessing.MinMaxScaler().fit(X_all).transform(X_all.astype(float)) #preprocessing.StandardScaler().fit(X_train).transform(X_train.astype(float)) y_all = df_jass_all[[target]] y_scaled_all = preprocessing.MinMaxScaler().fit(y_all).transform(y_all.astype(float)).reshape([1,-1])[0] + + for i in [2,3,4,5,6]: # linear model (multivariate) trained with the training set y_train_scaled = y_scaled_all[((df_jass_all.CV==i)&(df_jass_all.train==True)).values.tolist()] @@ -134,6 +144,7 @@ for i in [2,3,4,5,6]: pvalues_univ.append(ps_univ) SEs_univ.append(stes_univ) + pvalues = np.array(pvalues) SEs = np.array(SEs) coefficients = np.array(coefficients)