diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index f3450bfca21766036ff88bcc8ce650619568bc5b..2081c6810096fc43d93e5f79b255105650a843c5 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -83,7 +83,7 @@ def add_preprocessing_argument(): parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)") parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)") - parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7) + parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.75) parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01') parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False') diff --git a/jass_preprocessing/compute_score.py b/jass_preprocessing/compute_score.py index e10d6cec28697aca6f4c11caf1ba9788d8171afd..0e6ffc46e21969bcc3aff7101b0b454c037b417b 100644 --- a/jass_preprocessing/compute_score.py +++ b/jass_preprocessing/compute_score.py @@ -27,8 +27,11 @@ def compute_z_score(mgwas): return mgwas -def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1): - +def compute_sample_size(mgwas, diagnostic_folder, trait, min_sample_size_ratio = 0.95): + """ + Infer sample size when not present and filter sample size to reach homogeneous sample + size + """ if 'n' in mgwas.columns: myN = mgwas.n #--- freq, case-cont N exist @@ -58,13 +61,15 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = myW_thres_low = np.percentile(myN.dropna(), perc_low) myW_thres_max = np.percentile(myN.dropna(), perc_max) - while (1 - (myW_thres_low /myW_thres_max)) > max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous - perc_low += 10 - perc_max -= 10 + while ((myW_thres_low / myW_thres_max) < min_sample_size_ratio): # narrow treshold until sample size can be considered as homogeneous + perc_low += 2 + perc_max -= 0.5 myW_thres_low = np.percentile(myN.dropna(), perc_low) myW_thres_max = np.percentile(myN.dropna(), perc_max) + + print("Finished filtering") mgwas["computed_N"] = myN plt.clf() p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()]) diff --git a/jass_preprocessing/save_output.py b/jass_preprocessing/save_output.py index de6817896a5f939ce586d4a50d881b8547841fd6..39f41e107c8817418dd86e7c95ba9a51c93b6c39 100644 --- a/jass_preprocessing/save_output.py +++ b/jass_preprocessing/save_output.py @@ -19,8 +19,9 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): 'A0': mgwas_copy.loc[chrom].ref, 'A1':mgwas_copy.loc[chrom].alt, 'Z': mgwas_copy.loc[chrom].computed_z, - 'P': mgwas_copy.loc[chrom].pval - }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ]) + 'P': mgwas_copy.loc[chrom].pval, + 'N_effective': mgwas_copy.loc[chrom].computed_N + }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P", "N_effective"]) impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder)) @@ -43,8 +44,9 @@ def save_output(mgwas, ImpG_output_Folder, my_study): 'A0': mgwas_copy.ref, 'A1':mgwas_copy.alt, 'Z': mgwas_copy.computed_z, - 'P': mgwas_copy.pval - }, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P" ]) + 'P': mgwas_copy.pval, + 'N_effective': mgwas_copy.computed_N + }, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P", 'N_effective' ]) impg_output_file = ImpG_output_Folder + 'z_'+ my_study +".txt" print("WRITING results for {} to: {}".format( my_study, ImpG_output_Folder))