Skip to content
Snippets Groups Projects
Commit 5f9ea33f authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

adjust sample filtering parameter

parent 43dba7f5
No related branches found
No related tags found
No related merge requests found
......@@ -83,7 +83,7 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.75)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01')
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
......
......@@ -27,8 +27,11 @@ def compute_z_score(mgwas):
return mgwas
def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1):
def compute_sample_size(mgwas, diagnostic_folder, trait, min_sample_size_ratio = 0.95):
"""
Infer sample size when not present and filter sample size to reach homogeneous sample
size
"""
if 'n' in mgwas.columns:
myN = mgwas.n
#--- freq, case-cont N exist
......@@ -58,13 +61,15 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio =
myW_thres_low = np.percentile(myN.dropna(), perc_low)
myW_thres_max = np.percentile(myN.dropna(), perc_max)
while (1 - (myW_thres_low /myW_thres_max)) > max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous
perc_low += 10
perc_max -= 10
while ((myW_thres_low / myW_thres_max) < min_sample_size_ratio): # narrow treshold until sample size can be considered as homogeneous
perc_low += 2
perc_max -= 0.5
myW_thres_low = np.percentile(myN.dropna(), perc_low)
myW_thres_max = np.percentile(myN.dropna(), perc_max)
print("Finished filtering")
mgwas["computed_N"] = myN
plt.clf()
p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
......
......@@ -19,8 +19,9 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
'A0': mgwas_copy.loc[chrom].ref,
'A1':mgwas_copy.loc[chrom].alt,
'Z': mgwas_copy.loc[chrom].computed_z,
'P': mgwas_copy.loc[chrom].pval
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
'P': mgwas_copy.loc[chrom].pval,
'N_effective': mgwas_copy.loc[chrom].computed_N
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P", "N_effective"])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
......@@ -43,8 +44,9 @@ def save_output(mgwas, ImpG_output_Folder, my_study):
'A0': mgwas_copy.ref,
'A1':mgwas_copy.alt,
'Z': mgwas_copy.computed_z,
'P': mgwas_copy.pval
}, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P" ])
'P': mgwas_copy.pval,
'N_effective': mgwas_copy.computed_N
}, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P", 'N_effective' ])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +".txt"
print("WRITING results for {} to: {}".format( my_study, ImpG_output_Folder))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment