Skip to content
Snippets Groups Projects
Commit 43dba7f5 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

modified filtering of sample size

parent 46ff85c7
Branches
No related tags found
No related merge requests found
...@@ -45,12 +45,11 @@ def launch_preprocessing(args): ...@@ -45,12 +45,11 @@ def launch_preprocessing(args):
print('processing GWAS: {}'.format(tag)) print('processing GWAS: {}'.format(tag))
start = time.time() start = time.time()
print(args.input_folder)
GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2] GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
print(GWAS_link)
mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag]) mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
print(mapgw)
print(args.imputation_quality_treshold)
if args.imputation_quality_treshold is not 'None': if args.imputation_quality_treshold is not 'None':
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold)) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
...@@ -58,19 +57,15 @@ def launch_preprocessing(args): ...@@ -58,19 +57,15 @@ def launch_preprocessing(args):
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
print(ref.shape)
print(ref.head())
print(gw_df.shape)
print(gw_df.head())
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"]) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
print(mgwas.shape)
mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size) mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size)
end = time.time() end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start)) print("Preprocessing of {0} in {1}s".format(tag, end-start))
print(mgwas.head())
jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag) jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
if(args.output_folder_1_file): if(args.output_folder_1_file):
......
...@@ -27,7 +27,7 @@ def compute_z_score(mgwas): ...@@ -27,7 +27,7 @@ def compute_z_score(mgwas):
return mgwas return mgwas
def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7): def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1):
if 'n' in mgwas.columns: if 'n' in mgwas.columns:
myN = mgwas.n myN = mgwas.n
...@@ -45,22 +45,38 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7): ...@@ -45,22 +45,38 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7):
else: else:
raise ValueError( raise ValueError(
'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
# replace infinite value by the max finite one # replace infinite value by the max finite one
if(np.isinf(myN).any()): if(np.isinf(myN).any()):
myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
warnings.warn("Some snp had an infinite sample size") warnings.warn("Some snp had an infinite sample size")
myW_thres = np.percentile(myN.dropna(), 90)
ss_thres = float(perSS) * myW_thres perc_low = 0
perc_max = 100
myW_thres_low = np.percentile(myN.dropna(), perc_low)
myW_thres_max = np.percentile(myN.dropna(), perc_max)
while (1 - (myW_thres_low /myW_thres_max)) > max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous
perc_low += 10
perc_max -= 10
myW_thres_low = np.percentile(myN.dropna(), perc_low)
myW_thres_max = np.percentile(myN.dropna(), perc_max)
mgwas["computed_N"] = myN mgwas["computed_N"] = myN
plt.clf() plt.clf()
p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()]) p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
p1.axvline(x=ss_thres) p1.axvline(x=myW_thres_low, color='r')
p1.axvline(x=myW_thres_max, color="r")
fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
p1.figure.savefig(fo) p1.figure.savefig(fo)
# Filter SNP with a too small sample _SampleSize # Filter SNP with a too small sample _SampleSize
print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
mgwas = mgwas.loc[(myN >= ss_thres)]
mgwas = mgwas.loc[(myN >= myW_thres_low) & (myN <= myW_thres_max)]
mgwas = mgwas.loc[~mgwas.computed_N.isna()] mgwas = mgwas.loc[~mgwas.computed_N.isna()]
print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
......
...@@ -141,8 +141,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -141,8 +141,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
else: else:
compression = None compression = None
print(column_map.values)
print(column_map.index)
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols = column_map.values, usecols = column_map.values,
compression=compression, compression=compression,
...@@ -152,7 +150,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -152,7 +150,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
'NA', 'NULL', 'NaN', 'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head())
#Ensure that allele are written in upper cases: #Ensure that allele are written in upper cases:
fullGWAS.a1 = fullGWAS.a1.str.upper() fullGWAS.a1 = fullGWAS.a1.str.upper()
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment