diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index b089d3cbd56aaf02b8e4d7cca8c3d01e0e9e3256..f3450bfca21766036ff88bcc8ce650619568bc5b 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -45,12 +45,11 @@ def launch_preprocessing(args): print('processing GWAS: {}'.format(tag)) start = time.time() - print(args.input_folder) + GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2] - print(GWAS_link) + mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag]) - print(mapgw) - print(args.imputation_quality_treshold) + if args.imputation_quality_treshold is not 'None': gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold)) @@ -58,19 +57,15 @@ def launch_preprocessing(args): gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) - print(ref.shape) - print(ref.head()) - print(gw_df.shape) - print(gw_df.head()) + mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"]) - print(mgwas.shape) mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size) end = time.time() print("Preprocessing of {0} in {1}s".format(tag, end-start)) - print(mgwas.head()) + jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag) if(args.output_folder_1_file): diff --git a/jass_preprocessing/compute_score.py b/jass_preprocessing/compute_score.py index fd176a381082aa0ee3c2b88b6123fade5d73a5e9..e10d6cec28697aca6f4c11caf1ba9788d8171afd 100644 --- a/jass_preprocessing/compute_score.py +++ b/jass_preprocessing/compute_score.py @@ -27,7 +27,7 @@ def compute_z_score(mgwas): return mgwas -def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7): +def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1): if 'n' in mgwas.columns: myN = mgwas.n @@ -45,22 +45,38 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7): else: raise ValueError( 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') + # replace infinite value by the max finite one if(np.isinf(myN).any()): myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) warnings.warn("Some snp had an infinite sample size") - myW_thres = np.percentile(myN.dropna(), 90) - ss_thres = float(perSS) * myW_thres + + perc_low = 0 + perc_max = 100 + + myW_thres_low = np.percentile(myN.dropna(), perc_low) + myW_thres_max = np.percentile(myN.dropna(), perc_max) + + while (1 - (myW_thres_low /myW_thres_max)) > max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous + perc_low += 10 + perc_max -= 10 + + myW_thres_low = np.percentile(myN.dropna(), perc_low) + myW_thres_max = np.percentile(myN.dropna(), perc_max) + mgwas["computed_N"] = myN plt.clf() p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()]) - p1.axvline(x=ss_thres) + p1.axvline(x=myW_thres_low, color='r') + p1.axvline(x=myW_thres_max, color="r") fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) p1.figure.savefig(fo) # Filter SNP with a too small sample _SampleSize print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) - mgwas = mgwas.loc[(myN >= ss_thres)] + + + mgwas = mgwas.loc[(myN >= myW_thres_low) & (myN <= myW_thres_max)] mgwas = mgwas.loc[~mgwas.computed_N.isna()] print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 44da0084c4d4477f23b3bba9d8fa62d7548981c2..be99e7671f46ef5d89caef7ad5576e8d9bbe9311 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -141,8 +141,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): else: compression = None - print(column_map.values) - print(column_map.index) fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, usecols = column_map.values, compression=compression, @@ -152,7 +150,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) - print(fullGWAS.head()) #Ensure that allele are written in upper cases: fullGWAS.a1 = fullGWAS.a1.str.upper()