modified filtering of sample size

43dba7f5 · Hanna JULIENNE · 46ff85c7 · 43dba7f5 · 43dba7f5 · 43dba7f5
Commit 43dba7f5 authored 2 years ago by Hanna JULIENNE
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -45,12 +45,11 @@ def launch_preprocessing(args):
        print('processing GWAS: {}'.format(tag))
        start = time.time()
-        print(args.input_folder)
        GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
-        print(GWAS_link)
        mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
-        print(mapgw)
-        print(args.imputation_quality_treshold)
        if args.imputation_quality_treshold is not 'None':
            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
@@ -58,19 +57,15 @@ def launch_preprocessing(args):
            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
        ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
-        print(ref.shape)
-        print(ref.head())
-        print(gw_df.shape)
-        print(gw_df.head())
        mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
-        print(mgwas.shape)
        mgwas = jp.map_reference.compute_snp_alignement(mgwas)
        mgwas = jp.compute_score.compute_z_score(mgwas)
        mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size)
        end = time.time()
        print("Preprocessing of {0} in {1}s".format(tag, end-start))
-        print(mgwas.head())
        jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
        if(args.output_folder_1_file):

--- a/jass_preprocessing/compute_score.py
+++ b/jass_preprocessing/compute_score.py
@@ -27,7 +27,7 @@ def compute_z_score(mgwas):
    return mgwas
-def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7):
+def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1):
    if 'n' in mgwas.columns:
        myN = mgwas.n
@@ -45,22 +45,38 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, perSS = 0.7):
    else:
        raise ValueError(
            'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
    # replace infinite value by the max finite one
    if(np.isinf(myN).any()):
        myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
        warnings.warn("Some snp had an infinite sample size")
-    myW_thres = np.percentile(myN.dropna(), 90)
-    ss_thres = float(perSS) * myW_thres
+    perc_low = 0
+    perc_max = 100
+    myW_thres_low = np.percentile(myN.dropna(), perc_low)
+    myW_thres_max = np.percentile(myN.dropna(), perc_max)
+    while (1 - (myW_thres_low /myW_thres_max)) >  max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous
+        perc_low += 10
+        perc_max -= 10
+        myW_thres_low = np.percentile(myN.dropna(), perc_low)
+        myW_thres_max = np.percentile(myN.dropna(), perc_max)
    mgwas["computed_N"] = myN
    plt.clf()
    p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
-    p1.axvline(x=ss_thres)
+    p1.axvline(x=myW_thres_low, color='r')
+    p1.axvline(x=myW_thres_max, color="r")
    fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
    p1.figure.savefig(fo)
    # Filter SNP with a too small sample _SampleSize
    print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
-    mgwas = mgwas.loc[(myN >= ss_thres)]
+    mgwas = mgwas.loc[(myN >= myW_thres_low) & (myN <= myW_thres_max)]
    mgwas = mgwas.loc[~mgwas.computed_N.isna()]
    print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))

--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -141,8 +141,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
    else:
        compression = None
-    print(column_map.values)
-    print(column_map.index)
    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
                               usecols = column_map.values,
                               compression=compression,
@@ -152,7 +150,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
                                                 '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
                                                 'NA', 'NULL', 'NaN',
                                                 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
-    print(fullGWAS.head())
    #Ensure that allele are written in upper cases:
    fullGWAS.a1 = fullGWAS.a1.str.upper()