adjust sample filtering parameter

5f9ea33f · Hanna JULIENNE · 43dba7f5 · 5f9ea33f · 5f9ea33f · 5f9ea33f
Commit 5f9ea33f authored 2 years ago by Hanna JULIENNE
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -83,7 +83,7 @@ def add_preprocessing_argument():

    parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
    parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
-    parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
+    parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.75)

    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default='0.01')
    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')

--- a/jass_preprocessing/compute_score.py
+++ b/jass_preprocessing/compute_score.py
@@ -27,8 +27,11 @@ def compute_z_score(mgwas):

    return mgwas

-def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio = 0.1):
-
+def compute_sample_size(mgwas, diagnostic_folder, trait, min_sample_size_ratio = 0.95):
+    """
+    Infer sample size when not present and filter sample size to reach homogeneous sample
+    size
+    """
    if 'n' in mgwas.columns:
        myN = mgwas.n
    #--- freq, case-cont N exist
@@ -58,13 +61,15 @@ def compute_sample_size(mgwas, diagnostic_folder, trait, max_sample_size_ratio =
    myW_thres_low = np.percentile(myN.dropna(), perc_low)
    myW_thres_max = np.percentile(myN.dropna(), perc_max)

-    while (1 - (myW_thres_low /myW_thres_max)) >  max_sample_size_ratio: # narrow treshold until sample size can be considered as homogeneous
-        perc_low += 10
-        perc_max -= 10
+    while ((myW_thres_low / myW_thres_max) < min_sample_size_ratio): # narrow treshold until sample size can be considered as homogeneous
+        perc_low += 2
+        perc_max -= 0.5

        myW_thres_low = np.percentile(myN.dropna(), perc_low)
        myW_thres_max = np.percentile(myN.dropna(), perc_max)

+
+    print("Finished filtering")
    mgwas["computed_N"] = myN
    plt.clf()
    p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])

--- a/jass_preprocessing/save_output.py
+++ b/jass_preprocessing/save_output.py
@@ -19,8 +19,9 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
                            'A0': mgwas_copy.loc[chrom].ref,
                            'A1':mgwas_copy.loc[chrom].alt,
                            'Z': mgwas_copy.loc[chrom].computed_z,
-                            'P': mgwas_copy.loc[chrom].pval
-                }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
+                            'P': mgwas_copy.loc[chrom].pval,
+                            'N_effective': mgwas_copy.loc[chrom].computed_N
+                }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P", "N_effective"])

            impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
            print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
@@ -43,8 +44,9 @@ def save_output(mgwas, ImpG_output_Folder, my_study):
                    'A0': mgwas_copy.ref,
                    'A1':mgwas_copy.alt,
                    'Z': mgwas_copy.computed_z,
-                    'P': mgwas_copy.pval
-        }, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P" ])
+                    'P': mgwas_copy.pval,
+                    'N_effective': mgwas_copy.computed_N
+        }, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P", 'N_effective' ])

    impg_output_file = ImpG_output_Folder + 'z_'+ my_study +".txt"
    print("WRITING results for {} to: {}".format( my_study, ImpG_output_Folder))