add filter on reference panel

cc9337de · Hanna JULIENNE · 849b8ee4 · cc9337de · cc9337de · cc9337de
Commit cc9337de authored 5 years ago by Hanna JULIENNE
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -39,9 +39,7 @@ def launch_preprocessing(args):
        mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
        gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
-        ref = pd.read_csv(args.ref_path, header=None, sep= "\t",
+        ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
-                          names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
-                           index_col="snp_id")
        mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
        mgwas = jp.map_reference.compute_snp_alignement(mgwas)
@@ -69,6 +67,10 @@ def add_preprocessing_argument():
    parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
    parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
    parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
+    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default=0.01)
+    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
    parser.set_defaults(func=launch_preprocessing)
    return parser

--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):
    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
    column_dict.set_index("filename", inplace=True)
+    print(gwas_internal_link)
    gwas_file = gwas_internal_link.split('/')[-1]
    my_labels = column_dict.loc[gwas_file]
@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map):
                               usecols = column_map.values, #column_dict['label_position'].keys(),
                               names= column_map.index,
                                index_col=0,
-                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A',
+                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
-                                                       '#NA', '-1.#IND', '-1.#QNAN',
                                                 '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
                                                 'NA', 'NULL', 'NaN',
                                                 'nan', 'na', '.'])

--- a/jass_preprocessing/map_reference.py
+++ b/jass_preprocessing/map_reference.py
@@ -8,17 +8,23 @@ import numpy as np
 import jass_preprocessing.dna_utils as dna_u
 import warnings
+def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None):
-def read_reference(gwas_reference_panel):
    """
    helper function to name correctly the column
    """
-    ref = pd.read_csv(REF_filename, header=None, sep= "\t",
+    ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
                      names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
                       index_col="snp_id")
+    if mask_MHC:
+        ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
+    if minimum_MAF is not None:
+        ref = ref.loc[ref.MAF > minimum_MAF]
    return(ref)
 def map_on_ref_panel(gw_df , ref_panel):
    """
    Merge Gwas dataframe with the reference panel

--- a/jass_preprocessing/save_output.py
+++ b/jass_preprocessing/save_output.py
@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
        mgwas_chr = pd.DataFrame({
                        'rsID': mgwas_copy.loc[chrom].snp_id,
                        'pos': mgwas_copy.loc[chrom].pos,
-                        'A0': mgwas_copy.loc[chrom].ref,
+                        'A1': mgwas_copy.loc[chrom].ref,
-                        'A1':mgwas_copy.loc[chrom].alt,
+                        'A2':mgwas_copy.loc[chrom].alt,
                        'Z': mgwas_copy.loc[chrom].computed_z,
                        'P': mgwas_copy.loc[chrom].pval
-            }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
+            }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
        impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
        print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))