From cc9337de57872b2d37289fb6787e488be95439c2 Mon Sep 17 00:00:00 2001
From: hanna julienne <hanna.julienne@pasteur.fr>
Date: Wed, 30 Oct 2019 10:48:52 +0100
Subject: [PATCH] add filter on reference panel

---
 jass_preprocessing/__main__.py      |  8 +++++---
 jass_preprocessing/map_gwas.py      |  5 ++---
 jass_preprocessing/map_reference.py | 12 +++++++++---
 jass_preprocessing/save_output.py   |  6 +++---
 4 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py
index 482b83e..e9fc35e 100644
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -39,9 +39,7 @@ def launch_preprocessing(args):
         mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
 
         gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
-        ref = pd.read_csv(args.ref_path, header=None, sep= "\t",
-                          names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
-                           index_col="snp_id")
+        ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
 
         mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
         mgwas = jp.map_reference.compute_snp_alignement(mgwas)
@@ -69,6 +67,10 @@ def add_preprocessing_argument():
     parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
     parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
     parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
+
+    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default=0.01)
+    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
+
     parser.set_defaults(func=launch_preprocessing)
 
     return parser
diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py
index 83f01c5..17ac79d 100644
--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):
     column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
 
     column_dict.set_index("filename", inplace=True)
-
+    print(gwas_internal_link)
     gwas_file = gwas_internal_link.split('/')[-1]
     my_labels = column_dict.loc[gwas_file]
 
@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map):
                                usecols = column_map.values, #column_dict['label_position'].keys(),
                                names= column_map.index,
                                 index_col=0,
-                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A',
-                                                       '#NA', '-1.#IND', '-1.#QNAN',
+                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
                                                  '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
                                                  'NA', 'NULL', 'NaN',
                                                  'nan', 'na', '.'])
diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py
index e04589f..088e249 100644
--- a/jass_preprocessing/map_reference.py
+++ b/jass_preprocessing/map_reference.py
@@ -8,17 +8,23 @@ import numpy as np
 import jass_preprocessing.dna_utils as dna_u
 import warnings
 
-
-def read_reference(gwas_reference_panel):
+def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None):
     """
     helper function to name correctly the column
     """
-    ref = pd.read_csv(REF_filename, header=None, sep= "\t",
+    ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
                       names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
                        index_col="snp_id")
+    if mask_MHC:
+        ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
+
+    if minimum_MAF is not None:
+        ref = ref.loc[ref.MAF > minimum_MAF]
+
     return(ref)
 
 
+
 def map_on_ref_panel(gw_df , ref_panel):
     """
     Merge Gwas dataframe with the reference panel
diff --git a/jass_preprocessing/save_output.py b/jass_preprocessing/save_output.py
index c00fef4..daf240c 100644
--- a/jass_preprocessing/save_output.py
+++ b/jass_preprocessing/save_output.py
@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
         mgwas_chr = pd.DataFrame({
                         'rsID': mgwas_copy.loc[chrom].snp_id,
                         'pos': mgwas_copy.loc[chrom].pos,
-                        'A0': mgwas_copy.loc[chrom].ref,
-                        'A1':mgwas_copy.loc[chrom].alt,
+                        'A1': mgwas_copy.loc[chrom].ref,
+                        'A2':mgwas_copy.loc[chrom].alt,
                         'Z': mgwas_copy.loc[chrom].computed_z,
                         'P': mgwas_copy.loc[chrom].pval
-            }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
+            }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
 
         impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
         print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
-- 
GitLab