From cc9337de57872b2d37289fb6787e488be95439c2 Mon Sep 17 00:00:00 2001 From: hanna julienne <hanna.julienne@pasteur.fr> Date: Wed, 30 Oct 2019 10:48:52 +0100 Subject: [PATCH] add filter on reference panel --- jass_preprocessing/__main__.py | 8 +++++--- jass_preprocessing/map_gwas.py | 5 ++--- jass_preprocessing/map_reference.py | 12 +++++++++--- jass_preprocessing/save_output.py | 6 +++--- 4 files changed, 19 insertions(+), 12 deletions(-) diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index 482b83e..e9fc35e 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -39,9 +39,7 @@ def launch_preprocessing(args): mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) - ref = pd.read_csv(args.ref_path, header=None, sep= "\t", - names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], - index_col="snp_id") + ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.compute_snp_alignement(mgwas) @@ -69,6 +67,10 @@ def add_preprocessing_argument(): parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)") parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)") parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7) + + parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01) + parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False) + parser.set_defaults(func=launch_preprocessing) return parser diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 83f01c5..17ac79d 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link, GWAS_labels): column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') column_dict.set_index("filename", inplace=True) - + print(gwas_internal_link) gwas_file = gwas_internal_link.split('/')[-1] my_labels = column_dict.loc[gwas_file] @@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map): usecols = column_map.values, #column_dict['label_position'].keys(), names= column_map.index, index_col=0, - header=0, na_values= ['', '#N/A', '#N/A', 'N/A', - '#NA', '-1.#IND', '-1.#QNAN', + header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']) diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index e04589f..088e249 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -8,17 +8,23 @@ import numpy as np import jass_preprocessing.dna_utils as dna_u import warnings - -def read_reference(gwas_reference_panel): +def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None): """ helper function to name correctly the column """ - ref = pd.read_csv(REF_filename, header=None, sep= "\t", + ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") + if mask_MHC: + ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)] + + if minimum_MAF is not None: + ref = ref.loc[ref.MAF > minimum_MAF] + return(ref) + def map_on_ref_panel(gw_df , ref_panel): """ Merge Gwas dataframe with the reference panel diff --git a/jass_preprocessing/save_output.py b/jass_preprocessing/save_output.py index c00fef4..daf240c 100644 --- a/jass_preprocessing/save_output.py +++ b/jass_preprocessing/save_output.py @@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): mgwas_chr = pd.DataFrame({ 'rsID': mgwas_copy.loc[chrom].snp_id, 'pos': mgwas_copy.loc[chrom].pos, - 'A0': mgwas_copy.loc[chrom].ref, - 'A1':mgwas_copy.loc[chrom].alt, + 'A1': mgwas_copy.loc[chrom].ref, + 'A2':mgwas_copy.loc[chrom].alt, 'Z': mgwas_copy.loc[chrom].computed_z, 'P': mgwas_copy.loc[chrom].pval - }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ]) + }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ]) impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder)) -- GitLab