From fa601f187c660b80d4b760a908ef912036d7fcad Mon Sep 17 00:00:00 2001 From: hanna julienne <hanna.julienne@pasteur.fr> Date: Wed, 30 Oct 2019 11:42:56 +0100 Subject: [PATCH] add flexible region masker --- jass_preprocessing/__main__.py | 6 +++++- jass_preprocessing/map_reference.py | 13 ++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index e9fc35e..112b75f 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -39,8 +39,11 @@ def launch_preprocessing(args): mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) - ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) + if args.additional_masked_region is None: + ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) + else: + ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) @@ -70,6 +73,7 @@ def add_preprocessing_argument(): parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01) parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False) + parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None) parser.set_defaults(func=launch_preprocessing) diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index 088e249..1b0f395 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -8,9 +8,17 @@ import numpy as np import jass_preprocessing.dna_utils as dna_u import warnings -def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None): +def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, region_to_mask=None): """ helper function to name correctly the column + Args: + gwas_reference_panel (str): path toward the reference panel file + mask_MHC (bool): Whether the MHC region should be masked or not. default is False + Filter the reference panel by minimum allele frequency (hg19 coordinate) + minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel + region_to_mask (dict): a list of additional regions to mask + Return: + ref (pandas dataframe): the reference_panel with the specified filter applied """ ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], @@ -21,6 +29,9 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None): if minimum_MAF is not None: ref = ref.loc[ref.MAF > minimum_MAF] + if region_to_mask is not None: + for reg in region_to_mask: + ref = ref.loc[(ref.chr != reg["chr"]) | (ref.pos <reg["start"])|(ref.pos > reg["end"])] return(ref) -- GitLab