diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index e9fc35e19d63056a67cf79b7e9c77a736a00b58b..112b75f532f5a0e0d2d4e545a2237b2dc4bd9e7a 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -39,8 +39,11 @@ def launch_preprocessing(args): mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) - ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) + if args.additional_masked_region is None: + ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) + else: + ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) @@ -70,6 +73,7 @@ def add_preprocessing_argument(): parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01) parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False) + parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None) parser.set_defaults(func=launch_preprocessing) diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index 088e249b0726b1fac82ad79a394ce4be3ff6fba7..1b0f3953bcb39bcacbb370857c39d458dac44995 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -8,9 +8,17 @@ import numpy as np import jass_preprocessing.dna_utils as dna_u import warnings -def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None): +def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, region_to_mask=None): """ helper function to name correctly the column + Args: + gwas_reference_panel (str): path toward the reference panel file + mask_MHC (bool): Whether the MHC region should be masked or not. default is False + Filter the reference panel by minimum allele frequency (hg19 coordinate) + minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel + region_to_mask (dict): a list of additional regions to mask + Return: + ref (pandas dataframe): the reference_panel with the specified filter applied """ ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], @@ -21,6 +29,9 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None): if minimum_MAF is not None: ref = ref.loc[ref.MAF > minimum_MAF] + if region_to_mask is not None: + for reg in region_to_mask: + ref = ref.loc[(ref.chr != reg["chr"]) | (ref.pos <reg["start"])|(ref.pos > reg["end"])] return(ref)