diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index ad18b2cec290364e42b9005d428654579499ad08..248bb163e294fdeef57265868073af7532d68575 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -57,15 +57,14 @@ def launch_preprocessing(args): else: gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) - print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region) - print(eval(args.additional_masked_region)) ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) - - mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) + print(ref.shape) + print(gw_df.shape) + mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"]) + print(mgwas.shape) mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size) - end = time.time() print("Preprocessing of {0} in {1}s".format(tag, end-start)) @@ -94,6 +93,7 @@ def add_preprocessing_argument(): parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None') parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None') + parser.add_argument('--index-type', required=False, help= "type of index : rsID or chr:postion:ref_allele:alt_allele", default='rsid') parser.set_defaults(func=launch_preprocessing) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index f67ac3b26480140590ba1a2e52b840c03ee111f4..402b382c83f922e31438c60a0fd1726b278d38f7 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -13,10 +13,8 @@ import numpy as np import gzip import re - def walkfs(startdir, findfile): """ - Go through the folder and subfolder to find the specified file Args: @@ -133,9 +131,11 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): Return: a pandas dataframe with missing value all equal to np.nan """ + print("Reading file:") print(gwas_internal_link) is_gzipped = re.search(r".gz$", gwas_internal_link) + if is_gzipped: compression = 'gzip' else: @@ -153,15 +153,20 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): 'NA', 'NULL', 'NaN', 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) print(fullGWAS.head()) + def sorted_alleles(x): + return "".join(sorted(x)) # either rs ID or full position must be available: - if "snpid" not in fullGWAS.columns: - if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns): - fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str) - else: - raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link)) + if not(("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns)) and ("snpid" not in fullGWAS.columns): + raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link)) + + if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns): + fullGWAS["positional_index"] = fullGWAS.CHR.apply(str)+fullGWAS.POS.apply(str)+(fullGWAS.a1+fullGWAS.a2).apply(sorted_alleles) + if ("snpid" not in fullGWAS.columns): + fullGWAS["snpid"] = fullGWAS["positional_index"] fullGWAS.set_index("snpid", inplace=True) + print(fullGWAS.head()) if imputation_treshold: fullGWAS = fullGWAS.loc[fullName.imputation_quality > imputation_treshold] diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index bb9dbd55206436bc2baf3dd63e3b8c397cc67369..4a1be77b7715bcf6a7d7d8ecebebefa839e4f0d3 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -1,7 +1,5 @@ """ Module of function - - """ import pandas as pd import numpy as np @@ -17,15 +15,21 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio Filter the reference panel by minimum allele frequency (hg19 coordinate) minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel region_to_mask (dict): a list of additional regions to mask + type_of_index(str): 'rsid' or 'positional' Return: ref (pandas dataframe): the reference_panel with the specified filter applied """ ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"], index_col="snp_id") - #Filter Strand ambiguous + + def sorted_alleles(x): + return "".join(sorted(x)) + #Filter Strand ambiguous if biallelic ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])] + ref["positional_index"] = ref.chr.apply(str)+ref.pos.apply(str)+(ref.ref+ref.alt).apply(sorted_alleles) + if mask_MHC: ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)] @@ -39,7 +43,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio -def map_on_ref_panel(gw_df , ref_panel): +def map_on_ref_panel(gw_df , ref_panel, index_type="rsid"): """ Merge Gwas dataframe with the reference panel Make sure that the same SNPs are in the reference panel and the gwas @@ -51,30 +55,44 @@ def map_on_ref_panel(gw_df , ref_panel): Return: merge_GWAS (pandas dataframe): merge studies, """ - inter_index = ref_panel.index.intersection(gw_df.index) - print(ref_panel.head()) - print(gw_df.head()) - merge_GWAS = pd.merge(ref_panel, gw_df, - how='inner', indicator=True, left_index=True, right_index=True) + + if index_type=="rsid": + merge_GWAS = pd.merge(ref_panel, gw_df, + how='inner', indicator=True, left_index=True, right_index=True) + print("SNPs {}".format(merge_GWAS.shape[0])) + + # If the GWAS file contains indexes of the kind chr*:position, perform a + # second join + if gw_df.index.map(str).str.contains("^chr*", case=False).any(): + ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str) + other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, + left_on ='key2', right_index=True) + merge_GWAS = pd.concat([other_snp, merge_GWAS]) + + else: + if index_type=="positional": + print("length of intersection") + print(len(pd.Index(ref_panel.positional_index).intersection(pd.Index(gw_df.positional_index)))) + merge_GWAS = pd.merge(ref_panel.reset_index(), gw_df, + how='inner', indicator=True, on="positional_index") + print(merge_GWAS) + merge_GWAS.set_index("snp_id", inplace=True) + else: + raise ValueError("index_type can take only two values: 'rsid' or 'positional'") if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95): merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)] else: raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?") - - print("SNPs {}".format(merge_GWAS.shape[0])) - - # If the GWAS file contains indexes of the kind chr*:position, perform a - # second join - if gw_df.index.map(str).str.contains("^chr*", case=False).any(): - ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str) - other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, - left_on ='key2', right_index=True) - merge_GWAS = pd.concat([other_snp, merge_GWAS]) + print("before filter") + print(merge_GWAS.shape) merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')] + print("after filter") + print(merge_GWAS.shape) + merge_GWAS.index.rename("snp_id", inplace=True) return(merge_GWAS)