diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 0fb853450ec16deb81e4e51e377f6b532e084cf0..44da0084c4d4477f23b3bba9d8fa62d7548981c2 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -153,6 +153,12 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): 'NA', 'NULL', 'NaN', 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) print(fullGWAS.head()) + #Ensure that allele are written in upper cases: + + fullGWAS.a1 = fullGWAS.a1.str.upper() + fullGWAS.a2 = fullGWAS.a2.str.upper() + + def sorted_alleles(x): return "".join(sorted(x)) # either rs ID or full position must be available: diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index 2edef8dcd90811a61a61091b8a48855ec5356bf1..f6ad23558a96009ad897bd8feab6531181cd3be6 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio """ ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"], - dtype = {"chr": str, "snp_id":str, "MAF": np.float, "pos":np.int, "ref":str, "alt":str}, + dtype = {"chr": str, "snp_id":str, "MAF": np.float, "pos":np.int, "ref":str, "alt":str}, index_col="snp_id") def sorted_alleles(x): @@ -81,10 +81,12 @@ def map_on_ref_panel(gw_df , ref_panel, index_type="rsid"): merge_GWAS.set_index("snp_id", inplace=True) else: raise ValueError("index_type can take only two values: 'rsid' or 'positional'") - if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95): - merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)] - else: - raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?") + + if (("pos" in merge_GWAS.columns) and ("POS" in merge_GWAS.columns)) + if (merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95): + merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)] + else: + raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?") print("before filter") print(merge_GWAS.shape)