Commit a42199bd authored by hjulienne's avatar hjulienne
Browse files

adapt code to be compatible with positional index

parent 0c5728ea
......@@ -57,15 +57,14 @@ def launch_preprocessing(args):
else:
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region)
print(eval(args.additional_masked_region))
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
print(ref.shape)
print(gw_df.shape)
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
print(mgwas.shape)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size)
end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start))
......@@ -94,6 +93,7 @@ def add_preprocessing_argument():
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
parser.add_argument('--index-type', required=False, help= "type of index : rsID or chr:postion:ref_allele:alt_allele", default='rsid')
parser.set_defaults(func=launch_preprocessing)
......
......@@ -13,10 +13,8 @@ import numpy as np
import gzip
import re
def walkfs(startdir, findfile):
"""
Go through the folder and subfolder to find the specified file
Args:
......@@ -133,9 +131,11 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
Return:
a pandas dataframe with missing value all equal to np.nan
"""
print("Reading file:")
print(gwas_internal_link)
is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped:
compression = 'gzip'
else:
......@@ -153,15 +153,20 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head())
def sorted_alleles(x):
return "".join(sorted(x))
# either rs ID or full position must be available:
if "snpid" not in fullGWAS.columns:
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str)
else:
if not(("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns)) and ("snpid" not in fullGWAS.columns):
raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["positional_index"] = fullGWAS.CHR.apply(str)+fullGWAS.POS.apply(str)+(fullGWAS.a1+fullGWAS.a2).apply(sorted_alleles)
if ("snpid" not in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["positional_index"]
fullGWAS.set_index("snpid", inplace=True)
print(fullGWAS.head())
if imputation_treshold:
fullGWAS = fullGWAS.loc[fullName.imputation_quality > imputation_treshold]
......
"""
Module of function
"""
import pandas as pd
import numpy as np
......@@ -17,15 +15,21 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
Filter the reference panel by minimum allele frequency (hg19 coordinate)
minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel
region_to_mask (dict): a list of additional regions to mask
type_of_index(str): 'rsid' or 'positional'
Return:
ref (pandas dataframe): the reference_panel with the specified filter applied
"""
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"],
index_col="snp_id")
#Filter Strand ambiguous
def sorted_alleles(x):
return "".join(sorted(x))
#Filter Strand ambiguous if biallelic
ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
ref["positional_index"] = ref.chr.apply(str)+ref.pos.apply(str)+(ref.ref+ref.alt).apply(sorted_alleles)
if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
......@@ -39,7 +43,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
def map_on_ref_panel(gw_df , ref_panel):
def map_on_ref_panel(gw_df , ref_panel, index_type="rsid"):
"""
Merge Gwas dataframe with the reference panel
Make sure that the same SNPs are in the reference panel and the gwas
......@@ -51,18 +55,11 @@ def map_on_ref_panel(gw_df , ref_panel):
Return:
merge_GWAS (pandas dataframe): merge studies,
"""
inter_index = ref_panel.index.intersection(gw_df.index)
print(ref_panel.head())
print(gw_df.head())
if index_type=="rsid":
merge_GWAS = pd.merge(ref_panel, gw_df,
how='inner', indicator=True, left_index=True, right_index=True)
if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
else:
raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
print("SNPs {}".format(merge_GWAS.shape[0]))
# If the GWAS file contains indexes of the kind chr*:position, perform a
......@@ -73,8 +70,29 @@ def map_on_ref_panel(gw_df , ref_panel):
left_on ='key2', right_index=True)
merge_GWAS = pd.concat([other_snp, merge_GWAS])
else:
if index_type=="positional":
print("length of intersection")
print(len(pd.Index(ref_panel.positional_index).intersection(pd.Index(gw_df.positional_index))))
merge_GWAS = pd.merge(ref_panel.reset_index(), gw_df,
how='inner', indicator=True, on="positional_index")
print(merge_GWAS)
merge_GWAS.set_index("snp_id", inplace=True)
else:
raise ValueError("index_type can take only two values: 'rsid' or 'positional'")
if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
else:
raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
print("before filter")
print(merge_GWAS.shape)
merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')]
print("after filter")
print(merge_GWAS.shape)
merge_GWAS.index.rename("snp_id", inplace=True)
return(merge_GWAS)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment