Commit a42199bd authored by hjulienne's avatar hjulienne
Browse files

adapt code to be compatible with positional index

parent 0c5728ea
...@@ -57,15 +57,14 @@ def launch_preprocessing(args): ...@@ -57,15 +57,14 @@ def launch_preprocessing(args):
else: else:
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region)
print(eval(args.additional_masked_region))
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
print(ref.shape)
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) print(gw_df.shape)
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
print(mgwas.shape)
mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size) mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, args.percent_sample_size)
end = time.time() end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start)) print("Preprocessing of {0} in {1}s".format(tag, end-start))
...@@ -94,6 +93,7 @@ def add_preprocessing_argument(): ...@@ -94,6 +93,7 @@ def add_preprocessing_argument():
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None') parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None') parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
parser.add_argument('--index-type', required=False, help= "type of index : rsID or chr:postion:ref_allele:alt_allele", default='rsid')
parser.set_defaults(func=launch_preprocessing) parser.set_defaults(func=launch_preprocessing)
......
...@@ -13,10 +13,8 @@ import numpy as np ...@@ -13,10 +13,8 @@ import numpy as np
import gzip import gzip
import re import re
def walkfs(startdir, findfile): def walkfs(startdir, findfile):
""" """
Go through the folder and subfolder to find the specified file Go through the folder and subfolder to find the specified file
Args: Args:
...@@ -133,9 +131,11 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -133,9 +131,11 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
Return: Return:
a pandas dataframe with missing value all equal to np.nan a pandas dataframe with missing value all equal to np.nan
""" """
print("Reading file:") print("Reading file:")
print(gwas_internal_link) print(gwas_internal_link)
is_gzipped = re.search(r".gz$", gwas_internal_link) is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped: if is_gzipped:
compression = 'gzip' compression = 'gzip'
else: else:
...@@ -153,15 +153,20 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -153,15 +153,20 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
'NA', 'NULL', 'NaN', 'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head()) print(fullGWAS.head())
def sorted_alleles(x):
return "".join(sorted(x))
# either rs ID or full position must be available: # either rs ID or full position must be available:
if "snpid" not in fullGWAS.columns:
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str)
else:
raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
if not(("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns)) and ("snpid" not in fullGWAS.columns):
raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["positional_index"] = fullGWAS.CHR.apply(str)+fullGWAS.POS.apply(str)+(fullGWAS.a1+fullGWAS.a2).apply(sorted_alleles)
if ("snpid" not in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["positional_index"]
fullGWAS.set_index("snpid", inplace=True) fullGWAS.set_index("snpid", inplace=True)
print(fullGWAS.head()) print(fullGWAS.head())
if imputation_treshold: if imputation_treshold:
fullGWAS = fullGWAS.loc[fullName.imputation_quality > imputation_treshold] fullGWAS = fullGWAS.loc[fullName.imputation_quality > imputation_treshold]
......
""" """
Module of function Module of function
""" """
import pandas as pd import pandas as pd
import numpy as np import numpy as np
...@@ -17,15 +15,21 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio ...@@ -17,15 +15,21 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
Filter the reference panel by minimum allele frequency (hg19 coordinate) Filter the reference panel by minimum allele frequency (hg19 coordinate)
minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel
region_to_mask (dict): a list of additional regions to mask region_to_mask (dict): a list of additional regions to mask
type_of_index(str): 'rsid' or 'positional'
Return: Return:
ref (pandas dataframe): the reference_panel with the specified filter applied ref (pandas dataframe): the reference_panel with the specified filter applied
""" """
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"], names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"],
index_col="snp_id") index_col="snp_id")
#Filter Strand ambiguous
def sorted_alleles(x):
return "".join(sorted(x))
#Filter Strand ambiguous if biallelic
ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])] ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
ref["positional_index"] = ref.chr.apply(str)+ref.pos.apply(str)+(ref.ref+ref.alt).apply(sorted_alleles)
if mask_MHC: if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)] ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
...@@ -39,7 +43,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio ...@@ -39,7 +43,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
def map_on_ref_panel(gw_df , ref_panel): def map_on_ref_panel(gw_df , ref_panel, index_type="rsid"):
""" """
Merge Gwas dataframe with the reference panel Merge Gwas dataframe with the reference panel
Make sure that the same SNPs are in the reference panel and the gwas Make sure that the same SNPs are in the reference panel and the gwas
...@@ -51,30 +55,44 @@ def map_on_ref_panel(gw_df , ref_panel): ...@@ -51,30 +55,44 @@ def map_on_ref_panel(gw_df , ref_panel):
Return: Return:
merge_GWAS (pandas dataframe): merge studies, merge_GWAS (pandas dataframe): merge studies,
""" """
inter_index = ref_panel.index.intersection(gw_df.index)
print(ref_panel.head())
print(gw_df.head()) if index_type=="rsid":
merge_GWAS = pd.merge(ref_panel, gw_df, merge_GWAS = pd.merge(ref_panel, gw_df,
how='inner', indicator=True, left_index=True, right_index=True) how='inner', indicator=True, left_index=True, right_index=True)
print("SNPs {}".format(merge_GWAS.shape[0]))
# If the GWAS file contains indexes of the kind chr*:position, perform a
# second join
if gw_df.index.map(str).str.contains("^chr*", case=False).any():
ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True,
left_on ='key2', right_index=True)
merge_GWAS = pd.concat([other_snp, merge_GWAS])
else:
if index_type=="positional":
print("length of intersection")
print(len(pd.Index(ref_panel.positional_index).intersection(pd.Index(gw_df.positional_index))))
merge_GWAS = pd.merge(ref_panel.reset_index(), gw_df,
how='inner', indicator=True, on="positional_index")
print(merge_GWAS)
merge_GWAS.set_index("snp_id", inplace=True)
else:
raise ValueError("index_type can take only two values: 'rsid' or 'positional'")
if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95): if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)] merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
else: else:
raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?") raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
print("before filter")
print("SNPs {}".format(merge_GWAS.shape[0])) print(merge_GWAS.shape)
# If the GWAS file contains indexes of the kind chr*:position, perform a
# second join
if gw_df.index.map(str).str.contains("^chr*", case=False).any():
ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True,
left_on ='key2', right_index=True)
merge_GWAS = pd.concat([other_snp, merge_GWAS])
merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')] merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')]
print("after filter")
print(merge_GWAS.shape)
merge_GWAS.index.rename("snp_id", inplace=True) merge_GWAS.index.rename("snp_id", inplace=True)
return(merge_GWAS) return(merge_GWAS)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment