"""
Module of function
"""
import pandas as pd
import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
[docs]def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, region_to_mask=None):
"""
helper function to name correctly the column
Args:
gwas_reference_panel (str): path toward the reference panel file
mask_MHC (bool): Whether the MHC region should be masked or not. default is False
Filter the reference panel by minimum allele frequency (hg19 coordinate)
minimum_MAF (float): minimum allele frequency for a SNPs to be retain in the panel
region_to_mask (dict): a list of additional regions to mask
Return:
ref (pandas dataframe): the reference_panel with the specified filter applied
"""
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
#Filter Strand ambiguous
ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
if minimum_MAF is not None:
ref = ref.loc[ref.MAF > minimum_MAF]
if region_to_mask is not None:
for reg in region_to_mask:
ref = ref.loc[(ref.chr != reg["chr"]) | (ref.pos <reg["start"])|(ref.pos > reg["end"])]
return(ref)
[docs]def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas dataframe with the reference panel
Make sure that the same SNPs are in the reference panel and the gwas
Args:
gw_df (pandas dataframe): GWAS study dataframe
ref_panel (pandas dataframe): reference panel dataframe
Return:
merge_GWAS (pandas dataframe): merge studies,
"""
inter_index = ref_panel.index.intersection(gw_df.index)
merge_GWAS = pd.merge(ref_panel, gw_df,
how='inner', indicator=True, left_index=True, right_index=True)
print("SNPs {}".format(merge_GWAS.shape[0]))
# If the GWAS file contains indexes of the kind chr*:position, perform a
# second join
if gw_df.index.map(str).str.contains("^chr*", case=False).any():
ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True,
left_on ='key2', right_index=True)
merge_GWAS = pd.concat([other_snp, merge_GWAS])
merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')]
merge_GWAS.index.rename("snp_id", inplace=True)
return(merge_GWAS)
[docs]def compute_is_flipped(mgwas):
"""
Check if the reference panel and the GWAS data have the same reference
allele. return a boolean vector.
Args:
mgwas (pandas dataframe): GWAS study dataframe merged with the reference_panel
Return:
is_flipped (pandas dataframe): merge studies,
"""
flipped = pd.DataFrame({"ref_flipped" : (mgwas.ref == mgwas.a2), "alt_flipped" : (mgwas.alt == mgwas.a1)})
flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.ref == mgwas.a2c), "alt_flippedc" : (mgwas.alt == mgwas.a1c)})
is_flipped = pd.DataFrame({"flipped":flipped.all(1), # The allele of the
"flipped_complement":flipped_complement.all(1)}
).any(1)
return is_flipped
[docs]def compute_is_aligned(mgwas):
"""
Check if the reference panel and the GWAS data have the same reference
allele. return a boolean vector.
The function should be the complement of "is_flipped" but we still compute
the two function to eventually detect weird cases (more than two alleles for
instance)
"""
aligned = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1), "alt_ok" : (mgwas.alt == mgwas.a2)})
aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1c), "alt_ok" : (mgwas.alt == mgwas.a2c)})
is_aligned = pd.DataFrame({"aligned":aligned.all(1), # The allele of the
"aligned_complement":aligned_complement.all(1)}
).any(1)
return is_aligned
[docs]def compute_snp_alignement(mgwas):
"""
Add a column to mgwas indicating if the reference and coded
allele is flipped compared to the reference panel.
If it is, the sign of the statistic must be flipped
Args:
mgwas: a pandas dataframe of the GWAS data merged
with the reference panel
"""
#ensure that allele are upper cases:
mgwas['a1'] = mgwas.a1.str.upper()
mgwas['a2'] = mgwas.a2.str.upper()
mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)
is_aligned = compute_is_aligned(mgwas)
is_flipped = compute_is_flipped(mgwas)
# does some SNP are not "alignable" on reference
not_aligned_at_all = ~pd.DataFrame([is_aligned, is_flipped ]).any() #
if(not_aligned_at_all.any()):
warnings.warn('Some snps are not aligned at all with the reference panel\nand will be filtered:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas = mgwas.loc[~not_aligned_at_all]
aligned_everywhere = pd.DataFrame([is_aligned, is_flipped ]).all()
if(aligned_everywhere.any()):
raise ValueError('Some snps are both aligned and flipped:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas['sign_flip'] = np.nan
mgwas.loc[is_aligned,"sign_flip"] = 1
mgwas.loc[is_flipped, "sign_flip"] = -1
return(mgwas)