Skip to content
Snippets Groups Projects
Commit b3fb8ef1 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add module to map gwas on the reference panel

parent a40f95cf
No related branches found
No related tags found
No related merge requests found
import jass_preprocessing.map_gwas.map_gwas as map_gwas
import jass_preprocessing.dna_utils.dna_utils as dna_utils
import jass_preprocessing.map_reference.map_reference as map_reference
import jass_preprocessing.compute_score.compute as compute_score
"""
Few fonction to to compute DNA complement
"""
def dna_complement_base(inputbase):
if (inputbase == 'A'):
return('T')
if (inputbase == 'T'):
return('A')
if (inputbase == 'G'):
return('C')
if (inputbase == 'C'):
return('G')
dna_complement_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
return(dna_complement_dict[inputbase])
except KeyError:
return('Not ATGC')
......
......@@ -33,11 +33,13 @@ def convert_missing_values(df):
"""
Convert all missing value strings to a standart np.nan value
"""
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
'-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
'nan', 'na', '.']
nmissing = len(def_missing)
nan_vec = [np.nan] * nmissing
nan_vec = ["NA"] * nmissing
return df.replace(def_missing, nan_vec)
......@@ -58,7 +60,6 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
......@@ -67,30 +68,16 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
def read_gwas( gwas_internal_link, column_dict):
"""
Read gwas and apply minimal qc
Read gwas Chromosome and rename columns in our standart
"""
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0)
usecols=column_dict['label_position'].keys(),
index_col=0,
names=column_dict['label_position'].values(),
header=0)
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
fullGWAS = convert_missing_values(fullGWAS)
#fullGWAS = convert_missing_values(fullGWAS)
return fullGWAS
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas df with the reference panel
"""
def key2(x):
return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
ref_panel['key2'] = ref_panel.apply(key2,1)
merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
merge_GWAS.loc[other_snp.index] = other_snp
return(merge_GWAS)
import pandas as pd
import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
def read_reference(gwas_reference_panel):
"""
helper function to name correctly the column
"""
ref = pd.read_csv(REF_filename, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
return(ref)
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas df with the reference panel
"""
def key2(x):
return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
ref_panel['key2'] = ref_panel.apply(key2,1)
merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
merge_GWAS.loc[other_snp.index] = other_snp
return(merge_GWAS)
def compute_is_flipped(mgwas):
flipped = pd.DataFrame({"ref_flipped" : (mgwas.ref == mgwas.a2), "alt_flipped" : (mgwas.alt == mgwas.a1)})
flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.ref == mgwas.a2c), "alt_flippedc" : (mgwas.alt == mgwas.a1c)})
is_flipped = pd.DataFrame({"flipped":flipped.all(1), # The allele of the
"flipped_complement":flipped_complement.all(1)}
).any(1)
return is_flipped
def compute_is_aligned(mgwas):
aligned = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1), "alt_ok" : (mgwas.alt == mgwas.a2)})
aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1c), "alt_ok" : (mgwas.alt == mgwas.a2c)})
is_aligned = pd.DataFrame({"aligned":aligned.all(1), # The allele of the
"aligned_complement":aligned_complement.all(1)}
).any(1)
return is_aligned
def compute_snp_alignement(mgwas):
"""
Add a column to mgwas indicating if the reference and coded
allele is flipped compared to the reference panel.
If it is, the sign of the statistic must be flipped
Args:
mgwas: a pandas dataframe of the GWAS data merged
with the reference panel
"""
mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)
is_aligned = compute_is_aligned(mgwas)
is_flipped = compute_is_flipped(mgwas)
# does some SNP are not "alignable" on reference
not_aligned_at_all = ~pd.DataFrame([is_aligned, is_flipped ]).any() #
if(not_aligned_at_all.any()):
warnings.warn('Some snps are not aligned at all with the reference panel\nand will be filtered:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas = mgwas.loc[~not_aligned_at_all]
aligned_everywhere = pd.DataFrame([is_aligned, is_flipped ]).all()
if(aligned_everywhere.any()):
raise ValueError('Some snps are both aligned and flipped:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas['sign_flip'] = np.nan
mgwas.loc[is_aligned,"sign_flip"] = 1
mgwas.loc[is_flipped, "sign_flip"] = -1
return(mgwas)
......@@ -4,10 +4,12 @@ setup(name='jass_preprocessing',
version='0.1',
description='Preprocess GWAS summary statistic for JASS',
url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing',
author='Hugues Aschard, Vincent Laville, Hanna Julienne',
author='Hugues Aschard, Hanna Julienne, Vincent Laville',
author_email='hugues.aschard@pasteur.fr',
license='MIT',
#package_dir = {'': 'jass_preprocessing'},
packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
"jass_preprocessing.dna_utils"],
"jass_preprocessing.dna_utils", "jass_preprocessing.map_reference",
"jass_preprocessing.compute_score"
],
zip_safe=False)
......@@ -2,7 +2,7 @@
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__ = '2018-19-02'
__updated__ = '2018-26-06'
import numpy as np
import scipy.stats as ss
......@@ -18,7 +18,7 @@ import pandas as pd
perSS = 0.7
netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv'
GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
......@@ -30,36 +30,36 @@ out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt',
'GIANT_2015_WHR_COMBINED_EUR.txt']
gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)
GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt"]
gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.iloc[:2]
gwas.iloc[0,1].split('/')[-1]
column_dict['filename']
my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
target_lab = my_labels.values.tolist()[0]
len(target_lab)
column_dict[['freq']]
# READ GWAS
GWAS_filename = GWAS_table[0]
GWAS_filename
GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
GWAS_link
mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
gw_df.head()
ref = pd.read_csv(REF_filename, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas.head()
zscore = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"]
dir(jp.map_gwas)
mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)
np.isinf(ref.head().pos).any()
......@@ -336,7 +336,7 @@ for GWAS in range(0, len(GWAS_table)):
# # SUMMARY + PLOTS
# print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
# SNP_in, SNP_rem))
#
#np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1))
# # the histogram of the #sample per SNP
# fig = plt.figure()
# fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment