Skip to content
Snippets Groups Projects
Commit 0f14bb66 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

remove NA from diagnostic plots/fixed function for column mapping/ allele to upper case

parent 664bf830
No related branches found
No related tags found
No related merge requests found
# Preprocessing package for cleaning and formating data for JASS
This helper package convert heterogeneous GWAS summary statistic format to
the IMPG input format.
## Installation
*Requirement* :
## Usage
Once installed the preprocessing can be launch in the following manner:
## reference panel format:
The user is expected to provide a reference panel with the following
column naming and column types:
| chr | pos | snp_id | ref | alt | MAF |
|-----|-----|--------|-----|-----|-----|
|1 |13116 |rs62635286 | T| G| 0.0970447|
|1 |13118 |rs200579949 |A |G |0.0970447|
|1 |14604 |rs541940975 |A |G |0.147564|
|1 |14930 |rs75454623 |A |G |0.482228|
***
## Output format
['rsID', 'pos', 'A0', "A1", "Z" ]
......@@ -54,12 +54,14 @@ def compute_sample_size(mgwas, diagnostic_folder, trait):
ss_thres = perSS * myW_thres
mgwas["computed_N"] = myN
plt.clf()
p1 = sns.distplot(mgwas.computed_N)
p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
p1.axvline(x=ss_thres)
fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
p1.figure.savefig(fo)
# Filter SNP with a too small sample _SampleSize
print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
mgwas = mgwas.loc[(myN >= ss_thres)]
mgwas = mgwas.loc[~mgwas.computed_N.isna()]
print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
return(mgwas)
......@@ -36,7 +36,7 @@ def convert_missing_values(df):
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
'-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
'nan', 'na', '.']
'nan', 'na', '.']
nmissing = len(def_missing)
nan_vec = ["NA"] * nmissing
......@@ -47,35 +47,51 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
"""
Find column position for each specific Gwas
"""
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict[column_dict['filename'] == gwas_file]
target_lab = my_labels.values.tolist()[0]
f = open(gwas_internal_link)
my_labels = column_dict.loc[gwas_file]
column_dict.head()
#Our standart labels:
reference_label = column_dict.columns.tolist()
# labels in the GWAS files
target_lab = pd.Index(my_labels.values.tolist())
f = open(gwas_internal_link)
count_line = 0
line = f.readline()
header = line.split()
list_col = {}
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
return {'label_position' : list_col, 'position_label': list_lab}
def read_gwas( gwas_internal_link, column_dict):
print(line)
header = pd.Index(line.split())
def get_position(I,x):
try:
return I.get_loc(x)
except KeyError:
return np.nan
label_position = [get_position(header,i) for i in target_lab]
mapgw = pd.Series(label_position, index=reference_label)
mapgw = mapgw.loc[~mapgw.isna()].astype(int)
mapgw.sort_values(inplace=True)
print(mapgw)
f.close()
return mapgw
def read_gwas( gwas_internal_link, column_map):
"""
Read gwas Chromosome and rename columns in our standart
"""
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols=column_dict['label_position'].keys(),
usecols = column_map.values, #column_dict['label_position'].keys(),
names= column_map.index,
index_col=0,
names=column_dict['label_position'].values(),
header=0)
header=0, na_values= ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
'-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
'nan', 'na', '.'])
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
#fullGWAS = convert_missing_values(fullGWAS)
......
......@@ -36,7 +36,7 @@ def map_on_ref_panel(gw_df , ref_panel):
merge_GWAS = pd.concat([other_snp, merge_GWAS])
merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')]
merge_GWAS.index.rename("snp_id", inplace=True)
return(merge_GWAS)
......@@ -80,6 +80,10 @@ def compute_snp_alignement(mgwas):
mgwas: a pandas dataframe of the GWAS data merged
with the reference panel
"""
#ensure that allele are upper cases:
mgwas['a1'] = mgwas.a1.str.upper()
mgwas['a2'] = mgwas.a2.str.upper()
mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)
......
......@@ -26,31 +26,29 @@ ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/"
REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan',
'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/'
gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)
gwas_map.index
GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
gwas_map
#GWAS_table = gwas_map.index[22:]#["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
#GWAS_table[5:]
# "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt"
# "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt",
# "GIANT_2015_HIP_COMBINED_EUR.txt",
# ]
for GWAS_filename in GWAS_table.index:
gwas_map
GWAS_table = ["Menopause_HapMap_for_website_18112015.txt"] # gwas_map.index#["gabriel_asthma_dummy.txt"]
for GWAS_filename in GWAS_table:
tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'],
gwas_map.loc[GWAS_filename, 'outcome'])
print('processing GWAS: {}'.format(tag))
start = time.time()
gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
print(mapgw)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
......@@ -58,21 +56,28 @@ for GWAS_filename in GWAS_table.index:
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
if gw_df.index.map(str).str.contains("^chr*", case=False).any():
ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str)
other_snp = pd.merge(ref, gw_df, how='inner', indicator=True,
left_on ='key2', left_index=False, right_index=True)
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag)
end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start))
jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)
jp.save_output.save_output(mgwas, ldscore_format, tag)
mgwas.reset_index(inplace=True)
mgwas.sort_values(['chr', "pos"]).head(100)
mapgw.sort_values(inplace=True)
mgwas.head()
GWAS_labels
pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
mapgw.head()
GWAS_path
GWAS_labels
mapgw
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment