Skip to content
Snippets Groups Projects
Commit 0f14bb66 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

remove NA from diagnostic plots/fixed function for column mapping/ allele to upper case

parent 664bf830
No related branches found
No related tags found
No related merge requests found
# Preprocessing package for cleaning and formating data for JASS
This helper package convert heterogeneous GWAS summary statistic format to
the IMPG input format.
## Installation
*Requirement* :
## Usage
Once installed the preprocessing can be launch in the following manner:
## reference panel format:
The user is expected to provide a reference panel with the following
column naming and column types:
| chr | pos | snp_id | ref | alt | MAF |
|-----|-----|--------|-----|-----|-----|
|1 |13116 |rs62635286 | T| G| 0.0970447|
|1 |13118 |rs200579949 |A |G |0.0970447|
|1 |14604 |rs541940975 |A |G |0.147564|
|1 |14930 |rs75454623 |A |G |0.482228|
***
## Output format
['rsID', 'pos', 'A0', "A1", "Z" ]
...@@ -54,12 +54,14 @@ def compute_sample_size(mgwas, diagnostic_folder, trait): ...@@ -54,12 +54,14 @@ def compute_sample_size(mgwas, diagnostic_folder, trait):
ss_thres = perSS * myW_thres ss_thres = perSS * myW_thres
mgwas["computed_N"] = myN mgwas["computed_N"] = myN
plt.clf() plt.clf()
p1 = sns.distplot(mgwas.computed_N) p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
p1.axvline(x=ss_thres) p1.axvline(x=ss_thres)
fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
p1.figure.savefig(fo) p1.figure.savefig(fo)
# Filter SNP with a too small sample _SampleSize # Filter SNP with a too small sample _SampleSize
print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
mgwas = mgwas.loc[(myN >= ss_thres)] mgwas = mgwas.loc[(myN >= ss_thres)]
mgwas = mgwas.loc[~mgwas.computed_N.isna()]
print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
return(mgwas) return(mgwas)
...@@ -47,35 +47,51 @@ def map_columns_position(gwas_internal_link, GWAS_labels): ...@@ -47,35 +47,51 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
""" """
Find column position for each specific Gwas Find column position for each specific Gwas
""" """
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
gwas_file = gwas_internal_link.split('/')[-1] gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict[column_dict['filename'] == gwas_file]
target_lab = my_labels.values.tolist()[0]
f = open(gwas_internal_link)
my_labels = column_dict.loc[gwas_file]
column_dict.head()
#Our standart labels:
reference_label = column_dict.columns.tolist()
# labels in the GWAS files
target_lab = pd.Index(my_labels.values.tolist())
f = open(gwas_internal_link)
count_line = 0 count_line = 0
line = f.readline() line = f.readline()
header = line.split() print(line)
list_col = {} header = pd.Index(line.split())
list_lab = {}
for l in range(0, len(target_lab)): def get_position(I,x):
for h in range(0, len(header)): try:
if header[h] == target_lab[l]: return I.get_loc(x)
list_lab[my_labels.columns.tolist()[l]] = h except KeyError:
list_col[h] = my_labels.columns.tolist()[l] return np.nan
return {'label_position' : list_col, 'position_label': list_lab} label_position = [get_position(header,i) for i in target_lab]
def read_gwas( gwas_internal_link, column_dict): mapgw = pd.Series(label_position, index=reference_label)
mapgw = mapgw.loc[~mapgw.isna()].astype(int)
mapgw.sort_values(inplace=True)
print(mapgw)
f.close()
return mapgw
def read_gwas( gwas_internal_link, column_map):
""" """
Read gwas Chromosome and rename columns in our standart Read gwas Chromosome and rename columns in our standart
""" """
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols=column_dict['label_position'].keys(), usecols = column_map.values, #column_dict['label_position'].keys(),
names= column_map.index,
index_col=0, index_col=0,
names=column_dict['label_position'].values(), header=0, na_values= ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
header=0) '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
'nan', 'na', '.'])
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
#fullGWAS = convert_missing_values(fullGWAS) #fullGWAS = convert_missing_values(fullGWAS)
......
...@@ -80,6 +80,10 @@ def compute_snp_alignement(mgwas): ...@@ -80,6 +80,10 @@ def compute_snp_alignement(mgwas):
mgwas: a pandas dataframe of the GWAS data merged mgwas: a pandas dataframe of the GWAS data merged
with the reference panel with the reference panel
""" """
#ensure that allele are upper cases:
mgwas['a1'] = mgwas.a1.str.upper()
mgwas['a2'] = mgwas.a2.str.upper()
mgwas['a1c'] = dna_u.dna_complement(mgwas.a1) mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
mgwas['a2c'] = dna_u.dna_complement(mgwas.a2) mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)
......
...@@ -26,31 +26,29 @@ ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/" ...@@ -26,31 +26,29 @@ ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/"
REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan',
'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/'
gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)
gwas_map.index gwas_map
GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"] #GWAS_table = gwas_map.index[22:]#["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
#GWAS_table[5:]
# "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt" # "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt"
# "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", # "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt",
# "GIANT_2015_HIP_COMBINED_EUR.txt", # "GIANT_2015_HIP_COMBINED_EUR.txt",
# ] # ]
for GWAS_filename in GWAS_table.index:
gwas_map
GWAS_table = ["Menopause_HapMap_for_website_18112015.txt"] # gwas_map.index#["gabriel_asthma_dummy.txt"]
for GWAS_filename in GWAS_table:
tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'],
gwas_map.loc[GWAS_filename, 'outcome']) gwas_map.loc[GWAS_filename, 'outcome'])
print('processing GWAS: {}'.format(tag)) print('processing GWAS: {}'.format(tag))
start = time.time() start = time.time()
gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
print(mapgw)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
...@@ -58,21 +56,28 @@ for GWAS_filename in GWAS_table.index: ...@@ -58,21 +56,28 @@ for GWAS_filename in GWAS_table.index:
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id") index_col="snp_id")
if gw_df.index.map(str).str.contains("^chr*", case=False).any():
ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str)
other_snp = pd.merge(ref, gw_df, how='inner', indicator=True,
left_on ='key2', left_index=False, right_index=True)
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag) mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag)
end = time.time() end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start)) print("Preprocessing of {0} in {1}s".format(tag, end-start))
jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag) jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)
jp.save_output.save_output(mgwas, ldscore_format, tag) jp.save_output.save_output(mgwas, ldscore_format, tag)
mgwas.reset_index(inplace=True) mapgw.sort_values(inplace=True)
mgwas.sort_values(['chr', "pos"]).head(100) mgwas.head()
GWAS_labels
pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
mapgw.head()
GWAS_path
GWAS_labels
mapgw
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment