diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..a062d1a52502392f00d44b645e6535bf3fdcefbb --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ + + +# Preprocessing package for cleaning and formating data for JASS + +This helper package convert heterogeneous GWAS summary statistic format to +the IMPG input format. + +## Installation + +*Requirement* : + +## Usage + +Once installed the preprocessing can be launch in the following manner: + + + +##Â reference panel format: + +The user is expected to provide a reference panel with the following +column naming and column types: + +| chr | pos | snp_id | ref | alt | MAF | +|-----|-----|--------|-----|-----|-----| +|1 |13116 |rs62635286 | T| G| 0.0970447| +|1 |13118 |rs200579949 |A |G |0.0970447| +|1 |14604 |rs541940975 |A |G |0.147564| +|1 |14930 |rs75454623 |A |G |0.482228| + +*** + +## Output format +['rsID', 'pos', 'A0', "A1", "Z" ] diff --git a/jass_preprocessing/jass_preprocessing/compute_score/compute.py b/jass_preprocessing/jass_preprocessing/compute_score/compute.py index 841eba01dea572d3a506b4651ecf8a47cb1fbc1f..fd29ff24ac0d1c198a0e7812d8f75b1dddf11a4c 100644 --- a/jass_preprocessing/jass_preprocessing/compute_score/compute.py +++ b/jass_preprocessing/jass_preprocessing/compute_score/compute.py @@ -54,12 +54,14 @@ def compute_sample_size(mgwas, diagnostic_folder, trait): ss_thres = perSS * myW_thres mgwas["computed_N"] = myN plt.clf() - p1 = sns.distplot(mgwas.computed_N) + p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()]) p1.axvline(x=ss_thres) fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) p1.figure.savefig(fo) # Filter SNP with a too small sample _SampleSize print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) mgwas = mgwas.loc[(myN >= ss_thres)] + mgwas = mgwas.loc[~mgwas.computed_N.isna()] + print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) return(mgwas) diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py index aca29dd1cbd22786f8ebd1da67203bcd4db5bf91..b906d03c807a6d92a05ceda24a81c25d131fcfa2 100644 --- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py +++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py @@ -36,7 +36,7 @@ def convert_missing_values(df): def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', - 'nan', 'na', '.'] + 'nan', 'na', '.'] nmissing = len(def_missing) nan_vec = ["NA"] * nmissing @@ -47,35 +47,51 @@ def map_columns_position(gwas_internal_link, GWAS_labels): """ Find column position for each specific Gwas """ - column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') + column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0) + gwas_file = gwas_internal_link.split('/')[-1] - my_labels = column_dict[column_dict['filename'] == gwas_file] - target_lab = my_labels.values.tolist()[0] - f = open(gwas_internal_link) + my_labels = column_dict.loc[gwas_file] + column_dict.head() + #Our standart labels: + reference_label = column_dict.columns.tolist() + # labels in the GWAS files + target_lab = pd.Index(my_labels.values.tolist()) + + f = open(gwas_internal_link) count_line = 0 line = f.readline() - header = line.split() - list_col = {} - list_lab = {} - for l in range(0, len(target_lab)): - for h in range(0, len(header)): - if header[h] == target_lab[l]: - list_lab[my_labels.columns.tolist()[l]] = h - list_col[h] = my_labels.columns.tolist()[l] - return {'label_position' : list_col, 'position_label': list_lab} - - -def read_gwas( gwas_internal_link, column_dict): + print(line) + header = pd.Index(line.split()) + + def get_position(I,x): + try: + return I.get_loc(x) + except KeyError: + return np.nan + label_position = [get_position(header,i) for i in target_lab] + + + mapgw = pd.Series(label_position, index=reference_label) + mapgw = mapgw.loc[~mapgw.isna()].astype(int) + mapgw.sort_values(inplace=True) + print(mapgw) + f.close() + return mapgw + +def read_gwas( gwas_internal_link, column_map): """ Read gwas Chromosome and rename columns in our standart """ fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, - usecols=column_dict['label_position'].keys(), + usecols = column_map.values, #column_dict['label_position'].keys(), + names= column_map.index, index_col=0, - names=column_dict['label_position'].values(), - header=0) + header=0, na_values= ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', + '-NaN', + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', + 'nan', 'na', '.']) fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] #fullGWAS = convert_missing_values(fullGWAS) diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py index 26f1ae2829f72cfe6e9ee460e64efd4675ee84f7..bb049bcab99a7eb3eb2c6c1feed0a55379ce90e3 100644 --- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -36,7 +36,7 @@ def map_on_ref_panel(gw_df , ref_panel): merge_GWAS = pd.concat([other_snp, merge_GWAS]) merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')] - + merge_GWAS.index.rename("snp_id", inplace=True) return(merge_GWAS) @@ -80,6 +80,10 @@ def compute_snp_alignement(mgwas): mgwas: a pandas dataframe of the GWAS data merged with the reference panel """ + #ensure that allele are upper cases: + + mgwas['a1'] = mgwas.a1.str.upper() + mgwas['a2'] = mgwas.a2.str.upper() mgwas['a1c'] = dna_u.dna_complement(mgwas.a1) mgwas['a2c'] = dna_u.dna_complement(mgwas.a2) diff --git a/main_preprocessing.py b/main_preprocessing.py index 8b719f8e4437a66a0484d466d079153e3d33a570..e4248c2ae2c652334a9fd78f27888731aa9dce28 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -26,31 +26,29 @@ ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/" REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' -outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' -def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', - 'na', '.'] - -out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) -gwas_map.index -GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"] +gwas_map +#GWAS_table = gwas_map.index[22:]#["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"] +#GWAS_table[5:] # "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt" # "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", # "GIANT_2015_HIP_COMBINED_EUR.txt", # ] -for GWAS_filename in GWAS_table.index: + +gwas_map +GWAS_table = ["Menopause_HapMap_for_website_18112015.txt"] # gwas_map.index#["gabriel_asthma_dummy.txt"] +for GWAS_filename in GWAS_table: tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], gwas_map.loc[GWAS_filename, 'outcome']) print('processing GWAS: {}'.format(tag)) start = time.time() gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) - column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) + print(mapgw) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) @@ -58,21 +56,28 @@ for GWAS_filename in GWAS_table.index: names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") - if gw_df.index.map(str).str.contains("^chr*", case=False).any(): - ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str) - other_snp = pd.merge(ref, gw_df, how='inner', indicator=True, - left_on ='key2', left_index=False, right_index=True) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) + mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag) end = time.time() + print("Preprocessing of {0} in {1}s".format(tag, end-start)) jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag) jp.save_output.save_output(mgwas, ldscore_format, tag) -mgwas.reset_index(inplace=True) -mgwas.sort_values(['chr', "pos"]).head(100) +mapgw.sort_values(inplace=True) +mgwas.head() + +GWAS_labels +pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0) + +mapgw.head() + +GWAS_path +GWAS_labels +mapgw