diff --git a/jass_preprocessing/jass_preprocessing/__init__.py b/jass_preprocessing/jass_preprocessing/__init__.py index 8ea525bc7d94f21a7f590c143594e58b757b0829..617bb474fa9e0a259367fa9b6c28d5b38684d1d7 100644 --- a/jass_preprocessing/jass_preprocessing/__init__.py +++ b/jass_preprocessing/jass_preprocessing/__init__.py @@ -1,2 +1,4 @@ import jass_preprocessing.map_gwas.map_gwas as map_gwas import jass_preprocessing.dna_utils.dna_utils as dna_utils +import jass_preprocessing.map_reference.map_reference as map_reference +import jass_preprocessing.compute_score.compute as compute_score diff --git a/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py b/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py index a0524300aefc5713f7a72a75ede66a50c19c1ff5..2f61e7279664bfa76f628b7ea6ac5931592547a0 100644 --- a/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py +++ b/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py @@ -1,14 +1,14 @@ +""" + Few fonction to to compute DNA complement +""" + def dna_complement_base(inputbase): - if (inputbase == 'A'): - return('T') - if (inputbase == 'T'): - return('A') - if (inputbase == 'G'): - return('C') - if (inputbase == 'C'): - return('G') - return('Not ATGC') + dna_complement_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'} + try: + return(dna_complement_dict[inputbase]) + except KeyError: + return('Not ATGC') def dna_complement(input): diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py index d9523fe0365f91bdab605856e8bd3ebb13ba15f9..aca29dd1cbd22786f8ebd1da67203bcd4db5bf91 100644 --- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py +++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py @@ -33,11 +33,13 @@ def convert_missing_values(df): """ Convert all missing value strings to a standart np.nan value """ - def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] + def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', + '-NaN', + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', + 'nan', 'na', '.'] nmissing = len(def_missing) - nan_vec = [np.nan] * nmissing + nan_vec = ["NA"] * nmissing return df.replace(def_missing, nan_vec) @@ -58,7 +60,6 @@ def map_columns_position(gwas_internal_link, GWAS_labels): list_lab = {} for l in range(0, len(target_lab)): for h in range(0, len(header)): - if header[h] == target_lab[l]: list_lab[my_labels.columns.tolist()[l]] = h list_col[h] = my_labels.columns.tolist()[l] @@ -67,30 +68,16 @@ def map_columns_position(gwas_internal_link, GWAS_labels): def read_gwas( gwas_internal_link, column_dict): """ - Read gwas and apply minimal qc + Read gwas Chromosome and rename columns in our standart """ fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, - usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0) + usecols=column_dict['label_position'].keys(), + index_col=0, + names=column_dict['label_position'].values(), + header=0) fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] - fullGWAS = convert_missing_values(fullGWAS) + #fullGWAS = convert_missing_values(fullGWAS) return fullGWAS - - - -def map_on_ref_panel(gw_df , ref_panel): - """ - Merge Gwas df with the reference panel - """ - def key2(x): - return ('chr' + str(x["chr"]) + ":" + str(x["pos"])) - - ref_panel['key2'] = ref_panel.apply(key2,1) - - merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True) - other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True) - - merge_GWAS.loc[other_snp.index] = other_snp - return(merge_GWAS) diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py new file mode 100644 index 0000000000000000000000000000000000000000..3b40c888a195168779fcd0ce8aa8072de51199b8 --- /dev/null +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -0,0 +1,87 @@ +import pandas as pd +import numpy as np +import jass_preprocessing.dna_utils as dna_u +import warnings + + +def read_reference(gwas_reference_panel): + """ + helper function to name correctly the column + """ + ref = pd.read_csv(REF_filename, header=None, sep= "\t", + names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], + index_col="snp_id") + return(ref) + + +def map_on_ref_panel(gw_df , ref_panel): + """ + Merge Gwas df with the reference panel + """ + def key2(x): + return ('chr' + str(x["chr"]) + ":" + str(x["pos"])) + + ref_panel['key2'] = ref_panel.apply(key2,1) + + merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True) + other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True) + + merge_GWAS.loc[other_snp.index] = other_snp + return(merge_GWAS) + + +def compute_is_flipped(mgwas): + flipped = pd.DataFrame({"ref_flipped" : (mgwas.ref == mgwas.a2), "alt_flipped" : (mgwas.alt == mgwas.a1)}) + flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.ref == mgwas.a2c), "alt_flippedc" : (mgwas.alt == mgwas.a1c)}) + + is_flipped = pd.DataFrame({"flipped":flipped.all(1), # The allele of the + "flipped_complement":flipped_complement.all(1)} + ).any(1) + return is_flipped + +def compute_is_aligned(mgwas): + aligned = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1), "alt_ok" : (mgwas.alt == mgwas.a2)}) + aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1c), "alt_ok" : (mgwas.alt == mgwas.a2c)}) + + is_aligned = pd.DataFrame({"aligned":aligned.all(1), # The allele of the + "aligned_complement":aligned_complement.all(1)} + ).any(1) + return is_aligned + +def compute_snp_alignement(mgwas): + + """ + Add a column to mgwas indicating if the reference and coded + allele is flipped compared to the reference panel. + + If it is, the sign of the statistic must be flipped + + Args: + mgwas: a pandas dataframe of the GWAS data merged + with the reference panel + + """ + + mgwas['a1c'] = dna_u.dna_complement(mgwas.a1) + mgwas['a2c'] = dna_u.dna_complement(mgwas.a2) + + is_aligned = compute_is_aligned(mgwas) + is_flipped = compute_is_flipped(mgwas) + + # does some SNP are not "alignable" on reference + not_aligned_at_all = ~pd.DataFrame([is_aligned, is_flipped ]).any() #Â + if(not_aligned_at_all.any()): + warnings.warn('Some snps are not aligned at all with the reference panel\nand will be filtered:\n{}'.format(mgwas.index[is_flipped]), + DeprecationWarning) + mgwas = mgwas.loc[~not_aligned_at_all] + + aligned_everywhere = pd.DataFrame([is_aligned, is_flipped ]).all() + if(aligned_everywhere.any()): + raise ValueError('Some snps are both aligned and flipped:\n{}'.format(mgwas.index[is_flipped]), + DeprecationWarning) + + mgwas['sign_flip'] = np.nan + mgwas.loc[is_aligned,"sign_flip"] = 1 + mgwas.loc[is_flipped, "sign_flip"] = -1 + + return(mgwas) diff --git a/jass_preprocessing/setup.py b/jass_preprocessing/setup.py index 23e156cc244df1296261925e4181c5c5aaeaa315..9e21834d36159f72690d03aa7aaf42f98ee1ef37 100644 --- a/jass_preprocessing/setup.py +++ b/jass_preprocessing/setup.py @@ -4,10 +4,12 @@ setup(name='jass_preprocessing', version='0.1', description='Preprocess GWAS summary statistic for JASS', url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing', - author='Hugues Aschard, Vincent Laville, Hanna Julienne', + author='Hugues Aschard, Hanna Julienne, Vincent Laville', author_email='hugues.aschard@pasteur.fr', license='MIT', #package_dir = {'': 'jass_preprocessing'}, packages= ['jass_preprocessing', "jass_preprocessing.map_gwas", - "jass_preprocessing.dna_utils"], + "jass_preprocessing.dna_utils", "jass_preprocessing.map_reference", + "jass_preprocessing.compute_score" + ], zip_safe=False) diff --git a/main_preprocessing.py b/main_preprocessing.py index bb213c0f6782edfb6b21fe8580471dac9638683f..4d94748895ea26b100ca2cd3387ddaffaf859b1d 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -2,7 +2,7 @@ Read raw GWAS summary statistics, filter and format Write clean GWAS """ -__updated__ = '2018-19-02' +__updated__ = '2018-26-06' import numpy as np import scipy.stats as ss @@ -18,7 +18,7 @@ import pandas as pd perSS = 0.7 netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' #netPath = '/pasteur/projets/policy01/' -GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt' +GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv' GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' @@ -30,36 +30,36 @@ out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' -GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', - 'SNP_gwas_mc_merge_nogc.tbl.uniq', - 'GIANT_2015_HIP_COMBINED_EUR.txt', - 'GIANT_2015_WC_COMBINED_EUR.txt', - 'GIANT_2015_WHR_COMBINED_EUR.txt'] +gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) +GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt"] gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) - column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') -column_dict.iloc[:2] -gwas.iloc[0,1].split('/')[-1] - -column_dict['filename'] my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]] -target_lab = my_labels.values.tolist()[0] -len(target_lab) - +column_dict[['freq']] # READ GWAS GWAS_filename = GWAS_table[0] -GWAS_filename GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] GWAS_link mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) + gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) -ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") +gw_df.head() + +ref = pd.read_csv(REF_filename, header=None, sep= "\t", + names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], + index_col="snp_id") + +mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) +mgwas = jp.map_reference.compute_snp_alignement(mgwas) + +mgwas.head() +zscore = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"] + -dir(jp.map_gwas) -mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref) +np.isinf(ref.head().pos).any() diff --git a/pyPCMA_1_format_v1.4.py b/pyPCMA_1_format_v1.4.py index 4f9d07c2c5674037b399598d20ecaf0f1e303bfe..d3baa31ad3a76ea44018d86c6a70edbcf1201424 100755 --- a/pyPCMA_1_format_v1.4.py +++ b/pyPCMA_1_format_v1.4.py @@ -336,7 +336,7 @@ for GWAS in range(0, len(GWAS_table)): # # SUMMARY + PLOTS # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % ( # SNP_in, SNP_rem)) - # + #np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) # # the histogram of the #sample per SNP # fig = plt.figure() # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')