add module to map gwas on the reference panel

b3fb8ef1 · Hanna JULIENNE · a40f95cf · b3fb8ef1 · b3fb8ef1 · b3fb8ef1
Commit b3fb8ef1 authored 7 years ago by Hanna JULIENNE
--- a/jass_preprocessing/jass_preprocessing/__init__.py
+++ b/jass_preprocessing/jass_preprocessing/__init__.py
 import jass_preprocessing.map_gwas.map_gwas as map_gwas
 import jass_preprocessing.dna_utils.dna_utils as dna_utils
+import jass_preprocessing.map_reference.map_reference as map_reference
+import jass_preprocessing.compute_score.compute as compute_score
--- a/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py
+++ b/jass_preprocessing/jass_preprocessing/dna_utils/dna_utils.py
+"""
+    Few fonction to to compute DNA complement
+"""
+

 def dna_complement_base(inputbase):
-    if (inputbase == 'A'):
-        return('T')
-    if (inputbase == 'T'):
-        return('A')
-    if (inputbase == 'G'):
-        return('C')
-    if (inputbase == 'C'):
-        return('G')
+    dna_complement_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
+    try:
+        return(dna_complement_dict[inputbase])
+    except KeyError:
        return('Not ATGC')



--- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
+++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
@@ -33,11 +33,13 @@ def convert_missing_values(df):
    """
    Convert all missing value strings to a standart np.nan value
    """
-    def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
-                   '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
+    def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
+                   '-NaN',
+                   '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
+                    'nan', 'na', '.']

    nmissing = len(def_missing)
-    nan_vec = [np.nan] * nmissing
+    nan_vec = ["NA"] * nmissing
    return df.replace(def_missing, nan_vec)


@@ -58,7 +60,6 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):
    list_lab = {}
    for l in range(0, len(target_lab)):
        for h in range(0, len(header)):
-
            if header[h] == target_lab[l]:
                list_lab[my_labels.columns.tolist()[l]] = h
                list_col[h] = my_labels.columns.tolist()[l]
@@ -67,30 +68,16 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):

 def read_gwas( gwas_internal_link, column_dict):
    """
-    Read gwas and apply minimal qc
+    Read gwas Chromosome and rename columns in our standart
    """

    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
-                               usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0)
+                               usecols=column_dict['label_position'].keys(),
+                                index_col=0,
+                                names=column_dict['label_position'].values(),
+                                 header=0)

    fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
-    fullGWAS = convert_missing_values(fullGWAS)
+    #fullGWAS = convert_missing_values(fullGWAS)

    return fullGWAS
-
-
-
-def map_on_ref_panel(gw_df , ref_panel):
-    """
-    Merge Gwas df with the reference panel
-    """
-    def key2(x):
-        return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
-
-    ref_panel['key2'] = ref_panel.apply(key2,1)
-
-    merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
-    other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
-
-    merge_GWAS.loc[other_snp.index] = other_snp
-    return(merge_GWAS)
--- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
+++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
+import pandas as pd
+import numpy as np
+import jass_preprocessing.dna_utils as dna_u
+import warnings
+
+
+def read_reference(gwas_reference_panel):
+    """
+    helper function to name correctly the column
+    """
+    ref = pd.read_csv(REF_filename, header=None, sep= "\t",
+                      names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
+                       index_col="snp_id")
+    return(ref)
+
+
+def map_on_ref_panel(gw_df , ref_panel):
+    """
+    Merge Gwas df with the reference panel
+    """
+    def key2(x):
+        return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
+
+    ref_panel['key2'] = ref_panel.apply(key2,1)
+
+    merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
+    other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
+
+    merge_GWAS.loc[other_snp.index] = other_snp
+    return(merge_GWAS)
+
+
+def compute_is_flipped(mgwas):
+    flipped = pd.DataFrame({"ref_flipped" : (mgwas.ref == mgwas.a2), "alt_flipped" : (mgwas.alt == mgwas.a1)})
+    flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.ref == mgwas.a2c), "alt_flippedc" : (mgwas.alt == mgwas.a1c)})
+
+    is_flipped = pd.DataFrame({"flipped":flipped.all(1), # The allele of the
+                               "flipped_complement":flipped_complement.all(1)}
+                              ).any(1)
+    return is_flipped
+
+def compute_is_aligned(mgwas):
+    aligned = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1), "alt_ok" : (mgwas.alt == mgwas.a2)})
+    aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1c), "alt_ok" : (mgwas.alt == mgwas.a2c)})
+
+    is_aligned = pd.DataFrame({"aligned":aligned.all(1), # The allele of the
+                               "aligned_complement":aligned_complement.all(1)}
+                              ).any(1)
+    return is_aligned
+
+def compute_snp_alignement(mgwas):
+
+    """
+    Add a column to mgwas indicating if the reference and coded
+    allele is flipped compared to the reference panel.
+
+    If it is, the sign of the statistic must be flipped
+
+    Args:
+        mgwas: a pandas dataframe of the GWAS data merged
+         with the reference panel
+
+    """
+
+    mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
+    mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)
+
+    is_aligned = compute_is_aligned(mgwas)
+    is_flipped = compute_is_flipped(mgwas)
+
+    # does some SNP are not "alignable" on reference
+    not_aligned_at_all = ~pd.DataFrame([is_aligned, is_flipped ]).any() # 
+    if(not_aligned_at_all.any()):
+        warnings.warn('Some snps are not aligned at all with the reference panel\nand will be filtered:\n{}'.format(mgwas.index[is_flipped]),
+                  DeprecationWarning)
+        mgwas = mgwas.loc[~not_aligned_at_all]
+
+    aligned_everywhere = pd.DataFrame([is_aligned, is_flipped ]).all()
+    if(aligned_everywhere.any()):
+            raise ValueError('Some snps are both aligned and flipped:\n{}'.format(mgwas.index[is_flipped]),
+                      DeprecationWarning)
+
+    mgwas['sign_flip'] = np.nan
+    mgwas.loc[is_aligned,"sign_flip"] = 1
+    mgwas.loc[is_flipped, "sign_flip"] = -1
+
+    return(mgwas)
--- a/jass_preprocessing/setup.py
+++ b/jass_preprocessing/setup.py
@@ -4,10 +4,12 @@ setup(name='jass_preprocessing',
      version='0.1',
      description='Preprocess GWAS summary statistic for JASS',
      url='http:https://gitlab.pasteur.fr/statistical-genetics/JASS_Pre-processing',
-      author='Hugues Aschard, Vincent Laville, Hanna Julienne',
+      author='Hugues Aschard, Hanna Julienne, Vincent Laville',
      author_email='hugues.aschard@pasteur.fr',
      license='MIT',
      #package_dir = {'': 'jass_preprocessing'},
      packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
-                "jass_preprocessing.dna_utils"],
+                "jass_preprocessing.dna_utils", "jass_preprocessing.map_reference",
+                "jass_preprocessing.compute_score"
+                ],
      zip_safe=False)
--- a/main_preprocessing.py
+++ b/main_preprocessing.py
@@ -2,7 +2,7 @@
 Read raw GWAS summary statistics, filter and format
 Write clean GWAS
 """
-__updated__ = '2018-19-02'
+__updated__ = '2018-26-06'

 import numpy as np
 import scipy.stats as ss
@@ -18,7 +18,7 @@ import pandas as pd
 perSS = 0.7
 netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
 #netPath       = '/pasteur/projets/policy01/'
-GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
+GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv'
 GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
 REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
 pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
@@ -30,36 +30,36 @@ out_summary = "summary_GWAS.csv"
 ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'


-GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
-              'SNP_gwas_mc_merge_nogc.tbl.uniq',
-              'GIANT_2015_HIP_COMBINED_EUR.txt',
-              'GIANT_2015_WC_COMBINED_EUR.txt',
-              'GIANT_2015_WHR_COMBINED_EUR.txt']
+gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)

+GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt"]

 gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)

-
 column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')

-column_dict.iloc[:2]
-gwas.iloc[0,1].split('/')[-1]
-

-column_dict['filename']
 my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
-target_lab = my_labels.values.tolist()[0]
-len(target_lab)
-
+column_dict[['freq']]
    # READ GWAS
 GWAS_filename = GWAS_table[0]
-GWAS_filename

 GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
 GWAS_link
 mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
+
 gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
-ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
+gw_df.head()
+
+ref = pd.read_csv(REF_filename, header=None, sep= "\t",
+                  names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
+                   index_col="snp_id")
+
+mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
+mgwas = jp.map_reference.compute_snp_alignement(mgwas)
+
+mgwas.head()
+zscore = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"]
+

-dir(jp.map_gwas)
-mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)
+np.isinf(ref.head().pos).any()
--- a/pyPCMA_1_format_v1.4.py
+++ b/pyPCMA_1_format_v1.4.py
@@ -336,7 +336,7 @@ for GWAS in range(0, len(GWAS_table)):
    # # SUMMARY + PLOTS
    # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
    #     SNP_in, SNP_rem))
-    #
+    #np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1))
    # # the histogram of the #sample per SNP
    # fig = plt.figure()
    # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')