Skip to content
Snippets Groups Projects
Select Git revision
  • 664bf830bec784ff9502260f2d7ded6d4d862965
  • master default protected
  • P_value_too_small
  • new_sample_size_filter
  • 2.3
  • 2.2
  • 2.1
  • 2.0.1
  • 1.0
9 results

main_preprocessing.py

Blame
  • main_preprocessing.py 2.97 KiB
    """
    Read raw GWAS summary statistics, filter and format
    Write clean GWAS
    """
    __updated__ = '2018-26-06'
    
    import numpy as np
    import scipy.stats as ss
    import sys
    import math
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import jass_preprocessing as jp
    import pandas as pd
    import seaborn as sns
    import time
    
    perSS = 0.7
    netPath = "/mnt/atlas/"
    GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv'
    GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
    
    diagnostic_folder= "/mnt/atlas/PCMA/1._DATA/sample_size_distribution/"
    ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/"
    REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
    pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
    
    outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
    def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
                   '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan',
                   'na', '.']
    
    out_summary = "summary_GWAS.csv"
    ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/'
    gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)
    gwas_map.index
    GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
    # "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt"
    #              "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt",
    #              "GIANT_2015_HIP_COMBINED_EUR.txt",
    #              ]
    for GWAS_filename in GWAS_table.index:
    
        tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'],
                               gwas_map.loc[GWAS_filename, 'outcome'])
        print('processing GWAS: {}'.format(tag))
        start = time.time()
        gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
        column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
    
        GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
        mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
    
        gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
    
        ref = pd.read_csv(REF_filename, header=None, sep= "\t",
                          names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
                           index_col="snp_id")
    
        if gw_df.index.map(str).str.contains("^chr*", case=False).any():
            ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str)
            other_snp = pd.merge(ref, gw_df, how='inner', indicator=True,
                                 left_on ='key2', left_index=False, right_index=True)
    
        mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
        mgwas = jp.map_reference.compute_snp_alignement(mgwas)
        mgwas = jp.compute_score.compute_z_score(mgwas)
    
        mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag)
        end = time.time()
        print("Preprocessing of {0} in {1}s".format(tag, end-start))
    
        jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)
        jp.save_output.save_output(mgwas, ldscore_format, tag)
    
    mgwas.reset_index(inplace=True)
    mgwas.sort_values(['chr', "pos"]).head(100)