Skip to content
Snippets Groups Projects
Select Git revision
  • a40f95cfe66b3968877d368af66bf6148d502a52
  • master default protected
  • P_value_too_small
  • new_sample_size_filter
  • 2.3
  • 2.2
  • 2.1
  • 2.0.1
  • 1.0
9 results

main_preprocessing.py

Blame
  • main_preprocessing.py 1.98 KiB
    """
    Read raw GWAS summary statistics, filter and format
    Write clean GWAS
    """
    __updated__ = '2018-19-02'
    
    import numpy as np
    import scipy.stats as ss
    import sys
    import math
    import os
    import pandas as pd
    import matplotlib.pyplot as plt
    import jass_preprocessing as jp
    import pandas as pd
    
    
    perSS = 0.7
    netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
    #netPath       = '/pasteur/projets/policy01/'
    GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
    GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
    REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
    pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
    
    outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
    def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
                   '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
    out_summary = "summary_GWAS.csv"
    ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
    
    
    GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
                  'SNP_gwas_mc_merge_nogc.tbl.uniq',
                  'GIANT_2015_HIP_COMBINED_EUR.txt',
                  'GIANT_2015_WC_COMBINED_EUR.txt',
                  'GIANT_2015_WHR_COMBINED_EUR.txt']
    
    
    gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
    
    
    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
    
    column_dict.iloc[:2]
    gwas.iloc[0,1].split('/')[-1]
    
    
    column_dict['filename']
    my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
    target_lab = my_labels.values.tolist()[0]
    len(target_lab)
    
        # READ GWAS
    GWAS_filename = GWAS_table[0]
    GWAS_filename
    
    GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
    GWAS_link
    mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
    gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
    ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
    
    dir(jp.map_gwas)
    mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)