__main__.py

"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS datasets by chromosome
"""
__updated__ = '2022-03-02'

import pandas as pd
import numpy as np
import jass_preprocessing as jp
import time
import argparse


def raise_duplicated_index(tag):
    duplicated_index = tag.duplicated()
    raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))

def launch_preprocessing(args):
    """
    Preprocessing GWAS dataset
    """
    gwas_map = pd.read_csv(args.gwas_info, sep="\t")

    #define an unique
    gwas_map['tag'] = gwas_map.Consortium+ "_" + gwas_map.Outcome

    if gwas_map.tag.duplicated().any():
        raise_duplicated_index(gwas_map.tag)

    gwas_map.set_index("tag", inplace=True)
    print(gwas_map)
    for tag in gwas_map.index:
        gwas_filename = gwas_map.loc[tag, "filename"]

        print('processing GWAS: {}'.format(tag))
        start = time.time()
        print(args.input_folder)
        GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
        print('GWAS file found at: {}'.format(GWAS_link))
        mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])

        if args.imputation_quality_treshold is not 'None':
            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
        else:
            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)

        print("#SNPs in GWAS summary statistic file: {}".format(gw_df.shape[0]))
        ref = jp.map_reference.read_reference(args.ref_path, np.bool_(args.mask_MHC), np.double(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))

        print("Unique chromosome in reference")
        print(ref.chr.unique())

        mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])

        print("#SNPs mapped to reference panel: {}".format(mgwas.shape[0]))
        mgwas = jp.map_reference.compute_snp_alignement(mgwas)
        mgwas = jp.compute_score.compute_z_score(mgwas)
        mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, np.double(args.percent_sample_size))
        end = time.time()

        print("#SNPs remaining after sample size filter: {}".format(mgwas.shape[0]))
        print("Preprocessing of {0} in {1}s".format(tag, end-start))
        jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)

        if(args.output_folder_1_file):
            jp.save_output.save_output(mgwas, args.output_folder_1_file, tag)


def add_preprocessing_argument():

    parser = argparse.ArgumentParser()
    parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
    #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
    parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
    parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
    parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")

    parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
    parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
    parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)

    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default='0.01')
    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
    parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')

    parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
    parser.set_defaults(func=launch_preprocessing)

    return parser

def main():

    parser = add_preprocessing_argument()
    args = parser.parse_args()
    args.func(args)


if __name__=="__main__":
    main()