Skip to content
Snippets Groups Projects
Select Git revision
  • a27a77c61196288a160ea65489bfeeb89559d9d2
  • master default protected
  • P_value_too_small
  • new_sample_size_filter
  • 2.2
  • 2.1
  • 2.0.1
  • 1.0
8 results

__main__.py

Blame
  • Forked from Statistical-Genetics / jass_preprocessing
    6 commits behind the upstream repository.
    __main__.py 4.73 KiB
    """
    Read raw GWAS summary statistics, filter and format
    Write clean GWAS datasets by chromosome
    """
    __updated__ = '2022-03-02'
    
    import pandas as pd
    import numpy as np
    import jass_preprocessing as jp
    import time
    import argparse
    
    
    def raise_duplicated_index(tag):
        duplicated_index = tag.duplicated()
        raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))
    
    def launch_preprocessing(args):
        """
        Preprocessing GWAS dataset
        """
        gwas_map = pd.read_csv(args.gwas_info, sep="\t")
    
        #define an unique
        gwas_map['tag'] = gwas_map.Consortium+ "_" + gwas_map.Outcome
    
        if gwas_map.tag.duplicated().any():
            raise_duplicated_index(gwas_map.tag)
    
        gwas_map.set_index("tag", inplace=True)
        print(gwas_map)
        for tag in gwas_map.index:
            gwas_filename = gwas_map.loc[tag, "filename"]
    
            print('processing GWAS: {}'.format(tag))
            start = time.time()
            print(args.input_folder)
            GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
            print('GWAS file found at: {}'.format(GWAS_link))
            mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
    
            if args.imputation_quality_treshold is not 'None':
                gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
            else:
                gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
    
            print("#SNPs in GWAS summary statistic file: {}".format(gw_df.shape[0]))
            ref = jp.map_reference.read_reference(args.ref_path, np.bool_(args.mask_MHC), np.double(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
    
            print("Unique chromosome in reference")
            print(ref.chr.unique())
    
            mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
    
            print("#SNPs mapped to reference panel: {}".format(mgwas.shape[0]))
            mgwas = jp.map_reference.compute_snp_alignement(mgwas)
            mgwas = jp.compute_score.compute_z_score(mgwas)
            mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, np.double(args.percent_sample_size))
            end = time.time()
    
            print("#SNPs remaining after sample size filter: {}".format(mgwas.shape[0]))
            print("Preprocessing of {0} in {1}s".format(tag, end-start))
            jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
    
            if(args.output_folder_1_file):
                jp.save_output.save_output(mgwas, args.output_folder_1_file, tag)
    
    
    def add_preprocessing_argument():
    
        parser = argparse.ArgumentParser()
        parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
        #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
        parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
        parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
        parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")
    
        parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
        parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
        parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
    
        parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default='0.01')
        parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
        parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
    
        parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
        parser.set_defaults(func=launch_preprocessing)
    
        return parser
    
    def main():
    
        parser = add_preprocessing_argument()
        args = parser.parse_args()
        args.func(args)
    
    
    if __name__=="__main__":
        main()