Select Git revision
__main__.py
Forked from
Statistical-Genetics / jass_preprocessing
6 commits behind the upstream repository.

Hanna JULIENNE authored
__main__.py 4.73 KiB
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS datasets by chromosome
"""
__updated__ = '2022-03-02'
import pandas as pd
import numpy as np
import jass_preprocessing as jp
import time
import argparse
def raise_duplicated_index(tag):
duplicated_index = tag.duplicated()
raise ValueError("'Consortium_Outcome' are duplicated for: {0}".format(duplicated_index))
def launch_preprocessing(args):
"""
Preprocessing GWAS dataset
"""
gwas_map = pd.read_csv(args.gwas_info, sep="\t")
#define an unique
gwas_map['tag'] = gwas_map.Consortium+ "_" + gwas_map.Outcome
if gwas_map.tag.duplicated().any():
raise_duplicated_index(gwas_map.tag)
gwas_map.set_index("tag", inplace=True)
print(gwas_map)
for tag in gwas_map.index:
gwas_filename = gwas_map.loc[tag, "filename"]
print('processing GWAS: {}'.format(tag))
start = time.time()
print(args.input_folder)
GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
print('GWAS file found at: {}'.format(GWAS_link))
mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
if args.imputation_quality_treshold is not 'None':
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
else:
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
print("#SNPs in GWAS summary statistic file: {}".format(gw_df.shape[0]))
ref = jp.map_reference.read_reference(args.ref_path, np.bool_(args.mask_MHC), np.double(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
print("Unique chromosome in reference")
print(ref.chr.unique())
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref, gwas_map.loc[tag, "index_type"])
print("#SNPs mapped to reference panel: {}".format(mgwas.shape[0]))
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas)
mgwas = jp.compute_score.compute_sample_size(mgwas, args.diagnostic_folder, tag, np.double(args.percent_sample_size))
end = time.time()
print("#SNPs remaining after sample size filter: {}".format(mgwas.shape[0]))
print("Preprocessing of {0} in {1}s".format(tag, end-start))
jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
if(args.output_folder_1_file):
jp.save_output.save_output(mgwas, args.output_folder_1_file, tag)
def add_preprocessing_argument():
parser = argparse.ArgumentParser()
parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
#parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")
parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01')
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
parser.set_defaults(func=launch_preprocessing)
return parser
def main():
parser = add_preprocessing_argument()
args = parser.parse_args()
args.func(args)
if __name__=="__main__":
main()