diff --git a/doc/source/index.rst b/doc/source/index.rst index dd1bb930964cd9b8e6229d13359e34eab54d00d4..cae2a933ad3eb48b1c6feb59726ffcd381001f59 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -56,19 +56,21 @@ In a terminal, execute the following lines: Input ====== -* A reference panel (1000 genome format). The user is expected to provide a reference panel in tsv format with the following columns in that order, without header: - -+-----+-----+------------+-----+-----+---------+ -| chr | pos | snp_id | ref | alt | MAF | -+=====+=====+============+=====+=====+=========+ -| 1 |13116| rs62635286 | T | G |0.0970447| -+-----+-----+------------+-----+-----+---------+ -| 1 |13118| rs200579949| A | G |0.0970447| -+-----+-----+------------+-----+-----+---------+ -| 1 |14604| rs541940975| A | G | 0.147564| -+-----+-----+------------+-----+-----+---------+ -| 1 |14930| rs75454623 | A | G | 0.482228| -+-----+-----+------------+-----+-----+---------+ +* A reference panel (1000 genome format). The user is expected to provide a reference panel + in tsv format with the following columns in the following order, without header: + ++-----+------------+---------+-------+-----+-----+ +| chr | snp_id | MAF | pos | ref | alt | ++=====+============+=========+=======+=====+=====+ +| 1 | rs62635286 |0.0970447| 13116 | T | G | ++-----+------------+---------+-------+-----+-----+ +| 1 | rs63125786 |0.0970447| 15116 | T | A | ++-----+------------+---------+-------+-----+-----+ +| 1 | rs5686 |0.1970447| 17116 | A | G | ++-----+------------+---------+-------+-----+-----+ +| 1 | rs892586 |0.7670447| 23116 | C | G | ++-----+------------+---------+-------+-----+-----+ + * Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?) * a list containing the name of GWAS file to the string format. diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index b9ca5f386b3bd6a3288af332e2bf11b3e718d5ef..ad18b2cec290364e42b9005d428654579499ad08 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -38,22 +38,29 @@ def launch_preprocessing(args): raise_duplicated_index(gwas_map.tag) gwas_map.set_index("tag", inplace=True) - + print(gwas_map) for tag in gwas_map.index: gwas_filename = gwas_map.loc[tag, "filename"] print('processing GWAS: {}'.format(tag)) start = time.time() + print(args.input_folder) GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2] + print(GWAS_link) mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag]) + print(mapgw) + print(args.imputation_quality_treshold) - gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold)) - - if args.additional_masked_region is None: - ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF)) + if args.imputation_quality_treshold is not 'None': + gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold)) else: - ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) + gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) + + print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region) + print(eval(args.additional_masked_region)) + ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) + mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas) @@ -62,6 +69,7 @@ def launch_preprocessing(args): end = time.time() print("Preprocessing of {0} in {1}s".format(tag, end-start)) + print(mgwas.head()) jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag) if(args.output_folder_1_file): @@ -81,11 +89,11 @@ def add_preprocessing_argument(): parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)") parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7) - parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01) - parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False) - parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None) + parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01') + parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False') + parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None') - parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default=None) + parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None') parser.set_defaults(func=launch_preprocessing) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 19a0d3cfb134718ea92d728bab9feb0367027d7d..f67ac3b26480140590ba1a2e52b840c03ee111f4 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -153,6 +153,14 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): 'NA', 'NULL', 'NaN', 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) print(fullGWAS.head()) + # either rs ID or full position must be available: + if "snpid" not in fullGWAS.columns: + if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns): + fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str) + else: + raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link)) + + fullGWAS.set_index("snpid", inplace=True) print(fullGWAS.head()) if imputation_treshold: diff --git a/jass_preprocessing/map_reference.py b/jass_preprocessing/map_reference.py index c630149d8e7b4e844f85f349bded523153c68c74..bb9dbd55206436bc2baf3dd63e3b8c397cc67369 100644 --- a/jass_preprocessing/map_reference.py +++ b/jass_preprocessing/map_reference.py @@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio ref (pandas dataframe): the reference_panel with the specified filter applied """ ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", - names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], + names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"], index_col="snp_id") #Filter Strand ambiguous ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])] @@ -53,8 +53,15 @@ def map_on_ref_panel(gw_df , ref_panel): """ inter_index = ref_panel.index.intersection(gw_df.index) + print(ref_panel.head()) + print(gw_df.head()) merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True) + if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95): + merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)] + else: + raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?") + print("SNPs {}".format(merge_GWAS.shape[0])) diff --git a/jass_preprocessing/save_output.py b/jass_preprocessing/save_output.py index c00fef4e56f60c445c4b89660e383e0be750a1aa..92952d04e4b8a52fa2be429876f48167268dfa3c 100644 --- a/jass_preprocessing/save_output.py +++ b/jass_preprocessing/save_output.py @@ -10,6 +10,7 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): mgwas_copy.set_index("chr", inplace=True) mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True) + print(mgwas_copy.index.unique()) for chrom in mgwas_copy.index.unique(): mgwas_chr = pd.DataFrame({