Skip to content
Snippets Groups Projects
Commit 0c5728ea authored by hjulienne's avatar hjulienne
Browse files

adapt code to SAIGE output

parent 92a461de
No related branches found
No related tags found
No related merge requests found
Pipeline #42616 failed
...@@ -56,19 +56,21 @@ In a terminal, execute the following lines: ...@@ -56,19 +56,21 @@ In a terminal, execute the following lines:
Input Input
====== ======
* A reference panel (1000 genome format). The user is expected to provide a reference panel in tsv format with the following columns in that order, without header: * A reference panel (1000 genome format). The user is expected to provide a reference panel
in tsv format with the following columns in the following order, without header:
+-----+-----+------------+-----+-----+---------+
| chr | pos | snp_id | ref | alt | MAF | +-----+------------+---------+-------+-----+-----+
+=====+=====+============+=====+=====+=========+ | chr | snp_id | MAF | pos | ref | alt |
| 1 |13116| rs62635286 | T | G |0.0970447| +=====+============+=========+=======+=====+=====+
+-----+-----+------------+-----+-----+---------+ | 1 | rs62635286 |0.0970447| 13116 | T | G |
| 1 |13118| rs200579949| A | G |0.0970447| +-----+------------+---------+-------+-----+-----+
+-----+-----+------------+-----+-----+---------+ | 1 | rs63125786 |0.0970447| 15116 | T | A |
| 1 |14604| rs541940975| A | G | 0.147564| +-----+------------+---------+-------+-----+-----+
+-----+-----+------------+-----+-----+---------+ | 1 | rs5686 |0.1970447| 17116 | A | G |
| 1 |14930| rs75454623 | A | G | 0.482228| +-----+------------+---------+-------+-----+-----+
+-----+-----+------------+-----+-----+---------+ | 1 | rs892586 |0.7670447| 23116 | C | G |
+-----+------------+---------+-------+-----+-----+
* Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?) * Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
* a list containing the name of GWAS file to the string format. * a list containing the name of GWAS file to the string format.
......
...@@ -38,22 +38,29 @@ def launch_preprocessing(args): ...@@ -38,22 +38,29 @@ def launch_preprocessing(args):
raise_duplicated_index(gwas_map.tag) raise_duplicated_index(gwas_map.tag)
gwas_map.set_index("tag", inplace=True) gwas_map.set_index("tag", inplace=True)
print(gwas_map)
for tag in gwas_map.index: for tag in gwas_map.index:
gwas_filename = gwas_map.loc[tag, "filename"] gwas_filename = gwas_map.loc[tag, "filename"]
print('processing GWAS: {}'.format(tag)) print('processing GWAS: {}'.format(tag))
start = time.time() start = time.time()
print(args.input_folder)
GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2] GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
print(GWAS_link)
mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag]) mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
print(mapgw)
print(args.imputation_quality_treshold)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold)) if args.imputation_quality_treshold is not 'None':
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
if args.additional_masked_region is None:
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
else: else:
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region)) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region)
print(eval(args.additional_masked_region))
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_z_score(mgwas)
...@@ -62,6 +69,7 @@ def launch_preprocessing(args): ...@@ -62,6 +69,7 @@ def launch_preprocessing(args):
end = time.time() end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start)) print("Preprocessing of {0} in {1}s".format(tag, end-start))
print(mgwas.head())
jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag) jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
if(args.output_folder_1_file): if(args.output_folder_1_file):
...@@ -81,11 +89,11 @@ def add_preprocessing_argument(): ...@@ -81,11 +89,11 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)") parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7) parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01) parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01')
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False) parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None) parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default=None) parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
parser.set_defaults(func=launch_preprocessing) parser.set_defaults(func=launch_preprocessing)
......
...@@ -153,6 +153,14 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -153,6 +153,14 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
'NA', 'NULL', 'NaN', 'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float}) 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head()) print(fullGWAS.head())
# either rs ID or full position must be available:
if "snpid" not in fullGWAS.columns:
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str)
else:
raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
fullGWAS.set_index("snpid", inplace=True) fullGWAS.set_index("snpid", inplace=True)
print(fullGWAS.head()) print(fullGWAS.head())
if imputation_treshold: if imputation_treshold:
......
...@@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio ...@@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
ref (pandas dataframe): the reference_panel with the specified filter applied ref (pandas dataframe): the reference_panel with the specified filter applied
""" """
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t", ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"],
index_col="snp_id") index_col="snp_id")
#Filter Strand ambiguous #Filter Strand ambiguous
ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])] ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
...@@ -53,8 +53,15 @@ def map_on_ref_panel(gw_df , ref_panel): ...@@ -53,8 +53,15 @@ def map_on_ref_panel(gw_df , ref_panel):
""" """
inter_index = ref_panel.index.intersection(gw_df.index) inter_index = ref_panel.index.intersection(gw_df.index)
print(ref_panel.head())
print(gw_df.head())
merge_GWAS = pd.merge(ref_panel, gw_df, merge_GWAS = pd.merge(ref_panel, gw_df,
how='inner', indicator=True, left_index=True, right_index=True) how='inner', indicator=True, left_index=True, right_index=True)
if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
else:
raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
print("SNPs {}".format(merge_GWAS.shape[0])) print("SNPs {}".format(merge_GWAS.shape[0]))
......
...@@ -10,6 +10,7 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): ...@@ -10,6 +10,7 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_copy.set_index("chr", inplace=True) mgwas_copy.set_index("chr", inplace=True)
mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True) mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True)
print(mgwas_copy.index.unique())
for chrom in mgwas_copy.index.unique(): for chrom in mgwas_copy.index.unique():
mgwas_chr = pd.DataFrame({ mgwas_chr = pd.DataFrame({
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment