Commit 0c5728ea authored by hjulienne's avatar hjulienne
Browse files

adapt code to SAIGE output

parent 92a461de
Pipeline #42616 failed with stage
in 17 seconds
......@@ -56,19 +56,21 @@ In a terminal, execute the following lines:
Input
======
* A reference panel (1000 genome format). The user is expected to provide a reference panel in tsv format with the following columns in that order, without header:
+-----+-----+------------+-----+-----+---------+
| chr | pos | snp_id | ref | alt | MAF |
+=====+=====+============+=====+=====+=========+
| 1 |13116| rs62635286 | T | G |0.0970447|
+-----+-----+------------+-----+-----+---------+
| 1 |13118| rs200579949| A | G |0.0970447|
+-----+-----+------------+-----+-----+---------+
| 1 |14604| rs541940975| A | G | 0.147564|
+-----+-----+------------+-----+-----+---------+
| 1 |14930| rs75454623 | A | G | 0.482228|
+-----+-----+------------+-----+-----+---------+
* A reference panel (1000 genome format). The user is expected to provide a reference panel
in tsv format with the following columns in the following order, without header:
+-----+------------+---------+-------+-----+-----+
| chr | snp_id | MAF | pos | ref | alt |
+=====+============+=========+=======+=====+=====+
| 1 | rs62635286 |0.0970447| 13116 | T | G |
+-----+------------+---------+-------+-----+-----+
| 1 | rs63125786 |0.0970447| 15116 | T | A |
+-----+------------+---------+-------+-----+-----+
| 1 | rs5686 |0.1970447| 17116 | A | G |
+-----+------------+---------+-------+-----+-----+
| 1 | rs892586 |0.7670447| 23116 | C | G |
+-----+------------+---------+-------+-----+-----+
* Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
* a list containing the name of GWAS file to the string format.
......
......@@ -38,22 +38,29 @@ def launch_preprocessing(args):
raise_duplicated_index(gwas_map.tag)
gwas_map.set_index("tag", inplace=True)
print(gwas_map)
for tag in gwas_map.index:
gwas_filename = gwas_map.loc[tag, "filename"]
print('processing GWAS: {}'.format(tag))
start = time.time()
print(args.input_folder)
GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
print(GWAS_link)
mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
print(mapgw)
print(args.imputation_quality_treshold)
if args.imputation_quality_treshold is not 'None':
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
if args.additional_masked_region is None:
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
else:
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region)
print(eval(args.additional_masked_region))
ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
mgwas = jp.compute_score.compute_z_score(mgwas)
......@@ -62,6 +69,7 @@ def launch_preprocessing(args):
end = time.time()
print("Preprocessing of {0} in {1}s".format(tag, end-start))
print(mgwas.head())
jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
if(args.output_folder_1_file):
......@@ -81,11 +89,11 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01)
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default='0.01')
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default=None)
parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
parser.set_defaults(func=launch_preprocessing)
......
......@@ -153,6 +153,14 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head())
# either rs ID or full position must be available:
if "snpid" not in fullGWAS.columns:
if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str)
else:
raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
fullGWAS.set_index("snpid", inplace=True)
print(fullGWAS.head())
if imputation_treshold:
......
......@@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
ref (pandas dataframe): the reference_panel with the specified filter applied
"""
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
names =[ 'chr', "snp_id", "MAF","pos", "ref", "alt"],
index_col="snp_id")
#Filter Strand ambiguous
ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
......@@ -53,8 +53,15 @@ def map_on_ref_panel(gw_df , ref_panel):
"""
inter_index = ref_panel.index.intersection(gw_df.index)
print(ref_panel.head())
print(gw_df.head())
merge_GWAS = pd.merge(ref_panel, gw_df,
how='inner', indicator=True, left_index=True, right_index=True)
if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
else:
raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
print("SNPs {}".format(merge_GWAS.shape[0]))
......
......@@ -10,6 +10,7 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_copy.set_index("chr", inplace=True)
mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True)
print(mgwas_copy.index.unique())
for chrom in mgwas_copy.index.unique():
mgwas_chr = pd.DataFrame({
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment