Commit cc9337de authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add filter on reference panel

parent 849b8ee4
...@@ -39,9 +39,7 @@ def launch_preprocessing(args): ...@@ -39,9 +39,7 @@ def launch_preprocessing(args):
mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info) mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(args.ref_path, header=None, sep= "\t", ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas) mgwas = jp.map_reference.compute_snp_alignement(mgwas)
...@@ -69,6 +67,10 @@ def add_preprocessing_argument(): ...@@ -69,6 +67,10 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)") parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)") parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7) parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01)
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
parser.set_defaults(func=launch_preprocessing) parser.set_defaults(func=launch_preprocessing)
return parser return parser
......
...@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link, GWAS_labels): ...@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.set_index("filename", inplace=True) column_dict.set_index("filename", inplace=True)
print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1] gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict.loc[gwas_file] my_labels = column_dict.loc[gwas_file]
...@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map): ...@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map):
usecols = column_map.values, #column_dict['label_position'].keys(), usecols = column_map.values, #column_dict['label_position'].keys(),
names= column_map.index, names= column_map.index,
index_col=0, index_col=0,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A', header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
'#NA', '-1.#IND', '-1.#QNAN',
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
'NA', 'NULL', 'NaN', 'NA', 'NULL', 'NaN',
'nan', 'na', '.']) 'nan', 'na', '.'])
......
...@@ -8,17 +8,23 @@ import numpy as np ...@@ -8,17 +8,23 @@ import numpy as np
import jass_preprocessing.dna_utils as dna_u import jass_preprocessing.dna_utils as dna_u
import warnings import warnings
def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None):
def read_reference(gwas_reference_panel):
""" """
helper function to name correctly the column helper function to name correctly the column
""" """
ref = pd.read_csv(REF_filename, header=None, sep= "\t", ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id") index_col="snp_id")
if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
if minimum_MAF is not None:
ref = ref.loc[ref.MAF > minimum_MAF]
return(ref) return(ref)
def map_on_ref_panel(gw_df , ref_panel): def map_on_ref_panel(gw_df , ref_panel):
""" """
Merge Gwas dataframe with the reference panel Merge Gwas dataframe with the reference panel
......
...@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): ...@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_chr = pd.DataFrame({ mgwas_chr = pd.DataFrame({
'rsID': mgwas_copy.loc[chrom].snp_id, 'rsID': mgwas_copy.loc[chrom].snp_id,
'pos': mgwas_copy.loc[chrom].pos, 'pos': mgwas_copy.loc[chrom].pos,
'A0': mgwas_copy.loc[chrom].ref, 'A1': mgwas_copy.loc[chrom].ref,
'A1':mgwas_copy.loc[chrom].alt, 'A2':mgwas_copy.loc[chrom].alt,
'Z': mgwas_copy.loc[chrom].computed_z, 'Z': mgwas_copy.loc[chrom].computed_z,
'P': mgwas_copy.loc[chrom].pval 'P': mgwas_copy.loc[chrom].pval
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ]) }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder)) print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment