Skip to content
Snippets Groups Projects
Commit cc9337de authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add filter on reference panel

parent 849b8ee4
No related branches found
No related tags found
No related merge requests found
......@@ -39,9 +39,7 @@ def launch_preprocessing(args):
mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(args.ref_path, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
......@@ -69,6 +67,10 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01)
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
parser.set_defaults(func=launch_preprocessing)
return parser
......
......@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.set_index("filename", inplace=True)
print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict.loc[gwas_file]
......@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map):
usecols = column_map.values, #column_dict['label_position'].keys(),
names= column_map.index,
index_col=0,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A',
'#NA', '-1.#IND', '-1.#QNAN',
header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
'NA', 'NULL', 'NaN',
'nan', 'na', '.'])
......
......@@ -8,17 +8,23 @@ import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
def read_reference(gwas_reference_panel):
def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None):
"""
helper function to name correctly the column
"""
ref = pd.read_csv(REF_filename, header=None, sep= "\t",
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
if minimum_MAF is not None:
ref = ref.loc[ref.MAF > minimum_MAF]
return(ref)
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas dataframe with the reference panel
......
......@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_chr = pd.DataFrame({
'rsID': mgwas_copy.loc[chrom].snp_id,
'pos': mgwas_copy.loc[chrom].pos,
'A0': mgwas_copy.loc[chrom].ref,
'A1':mgwas_copy.loc[chrom].alt,
'A1': mgwas_copy.loc[chrom].ref,
'A2':mgwas_copy.loc[chrom].alt,
'Z': mgwas_copy.loc[chrom].computed_z,
'P': mgwas_copy.loc[chrom].pval
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
}, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment