Commit cc9337de authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add filter on reference panel

parent 849b8ee4
......@@ -39,9 +39,7 @@ def launch_preprocessing(args):
mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(args.ref_path, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
ref = read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
mgwas = jp.map_reference.compute_snp_alignement(mgwas)
......@@ -69,6 +67,10 @@ def add_preprocessing_argument():
parser.add_argument('--output-folder', required=True, help= "Location of main ouput folder for preprocessed GWAS files (splitted by chromosome)")
parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by minimum allele frequency", default=0.01)
parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
parser.set_defaults(func=launch_preprocessing)
return parser
......
......@@ -88,7 +88,7 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.set_index("filename", inplace=True)
print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict.loc[gwas_file]
......@@ -133,8 +133,7 @@ def read_gwas( gwas_internal_link, column_map):
usecols = column_map.values, #column_dict['label_position'].keys(),
names= column_map.index,
index_col=0,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A',
'#NA', '-1.#IND', '-1.#QNAN',
header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
'NA', 'NULL', 'NaN',
'nan', 'na', '.'])
......
......@@ -8,17 +8,23 @@ import numpy as np
import jass_preprocessing.dna_utils as dna_u
import warnings
def read_reference(gwas_reference_panel):
def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None):
"""
helper function to name correctly the column
"""
ref = pd.read_csv(REF_filename, header=None, sep= "\t",
ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
index_col="snp_id")
if mask_MHC:
ref = ref.loc[(ref.chr !=6)|(ref.pos < 28477797)|(ref.pos > 33448354)]
if minimum_MAF is not None:
ref = ref.loc[ref.MAF > minimum_MAF]
return(ref)
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas dataframe with the reference panel
......
......@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_chr = pd.DataFrame({
'rsID': mgwas_copy.loc[chrom].snp_id,
'pos': mgwas_copy.loc[chrom].pos,
'A0': mgwas_copy.loc[chrom].ref,
'A1':mgwas_copy.loc[chrom].alt,
'A1': mgwas_copy.loc[chrom].ref,
'A2':mgwas_copy.loc[chrom].alt,
'Z': mgwas_copy.loc[chrom].computed_z,
'P': mgwas_copy.loc[chrom].pval
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
}, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment