adapt code to SAIGE output

0c5728ea · hjulienne · 92a461de · 0c5728ea · 0c5728ea · 0c5728ea
Commit 0c5728ea authored 4 years ago by hjulienne
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -56,19 +56,21 @@ In a terminal, execute the following lines:
 Input
 ======
-* A reference panel (1000 genome format). The user is expected to provide a reference panel in tsv format with the following columns in that order, without header:
+* A reference panel (1000 genome format). The user is expected to provide a reference panel
+ in tsv format with the following columns in the following order, without header:
-+-----+-----+------------+-----+-----+---------+
-| chr | pos |   snp_id   | ref | alt |   MAF   |
+-----+------------+---------+-------+-----+-----+
-+=====+=====+============+=====+=====+=========+
+| chr |   snp_id   |   MAF   |  pos  | ref | alt |
-|  1  |13116| rs62635286 |  T  |  G  |0.0970447|
+=====+============+=========+=======+=====+=====+
-+-----+-----+------------+-----+-----+---------+
+|  1  | rs62635286 |0.0970447| 13116 |  T  |  G  |
-|  1  |13118| rs200579949|  A  |  G  |0.0970447|
+-----+------------+---------+-------+-----+-----+
-+-----+-----+------------+-----+-----+---------+
+|  1  | rs63125786 |0.0970447| 15116 |  T  |  A  |
-|  1  |14604| rs541940975|  A  |  G  | 0.147564|
+-----+------------+---------+-------+-----+-----+
-+-----+-----+------------+-----+-----+---------+
+|  1  | rs5686     |0.1970447| 17116 |  A  |  G  |
-|  1  |14930| rs75454623 |  A  |  G  | 0.482228|
+-----+------------+---------+-------+-----+-----+
-+-----+-----+------------+-----+-----+---------+
+|  1  | rs892586   |0.7670447| 23116 |  C  |  G  |
+-----+------------+---------+-------+-----+-----+
 * Folder containing all raw gwas data : (all chromosomes in one file) (minimal conditions?? tab separated?)
 * a list containing the name of GWAS file to the string format.

--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -38,22 +38,29 @@ def launch_preprocessing(args):
        raise_duplicated_index(gwas_map.tag)
    gwas_map.set_index("tag", inplace=True)
+    print(gwas_map)
    for tag in gwas_map.index:
        gwas_filename = gwas_map.loc[tag, "filename"]
        print('processing GWAS: {}'.format(tag))
        start = time.time()
+        print(args.input_folder)
        GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2]
+        print(GWAS_link)
        mapgw = jp.map_gwas.map_columns_position(GWAS_link, gwas_map.loc[tag])
+        print(mapgw)
+        print(args.imputation_quality_treshold)
-        gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
+        if args.imputation_quality_treshold is not 'None':
+            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw, imputation_treshold=eval(args.imputation_quality_treshold))
-        if args.additional_masked_region is None:
-            ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF))
        else:
-            ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
+            gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
+        print(bool(args.mask_MHC), float(args.minimum_MAF), args.additional_masked_region)
+        print(eval(args.additional_masked_region))
+        ref = jp.map_reference.read_reference(args.ref_path, bool(args.mask_MHC), float(args.minimum_MAF), region_to_mask=eval(args.additional_masked_region))
        mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
        mgwas = jp.map_reference.compute_snp_alignement(mgwas)
        mgwas = jp.compute_score.compute_z_score(mgwas)
@@ -62,6 +69,7 @@ def launch_preprocessing(args):
        end = time.time()
        print("Preprocessing of {0} in {1}s".format(tag, end-start))
+        print(mgwas.head())
        jp.save_output.save_output_by_chromosome(mgwas, args.output_folder, tag)
        if(args.output_folder_1_file):
@@ -81,11 +89,11 @@ def add_preprocessing_argument():
    parser.add_argument('--output-folder-1-file', required=False, help= "optional location to store the preprocessing in one tabular file with one chromosome columns (useful to compute LDSC correlation for instance)")
    parser.add_argument('--percent-sample-size', required=False, help= "the proportion (between 0 and 1) of the 90th percentile of the sample size used to filter the SNPs", default=0.7)
-    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default=0.01)
+    parser.add_argument('--minimum-MAF', required=False, help= "Filter the reference panel by  minimum allele frequency", default='0.01')
-    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default=False)
+    parser.add_argument('--mask-MHC', required=False, help= "Whether the MHC region should be masked or not. default is False", default='False')
-    parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default=None)
+    parser.add_argument('--additional-masked-region', required=False, help= "List of dictionary containing coordinate of region to mask. For example :[{'chr':6, 'start':50000000, 'end': 70000000}, {'chr':6, 'start':100000000, 'end': 120000000}]", default='None')
-    parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default=None)
+    parser.add_argument('--imputation-quality-treshold', required=False, help= "minimum imputation quality in summary statistics", default='None')
    parser.set_defaults(func=launch_preprocessing)

--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -153,6 +153,14 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
                                                 'NA', 'NULL', 'NaN',
                                                 'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
    print(fullGWAS.head())
+    # either rs ID or full position must be available:
+    if "snpid" not in fullGWAS.columns:
+        if ("CHR" in fullGWAS.columns) and ("POS" in fullGWAS.columns):
+            fullGWAS["snpid"] = fullGWAS["CHR"].apply(str) + ":" +fullGWAS["POS"].apply(str) + ":"+ fullGWAS["a1"].apply(str)+ ":"+ fullGWAS["a2"].apply(str)
+        else:
+            raise ValueError("Summary statistic file {0} doesn't contain rsID or full SNP position".format(gwas_internal_link))
    fullGWAS.set_index("snpid", inplace=True)
    print(fullGWAS.head())
    if imputation_treshold:

--- a/jass_preprocessing/map_reference.py
+++ b/jass_preprocessing/map_reference.py
@@ -21,7 +21,7 @@ def read_reference(gwas_reference_panel, mask_MHC=False, minimum_MAF=None, regio
        ref (pandas dataframe): the reference_panel with the specified filter applied
    """
    ref = pd.read_csv(gwas_reference_panel, header=None, sep= "\t",
-                      names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
+                      names =[ 'chr', "snp_id", "MAF","pos",  "ref", "alt"],
                       index_col="snp_id")
    #Filter Strand ambiguous
    ref = ref.loc[~(ref.ref+ref.alt).isin(["AT", "TA", 'CG','GC'])]
@@ -53,8 +53,15 @@ def map_on_ref_panel(gw_df , ref_panel):
    """
    inter_index = ref_panel.index.intersection(gw_df.index)
+    print(ref_panel.head())
+    print(gw_df.head())
    merge_GWAS = pd.merge(ref_panel, gw_df,
                          how='inner', indicator=True, left_index=True, right_index=True)
+    if ((merge_GWAS.pos == merge_GWAS.POS).mean()> 0.95):
+        merge_GWAS = merge_GWAS.loc[(merge_GWAS.pos == merge_GWAS.POS)]
+    else:
+        raise ValueError("SNP positions in reference panel and in Summary statistic are different! Different assembly?")
    print("SNPs {}".format(merge_GWAS.shape[0]))

--- a/jass_preprocessing/save_output.py
+++ b/jass_preprocessing/save_output.py
@@ -10,6 +10,7 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
    mgwas_copy.set_index("chr", inplace=True)
    mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True)
+    print(mgwas_copy.index.unique())
    for chrom in mgwas_copy.index.unique():
        mgwas_chr = pd.DataFrame({