remove NA from diagnostic plots/fixed function for column mapping/ allele to upper case

0f14bb66 · Hanna JULIENNE · 664bf830 · 0f14bb66 · 0f14bb66 · 0f14bb66
Commit 0f14bb66 authored 7 years ago by Hanna JULIENNE
--- a/README.md
+++ b/README.md
+
+
+# Preprocessing package for cleaning and formating data for JASS
+
+This helper package convert heterogeneous GWAS summary statistic format to
+the IMPG input format.
+
+## Installation
+
+*Requirement* :
+
+## Usage
+
+Once installed the preprocessing can be launch in the following manner:
+
+
+
+## reference panel format:
+
+The user is expected to provide a reference panel with the following
+column naming and column types:
+
+| chr | pos | snp_id | ref | alt | MAF |
+|-----|-----|--------|-----|-----|-----|
+|1	|13116	|rs62635286 |	T|	G|	0.0970447|
+|1	|13118	|rs200579949 |A	|G	|0.0970447|
+|1	|14604	|rs541940975 |A	|G	|0.147564|
+|1	|14930	|rs75454623	|A	|G	|0.482228|
+
+***
+
+## Output format
+['rsID', 'pos', 'A0', "A1", "Z" ]
--- a/jass_preprocessing/jass_preprocessing/compute_score/compute.py
+++ b/jass_preprocessing/jass_preprocessing/compute_score/compute.py
@@ -54,12 +54,14 @@ def compute_sample_size(mgwas, diagnostic_folder, trait):
    ss_thres = perSS * myW_thres
    mgwas["computed_N"] = myN
    plt.clf()
-    p1 = sns.distplot(mgwas.computed_N)
+    p1 = sns.distplot(mgwas.computed_N[~mgwas.computed_N.isna()])
    p1.axvline(x=ss_thres)
    fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait)
    p1.figure.savefig(fo)
    # Filter SNP with a too small sample _SampleSize
    print("NSNP before sample size filtering: {}".format(mgwas.shape[0]))
    mgwas = mgwas.loc[(myN >= ss_thres)]
+    mgwas = mgwas.loc[~mgwas.computed_N.isna()]
+
    print("NSNP after sample size filtering: {}".format(mgwas.shape[0]))
    return(mgwas)
--- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
+++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
@@ -47,35 +47,51 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):
    """
    Find column position for each specific Gwas
    """
-    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
+    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
+
    gwas_file = gwas_internal_link.split('/')[-1]
-    my_labels = column_dict[column_dict['filename'] == gwas_file]
-    target_lab = my_labels.values.tolist()[0]
-    f = open(gwas_internal_link)

+    my_labels = column_dict.loc[gwas_file]
+    column_dict.head()
+    #Our standart labels:
+    reference_label = column_dict.columns.tolist()
+    # labels in the GWAS files
+    target_lab = pd.Index(my_labels.values.tolist())
+
+    f = open(gwas_internal_link)
    count_line = 0
    line = f.readline()
-    header = line.split()
-    list_col = {}
-    list_lab = {}
-    for l in range(0, len(target_lab)):
-        for h in range(0, len(header)):
-            if header[h] == target_lab[l]:
-                list_lab[my_labels.columns.tolist()[l]] = h
-                list_col[h] = my_labels.columns.tolist()[l]
-    return {'label_position' : list_col, 'position_label': list_lab}
-
-
-def read_gwas( gwas_internal_link, column_dict):
+    print(line)
+    header = pd.Index(line.split())
+
+    def get_position(I,x):
+        try:
+            return I.get_loc(x)
+        except KeyError:
+            return np.nan
+    label_position = [get_position(header,i) for i in target_lab]
+
+
+    mapgw = pd.Series(label_position, index=reference_label)
+    mapgw = mapgw.loc[~mapgw.isna()].astype(int)
+    mapgw.sort_values(inplace=True)
+    print(mapgw)
+    f.close()
+    return mapgw
+
+def read_gwas( gwas_internal_link, column_map):
    """
    Read gwas Chromosome and rename columns in our standart
    """

    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
-                               usecols=column_dict['label_position'].keys(),
+                               usecols = column_map.values, #column_dict['label_position'].keys(),
+                               names= column_map.index,
                                index_col=0,
-                                names=column_dict['label_position'].values(),
-                                 header=0)
+                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
+                                                 '-NaN',
+                                                 '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
+                                                 'nan', 'na', '.'])

    fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
    #fullGWAS = convert_missing_values(fullGWAS)

--- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
+++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
@@ -80,6 +80,10 @@ def compute_snp_alignement(mgwas):
        mgwas: a pandas dataframe of the GWAS data merged
         with the reference panel
    """
+    #ensure that allele are upper cases:
+
+    mgwas['a1'] = mgwas.a1.str.upper()
+    mgwas['a2'] = mgwas.a2.str.upper()

    mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)
    mgwas['a2c'] = dna_u.dna_complement(mgwas.a2)

--- a/main_preprocessing.py
+++ b/main_preprocessing.py
@@ -26,31 +26,29 @@ ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/"
 REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
 pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'

-outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
-def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
-               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan',
-               'na', '.']
-
-out_summary = "summary_GWAS.csv"
 ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/'
 gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)
-gwas_map.index
-GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
+gwas_map
+#GWAS_table = gwas_map.index[22:]#["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"]
+#GWAS_table[5:]
 # "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt"
 #              "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt",
 #              "GIANT_2015_HIP_COMBINED_EUR.txt",
 #              ]
-for GWAS_filename in GWAS_table.index:
+
+gwas_map
+GWAS_table = ["Menopause_HapMap_for_website_18112015.txt"] # gwas_map.index#["gabriel_asthma_dummy.txt"]
+for GWAS_filename in GWAS_table:

    tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'],
                           gwas_map.loc[GWAS_filename, 'outcome'])
    print('processing GWAS: {}'.format(tag))
    start = time.time()
    gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
-    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')

    GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
    mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
+    print(mapgw)

    gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)

@@ -58,21 +56,28 @@ for GWAS_filename in GWAS_table.index:
                      names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
                       index_col="snp_id")

-    if gw_df.index.map(str).str.contains("^chr*", case=False).any():
-        ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str)
-        other_snp = pd.merge(ref, gw_df, how='inner', indicator=True,
-                             left_on ='key2', left_index=False, right_index=True)

    mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
+
    mgwas = jp.map_reference.compute_snp_alignement(mgwas)
    mgwas = jp.compute_score.compute_z_score(mgwas)

    mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag)
    end = time.time()
+
    print("Preprocessing of {0} in {1}s".format(tag, end-start))

    jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)
    jp.save_output.save_output(mgwas, ldscore_format, tag)

-mgwas.reset_index(inplace=True)
-mgwas.sort_values(['chr', "pos"]).head(100)
+mapgw.sort_values(inplace=True)
+mgwas.head()
+
+GWAS_labels
+pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0)
+
+mapgw.head()
+
+GWAS_path
+GWAS_labels
+mapgw