diff --git a/jass/models/plots.py b/jass/models/plots.py index c5f12b9ff09e3a6e4da03473fb605a346e998358..599c1f71e7bbe1d9e65d15774a85f5e0bc8ddbdc 100644 --- a/jass/models/plots.py +++ b/jass/models/plots.py @@ -1,326 +1,329 @@ -# -*- coding: utf-8 -*-t -""" -This software allows to plot and store graphs which can be displayed on the web interface. - -@author: vguillem, hmenager, hjulienn, clasry -""" - -import logging -import numpy as np -import math - -# Keep the following order: 1) importing matplotlib, (2) configuring it to use AGG, (3) importing matplotlib submodules -import matplotlib -matplotlib.use("AGG") -import matplotlib.pyplot as plt -from matplotlib import colors -import matplotlib.patches as mpatches -from scipy.stats import norm, chi2 -import seaborn as sns -import os -from pandas import DataFrame, read_hdf -import pandas as pd - - -def replaceZeroes(df): - """ - replaceZeroes - replace null values of df with the smallest non-zero value - """ - ids = np.where((df != 0) & np.isfinite(df)) - min_nonzero = np.min(df.values[ids]) - df.values[df.values == 0] = min_nonzero - return df - -def create_global_plot(work_file_path: str, global_plot_path: str): - """ - create_global_plot - generate genome-wide manhattan plot for a given set of phenotypes - """ - - regions = read_hdf(work_file_path, "Regions") - chr_length = regions.groupby('CHR').max().position - N_reg= regions.Region.max() - maxy = 0 - - fig = plt.figure(figsize=(30, 12)) - ax = fig.add_subplot(111) - - chunk_size = 50 - colors = [ - '#4287f5', - 'orangered' - ] - binf=regions.Region.iloc[0] - bsup= binf+chunk_size - chr_considered= regions.CHR.unique() - length_chr = regions.groupby("CHR").MiddlePosition.max() / 10**6 - length_chr.loc[0] = 0 - label = "Chr"+length_chr.loc[chr_considered].index.astype("str") - - lab_pos = length_chr.loc[chr_considered]/2 - pos_shift = length_chr.cumsum() - pos_shift.index = pos_shift.index +1 - pos_shift.loc[chr_considered[0]] = 0 - lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered] - - while binf < N_reg: - df = read_hdf(work_file_path, "SumStatTab", columns=["CHR","position", 'JASS_PVAL', "Region"], where = "Region >= {0} and Region < {1}".format(binf, bsup)) - binf+= chunk_size - bsup= binf+ chunk_size - df["plot_position"] = pos_shift.loc[df.CHR].values + df.position/ 10**6 - df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) - max_reg = np.max(df["-log10(Joint p-value)"]) - if maxy < max_reg: - maxy = max_reg - plt.scatter(df['plot_position'],df["-log10(Joint p-value)"] ,s=3,color=[colors[i%2] for i in df.CHR.astype(int)]) - - - ax.set_xticks(lab_pos) - ax.set_xticklabels(label, rotation =90) - ax.set_xlabel("", fontsize=16) - ax.set_ylabel("-log10(p-value)", fontsize=16) - # add line for significant SNPs - plt.axhline(y=-np.log10(5*10**-8), color='darkgrey') - # add line for suggestive SNPs - plt.axhline(y=-np.log10(5*10**-6), color='darkgrey', ls="--") - - ax.set_title("Joint test association results by region", fontsize=18) - - fig.savefig(global_plot_path, dpi=300) - fig.clf() - print("------ Manhattan plot -----") - - -def create_local_plot(work_file_path: str, local_plot_path: str): - """ - create_local_plot - generate a zoom plot for a given set of phenotypes - """ - # The colors associated with each chromosome are alternately red and blue - colors = ['blue', 'red'] - - df = read_hdf(work_file_path, "SumStatTab") - chr = np.int64(df["CHR"].iloc[0]) - Num_color = chr % 2 - - df['JASS_PVAL'] = replaceZeroes(df['JASS_PVAL']) - df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) - - Columns_to_keep = ["position", "-log10(Joint p-value)"] - df = df[Columns_to_keep] - df.sort_values(by="position", inplace=True) - - Nb_values = df.shape[0] - if(Nb_values == 0): - # ERROR: the worktable is empty - Message = "The worktable is empty: it is impossible to draw the zoom plot" - raise NameError(Message) - - start = df["position"].iloc[0] - end = df["position"].iloc[Nb_values - 1] - - significance_treshold = 0.05 / min(Nb_values, 10**6) - Line_value = - math.log10(significance_treshold) - - Max_value = max(Line_value, df["-log10(Joint p-value)"].max()) + 0.5 - - fig = plt.figure(figsize=(30, 12)) - ax = fig.add_subplot(111) - plt.scatter(df["position"], df["-log10(Joint p-value)"], color = colors[Num_color]) - plt.axhline(y=Line_value, color='green') - plt.ylim(0, Max_value) - ax.set_xlabel("position", fontsize=16) - ax.set_ylabel("-log10(p-value)", fontsize=16) - Titre = "Joint test association results by position for chromosome{} ({} SNPs in [{} bp ; {} bp])" \ - .format(chr, Nb_values, start, end) - ax.set_title(Titre, fontsize=18) - - fig.savefig(local_plot_path, dpi=600) - fig.clf() - - print("------ zoom plot -----") - - -def create_quadrant_plot(work_file_path: str, - quadrant_plot_path: str, significance_treshold=5*10**-8): - """ - create_quadrant_plot - Create a "quadrant" plot that represent the joint test pvalue versus the univariate \ - test pvalue for the most significant SNPs by genomic region. The plot use a logarithmic scale. - - :param work_file_path: path to the worktable - :type work_file_path: str - - :param quadrant_plot_path: path to the file where to store the plot results - :type quadrant_plot_path: str - """ - - df = read_hdf(work_file_path, "Regions") - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes( - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) - - log10_pval = -np.log10(df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) - - pv_t = DataFrame(log10_pval, columns=['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']) - - n = df.shape[0] - transparency = 0.6 - pv_t["color"] = "grey" - - # blue: significant pvalues for and univariate tests - cond = df.signif_status == "Both" - pv_t.loc[cond.values, "color"] = "#3ba3ec" - b = cond.sum() - # red: significant pvalues for test only - cond = df.signif_status == "Joint" - pv_t.loc[cond.values, "color"] = "#f77189" - r = cond.sum() - # green: significant pvalues for univariate test only - cond = df.signif_status == "Univariate" - pv_t.loc[cond.values, "color"] = "#50b131" - c = cond.sum() - - # grey: non significant pvalues - cond = df.signif_status == "None" - a = cond.sum() - - fig, ax = plt.subplots(figsize=(10, 5)) - - plt.subplot(121) - plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, - c=pv_t.color.tolist(), alpha=0.6, s=10) - - plt.axis([0, pv_t.UNIVARIATE_MIN_PVAL.max(), 0, pv_t.JASS_PVAL.max()]) - # axes abcisse et ordonnée à 8 - treshold = -np.log10(significance_treshold) - plt.axvline(treshold, color="grey", linestyle="--") - plt.axhline(treshold, color="grey", linestyle="--") - - # légendes abcisse et ordonnée - plt.xlabel('-log10(P) for univariate tests', fontsize=12) - plt.ylabel('-log10(P) for joint test', fontsize=12) - - # légendes différents groupes - # green: significant pvalues for univariate test only - green_patch = mpatches.Patch( - color='#50b131', label='{} Significant pvalues for univariate test only'.format(c)) - # blue: significant pvalues for and univariate tests - blue_patch = mpatches.Patch( - color='#3ba3ec', label='{} Significant pvalues for joint and univariate tests'.format(b)) - # red: significant pvalues for test only - red_patch = mpatches.Patch( - color='#f77189', label='{} Significant pvalues for joint test only'.format(r)) - # grey: non significant pvalues - grey_patch = mpatches.Patch( - color='grey', label='{} Non significant pvalues'.format(a)) - - lgd = plt.legend(handles=[red_patch, blue_patch, green_patch, grey_patch], - bbox_to_anchor=(0.95, -0.25), loc='lower center', - ncol=2, mode="expand", borderaxespad=0.) - - plt.subplot(122) - plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, - c=pv_t.color.tolist(), alpha=0.6, s=10) - # axes abcisse et ordonnee à 8 - plt.axvline(treshold, color="grey", linestyle="--") - plt.axhline(treshold, color="grey", linestyle="--") - # zoom on the square of region detected by jass: - alim = np.ceil(pv_t.JASS_PVAL.loc[pv_t.color == "#f77189"].max() + 2) - if np.isnan(alim): - alim = 10 - plt.axis([0, alim, 0, alim]) - # légendes abcisse et ordonnee - plt.xlabel('-log10(P) for univariate tests', fontsize=12) - # plt.show() - plt.savefig(quadrant_plot_path, dpi=600, - bbox_extra_artists=(lgd,), bbox_inches='tight') - plt.clf() - - nb_omnibus = r - nb_total = r + b + c - - - print("------ quadrant plot -----") - - #return (nb_omnibus, nb_total) - - -def create_qq_plot(work_file_path: str, qq_plot_path: str): - df = read_hdf(work_file_path, "SumStatTab", columns=['JASS_PVAL', 'UNIVARIATE_MIN_PVAL', 'UNIVARIATE_MIN_QVAL']) - - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL', "UNIVARIATE_MIN_QVAL"]] = replaceZeroes( - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL','UNIVARIATE_MIN_QVAL']]) - - pvalue = -np.log10(df.JASS_PVAL) - pvalue_univ = -np.log10(df.UNIVARIATE_MIN_PVAL) - qvalue_univ = -np.log10(df.UNIVARIATE_MIN_QVAL) - # compute_expected pvalue - pvalue.sort_values().values - - QQ_pval = pd.DataFrame({"JASS p-value" : pvalue.sort_values().values, - "Univariate p-value" : pvalue_univ.sort_values().values, - "Univariate q-value" : qvalue_univ.sort_values().values, - }) - QQ_pval = QQ_pval.iloc[::20,].dropna() - exp_val = np.flip(- np.log10((QQ_pval.index.values+1) / (QQ_pval.index.max()+1))) - - exp_val[-1] = QQ_pval.max().max() - QQ_pval.index = exp_val - QQ_pval = QQ_pval.iloc[:-1,] - pval_median = df.JASS_PVAL.median() - pval_median_univ = df.UNIVARIATE_MIN_PVAL.median() - pval_median_quniv = df.UNIVARIATE_MIN_QVAL.median() - - print("median pval") - print(pval_median) - lambda_value_jass = chi2.sf(pval_median, df=1) / chi2.sf(0.5, df=1) - lambda_value_jass - lambda_value_univ = chi2.sf(pval_median_univ, df=1) / chi2.sf(0.5, df=1) - lambda_value_quniv = chi2.sf(pval_median_quniv, df=1) / chi2.sf(0.5, df=1) - - p = sns.lineplot(data=QQ_pval) - alim = exp_val[-2] - plt.plot([0, alim],[0, alim], c="red", linewidth=0.5) - p.set_title("QQ plot\n λ JASS = {:.2f}\n λ univariate p-values = {:.2f} λ univariate q-values = {:.2f}".format(lambda_value_jass, lambda_value_univ, lambda_value_quniv), fontsize = 11) - p.set_xlabel("Expected -log10(p-values)", fontsize = 13) - p.set_ylabel("Observed -log10(p-values)", fontsize = 13) - - plt.savefig(qq_plot_path) - plt.clf() - print("------ QQ plot -----") - - -def create_qq_plot_by_GWAS(init_file_path: str, qq_plot_folder: str): - df = read_hdf(init_file_path, "SumStatTab", where="Region < {0}".format(2)) - uni_var = [i for i in df.columns if i[:2]=="z_"] - - for gwas in uni_var: - print(gwas) - df = read_hdf( - init_file_path, - "SumStatTab", - columns=[gwas]) - - qval_obs = df[gwas].quantile(np.linspace(0,1, 1000)) - qval_exp = norm.ppf(np.linspace(0.001,0.999, 1000)) - - # Cast values between 0 and 1, 0 and 1 excluded - x_1 = np.linspace(min(qval_obs), max(qval_obs)) - y_1 = 1.1 * x_1 - - plt.plot(x_1, y_1, c="red") - plt.scatter(qval_exp, qval_obs, s=5) - lambda_value = np.nanmedian(df[gwas]) - if np.abs(lambda_value) > 0.5: - print("Strong deviation for") - print("Zscore median {} for {}".format(lambda_value, gwas)) - - plt.title("median Zscore = {:.2f}".format(lambda_value)) - plt.xlabel("expected quantile") - plt.ylabel("observed quantile") - output_fi = "{0}/{1}.png".format(qq_plot_folder, gwas) - plt.savefig(output_fi, dpi=600) - plt.clf() - - print("------ QQ plot -----") +# -*- coding: utf-8 -*-t +""" +This software allows to plot and store graphs which can be displayed on the web interface. + +@author: vguillem, hmenager, hjulienn, clasry +""" + +import logging +import numpy as np +import math + +# Keep the following order: 1) importing matplotlib, (2) configuring it to use AGG, (3) importing matplotlib submodules +import matplotlib +matplotlib.use("AGG") +import matplotlib.pyplot as plt +from matplotlib import colors +import matplotlib.patches as mpatches +from scipy.stats import norm, chi2 +import seaborn as sns +import os +from pandas import DataFrame, read_hdf +import pandas as pd + + +def replaceZeroes(df): + """ + replaceZeroes + replace null values of df with the smallest non-zero value + """ + ids = np.where((df != 0) & np.isfinite(df)) + min_nonzero = np.min(df.values[ids]) + df.values[df.values == 0] = min_nonzero + return df + +def create_global_plot(work_file_path: str, global_plot_path: str): + """ + create_global_plot + generate genome-wide manhattan plot for a given set of phenotypes + """ + + regions = read_hdf(work_file_path, "Regions") + chr_length = regions.groupby('CHR').max().position + N_reg= regions.Region.max() + maxy = 0 + + fig = plt.figure(figsize=(30, 12)) + ax = fig.add_subplot(111) + + chunk_size = 50 + colors = [ + '#4287f5', + 'orangered' + ] + binf=regions.Region.iloc[0] + bsup= binf+chunk_size + chr_considered= regions.CHR.unique() + length_chr = regions.groupby("CHR").MiddlePosition.max() / 10**6 + length_chr.loc[0] = 0 + label = "Chr"+length_chr.loc[chr_considered].index.astype("str") + + lab_pos = length_chr.loc[chr_considered]/2 + pos_shift = length_chr.cumsum() + pos_shift.index = pos_shift.index +1 + pos_shift.loc[chr_considered[0]] = 0 + lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered] + + while binf < N_reg: + df = read_hdf(work_file_path, "SumStatTab", columns=["CHR","position", 'JASS_PVAL', "Region"], where = "Region >= {0} and Region < {1}".format(binf, bsup)) + binf+= chunk_size + bsup= binf+ chunk_size + df["plot_position"] = pos_shift.loc[df.CHR].values + df.position/ 10**6 + df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) + max_reg = np.max(df["-log10(Joint p-value)"]) + if maxy < max_reg: + maxy = max_reg + plt.scatter(df['plot_position'],df["-log10(Joint p-value)"] ,s=3,color=[colors[i%2] for i in df.CHR.astype(int)]) + + + ax.set_xticks(lab_pos) + ax.set_xticklabels(label, rotation =90) + ax.set_xlabel("", fontsize=16) + ax.set_ylabel("-log10(p-value)", fontsize=16) + # add line for significant SNPs + plt.axhline(y=-np.log10(5*10**-8), color='darkgrey') + # add line for suggestive SNPs + plt.axhline(y=-np.log10(5*10**-6), color='darkgrey', ls="--") + + ax.set_title("Joint test association results by region", fontsize=18) + + fig.savefig(global_plot_path, dpi=300) + fig.clf() + print("------ Manhattan plot -----") + + +def create_local_plot(work_file_path: str, local_plot_path: str): + """ + create_local_plot + generate a zoom plot for a given set of phenotypes + """ + # The colors associated with each chromosome are alternately red and blue + colors = ['blue', 'red'] + + df = read_hdf(work_file_path, "SumStatTab") + chr = np.int64(df["CHR"].iloc[0]) + Num_color = chr % 2 + + df['JASS_PVAL'] = replaceZeroes(df['JASS_PVAL']) + df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) + + Columns_to_keep = ["position", "-log10(Joint p-value)"] + df = df[Columns_to_keep] + df.sort_values(by="position", inplace=True) + + Nb_values = df.shape[0] + if(Nb_values == 0): + # ERROR: the worktable is empty + Message = "The worktable is empty: it is impossible to draw the zoom plot" + raise NameError(Message) + + start = df["position"].iloc[0] + end = df["position"].iloc[Nb_values - 1] + + significance_treshold = 0.05 / min(Nb_values, 10**6) + Line_value = - math.log10(significance_treshold) + + Max_value = max(Line_value, df["-log10(Joint p-value)"].max()) + 0.5 + + fig = plt.figure(figsize=(30, 12)) + ax = fig.add_subplot(111) + plt.scatter(df["position"], df["-log10(Joint p-value)"], color = colors[Num_color]) + plt.axhline(y=Line_value, color='green') + plt.ylim(0, Max_value) + ax.set_xlabel("position", fontsize=16) + ax.set_ylabel("-log10(p-value)", fontsize=16) + Titre = "Joint test association results by position for chromosome{} ({} SNPs in [{} bp ; {} bp])" \ + .format(chr, Nb_values, start, end) + ax.set_title(Titre, fontsize=18) + + fig.savefig(local_plot_path, dpi=600) + fig.clf() + + print("------ zoom plot -----") + + +def create_quadrant_plot(work_file_path: str, + quadrant_plot_path: str, significance_treshold=5*10**-8): + """ + create_quadrant_plot + Create a "quadrant" plot that represent the joint test pvalue versus the univariate \ + test pvalue for the most significant SNPs by genomic region. The plot use a logarithmic scale. + + :param work_file_path: path to the worktable + :type work_file_path: str + + :param quadrant_plot_path: path to the file where to store the plot results + :type quadrant_plot_path: str + """ + + df = read_hdf(work_file_path, "Regions") + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes( + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) + + log10_pval = -np.log10(df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) + + pv_t = DataFrame(log10_pval, columns=['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']) + + n = df.shape[0] + transparency = 0.6 + pv_t["color"] = "grey" + + # blue: significant pvalues for and univariate tests + cond = df.signif_status == "Both" + pv_t.loc[cond.values, "color"] = "#3ba3ec" + b = cond.sum() + # red: significant pvalues for test only + cond = df.signif_status == "Joint" + pv_t.loc[cond.values, "color"] = "#f77189" + r = cond.sum() + # green: significant pvalues for univariate test only + cond = df.signif_status == "Univariate" + pv_t.loc[cond.values, "color"] = "#50b131" + c = cond.sum() + + # grey: non significant pvalues + cond = df.signif_status == "None" + a = cond.sum() + + fig, ax = plt.subplots(figsize=(10, 5)) + + plt.subplot(121) + plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, + c=pv_t.color.tolist(), alpha=0.6, s=10) + + plt.axis([0, pv_t.UNIVARIATE_MIN_PVAL.max(), 0, pv_t.JASS_PVAL.max()]) + # axes abcisse et ordonnée à 8 + treshold = -np.log10(significance_treshold) + plt.axvline(treshold, color="grey", linestyle="--") + plt.axhline(treshold, color="grey", linestyle="--") + + # légendes abcisse et ordonnée + plt.xlabel('-log10(P) for univariate tests', fontsize=12) + plt.ylabel('-log10(P) for joint test', fontsize=12) + + # légendes différents groupes + # green: significant pvalues for univariate test only + green_patch = mpatches.Patch( + color='#50b131', label='{} Significant pvalues for univariate test only'.format(c)) + # blue: significant pvalues for and univariate tests + blue_patch = mpatches.Patch( + color='#3ba3ec', label='{} Significant pvalues for joint and univariate tests'.format(b)) + # red: significant pvalues for test only + red_patch = mpatches.Patch( + color='#f77189', label='{} Significant pvalues for joint test only'.format(r)) + # grey: non significant pvalues + grey_patch = mpatches.Patch( + color='grey', label='{} Non significant pvalues'.format(a)) + + lgd = plt.legend(handles=[red_patch, blue_patch, green_patch, grey_patch], + bbox_to_anchor=(0.95, -0.25), loc='lower center', + ncol=2, mode="expand", borderaxespad=0.) + + plt.subplot(122) + plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, + c=pv_t.color.tolist(), alpha=0.6, s=10) + # axes abcisse et ordonnee à 8 + plt.axvline(treshold, color="grey", linestyle="--") + plt.axhline(treshold, color="grey", linestyle="--") + # zoom on the square of region detected by jass: + alim = np.ceil(pv_t.JASS_PVAL.loc[pv_t.color == "#f77189"].max() + 2) + if np.isnan(alim): + alim = 10 + plt.axis([0, alim, 0, alim]) + # légendes abcisse et ordonnee + plt.xlabel('-log10(P) for univariate tests', fontsize=12) + # plt.show() + plt.savefig(quadrant_plot_path, dpi=600, + bbox_extra_artists=(lgd,), bbox_inches='tight') + plt.clf() + + nb_omnibus = r + nb_total = r + b + c + + + print("------ quadrant plot -----") + + #return (nb_omnibus, nb_total) + + +def create_qq_plot(work_file_path: str, qq_plot_path: str): + df = read_hdf(work_file_path, "SumStatTab", columns=['JASS_PVAL', 'UNIVARIATE_MIN_PVAL', 'UNIVARIATE_MIN_QVAL']) + + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL', "UNIVARIATE_MIN_QVAL"]] = replaceZeroes( + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL','UNIVARIATE_MIN_QVAL']]) + + + dfr = read_hdf(work_file_path, "SumStatTab", where="Region==1")) + k= len([c for c in dfr.columns if c[:2]=="z_"]) + pvalue = -np.log10(df.JASS_PVAL) + pvalue_univ = -np.log10(df.UNIVARIATE_MIN_PVAL) + qvalue_univ = -np.log10(df.UNIVARIATE_MIN_QVAL) + # compute_expected pvalue + pvalue.sort_values().values + + QQ_pval = pd.DataFrame({"JASS p-value" : pvalue.sort_values().values, + "Univariate p-value" : pvalue_univ.sort_values().values, + "Univariate q-value" : qvalue_univ.sort_values().values, + }) + QQ_pval = QQ_pval.iloc[::20,].dropna() + exp_val = np.flip(- np.log10((QQ_pval.index.values+1) / (QQ_pval.index.max()+1))) + + exp_val[-1] = QQ_pval.max().max() + QQ_pval.index = exp_val + QQ_pval = QQ_pval.iloc[:-1,] + pval_median = df.JASS_PVAL.median() + pval_median_univ = df.UNIVARIATE_MIN_PVAL.median() + pval_median_quniv = df.UNIVARIATE_MIN_QVAL.median() + + print("median pval") + + lambda_value_jass = chi2.sf(pval_median, df=k) / chi2.sf(0.5, df=k) + + lambda_value_univ = chi2.sf(pval_median_univ, df=1) / chi2.sf(0.5, df=1) + lambda_value_quniv = chi2.sf(pval_median_quniv, df=1) / chi2.sf(0.5, df=1) + + p = sns.lineplot(data=QQ_pval) + alim = exp_val[-2] + plt.plot([0, alim],[0, alim], c="red", linewidth=0.5) + p.set_title("QQ plot\n λ JASS = {:.2f}\n λ univariate p-values = {:.2f} λ univariate q-values = {:.2f}".format(lambda_value_jass, lambda_value_univ, lambda_value_quniv), fontsize = 11) + p.set_xlabel("Expected -log10(p-values)", fontsize = 13) + p.set_ylabel("Observed -log10(p-values)", fontsize = 13) + + plt.savefig(qq_plot_path) + plt.clf() + print("------ QQ plot -----") + + +def create_qq_plot_by_GWAS(init_file_path: str, qq_plot_folder: str): + df = read_hdf(init_file_path, "SumStatTab", where="Region < {0}".format(2)) + uni_var = [i for i in df.columns if i[:2]=="z_"] + + for gwas in uni_var: + print(gwas) + df = read_hdf( + init_file_path, + "SumStatTab", + columns=[gwas]) + + qval_obs = df[gwas].quantile(np.linspace(0,1, 1000)) + qval_exp = norm.ppf(np.linspace(0.001,0.999, 1000)) + + # Cast values between 0 and 1, 0 and 1 excluded + x_1 = np.linspace(min(qval_obs), max(qval_obs)) + y_1 = 1.1 * x_1 + + plt.plot(x_1, y_1, c="red") + plt.scatter(qval_exp, qval_obs, s=5) + lambda_value = np.nanmedian(df[gwas]) + if np.abs(lambda_value) > 0.5: + print("Strong deviation for") + print("Zscore median {} for {}".format(lambda_value, gwas)) + + plt.title("median Zscore = {:.2f}".format(lambda_value)) + plt.xlabel("expected quantile") + plt.ylabel("observed quantile") + output_fi = "{0}/{1}.png".format(qq_plot_folder, gwas) + plt.savefig(output_fi, dpi=600) + plt.clf() + + print("------ QQ plot -----")