diff --git a/jass/controllers/default_controller.py b/jass/controllers/default_controller.py index af5997fbb22d9264ad3b15368b4fd3386ee36bc8..6bbdb3d77484ac10d82353b2ebc2c87c4e16212d 100644 --- a/jass/controllers/default_controller.py +++ b/jass/controllers/default_controller.py @@ -1,13 +1,17 @@ -from typing import List, Dict -import os +# -*- coding: utf-8 -*- -from six import iteritems -from flask import send_file, abort -import connexion +""" +default_controller ensures the connection between the web interface and the Python JASS-analysis module +""" -from jass.models.phenotype import Phenotype, get_available_phenotypes -from jass.models.project import Project, create_project from jass.config import config +from jass.models.project import Project, create_project +from jass.models.phenotype import Phenotype, get_available_phenotypes +import connexion +from flask import send_file, abort +from six import iteritems +import os +from typing import List, Dict PHENOTYPES = get_available_phenotypes( os.path.join(config["DATA_DIR"], "initTable.hdf5") @@ -57,6 +61,10 @@ def projects_project_id_genome_get(projectID, threshold=None): def projects_project_id_global_manhattan_plot_get(projectID): + """ + projects_project_id_global_manhattan_plot_get + Gets the global Manhattan plot stored in the Project folder to display it on the Web interface + """ try: return send_file( Project(id=projectID).get_global_manhattan_plot_path(), mimetype="image/png" @@ -76,6 +84,10 @@ def projects_project_id_global_manhattan_plot_get(projectID): def projects_project_id_quadrant_plot_get(projectID): + """ + projects_project_id_quadrant_plot_get + Gets the quadrant plot stored in the Project folder to display it on the Web interface + """ try: return send_file( Project(id=projectID).get_quadrant_plot_path(), mimetype="image/png" @@ -95,9 +107,16 @@ def projects_project_id_quadrant_plot_get(projectID): def projects_project_id_genome_full_get(projectID): + """ + projects_project_id_genome_full_get + Downloads the file genome_full.csv stored in the Project folder + """ try: return send_file( - Project(id=projectID).get_csv_path(), mimetype="text/csv" + Project(id=projectID).get_csv_path(), + mimetype="text/csv", + as_attachment=True, + attachment_filename="genome_full.csv" ) except FileNotFoundError: status = Project(id=projectID).status @@ -114,10 +133,18 @@ def projects_project_id_genome_full_get(projectID): def projects_project_id_local_manhattan_data_get(projectID, chromosome, region): + """ + projects_project_id_local_manhattan_data_get + Return the SumStatTab dataframe of the Project for a given chromosome and region for the Manhattan plot + """ return Project(id=projectID).get_project_local_manhattan_data(chromosome, region) def projects_project_id_local_heatmap_data_get(projectID, chromosome, region): + """ + projects_project_id_local_heatmap_data_get + Return the SumStatTab dataframe of the Project for a given chromosome and region for the Heatmap plot + """ return Project(id=projectID).get_project_local_heatmap_data(chromosome, region) diff --git a/jass/models/plots.py b/jass/models/plots.py index 4e46050cb37ca981af7bb5586446f09edb664bc4..719f606351c87acf37585825bb30106e4a8aab03 100755 --- a/jass/models/plots.py +++ b/jass/models/plots.py @@ -1,33 +1,43 @@ # -*- coding: utf-8 -*- """ -Created on Tue Mar 28 09:57:33 2017 +This software allows to plot and store graphs which can be displayed on the web interface. -@author: vguillem -@author: hmenager -@author: hjulienn -@author: clasry +@author: vguillem, hmenager, hjulienn, clasry """ -from pandas import DataFrame, read_hdf -# create (or open) an hdf5 file and opens in append mode +import logging import numpy as np -import matplotlib +# Keep the following order: 1) importing matplotlib, (2) configuring it to use AGG, (3) importing matplotlib submodules +import matplotlib matplotlib.use("AGG") import matplotlib.pyplot as plt from matplotlib import colors import matplotlib.patches as mpatches +import os +from pandas import DataFrame, read_hdf + + def replaceZeroes(df): - ids = np.where((df !=0) & np.isfinite(df)) + """ + replaceZeroes + replace null values of df with the smallest non-zero value + """ + ids = np.where((df != 0) & np.isfinite(df)) min_nonzero = np.min(df.values[ids]) df.values[df.values == 0] = min_nonzero return df def create_global_plot(work_file_path: str, global_plot_path: str): + """ + create_global_plot + generate genome-wide manhattan plot for a given set of phenotypes + """ df = read_hdf(work_file_path, "SumStatTab") - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes(df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes( + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) df["ind"] = range(len(df)) @@ -63,8 +73,9 @@ def create_global_plot(work_file_path: str, global_plot_path: str): "#bcbd22", "#17becf", "#1f77b4", - "#ff7f0e", + "#ff7f0e" ] + for num, (name, group) in enumerate(df_grouped): group = DataFrame(group) group["absolute_position"] = group["position"] + m @@ -99,21 +110,33 @@ def create_global_plot(work_file_path: str, global_plot_path: str): fig.savefig(global_plot_path, dpi=300) fig.clf() + # Update Jass_progress + progress_path = os.path.join(os.path.dirname( + work_file_path), "JASS_progress.txt") + JASS_progress = 99 + file_progress = open(progress_path, "w") + file_progress.write(str(JASS_progress)) + file_progress.close() + print("------ progress -----") def create_quadrant_plot(work_file_path: str, - quadrant_plot_path: str, significance_treshold=5*10**-8): + quadrant_plot_path: str, significance_treshold=5*10**-8): """ - Create a "quadrant" plot that represent the joint test pvalue versus the univariate test pvalue for the most significant SNPs by genomic region. The plot use a logarithmic scale. + create_quadrant_plot + Create a "quadrant" plot that represent the joint test pvalue versus the univariate \ + test pvalue for the most significant SNPs by genomic region. The plot use a logarithmic scale. :param work_file_path: path to the worktable :type work_file_path: str + :param quadrant_plot_path: path to the file where to store the plot results :type quadrant_plot_path: str """ df = read_hdf(work_file_path, "Regions") - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes(df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes( + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) #df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) @@ -126,26 +149,27 @@ def create_quadrant_plot(work_file_path: str, pv_t["color"] = "grey" # blue: significant pvalues for and univariate tests - cond = df.signif_status=="Both" + cond = df.signif_status == "Both" pv_t.loc[cond.values, "color"] = "#3ba3ec" b = cond.sum() # red: significant pvalues for test only - cond = df.signif_status=="Joint" + cond = df.signif_status == "Joint" pv_t.loc[cond.values, "color"] = "#f77189" r = cond.sum() # green: significant pvalues for univariate test only - cond = df.signif_status=="Univariate" + cond = df.signif_status == "Univariate" pv_t.loc[cond.values, "color"] = "#50b131" c = cond.sum() # grey: non significant pvalues - cond = df.signif_status=="None" + cond = df.signif_status == "None" a = cond.sum() fig, ax = plt.subplots(figsize=(10, 5)) plt.subplot(121) - plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, c=pv_t.color.tolist(), alpha=0.6, s=10) + plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, + c=pv_t.color.tolist(), alpha=0.6, s=10) plt.axis([0, pv_t.UNIVARIATE_MIN_PVAL.max(), 0, pv_t.JASS_PVAL.max()]) # axes abcisse et ordonnée à 8 @@ -168,13 +192,16 @@ def create_quadrant_plot(work_file_path: str, red_patch = mpatches.Patch( color='#f77189', label='{} Significant pvalues for joint test only'.format(r)) # grey: non significant pvalues - grey_patch = mpatches.Patch(color='grey', label='{} Non significant pvalues'.format(a)) + grey_patch = mpatches.Patch( + color='grey', label='{} Non significant pvalues'.format(a)) lgd = plt.legend(handles=[red_patch, blue_patch, green_patch, grey_patch], - bbox_to_anchor=(0.95, -0.25), loc='lower center', ncol=2, mode="expand", borderaxespad=0.) + bbox_to_anchor=(0.95, -0.25), loc='lower center', + ncol=2, mode="expand", borderaxespad=0.) plt.subplot(122) - plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, c=pv_t.color.tolist(), alpha=0.6, s=10) + plt.scatter(pv_t.UNIVARIATE_MIN_PVAL, pv_t.JASS_PVAL, + c=pv_t.color.tolist(), alpha=0.6, s=10) # axes abcisse et ordonnee à 8 plt.axvline(treshold, color="grey", linestyle="--") plt.axhline(treshold, color="grey", linestyle="--") @@ -185,24 +212,35 @@ def create_quadrant_plot(work_file_path: str, plt.axis([0, alim, 0, alim]) # légendes abcisse et ordonnee plt.xlabel('-log10(P) for univariate tests', fontsize=12) - #plt.show() - plt.savefig(quadrant_plot_path, dpi=600, bbox_extra_artists=(lgd,), bbox_inches='tight') + # plt.show() + plt.savefig(quadrant_plot_path, dpi=600, + bbox_extra_artists=(lgd,), bbox_inches='tight') plt.clf() nb_omnibus = r nb_total = r + b + c + # Update Jass_progress + progress_path = os.path.join(os.path.dirname( + work_file_path), "JASS_progress.txt") + JASS_progress = 100 + file_progress = open(progress_path, "w") + file_progress.write(str(JASS_progress)) + file_progress.close() + print("------ progress -----") + return (nb_omnibus, nb_total) def create_qq_plot(work_file_path: str, qq_plot_path: str): df = read_hdf(work_file_path, "SumStatTab") - df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes(df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']] = replaceZeroes( + df[['JASS_PVAL', 'UNIVARIATE_MIN_PVAL']]) pvalue = -np.log10(df.JASS_PVAL) # Cast values between 0 and 1, 0 and 1 excluded x = -np.log10(np.arange(1, pvalue.shape[0] + 1) / (pvalue.shape[0] + 2)) - y = np.sort(pvalue[:, 0]) + y = pvalue.sort_values() plt.scatter(x[::-1], y, s=5) lambda_value = np.median(y) / np.median(x) x_1 = np.linspace(0, 6) diff --git a/jass/models/project.py b/jass/models/project.py index 775c40e457df2c5ff3344cb53b6d638526ade1a3..7fa11a5c2e026be3cecdacb06c5ae42ad5332607 100644 --- a/jass/models/project.py +++ b/jass/models/project.py @@ -1,5 +1,7 @@ -# coding: utf-8 - +# -*- coding: utf-8 -*- +""" +compute joint statistics and generate plots for a given set of phenotypes +""" from __future__ import absolute_import from typing import List, Dict import os, sys @@ -8,17 +10,17 @@ import traceback from celery import Celery -from .base_model_ import Model -from ..util import deserialize_model -from .phenotype import Phenotype -from .worktable import ( +from jass.models.base_model_ import Model +from jass.util import deserialize_model +from jass.models.phenotype import Phenotype +from jass.models.worktable import ( create_worktable_file, get_worktable_summary, get_worktable_genomedata, get_worktable_local_manhattan_data, get_worktable_local_heatmap_data, ) -from .plots import create_global_plot, create_quadrant_plot +from jass.models.plots import create_global_plot, create_quadrant_plot from jass.config import config app = Celery("tasks", broker="pyamqp://guest@localhost//") @@ -41,12 +43,13 @@ class Project(Model): :param id: project ID. :type id: str """ - self.swagger_types = {"id": str, "status": str, "phenotypes": List[Phenotype]} + self.swagger_types = {"id": str, "status": str, "phenotypes": List[Phenotype], "progress": str} self.attribute_map = { "id": "id", "status": "status", "phenotypes": "phenotypes", + "progress": "progress" } self._id = id @@ -109,14 +112,34 @@ class Project(Model): self._phenotypes = phenotypes def get_folder_path(self): + """ + get_folder_path + Gets the path of the folder where the project data are stored + """ return os.path.join(config["DATA_DIR"], "project_{}".format(self.id)) def get_worktable_path(self): + """ + get_worktable_path + Gets the path of the file workTable.hdf5 + """ return os.path.join(self.get_folder_path(), "workTable.hdf5") def get_csv_path(self): + """ + get_csv_path + Gets the path of the file genome_full.csv + """ return os.path.join(self.get_folder_path(), "workTable.csv") + def get_progress_path(self): + """ + get_progress_path + Gets the path of the file containing the current progress percentage of \ + the analysis performed within the project + """ + return os.path.join(self.get_folder_path(), "JASS_progress.txt") + def get_project_summary_statistics(self): return get_worktable_summary(self.get_worktable_path()) @@ -147,6 +170,10 @@ class Project(Model): @property def status(self): + """ + status + Gets the status of the project + """ if not os.path.exists(self.get_folder_path()): return Project.DOES_NOT_EXIST else: @@ -166,6 +193,19 @@ class Project(Model): "quadrant_plot_status": quadrant_plot_status, } + @property + def progress(self): + """ + progress + Gets the percentage of completion of the phenotype analysis + """ + JASS_progress = 0 + progress_path = self.get_progress_path() + if os.path.exists(progress_path): + file_progress = open(progress_path, "r") + JASS_progress = file_progress.read() + file_progress.close() + return JASS_progress def get_file_building_tb_path(file_path): return file_path + ".log" diff --git a/jass/models/worktable.py b/jass/models/worktable.py index 15fab78b71f7d440e593b418bbf59f0ec1362959..e83ce2300edf6c5ac3eef162d4b960bc91f3ed95 100755 --- a/jass/models/worktable.py +++ b/jass/models/worktable.py @@ -2,18 +2,24 @@ """ This contains all functions for accessing the "worktable" hdf5 file. All functions either create or read a worktable at a specific path location. + @author: vguillem, hmenager, hjulienne """ +import math +from jass.models.stats import ( + make_stat_computer_nopattern, + make_stat_computer_pattern, + make_stat_computer_nan_dumb, +) +from jass.config import config import logging import os import importlib from typing import List -# from dask.dataframe import read_hdf as dask_read_hdf from pandas import HDFStore, DataFrame, concat, read_hdf, read_csv, Series, Index -# create (or open) an hdf5 file and opens in append mode import numpy as np import scipy.stats as spst import tables @@ -21,81 +27,123 @@ import warnings warnings.filterwarnings("ignore", category=tables.NaturalNameWarning) -from ..config import config -from .stats import ( - make_stat_computer_nopattern, - make_stat_computer_pattern, - make_stat_computer_nan_dumb, -) + +def signif(x, digit): + """ + signif + Round a number x to represent it with <digit> digits + + :param x: the number to round + :type x: float + + :param digit: the number of digits + :type digit: int + + :return: the rounded number + :rtype: float + + example: + >>> signif(1.2345678, 1) + 1.0 + + >>> signif(1.2345678, 3) + 1.23 + + >>> signif(1.2345678, 5) + 1.2346 + """ + if x == 0: + return 0 + + return round(x, digit - int(math.floor(math.log10(abs(x)))) - 1) + def choose_stat_function(smart_na_computation, optim_na, function_name, stat_function, sub_cov, **kwargs): if smart_na_computation: - #If stat is sumz use normal computer even with na + # If stat is sumz use normal computer even with na if function_name == "omnibus_stat": if optim_na: - stat_compute = make_stat_computer_pattern(sub_cov, stat_function) + stat_compute = make_stat_computer_pattern( + sub_cov, stat_function) else: - stat_compute = make_stat_computer_nan_dumb(sub_cov, stat_function) + stat_compute = make_stat_computer_nan_dumb( + sub_cov, stat_function) else: if function_name == "meta_analysis": - stat_compute = make_stat_computer_nopattern(sub_cov, stat_function, **kwargs) - elif function_name=="sumz_stat": + stat_compute = make_stat_computer_nopattern( + sub_cov, stat_function, **kwargs) + elif function_name == "sumz_stat": loading_file = kwargs.get('loadings', None) if loading_file is None: - #Default loadings would be one for every phenotypes - stat_compute = make_stat_computer_nopattern(sub_cov, stat_function) + # Default loadings would be one for every phenotypes + stat_compute = make_stat_computer_nopattern( + sub_cov, stat_function) else: loadings = read_csv(loading_file, index_col=0) - loadings = loadings.iloc[:,0] - stat_compute = make_stat_computer_nopattern(sub_cov, stat_function,loadings=loadings ) + loadings = loadings.iloc[:, 0] + stat_compute = make_stat_computer_nopattern( + sub_cov, stat_function, loadings=loadings) else: - stat_compute = make_stat_computer_nopattern(sub_cov, stat_function) + stat_compute = make_stat_computer_nopattern( + sub_cov, stat_function) else: stat_compute = make_stat_computer_nopattern(sub_cov, stat_function) - return stat_compute -def add_signif_status_column(region_sub_tab, significance_treshold=5*10**-8): +def add_signif_status_column(region_sub_tab, significance_treshold=5*10**-8): region_sub_tab["signif_status"] = "" + # blue: significant pvalues for omnibus and univariate tests - cond = np.where((region_sub_tab.JASS_PVAL < significance_treshold) & (region_sub_tab.UNIVARIATE_MIN_PVAL < significance_treshold))[0] + cond = np.where((region_sub_tab.JASS_PVAL < significance_treshold) & ( + region_sub_tab.UNIVARIATE_MIN_PVAL < significance_treshold))[0] region_sub_tab.loc[region_sub_tab.index[cond], "signif_status"] = "Both" # red: significant pvalues for omnibus test only - cond = np.where((region_sub_tab.JASS_PVAL < significance_treshold ) & (region_sub_tab.UNIVARIATE_MIN_PVAL > significance_treshold))[0] + cond = np.where((region_sub_tab.JASS_PVAL < significance_treshold) & ( + region_sub_tab.UNIVARIATE_MIN_PVAL > significance_treshold))[0] region_sub_tab.loc[region_sub_tab.index[cond], "signif_status"] = "Joint" # green: significant pvalues for univariate test only - cond = np.where((region_sub_tab.JASS_PVAL > significance_treshold) & (region_sub_tab.UNIVARIATE_MIN_PVAL < significance_treshold))[0] - region_sub_tab.loc[region_sub_tab.index[cond], "signif_status"] = "Univariate" + cond = np.where((region_sub_tab.JASS_PVAL > significance_treshold) & ( + region_sub_tab.UNIVARIATE_MIN_PVAL < significance_treshold))[0] + region_sub_tab.loc[region_sub_tab.index[cond], + "signif_status"] = "Univariate" # grey: non significant pvalues - cond = np.where((region_sub_tab.JASS_PVAL > significance_treshold) & (region_sub_tab.UNIVARIATE_MIN_PVAL > significance_treshold))[0] + cond = np.where((region_sub_tab.JASS_PVAL > significance_treshold) & ( + region_sub_tab.UNIVARIATE_MIN_PVAL > significance_treshold))[0] region_sub_tab.loc[region_sub_tab.index[cond], "signif_status"] = "None" return region_sub_tab + def get_region_summary(sum_stat_tab, phenotype_ids, significance_treshold=5*10**-8): # Select the most significant SNP for the joint test for each region - region_sub_tab = sum_stat_tab.sort_values("JASS_PVAL").groupby("Region").first()#.reset_index() + region_sub_tab = sum_stat_tab.sort_values( + "JASS_PVAL").groupby("Region").first() # .reset_index() # add minimum univariate p-value univar = sum_stat_tab.groupby("Region").min().UNIVARIATE_MIN_PVAL region_sub_tab.loc[univar.index, "UNIVARIATE_MIN_PVAL"] = univar.values + # Tag SNPs depending on which test is significant region_sub_tab.reset_index(inplace=True) - region_sub_tab = add_signif_status_column(region_sub_tab, significance_treshold) + region_sub_tab = add_signif_status_column( + region_sub_tab, significance_treshold) # reorder columns - region_sub_tab = region_sub_tab[['Region', "MiddlePosition", "snp_ids","CHR", "position", "Ref_allele", "Alt_allele", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", "signif_status"] + phenotype_ids] + region_sub_tab = region_sub_tab[['Region', "MiddlePosition", "snp_ids", "CHR", "position", + "Ref_allele", "Alt_allele", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", + "signif_status"] + phenotype_ids] return region_sub_tab -def post_computation_filtering(worktable_chunk, significant_treshold = 5*10**-8): + +def post_computation_filtering(worktable_chunk, significant_treshold=5*10**-8): """ Remove SNPs that seems aberrant: SNPs with a very low p-value that are isolated in their region @@ -113,23 +161,27 @@ def post_computation_filtering(worktable_chunk, significant_treshold = 5*10**-8) # select region with only one SNP that is significant which is # suspect - reg = res.loc[res==1].index + reg = res.loc[res == 1].index for reg_aberant in reg: - aberant_SNP = worktable_chunk.loc[worktable_chunk.Region==reg_aberant].sort_values("JASS_PVAL").index[0] + aberant_SNP = worktable_chunk.loc[worktable_chunk.Region == reg_aberant].sort_values( + "JASS_PVAL").index[0] worktable_chunk.drop(aberant_SNP, inplace=True) return worktable_chunk -def compute_pleiotropy_index(W,significance_treshold): - N_significatif = (2.0 * spst.norm.sf(W.fillna(0, inplace=False).abs())<significance_treshold).sum(1) +def compute_pleiotropy_index(W, significance_treshold): + + N_significatif = (2.0 * spst.norm.sf(W.fillna(0, + inplace=False).abs()) < significance_treshold).sum(1) N_pheno = (~W.isnull()).sum(1) - #pleiotropy index is not meaningful for too few phenotype + # pleiotropy index is not meaningful for too few phenotype S = N_significatif/N_pheno S.loc[N_pheno < 4] = np.nan return S + def create_worktable_file( phenotype_ids: List[str], init_file_path: str, @@ -139,7 +191,7 @@ def create_worktable_file( optim_na: bool = True, csv_file: str = None, chunk_size: int = 50, - significance_treshold = 5*10**-8, + significance_treshold=5*10**-8, post_filtering=True, **kwargs ): @@ -171,18 +223,30 @@ def create_worktable_file( :type significant_treshold: float : """ + # Initialization of Jass_progress + progress_path = os.path.join(os.path.dirname( + project_hdf_path), "JASS_progress.txt") + JASS_progress = 0 + file_progress = open(progress_path, "w") + file_progress.write(str(JASS_progress)) + file_progress.close() + print("------ progress -----") + # read data by chunks to optimize memory usage # select only rows (SNPs) where there are no missing data how_dropna = "any" if remove_nan else "all" if os.path.exists(project_hdf_path): os.remove(project_hdf_path) hdf_work = HDFStore(project_hdf_path) + if csv_file is not None: + if os.path.exists(csv_file): + os.remove(csv_file) # subset of phenotypes that have been selected phenolist = read_hdf(init_file_path, "PhenoList") phenolist = phenolist.loc[phenotype_ids] hdf_work.put( - "PhenoList", phenolist + "PhenoList", phenolist ) # subset of covariance matrix for the selected phenotypes @@ -194,15 +258,16 @@ def create_worktable_file( regions = read_hdf(init_file_path, "Regions").index.tolist() sum_stat_tab_min_itemsizes = {"snp_ids": 50, "Region": 10, "CHR": 5} - #['Region', "MiddlePosition", "snp_ids","CHR", "position", "Ref_allele", "Alt_allele", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", "signif_status"] - region_sub_table_min_itemsizes = {"Region": 10, "index": 10, "CHR": 5, "snp_ids": 50, "signif_status":20} + region_sub_table_min_itemsizes = { + "Region": 10, "index": 10, "CHR": 5, "snp_ids": 50, "signif_status": 20} smart_na_computation = not (remove_nan) module_name, function_name = stat.split(":") stat_module = importlib.import_module(module_name) stat_fn = getattr(stat_module, function_name) - stat_compute = choose_stat_function(smart_na_computation, optim_na, function_name, stat_fn, sub_cov, samp_size=phenolist['Effective_sample_size'], **kwargs) + stat_compute = choose_stat_function(smart_na_computation, optim_na, function_name, + stat_fn, sub_cov, samp_size=phenolist['Effective_sample_size'], **kwargs) Nchunk = len(regions) // chunk_size + 1 @@ -210,11 +275,17 @@ def create_worktable_file( Nsnp_jassed = 0 for chunk in range(Nchunk): + + # the chunk index starts at zero and we take into account the 2 plot stages + JASS_progress = round((chunk + 1) * 100 / (Nchunk + 2)) + binf = chunk * chunk_size bsup = (chunk+1) * chunk_size + sum_stat_tab = read_hdf(init_file_path, 'SumStatTab', columns=[ - 'Region', 'CHR', 'position', 'snp_ids', 'Ref_allele', 'Alt_allele', 'MiddlePosition'] + phenotype_ids, - where='Region >= {0} and Region < {1}'.format(binf, bsup)) + 'Region', 'CHR', 'position', 'snp_ids', 'Ref_allele', 'Alt_allele', 'MiddlePosition'] + phenotype_ids, + where='Region >= {0} and Region < {1}'.format(binf, bsup)) + print("Regions {0} to {1}\r".format(binf, bsup)) # Remake row index unique: IMPORTANT for assignation with .loc at line 98 @@ -223,18 +294,21 @@ def create_worktable_file( ) sum_stat_tab.reset_index(drop=True, inplace=True) - if sum_stat_tab.shape[0]==0: - print("No data available for region {0} to region {1}".format(binf, bsup)) - continue # skip region if no data are available + if sum_stat_tab.shape[0] == 0: + print( + "No data available for region {0} to region {1}".format(binf, bsup)) + continue # skip region if no data are available N_pheno = len(phenotype_ids) Nsnp_total = Nsnp_total + sum_stat_tab.shape[0] if remove_nan or stat.split(":")[-1] != "omnibus_stat": - sum_stat_tab['JASS_PVAL'] = stat_compute(sum_stat_tab[phenotype_ids]) + sum_stat_tab['JASS_PVAL'] = stat_compute( + sum_stat_tab[phenotype_ids]) else: # Sort SumStatTab by missing patterns - patterns_missing, frequent_pattern = compute_frequent_missing_pattern(sum_stat_tab[phenotype_ids]) + patterns_missing, frequent_pattern = compute_frequent_missing_pattern( + sum_stat_tab[phenotype_ids]) # Apply the statistic computation by missing patterns for pattern in frequent_pattern: @@ -256,18 +330,24 @@ def create_worktable_file( sum_stat_tab.sort_values(by=["Region", "CHR"], inplace=True) sum_stat_tab["UNIVARIATE_MIN_PVAL"] = DataFrame( - 2.0 * spst.norm.sf(sum_stat_tab[phenotype_ids].fillna(0, inplace=False).abs()), - index=sum_stat_tab.index, - ).min(axis=1) + 2.0 * + spst.norm.sf(sum_stat_tab[phenotype_ids].fillna( + 0, inplace=False).abs()), + index=sum_stat_tab.index, + ).min(axis=1) - sum_stat_tab["UNIVARIATE_MIN_QVAL"] = sum_stat_tab["UNIVARIATE_MIN_PVAL"]*(1-np.isnan(sum_stat_tab[phenotype_ids]).astype(int)).sum(1) - sum_stat_tab.loc[sum_stat_tab.UNIVARIATE_MIN_QVAL>1 , "UNIVARIATE_MIN_QVAL"] = 1 + sum_stat_tab["UNIVARIATE_MIN_QVAL"] = sum_stat_tab["UNIVARIATE_MIN_PVAL"] * \ + (1-np.isnan(sum_stat_tab[phenotype_ids]).astype(int)).sum(1) + sum_stat_tab.loc[sum_stat_tab.UNIVARIATE_MIN_QVAL > + 1, "UNIVARIATE_MIN_QVAL"] = 1 - #Computing pleiotropy - sum_stat_tab["PLEIOTROPY_INDEX"] = compute_pleiotropy_index(sum_stat_tab[phenotype_ids], significance_treshold) + # Computing pleiotropy + sum_stat_tab["PLEIOTROPY_INDEX"] = compute_pleiotropy_index( + sum_stat_tab[phenotype_ids], significance_treshold) sum_stat_tab = sum_stat_tab[ - ["Region", "CHR", "snp_ids", "position", 'Ref_allele', 'Alt_allele', "MiddlePosition", "JASS_PVAL", "UNIVARIATE_MIN_PVAL", "UNIVARIATE_MIN_QVAL","PLEIOTROPY_INDEX" ] + ["Region", "CHR", "snp_ids", "position", 'Ref_allele', 'Alt_allele', "MiddlePosition", + "JASS_PVAL", "UNIVARIATE_MIN_PVAL", "UNIVARIATE_MIN_QVAL", "PLEIOTROPY_INDEX"] + phenotype_ids] if post_filtering: @@ -279,15 +359,21 @@ def create_worktable_file( if csv_file is not None: with open(csv_file, 'a') as f: - sum_stat_tab.to_csv(f, header=f.tell()==0) + sum_stat_tab.to_csv(f, header=f.tell() == 0) - region_sub_table = get_region_summary(sum_stat_tab, phenotype_ids, significance_treshold=significance_treshold) + region_sub_table = get_region_summary( + sum_stat_tab, phenotype_ids, significance_treshold=significance_treshold) hdf_work.append( "Regions", - region_sub_table - ,min_itemsize=region_sub_table_min_itemsizes + region_sub_table, min_itemsize=region_sub_table_min_itemsizes ) + + file_progress = open(progress_path, "w") + file_progress.write(str(JASS_progress)) + file_progress.close() + print("------ progress -----") + hdf_work.close() print("{1} SNPs treated on {0} SNPs".format(Nsnp_jassed, Nsnp_total)) @@ -300,12 +386,16 @@ def create_worktable_file( np.array( [ [ - sum((jost_min < significance_treshold) & (pval_min < significance_treshold)), - sum((jost_min < significance_treshold) & (pval_min > significance_treshold)), + sum((jost_min < significance_treshold) & + (pval_min < significance_treshold)), + sum((jost_min < significance_treshold) & + (pval_min > significance_treshold)), ], [ - sum((jost_min > significance_treshold) & (pval_min < significance_treshold)), - sum((jost_min > significance_treshold) & (pval_min > significance_treshold)), + sum((jost_min > significance_treshold) & + (pval_min < significance_treshold)), + sum((jost_min > significance_treshold) & + (pval_min > significance_treshold)), ], ] ) @@ -327,13 +417,15 @@ def compute_frequent_missing_pattern(sum_stat_tab): cover 99 percent of the snps """ N_pheno = sum_stat_tab.shape[1] - patterns_missing = Series(np.dot((1- sum_stat_tab.isnull()), 10**np.arange((N_pheno-1), -1, -1))) + patterns_missing = Series( + np.dot((1 - sum_stat_tab.isnull()), 10**np.arange((N_pheno-1), -1, -1))) pattern_frequency = patterns_missing.value_counts() / len(patterns_missing) - n_pattern = pattern_frequency.shape[0]#(pattern_frequency.cumsum() < 0.99).sum() + 1 + n_pattern = pattern_frequency.shape[0] + # (pattern_frequency.cumsum() < 0.99).sum() + 1 #n_pattern = min(5000, n_pattern) print("Number of pattern {}".format(n_pattern)) #print("Covering {}/1 of snps ".format(pattern_frequency.iloc[:n_pattern].sum())) - frequent_pattern = pattern_frequency.index.tolist() #[:n_pattern] + frequent_pattern = pattern_frequency.index.tolist() # [:n_pattern] return patterns_missing, frequent_pattern @@ -350,8 +442,12 @@ def stringize_dataframe_region_chr(dataframe: DataFrame): :return: The dataframe with converted Region and CHR columns :rtype: pandas.DataFrame """ - dataframe["Region"] = dataframe["Region"].apply(lambda x: "Region" + str(x)) + dataframe["Region"] = dataframe["Region"].apply( + lambda x: "Region" + str(x)) dataframe["CHR"] = dataframe["CHR"].apply(lambda x: "chr" + str(x)) + dataframe["JASS_PVAL"] = dataframe["JASS_PVAL"].apply( + lambda x: str(signif(x, 4))) + return dataframe @@ -383,8 +479,16 @@ def get_worktable_genomedata(project_hdf_path: str): :return: The dataframe as a CSV formatted text :rtype: str """ - region_subtable = stringize_dataframe_region_chr(read_hdf(project_hdf_path, "Regions")) - region_subtable.rename(index=str, columns={'JASS_PVAL':'JOSTmin'}, inplace=True) + region_subtable = stringize_dataframe_region_chr( + read_hdf(project_hdf_path, "Regions")) + + region_subtable.rename(index=str, columns={ + 'JASS_PVAL': 'JOSTmin'}, inplace=True) + + region_subtable['PVALmin'] = region_subtable['UNIVARIATE_MIN_PVAL'] + region_subtable['PVALmin'] = region_subtable['PVALmin']. apply( + lambda x: str(signif(x, 4))) + return region_subtable.to_csv(index=False) @@ -407,8 +511,9 @@ def get_worktable_local_manhattan_data( region_int = region[6:] chromosome_int = chromosome[3:] dataframe = read_hdf(project_hdf_path, "SumStatTab", - columns=["Region", "CHR", "position", "snp_ids", "JASS_PVAL"], - where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) + columns=["Region", "CHR", "position", + "snp_ids", "JASS_PVAL"], + where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) dataframe = stringize_dataframe_region_chr(dataframe) dataframe = dataframe.sort_values("position") return dataframe.to_csv(index=False) @@ -434,13 +539,14 @@ def get_worktable_local_heatmap_data( region_int = region[6:] chromosome_int = chromosome[3:] dataframe = read_hdf(project_hdf_path, "SumStatTab", - where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) + where=['Region='+str(region_int), 'CHR='+str(chromosome_int)]) dataframe = stringize_dataframe_region_chr(dataframe) dataframe = dataframe.sort_values("position") - dataframe.drop( ["Region", "CHR", "position", "JASS_PVAL", "MiddlePosition", "UNIVARIATE_MIN_PVAL", "UNIVARIATE_MIN_QVAL", "PLEIOTROPY_INDEX"], - axis=1, - inplace=True, - ) + dataframe.drop(["Region", "CHR", "position", "JASS_PVAL", "MiddlePosition", "UNIVARIATE_MIN_PVAL", + "UNIVARIATE_MIN_QVAL", "PLEIOTROPY_INDEX"], + axis=1, + inplace=True, + ) dataframe.rename(columns={"snp_ids": "ID"}, inplace=True) column_order = list(dataframe.ID) pivoted_dataframe = dataframe.pivot_table(columns="ID") diff --git a/jass/static/export.html b/jass/static/export.html index 25a6b90f3f58f5c12ece45243c3bcc0e9e1e1d7a..23f4666b72eb6bf6d26d8426d51d766dd5402908 100644 --- a/jass/static/export.html +++ b/jass/static/export.html @@ -124,22 +124,11 @@ var tr = document.createElement("tr"); //var val = data[i]['ID']; //var linkRef = data[i]['linkRef']; + for (var j=0; j<columns.length; j++) { - var td = document.createElement("td"); td.innerHTML=data[i][columns[j]]; - if(columns[j] == "JASS_PVAL"){ - var val= -Math.log10(data[i][columns[j]]); - td.innerHTML= val; - //idCounter ++; - tr.append(td); - } - - else { - td.innerHTML=data[i][columns[j]]; - tr.append(td); - } - + tr.append(td); } //var td = document.createElement("td"); //td.innerHTML='-'; diff --git a/jass/static/index.html b/jass/static/index.html index cb851f2ed187eae57156675b5f228c87f83d8ea3..376dd7668e2d284ba7027f377ac7b656c3e52f08 100644 --- a/jass/static/index.html +++ b/jass/static/index.html @@ -53,7 +53,12 @@ <p>Jass is devellopped by the <a href="https://research.pasteur.fr/en/team/statistical-genetics/" target="_blank">Statistical Genetics</a> group in collaboration with the <a href="https://research.pasteur.fr/en/team/bioinformatics-and-biostatistics-hub/">HUB</a> </p> </div> <div id="tabs-3"> - <p>Publication pending !</p> + <p><b>JASS: command line and web interface for the joint analysis of GWAS results</b><br /> + Hanna Julienne, Pierre Lechat, Vincent Guillemot, Carla Lasry, Chunzi Yao, Robinson Araud, Vincent Laville, Bjarni Vilhjalmsson, Hervé Ménager, Hugues Aschard<br /> + in: NAR Genomics and Bioinformatics, Volume 2, Issue 1, March 2020, lqaa003, <a href="https://doi.org/10.1093/nargab/lqaa003"> <FONT color=#0000FF>https://doi.org/10.1093/nargab/lqaa003</FONT></a></p> + <p><b>Multitrait genetic-phenotype associations to connect disease variants and biological mechanisms</b><br /> + Hanna Julienne, Vincent Laville, Zachary R. McCaw, Zihuai He, Vincent Guillemot, Carla Lasry, Andrey Ziyatdinov, Amaury Vaysse, Pierre Lechat, Hervé Ménager, Wilfried Le Goff, Marie-Pierre Dube, Peter Kraft, Iuliana Ionita-Laza, Bjarni J. Vilhjálmsson, Hugues Aschard<br /> + preprint in: biorxiv, <a href=https://www.biorxiv.org/content/10.1101/2020.06.26.172999v1.full> <FONT color=#0000FF>https://www.biorxiv.org/content/10.1101/2020.06.26.172999v1.full</FONT></a></p> </div> </div> </body> diff --git a/jass/static/selectPhenotypes.html b/jass/static/selectPhenotypes.html index ed804854b747480f7146737064d8825f60af0bc4..243fd59485a5e532e7411ca611a636fdf4a8939a 100644 --- a/jass/static/selectPhenotypes.html +++ b/jass/static/selectPhenotypes.html @@ -21,6 +21,10 @@ background-repeat:no-repeat; } #image-top {height:80!important} + + div.blockMe { padding: 30px; margin: 30px; border: 10px solid #ccc; background-color: #ffd } + #question { background-color: #ffc; padding: 10px; } + #question input { width: 4em } </style> <script> @@ -97,70 +101,104 @@ } $(function(){ + function avancement() { + var ava = document.getElementById("avancement"); + var prc = document.getElementById("pourcentage"); + prc.innerHTML = ava.value + "%"; + } + + function modif(val) { + var ava = document.getElementById("avancement"); + if((ava.value+val)<=ava.max && (ava.value+val)>0) { + ava.value += val; + } + avancement(); + } + + // Variable to get ids for the checkboxes var idCounter=1; $("#btn1").click(function(){ - var selectedString = getSelected().join(','); - if(selectedString != ''){ - var phe = {}; - phe['phenotypeID'] = selectedString; - console.log("!!! "+selectedString); - console.log("!!! "+phe['phenotypeID']); - $.blockUI({ css: { - border: 'none', - padding: '15px', - backgroundColor: '#000', - '-webkit-border-radius': '10px', - '-moz-border-radius': '10px', - opacity: .5, - color: '#fff'} }); + var selectedNumber = getSelected().length; + var toApply = 1; + if (selectedNumber == 1){ + // A confirmation window is displayed + var r = confirm("ð–ð€ð‘ððˆðð†: ð²ð¨ð® ð¡ðšð¯ðž ð¬ðžð¥ðžðœððžð ð¨ð§ð¥ð² ð¨ð§ðž ð©ð¡ðžð§ð¨ðð²ð©ðž!\nIt isn't the way JASS normally works.\nDo you want to continue?"); + if (r == true) { // Button OK is selected + toApply = 1; + } else { // Button CANCEL is selected + toApply = 0; + } + } + if (toApply == 1){ + var selectedString = getSelected().join(','); + if(selectedString != ''){ + var phe = {}; + phe['phenotypeID'] = selectedString; + console.log("!!! "+selectedString); + console.log("!!! "+phe['phenotypeID']); + + $.blockUI({ message: $('#question'), css: { width: '275px' }}); + avancement(); //Initialisation + - var status="-1"; - var getProjectStatus = function(){ - $.post( "/api/projects",phe).done(function( dataJson ) { - let data = JSON.parse(JSON.stringify(dataJson)); - status = data.status.worktable; - console.log("!! status "+status); - if(status =="READY"){ - $.unblockUI(); - console.log( data ); - sessionStorage.setItem("id",data.id); - console.log(data.phenotypes); - //var monobjet_json = JSON.stringify(data.phenotypes[0]); - var monobjet_json = JSON.stringify(data.phenotypes); - sessionStorage.setItem("phenotypes",monobjet_json); - console.log(data.phenotypes[0]["cohort"]); - //location.href = 'http://hub17.hosting.pasteur.fr/getVar.html'; - location.href = 'chromo_heatmap_manhattan.html'; - } - else if(status =="CREATING"){ - console.log("CREATING"); - setTimeout(getProjectStatus, 10000); - } - }); + var status="-1"; + var JASS_progress = 0; + var Old_progress = 0; + var getProjectStatus = function(){ + $.post( "/api/projects",phe).done(function( data ) { + status = data.status.worktable; + console.log("!! status "+status); + JASS_progress = data.progress; + console.log(">>>>>>> progress "+JASS_progress); + var deltaProgress = JASS_progress - Old_progress; + Old_progress = JASS_progress; + modif(deltaProgress); + + + if(status =="READY"){ + $.unblockUI(); + console.log( data ); + sessionStorage.setItem("id",data.id); + console.log(data.phenotypes); + //var monobjet_json = JSON.stringify(data.phenotypes[0]); + var monobjet_json = JSON.stringify(data.phenotypes); + sessionStorage.setItem("phenotypes",monobjet_json); + console.log(data.phenotypes[0]["cohort"]); + //location.href = 'http://hub17.hosting.pasteur.fr/getVar.html'; + location.href = 'chromo_heatmap_manhattan.html'; + } + else if(status =="CREATING"){ + console.log("CREATING"); + + + setTimeout(getProjectStatus, 10000); + } + }); - }; - getProjectStatus(); - - - } - else{ - console.log( "rien"); - $.blockUI({ message: '<h2><img src="img/busy.gif" /> Please choose an array of Phenotypes...</h2>' , - css: { - border: 'none', - padding: '15px', - backgroundColor: '#000', - '-webkit-border-radius': '10px', - '-moz-border-radius': '10px', - opacity: .5, - color: '#fff'}}); - setTimeout($.unblockUI, 2000); - } - }); + }; + getProjectStatus(); - }); + + } + else{ + console.log( "rien"); + $.blockUI({ message: '<h2><img src="img/busy.gif" /> Please choose an array of Phenotypes...</h2>' , + css: { + border: 'none', + padding: '15px', + backgroundColor: '#000', + '-webkit-border-radius': '10px', + '-moz-border-radius': '10px', + opacity: .5, + color: '#fff'}}); + setTimeout($.unblockUI, 2000); + } + } + }); + + }); @@ -176,6 +214,13 @@ <div id='divContainer'> <table id ="pheTable" style="width: 100%" class="display dataTable"></table> </div> + +<div id="question" style="display:none; cursor: default"> + <H3>Analysis in progress ...</H3> + <progress id="avancement" value="0" max="100"></progress> + <span id="pourcentage"></span> +</div> + <button id="btn1">Select Phenotypes</button> </body> diff --git a/jass/swagger/swagger.yaml b/jass/swagger/swagger.yaml index e0a2a9f0e3ccd651a900c86c429d60fc5f2b55ad..ed2d8ff239af35abe4da0466a081b19d09fa8098 100644 --- a/jass/swagger/swagger.yaml +++ b/jass/swagger/swagger.yaml @@ -89,6 +89,8 @@ paths: "global_manhattan": "CREATING" "quadrant_plot_status": "CREATING" "worktable": "CREATING" + progress": + "progress": "0" Ready: value: id: "bca9d414e0f9a67b9e0d2131a47c316c" @@ -113,6 +115,8 @@ paths: "global_manhattan": "READY" "quadrant_plot_status": "READY" "worktable": "READY" + progress": + "progress": "100" x-openapi-router-controller: jass.controllers.default_controller "/projects/{projectID}": get: @@ -415,6 +419,8 @@ components: type: string status: type: string + progress: + type: string outcome: type: array items: diff --git a/jass/test/test_plots.py b/jass/test/test_plots.py new file mode 100644 index 0000000000000000000000000000000000000000..9bdbdf0fb7dd42ff4e24493dd763c2ae0a1ad3f9 --- /dev/null +++ b/jass/test/test_plots.py @@ -0,0 +1,38 @@ +# coding: utf-8 + +from __future__ import absolute_import +import os, shutil, tempfile + +from pandas import read_hdf +from pandas.testing import assert_frame_equal + +from jass.models.plots import create_global_plot + +from . import JassTestCase + + +class TestPlots(JassTestCase): + + test_folder = "data_test1" + + def setUp(self): + # Create a temporary directory + self.test_dir = tempfile.mkdtemp() + self.worktable_hdf_path = self.get_file_path_fn("worktable.hdf5") + self.global_plot_path = os.path.join(self.test_dir, "global_manhattan.png") + + def tearDown(self): + # Remove the directory after the test + shutil.rmtree(self.test_dir) + pass + + def test_create_global_plot(self): + """ + Compare result and expected SumStatJostTab + """ + create_global_plot(self.worktable_hdf_path, self.global_plot_path) + +if __name__ == "__main__": + import unittest + + unittest.main()