diff --git a/ippisite/ippidb/admin.py b/ippisite/ippidb/admin.py index 935d70556b2954f2a080686abf6529d6fab93b13..badec17a76a15260f971e5ff5714ddf9244e6ce2 100644 --- a/ippisite/ippidb/admin.py +++ b/ippisite/ippidb/admin.py @@ -30,6 +30,7 @@ from .models import ( Contribution, update_compound_cached_properties, ) +from .tasks import validate_compounds class ViewOnSiteModelAdmin(admin.ModelAdmin): @@ -118,35 +119,14 @@ class CompoundModelAdmin(ViewOnSiteModelAdmin): ) search_fields = ("id", "iupac_name", "common_name", "canonical_smile") - actions = ["compute_properties", "cache_properties", "validate_contributions"] - - def compute_properties(self, request, queryset): - ids = [id for id in queryset.values_list("id", flat=True)] - call_command("compute_compound_properties", "update", "--ids", *ids) - self.message_user( - request, - f"properties have been computed for compound(s) {','.join(str(id) for id in queryset.values_list('id', flat=True))}.", - ) - - def cache_properties(self, request, queryset): - update_compound_cached_properties(queryset) - self.message_user( - request, - f"properties have been cached for compound(s) {','.join(str(id) for id in queryset.values_list('id', flat=True))}.", - ) + actions = ["validate_contributions"] def validate_contributions(self, request, queryset): - count = 0 - for compound in queryset.all(): - for ca in compound.compoundaction_set.all(): - for contrib in ca.ppi.contribution_set.all(): - if contrib.validated is False: - contrib.validated = True - contrib.save() - count += 1 + ids = [id for id in queryset.values_list("id", flat=True)] + validate_compounds.delay(ids) self.message_user( request, - f"{count} contribution(s) on compound(s) {','.join(str(id) for id in queryset.values_list('id', flat=True))} validated.", + f"validation started for compound(s) {','.join(str(id) for id in queryset.values_list('id', flat=True))} (this might take a while).", ) diff --git a/ippisite/ippidb/gx.py b/ippisite/ippidb/gx.py index 32acef6ba0cf3c61ff6db0caeefd4e38af19dc2f..47287df182c2c3a661f3f1bbf96338d39a5ddce2 100644 --- a/ippisite/ippidb/gx.py +++ b/ippisite/ippidb/gx.py @@ -1,20 +1,104 @@ """ iPPI-DB-Galaxy communication module """ +from datetime import datetime +import time + from bioblend.galaxy import GalaxyInstance +from django.conf import settings +import requests +import urllib3 + +BASE_URL = settings.GALAXY_BASE_URL +KEY = settings.GALAXY_APIKEY +WORKFLOW_ID = settings.GALAXY_COMPOUNDPROPERTIES_WORKFLOWID + + +urllib3.disable_warnings() -workflow_id = "dad6103ff71ca4fe" -galaxy_url = "https://galaxy-dev.web.pasteur.fr" -api_key = "21c2ce387688b1a785040762f7c9c331" +class GalaxyCompoundPropertiesRunner(object): + def __init__(self): + self.galaxy_instance = GalaxyInstance(url=BASE_URL, key=KEY, verify=False) + self.galaxy_instance.nocache = True -def run_workflow_and_get_results(input_file): - gi = GalaxyInstance(galaxy_url, key=api_key) - gi.verify = False - history_id = gi.histories.create_history("ippidb_history")["id"] - dataset_id = gi.tools.upload_file(input_file, history_id)["outputs"][0]["id"] - inputs = {"0": {"id": dataset_id, "src": "hda"}} - workflow_run = gi.workflows.invoke_workflow( - workflow_id, inputs=inputs, history_id=history_id - ) - print(workflow_run) + def compute_properties_for_sdf_file(self, sdf_file_path): + # create a history to store the workflow results + now = datetime.now() + date_time = now.strftime("%Y/%m/%d-%H:%M:%S") + history_name = "compoundpropertiesjobrun_%s" % date_time + history = self.galaxy_instance.histories.create_history(name=history_name) + history_id = history["id"] + if history["state"] not in ["new", "ok"]: + raise Exception( + f'Error creating history "{history_name}" (id {history_id})' + ) + # launch data upload job + upload_response = self.galaxy_instance.tools.upload_file( + path=sdf_file_path, file_type="sdf", history_id=history_id + ) + upload_data_id = upload_response["outputs"][0]["id"] + upload_job = upload_response["jobs"][0] + upload_job_id = upload_job["id"] + # monitor data upload until completed or on error + while upload_job["state"] not in ["ok"]: + time.sleep(2) + upload_job = self.galaxy_instance.jobs.show_job(upload_job_id) + if upload_job["state"] in ["error", "deleted", "discarded"]: + data = self.galaxy_instance.datasets.show_dataset(upload_data_id) + raise Exception( + f"Error during Galaxy data upload job - name : " + f"{data['name']}, id : {upload_data_id}, " + f"error : {data['misc_info']}" + ) + # check uploaded dataset status + data = self.galaxy_instance.datasets.show_dataset(upload_data_id) + if data["state"] not in ["ok"]: + raise Exception( + f"Error during Galaxy data upload result - name : " + f"{data['name']}, id : {upload_data_id}, " + f"error : {data['misc_info']}" + ) + # submit compound properties computation job + dataset_map = {"0": {"src": "hda", "id": upload_data_id}} + workflow_job = self.galaxy_instance.workflows.invoke_workflow( + WORKFLOW_ID, inputs=dataset_map, history_id=history_id + ) + workflow_job_id = workflow_job["id"] + while workflow_job["state"] not in ["ok", "scheduled"]: + time.sleep(2) + workflow_job = self.galaxy_instance.workflows.show_invocation( + "dad6103ff71ca4fe", workflow_job_id + ) + if workflow_job["state"] in ["error", "deleted", "discarded"]: + raise Exception( + f"Error during Galaxy workflow job - name : " + f"id : {workflow_job_id}, " + ) + datasets = self.galaxy_instance.histories.show_history( + history_id, contents=True + ) + actual_result_dataset = None + for dataset in datasets: + if dataset["extension"] == "json": + actual_result_dataset = dataset + if actual_result_dataset is None: + raise Exception( + f"Result for galaxy workflow invocation {workflow_job_id} not found in" + f" history {history_id}" + ) + dataset = self.galaxy_instance.datasets.show_dataset( + actual_result_dataset["id"] + ) + while dataset["state"] not in ["ok"]: + time.sleep(2) + try: + dataset = self.galaxy_instance.datasets.show_dataset( + actual_result_dataset["id"] + ) + except requests.exceptions.ChunkedEncodingError: + continue + download_url = dataset["download_url"] + contents_resp = requests.get(BASE_URL + download_url, verify=False) + contents = contents_resp.json() + return contents diff --git a/ippisite/ippidb/tasks.py b/ippisite/ippidb/tasks.py new file mode 100644 index 0000000000000000000000000000000000000000..904484f7e11e8fed7223e2a7f0a89ccd428c03f6 --- /dev/null +++ b/ippisite/ippidb/tasks.py @@ -0,0 +1,308 @@ +from __future__ import absolute_import, unicode_literals +import json +import tempfile +import io +import base64 +import itertools + + +from celery import shared_task +import seaborn as sns +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + +# TODO send email from django.core.mail import mail_managers +from django.forms.models import model_to_dict + +from .models import ( + Compound, + update_compound_cached_properties, + LeLleBiplotData, + PcaBiplotData, +) +from .utils import smi2sdf +from .gx import GalaxyCompoundPropertiesRunner + + +def dec(decimal_places): + def func(number): + return round(float(number), decimal_places) + + return func + + +def compute_compound_properties(compound_ids): + compounds = Compound.objects.filter(id__in=compound_ids) + runner = GalaxyCompoundPropertiesRunner() + smiles_dict = {} + for c in compounds: + smiles_dict[c.id] = c.canonical_smile + # create SDF file for the selection + sdf_string = smi2sdf(smiles_dict) + fh = tempfile.NamedTemporaryFile(mode="w", delete=False) + fh.write(sdf_string) + fh.close() + print(f"Galaxy input SDF file for compounds {smiles_dict.keys()}: {fh.name}") + # run computations on Galaxy + pc_properties = runner.compute_properties_for_sdf_file(fh.name) + pc_properties_dict = {compound["Name"]: compound for compound in pc_properties} + fh = tempfile.NamedTemporaryFile(mode="w", delete=False) + json.dump(pc_properties_dict, fh, indent=4) + fh.close() + print( + f"Properties added for compounds {smiles_dict.keys()} in JSON file: {fh.name}" + ) + # report and update database + property_mapping = { + "CanonicalSmile": ("canonical_smile", str), + "IUPAC": ("iupac_name", str), + "TPSA": ("tpsa", dec(2)), + "NbMultBonds": ("nb_multiple_bonds", int), + "BalabanIndex": ("balaban_index", dec(2)), + "NbDoubleBonds": ("nb_double_bonds", int), + "RDF070m": ("rdf070m", dec(2)), + "SumAtomPolar": ("sum_atom_polar", dec(2)), + "SumAtomVolVdW": ("sum_atom_vol_vdw", dec(2)), + "MolecularWeight": ("molecular_weight", dec(2)), + "NbCircuits": ("nb_circuits", int), + "NbAromaticsSSSR": ("nb_aromatic_sssr", int), + "NbAcceptorH": ("nb_acceptor_h", int), + "NbF": ("nb_f", int), + "Ui": ("ui", dec(2)), + "NbO": ("nb_o", int), + "NbCl": ("nb_cl", int), + "NbBonds": ("nb_bonds", int), + "LogP": ("a_log_p", dec(2)), + "RandicIndex": ("randic_index", dec(2)), + "NbBondsNonH": ("nb_bonds_non_h", int), + "NbAromaticsEther": ("nb_aromatic_ether", int), + "NbChiralCenters": ("nb_chiral_centers", int), + "NbBenzLikeRings": ("nb_benzene_like_rings", int), + "RotatableBondFraction": ("rotatable_bond_fraction", dec(2)), + "LogD": ("log_d", dec(2)), + "WienerIndex": ("wiener_index", int), + "NbN": ("nb_n", int), + "NbC": ("nb_c", int), + "NbAtom": ("nb_atom", int), + "NbAromaticsBonds": ("nb_aromatic_bonds", int), + "MeanAtomVolVdW": ("mean_atom_vol_vdw", dec(2)), + "AromaticRatio": ("aromatic_ratio", dec(2)), + "NbAtomNonH": ("nb_atom_non_h", int), + "NbDonorH": ("nb_donor_h", int), + "NbI": ("nb_i", int), + "NbRotatableBonds": ("nb_rotatable_bonds", int), + "NbRings": ("nb_rings", int), + "NbCsp2": ("nb_csp2", int), + "NbCsp3": ("nb_csp3", int), + "NbBr": ("nb_br", int), + "GCMolarRefractivity": ("gc_molar_refractivity", dec(2)), + "NbAliphaticsAmines": ("nb_aliphatic_amines", int), + } + ippidb_convs = {value[0]: value[1] for key, value in property_mapping.items()} + ippidb_convs["id"] = int + for cid, item in pc_properties_dict.items(): + compound = Compound.objects.get(id=cid) + updated_properties = {} + for galaxy_prop, prop in property_mapping.items(): + ippidb_prop = prop[0] + ippidb_conv = prop[1] + try: + updated_properties[ippidb_prop] = ippidb_conv(item[galaxy_prop]) + except ValueError as ve: + print( + f"Error setting property {ippidb_prop} to {item[galaxy_prop]} in compound {compound.id} \ndetails:{ve}" + ) + for key, value in updated_properties.items(): + setattr(compound, key, value) + compound.save() + + +def compute_drugbank_similarity(compound_ids): + compounds = Compound.objects.filter(id__in=compound_ids) + for c in compounds: + c.save(autofill=True) + pass + + +def validate(compound_ids): + compounds = Compound.objects.filter(id__in=compound_ids) + for c in compounds: + for ca in c.compoundaction_set.all(): + for contribution in ca.ppi.contribution_set.filter(validated=False): + contribution.validated = True + contribution.save() + + +def generate_le_lle_plot(): + print("Generating the LE vs. LLE biplot...") + le_lle_data = [] + LeLleBiplotData.objects.all().delete() + print("Successfully flushed LE-LLE biplot data") + for comp in Compound.objects.validated(): + if comp.le is not None: + le = round(comp.le, 7) + lle = round(comp.lle, 7) + le_lle_data.append( + { + "x": le, + "y": lle, + "id": comp.id, + "family_name": comp.best_activity_ppi_family_name, + "smiles": comp.canonical_smile, + } + ) + else: + print("compound %s has no LE" % comp.id) + le_lle_json = json.dumps(le_lle_data, separators=(",", ":")) + new = LeLleBiplotData() + new.le_lle_biplot_data = le_lle_json + new.save() + print("Successfully generated LE-LLE biplot data") + + +def plot_circle(): + theta = np.linspace(0, 2 * np.pi, 100) + r = np.sqrt(1.0) + x1 = r * np.cos(theta) + x2 = r * np.sin(theta) + return x1, x2 + + +def generate_pca_plot(): + print("Generating the PCA biplot...") + pca_data = [] + features = [ + "molecular_weight", + "a_log_p", + "nb_donor_h", + "nb_acceptor_h", + "tpsa", + "nb_rotatable_bonds", + "nb_benzene_like_rings", + "fsp3", + "nb_chiral_centers", + "nb_csp3", + "nb_atom", + "nb_bonds", + "nb_atom_non_h", + "nb_rings", + "nb_multiple_bonds", + "nb_aromatic_bonds", + "aromatic_ratio", + ] + PcaBiplotData.objects.all().delete() + print("Successfully flushed PCA biplot data") + values_list = [] + for comp in Compound.objects.validated(): + values = model_to_dict(comp, fields=features + ["id", "family"]) + values["family"] = comp.best_activity_ppi_family_name + values_list.append(values) + df = pd.DataFrame(values_list) + # drop compounds with undefined property values + df.dropna(how="any", inplace=True) + # prepare correlation circle figure + plt.switch_backend("Agg") + fig_, ax = plt.subplots(figsize=(6, 6)) + x1, x2 = plot_circle() + plt.plot(x1, x2) + ax.set_aspect(1) + ax.yaxis.set_label_coords(-0.1, 0.5) + ax.xaxis.set_label_coords(0.5, -0.08) + # do not process the data unless there are compounds in the dataframe + if len(df.index) > 0: + x = df.loc[:, features].values + y = df.loc[:, ["family"]].values + x = StandardScaler().fit_transform(x) + n = x.shape[0] # retrieve number of individuals + p = x.shape[1] # retrieve number of variables + pca = PCA(n_components=p) + principal_components = pca.fit_transform(x) + # compute correlation circle + variance_ratio = pd.Series(pca.explained_variance_ratio_) + coef = np.transpose(pca.components_) + cols = ["PC-" + str(x) for x in range(len(variance_ratio))] + pc_infos = pd.DataFrame(coef, columns=cols, index=features) + # we might remove the line below if the PCA remains grayscale + pal = itertools.cycle(sns.color_palette("dark", len(features))) # noqa: F841 + # compute the length of each feature arrow in the correlation circle + pc_infos["DIST"] = pc_infos[["PC-0", "PC-1"]].pow(2).sum(1).pow(0.5) + # store the maximal length for normalization purposes + best_distance = max(pc_infos["DIST"]) + # compute corvar for the correlation circle + eigval = (float(n) - 1) / float(n) * pca.explained_variance_ + sqrt_eigval = np.sqrt(eigval) + sqrt_eigval = np.sqrt(eigval) + corvar = np.zeros((p, p)) + for k in range(p): + corvar[:, k] = pca.components_[k, :] * sqrt_eigval[k] + for idx in range(len(pc_infos["PC-0"])): + x = corvar[idx, 0] # use corvar to plot the variable map + y = corvar[idx, 1] # use corvar to plot the variable map + color = "black" + # alpha is the feature length normalized + # to the longest feature's length + alpha = pc_infos["DIST"][idx] / best_distance + plt.arrow(0.0, 0.0, x, y, head_width=0.02, color="black", alpha=alpha) + plt.annotate( + Compound._meta.get_field(pc_infos.index[idx]).verbose_name, + xy=(x, y), + xycoords="data", + xytext=np.asarray((x, y)) + (0.02, -0.02), + fontsize=6, + color=color, + alpha=alpha, + ) + plt.xlabel("PC-1 (%s%%)" % str(variance_ratio[0])[:4].lstrip("0.")) + plt.ylabel("PC-2 (%s%%)" % str(variance_ratio[1])[:4].lstrip("0.")) + plt.xlim((-1, 1)) + plt.ylim((-1, 1)) + principal_df = pd.DataFrame(data=principal_components) + # only select the two first dimensions for the plot, and rename them to x and y + principal_df = principal_df.iloc[:, 0:2] + principal_df = principal_df.rename(columns={0: "x", 1: "y"}) + final_df = pd.concat([principal_df, df[["family", "id"]]], axis=1) + for index, row in final_df.iterrows(): + smiles = Compound.objects.get(id=row.id).canonical_smile + pca_data.append( + { + "x": row.x, + "y": row.y, + "id": row.id, + "family_name": row.family, + "smiles": smiles, + } + ) + else: + pca_data = [] + # save correlation circle PNG + my_string_io_bytes = io.BytesIO() + plt.savefig(my_string_io_bytes, format="png", dpi=600, bbox_inches="tight") + my_string_io_bytes.seek(0) + # figdata_png is the correlation circle figure, base 64-encoded + figdata_png = base64.b64encode(my_string_io_bytes.read()) + pca_data_cc = { + "data": pca_data, + "correlation_circle": figdata_png.decode("utf-8"), + } + pca_json = json.dumps(pca_data_cc, separators=(",", ":")) + new = PcaBiplotData() + new.pca_biplot_data = pca_json + new.save() + print("Successfully generated PCA biplot data") + + +@shared_task +def validate_compounds(compound_ids): + """ + This task will perform all computations and validate the compound + It also regenerates the LE-LLE and PCA plots + """ + compute_compound_properties(compound_ids) + update_compound_cached_properties(Compound.objects.filter(id__in=compound_ids)) + compute_drugbank_similarity(compound_ids) + validate(compound_ids) + generate_le_lle_plot() + generate_pca_plot() diff --git a/ippisite/ippisite/__init__.py b/ippisite/ippisite/__init__.py index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d128d39cd37a95ea64f18231661de44f1b7c0cb3 100644 --- a/ippisite/ippisite/__init__.py +++ b/ippisite/ippisite/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import, unicode_literals + +# This will make sure the app is always imported when +# Django starts so that shared_task will use this app. +from .celery import app as celery_app + +__all__ = ('celery_app',) \ No newline at end of file diff --git a/ippisite/ippisite/celery.py b/ippisite/ippisite/celery.py new file mode 100644 index 0000000000000000000000000000000000000000..670db8cece5d3f696b6bc6a41effabc1f319acb0 --- /dev/null +++ b/ippisite/ippisite/celery.py @@ -0,0 +1,22 @@ +from __future__ import absolute_import, unicode_literals +import os +from celery import Celery + +# set the default Django settings module for the 'celery' program. +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'ippisite.settings') + +app = Celery('ippidb') + +# Using a string here means the worker doesn't have to serialize +# the configuration object to child processes. +# - namespace='CELERY' means all celery-related configuration keys +# should have a `CELERY_` prefix. +app.config_from_object('django.conf:settings', namespace='CELERY') + +# Load task modules from all registered Django app configs. +app.autodiscover_tasks() + + +@app.task(bind=True) +def debug_task(self): + print('Request: {0!r}'.format(self.request)) \ No newline at end of file