diff --git a/ippisite/ippidb/management/commands/compute_compound_properties.py b/ippisite/ippidb/management/commands/compute_compound_properties.py deleted file mode 100644 index 22153cacb941501cc32c490117c2e4f4201edaee..0000000000000000000000000000000000000000 --- a/ippisite/ippidb/management/commands/compute_compound_properties.py +++ /dev/null @@ -1,321 +0,0 @@ -import argparse -from datetime import datetime -from itertools import islice -import json -import re -import time -import tempfile - -from bioblend.galaxy import GalaxyInstance -from django.conf import settings -from django.core.management import BaseCommand -from django.forms.models import model_to_dict -import pandas as pd -import requests - -from ippidb.models import Compound -from ippidb.utils import smi2sdf - -# disable insecure HTTP request warnings (used by bioblend) -import urllib3 - -urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) - -BASE_URL = settings.GALAXY_BASE_URL -KEY = settings.GALAXY_APIKEY -WORKFLOW_ID = settings.GALAXY_COMPOUNDPROPERTIES_WORKFLOWID - - -class GalaxyCompoundPropertiesRunner(object): - def __init__(self, galaxy_instance): - self.galaxy_instance = galaxy_instance - - def compute_properties_for_sdf_file(self, sdf_file_path): - # create a history to store the workflow results - now = datetime.now() - date_time = now.strftime("%Y/%m/%d-%H:%M:%S") - history_name = "compoundpropertiesjobrun_%s" % date_time - history = self.galaxy_instance.histories.create_history(name=history_name) - history_id = history["id"] - if history["state"] not in ["new", "ok"]: - raise Exception( - f'Error creating history "{history_name}" (id {history_id})' - ) - # launch data upload job - upload_response = self.galaxy_instance.tools.upload_file( - path=sdf_file_path, file_type="sdf", history_id=history_id - ) - upload_data_id = upload_response["outputs"][0]["id"] - upload_job = upload_response["jobs"][0] - upload_job_id = upload_job["id"] - # monitor data upload until completed or on error - while upload_job["state"] not in ["ok"]: - time.sleep(2) - upload_job = self.galaxy_instance.jobs.show_job(upload_job_id) - if upload_job["state"] in ["error", "deleted", "discarded"]: - data = self.galaxy_instance.datasets.show_dataset(upload_data_id) - raise Exception( - f"Error during Galaxy data upload job - name : " - f"{data['name']}, id : {upload_data_id}, " - f"error : {data['misc_info']}" - ) - # check uploaded dataset status - data = self.galaxy_instance.datasets.show_dataset(upload_data_id) - if data["state"] not in ["ok"]: - raise Exception( - f"Error during Galaxy data upload result - name : " - f"{data['name']}, id : {upload_data_id}, " - f"error : {data['misc_info']}" - ) - # submit compound properties computation job - dataset_map = {"0": {"src": "hda", "id": upload_data_id}} - workflow_job = self.galaxy_instance.workflows.invoke_workflow( - WORKFLOW_ID, inputs=dataset_map, history_id=history_id - ) - workflow_job_id = workflow_job["id"] - while workflow_job["state"] not in ["ok", "scheduled"]: - time.sleep(2) - workflow_job = self.galaxy_instance.workflows.show_invocation( - "dad6103ff71ca4fe", workflow_job_id - ) - if workflow_job["state"] in ["error", "deleted", "discarded"]: - raise Exception( - f"Error during Galaxy workflow job - name : " - f"id : {workflow_job_id}, " - ) - datasets = self.galaxy_instance.histories.show_history( - history_id, contents=True - ) - actual_result_dataset = None - for dataset in datasets: - if dataset["extension"] == "json": - actual_result_dataset = dataset - if actual_result_dataset is None: - raise Exception( - f"Result for galaxy workflow invocation {workflow_job_id} not found in" - f" history {history_id}" - ) - dataset = self.galaxy_instance.datasets.show_dataset( - actual_result_dataset["id"] - ) - while dataset["state"] not in ["ok"]: - time.sleep(2) - dataset = self.galaxy_instance.datasets.show_dataset( - actual_result_dataset["id"] - ) - download_url = dataset["download_url"] - contents_resp = requests.get(BASE_URL + download_url, verify=False) - contents = contents_resp.json() - return contents - - -def idrange_type(s, pat=re.compile(r"^(\d+)-(\d+)$")): - m = pat.match(s) - if not m: - raise argparse.ArgumentTypeError( - "please specify ID range as [start number]-[endnumber]" - ) - return (int(m.groups()[0]), int(m.groups()[1])) - - -def dec(decimal_places): - def func(number): - return round(float(number), decimal_places) - - return func - - -def chunks(data, size=10): - it = iter(data) - for i in range(0, len(data), size): - yield {k: data[k] for k in islice(it, size)} - - -class Command(BaseCommand): - - help = "Compute compound physicochemical properties" - - def add_arguments(self, parser): - parser.add_argument( - "mode", choices=["update", "compare", "print"], default="update" - ) - selection = parser.add_mutually_exclusive_group(required=True) - selection.add_argument( - "--all", action="store_true", help="Process all compounds in the database" - ) - selection.add_argument( - "--ids", - nargs="+", - type=int, - help="Process the compounds for the specified IDs", - ) - selection.add_argument( - "--idrange", - type=idrange_type, - help="Process the compounds for the specified ID range", - ) - parser.add_argument( - "--json", - type=argparse.FileType("r"), - help="Process precomputed results stored in a JSON file", - ) - parser.add_argument( - "--xls", type=argparse.FileType("w"), help="Store results in Excel file" - ) - - def handle(self, *args, **options): - # select the compounds that need to be processed - smiles_dict = {} - compounds = [] - pc_properties = {} - already_done_ids = [] - if options["json"] is not None: - pc_properties_dict = json.load(open(options["json"].name, "r")) - ids = [ - int(key) - for key, item in pc_properties_dict.items() - if "IUPAC" not in item - ] - already_done_ids = [ - int(key) for key, item in pc_properties_dict.items() if "IUPAC" in item - ] - if options["all"] is True: - ids = Compound.objects.all().values("id") - elif options["ids"]: - ids = Compound.objects.filter(id__in=options["ids"]).values("id") - elif options["idrange"]: - ids = Compound.objects.filter( - id__gte=options["idrange"][0], id__lte=options["idrange"][1] - ).values("id") - else: - compounds = Compound.objects.filter(iupac_name__isnull=True).values("id") - ids = [row["id"] for row in ids] - ids = list(set(ids) - set(already_done_ids)) - compounds = Compound.objects.filter(id__in=ids) - for c in compounds: - smiles_dict[c.id] = c.canonical_smile - # create or reuse existing JSON file to save new results - if options["json"]: - json_file = options["json"].name - else: - json_fh = tempfile.NamedTemporaryFile(mode="w", delete=False) - json.dump({c.id: {} for c in compounds}, json_fh) - json_file = json_fh.name - json_fh.close() - self.stderr.write(self.style.SUCCESS(f"Compound properties file: {json_file}")) - if len(compounds) > 0: - self.stderr.write(f"Now processing {len(compounds)} compounds") - # set up Galaxy computation environment - gi = GalaxyInstance(url=BASE_URL, key=KEY, verify=False) - gi.nocache = True - runner = GalaxyCompoundPropertiesRunner(gi) - chunk_size = 3 - for smiles_dict_chunk in chunks(smiles_dict, chunk_size): - # create SDF file for the selection - sdf_string = smi2sdf(smiles_dict_chunk) - fh = tempfile.NamedTemporaryFile(mode="w", delete=False) - fh.write(sdf_string) - fh.close() - self.stderr.write( - self.style.SUCCESS( - f"Galaxy input SDF file for compounds {smiles_dict_chunk.keys()}: {fh.name}" - ) - ) - # run computations on Galaxy - pc_properties = runner.compute_properties_for_sdf_file(fh.name) - new_pc_properties_dict = { - compound["Name"]: compound for compound in pc_properties - } - pc_properties_dict = json.load(open(json_file, "r")) - pc_properties_dict.update(new_pc_properties_dict) - fh = open(json_file, "w") - json.dump(pc_properties_dict, fh, indent=4) - fh.close() - self.stderr.write( - self.style.SUCCESS( - f"Properties added for compounds {smiles_dict_chunk.keys()} in JSON file: {json_file}" - ) - ) - # report and update database - property_mapping = { - "CanonicalSmile": ("canonical_smile", str), - "IUPAC": ("iupac_name", str), - "TPSA": ("tpsa", dec(2)), - "NbMultBonds": ("nb_multiple_bonds", int), - "BalabanIndex": ("balaban_index", dec(2)), - "NbDoubleBonds": ("nb_double_bonds", int), - "RDF070m": ("rdf070m", dec(2)), - "SumAtomPolar": ("sum_atom_polar", dec(2)), - "SumAtomVolVdW": ("sum_atom_vol_vdw", dec(2)), - "MolecularWeight": ("molecular_weight", dec(2)), - "NbCircuits": ("nb_circuits", int), - "NbAromaticsSSSR": ("nb_aromatic_sssr", int), - "NbAcceptorH": ("nb_acceptor_h", int), - "NbF": ("nb_f", int), - "Ui": ("ui", dec(2)), - "NbO": ("nb_o", int), - "NbCl": ("nb_cl", int), - "NbBonds": ("nb_bonds", int), - "LogP": ("a_log_p", dec(2)), - "RandicIndex": ("randic_index", dec(2)), - "NbBondsNonH": ("nb_bonds_non_h", int), - "NbAromaticsEther": ("nb_aromatic_ether", int), - "NbChiralCenters": ("nb_chiral_centers", int), - "NbBenzLikeRings": ("nb_benzene_like_rings", int), - "RotatableBondFraction": ("rotatable_bond_fraction", dec(2)), - "LogD": ("log_d", dec(2)), - "WienerIndex": ("wiener_index", int), - "NbN": ("nb_n", int), - "NbC": ("nb_c", int), - "NbAtom": ("nb_atom", int), - "NbAromaticsBonds": ("nb_aromatic_bonds", int), - "MeanAtomVolVdW": ("mean_atom_vol_vdw", dec(2)), - "AromaticRatio": ("aromatic_ratio", dec(2)), - "NbAtomNonH": ("nb_atom_non_h", int), - "NbDonorH": ("nb_donor_h", int), - "NbI": ("nb_i", int), - "NbRotatableBonds": ("nb_rotatable_bonds", int), - "NbRings": ("nb_rings", int), - "NbCsp2": ("nb_csp2", int), - "NbCsp3": ("nb_csp3", int), - "NbBr": ("nb_br", int), - "GCMolarRefractivity": ("gc_molar_refractivity", dec(2)), - "NbAliphaticsAmines": ("nb_aliphatic_amines", int), - } - properties_list = [] - computed_properties = ["id"] + [ - value[0] for key, value in property_mapping.items() - ] - ippidb_convs = {value[0]: value[1] for key, value in property_mapping.items()} - ippidb_convs["id"] = int - pc_properties_dict = json.load(open(json_file)) - for cid, item in pc_properties_dict.items(): - compound = Compound.objects.get(id=cid) - ippidb_dict = model_to_dict(Compound.objects.get(id=cid)) - ippidb_dict = { - key: ippidb_convs[key](value) - for key, value in ippidb_dict.items() - if key in computed_properties and value is not None - } - ippidb_dict["source"] = "iPPI-DB" - galaxy_dict = {"id": int(cid), "source": "Galaxy"} - for galaxy_prop, prop in property_mapping.items(): - ippidb_prop = prop[0] - ippidb_conv = prop[1] - try: - galaxy_dict[ippidb_prop] = ippidb_conv(item[galaxy_prop]) - except ValueError as ve: - self.stderr.write( - self.style.ERROR( - f"Error setting property {ippidb_prop} to {item[galaxy_prop]}" - f" in compound {compound.id} \ndetails:{ve}" - ) - ) - properties_list.append(ippidb_dict) - properties_list.append(galaxy_dict) - properties_df = pd.DataFrame(properties_list) - properties_df.set_index(["id", "source"], inplace=True) - properties_df.sort_index(inplace=True) - if options["xls"]: - properties_df.to_excel(options["xls"].name) - self.stderr.write(self.style.SUCCESS("All done!")) diff --git a/ippisite/ippidb/management/commands/lle_le.py b/ippisite/ippidb/management/commands/lle_le.py index 0ac9cbad35a80f4c91e14cb65d06b45a1df30251..73aac5064dd9e80689d8c4fa32d46084ae9d5955 100644 --- a/ippisite/ippidb/management/commands/lle_le.py +++ b/ippisite/ippidb/management/commands/lle_le.py @@ -1,8 +1,6 @@ -import json - from django.core.management import BaseCommand -from ippidb.models import Compound, LeLleBiplotData +from ippidb.tasks import generate_le_lle_plot class Command(BaseCommand): @@ -11,28 +9,7 @@ class Command(BaseCommand): def handle(self, *args, **options): self.stdout.write(self.style.SUCCESS("Generating the LE vs. LLE biplot...")) - le_lle_data = [] - LeLleBiplotData.objects.all().delete() - self.stdout.write(self.style.SUCCESS("Successfully flushed LE-LLE biplot data")) - for comp in Compound.objects.validated(): - if comp.le is not None: - le = round(comp.le, 7) - lle = round(comp.lle, 7) - le_lle_data.append( - { - "x": le, - "y": lle, - "id": comp.id, - "family_name": comp.best_activity_ppi_family_name, - "smiles": comp.canonical_smile, - } - ) - else: - self.stdout.write(self.style.WARNING("compound %s has no LE" % comp.id)) - le_lle_json = json.dumps(le_lle_data, separators=(",", ":")) - new = LeLleBiplotData() - new.le_lle_biplot_data = le_lle_json - new.save() + generate_le_lle_plot() self.stdout.write( self.style.SUCCESS("Successfully generated LE-LLE biplot data") ) diff --git a/ippisite/ippidb/management/commands/pca.py b/ippisite/ippidb/management/commands/pca.py index 330d4452dc7b08414105f05d80c2cc9a671dc315..89beebb30b1b7542a0f93674e75b20196ec0db22 100644 --- a/ippisite/ippidb/management/commands/pca.py +++ b/ippisite/ippidb/management/commands/pca.py @@ -1,26 +1,6 @@ -import json -import io -import base64 -import itertools - from django.core.management import BaseCommand -from django.forms.models import model_to_dict -import seaborn as sns -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd -from sklearn.decomposition import PCA -from sklearn.preprocessing import StandardScaler - -from ippidb.models import Compound, PcaBiplotData - -def plot_circle(): - theta = np.linspace(0, 2 * np.pi, 100) - r = np.sqrt(1.0) - x1 = r * np.cos(theta) - x2 = r * np.sin(theta) - return x1, x2 +from ippidb.tasks import generate_pca_plot class Command(BaseCommand): @@ -29,124 +9,5 @@ class Command(BaseCommand): def handle(self, *args, **options): self.stdout.write(self.style.SUCCESS("Generating the PCA biplot...")) - pca_data = [] - features = [ - "molecular_weight", - "a_log_p", - "nb_donor_h", - "nb_acceptor_h", - "tpsa", - "nb_rotatable_bonds", - "nb_benzene_like_rings", - "fsp3", - "nb_chiral_centers", - "nb_csp3", - "nb_atom", - "nb_bonds", - "nb_atom_non_h", - "nb_rings", - "nb_multiple_bonds", - "nb_aromatic_bonds", - "aromatic_ratio", - ] - PcaBiplotData.objects.all().delete() - self.stdout.write(self.style.SUCCESS("Successfully flushed PCA biplot data")) - values_list = [] - for comp in Compound.objects.validated(): - values = model_to_dict(comp, fields=features + ["id", "family"]) - values["family"] = comp.best_activity_ppi_family_name - values_list.append(values) - df = pd.DataFrame(values_list) - # drop compounds with undefined property values - df.dropna(how="any", inplace=True) - # prepare correlation circle figure - plt.switch_backend("Agg") - fig_, ax = plt.subplots(figsize=(6, 6)) - x1, x2 = plot_circle() - plt.plot(x1, x2) - ax.set_aspect(1) - ax.yaxis.set_label_coords(-0.1, 0.5) - ax.xaxis.set_label_coords(0.5, -0.08) - # do not process the data unless there are compounds in the dataframe - if len(df.index) > 0: - x = df.loc[:, features].values - y = df.loc[:, ["family"]].values - x = StandardScaler().fit_transform(x) - n = x.shape[0] # retrieve number of individuals - p = x.shape[1] # retrieve number of variables - pca = PCA(n_components=p) - principal_components = pca.fit_transform(x) - # compute correlation circle - variance_ratio = pd.Series(pca.explained_variance_ratio_) - coef = np.transpose(pca.components_) - cols = ["PC-" + str(x) for x in range(len(variance_ratio))] - pc_infos = pd.DataFrame(coef, columns=cols, index=features) - # we might remove the line below if the PCA remains grayscale - pal = itertools.cycle( # noqa: F841 - sns.color_palette("dark", len(features)) - ) - # compute the length of each feature arrow in the correlation circle - pc_infos["DIST"] = pc_infos[["PC-0", "PC-1"]].pow(2).sum(1).pow(0.5) - # store the maximal length for normalization purposes - best_distance = max(pc_infos["DIST"]) - # compute corvar for the correlation circle - eigval = (float(n) - 1) / float(n) * pca.explained_variance_ - sqrt_eigval = np.sqrt(eigval) - sqrt_eigval = np.sqrt(eigval) - corvar = np.zeros((p, p)) - for k in range(p): - corvar[:, k] = pca.components_[k, :] * sqrt_eigval[k] - for idx in range(len(pc_infos["PC-0"])): - x = corvar[idx, 0] # use corvar to plot the variable map - y = corvar[idx, 1] # use corvar to plot the variable map - color = "black" - # alpha is the feature length normalized - # to the longest feature's length - alpha = pc_infos["DIST"][idx] / best_distance - plt.arrow(0.0, 0.0, x, y, head_width=0.02, color="black", alpha=alpha) - plt.annotate( - Compound._meta.get_field(pc_infos.index[idx]).verbose_name, - xy=(x, y), - xycoords="data", - xytext=np.asarray((x, y)) + (0.02, -0.02), - fontsize=6, - color=color, - alpha=alpha, - ) - plt.xlabel("PC-1 (%s%%)" % str(variance_ratio[0])[:4].lstrip("0.")) - plt.ylabel("PC-2 (%s%%)" % str(variance_ratio[1])[:4].lstrip("0.")) - plt.xlim((-1, 1)) - plt.ylim((-1, 1)) - principal_df = pd.DataFrame(data=principal_components) - # only select the two first dimensions for the plot, and rename them to x and y - principal_df = principal_df.iloc[:, 0:2] - principal_df = principal_df.rename(columns={0: "x", 1: "y"}) - final_df = pd.concat([principal_df, df[["family", "id"]]], axis=1) - for index, row in final_df.iterrows(): - smiles = Compound.objects.get(id=row.id).canonical_smile - pca_data.append( - { - "x": row.x, - "y": row.y, - "id": row.id, - "family_name": row.family, - "smiles": smiles, - } - ) - else: - pca_data = [] - # save correlation circle PNG - my_string_io_bytes = io.BytesIO() - plt.savefig(my_string_io_bytes, format="png", dpi=600, bbox_inches="tight") - my_string_io_bytes.seek(0) - # figdata_png is the correlation circle figure, base 64-encoded - figdata_png = base64.b64encode(my_string_io_bytes.read()) - pca_data_cc = { - "data": pca_data, - "correlation_circle": figdata_png.decode("utf-8"), - } - pca_json = json.dumps(pca_data_cc, separators=(",", ":")) - new = PcaBiplotData() - new.pca_biplot_data = pca_json - new.save() + generate_pca_plot() self.stdout.write(self.style.SUCCESS("Successfully generated PCA biplot data"))