Skip to content
Snippets Groups Projects
Commit 1c95e9d6 authored by Hervé  MENAGER's avatar Hervé MENAGER
Browse files

django commands modifications to now use newer code from tasks

parent e7bba3ed
No related branches found
No related tags found
1 merge request!13Master
import argparse
from datetime import datetime
from itertools import islice
import json
import re
import time
import tempfile
from bioblend.galaxy import GalaxyInstance
from django.conf import settings
from django.core.management import BaseCommand
from django.forms.models import model_to_dict
import pandas as pd
import requests
from ippidb.models import Compound
from ippidb.utils import smi2sdf
# disable insecure HTTP request warnings (used by bioblend)
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
BASE_URL = settings.GALAXY_BASE_URL
KEY = settings.GALAXY_APIKEY
WORKFLOW_ID = settings.GALAXY_COMPOUNDPROPERTIES_WORKFLOWID
class GalaxyCompoundPropertiesRunner(object):
def __init__(self, galaxy_instance):
self.galaxy_instance = galaxy_instance
def compute_properties_for_sdf_file(self, sdf_file_path):
# create a history to store the workflow results
now = datetime.now()
date_time = now.strftime("%Y/%m/%d-%H:%M:%S")
history_name = "compoundpropertiesjobrun_%s" % date_time
history = self.galaxy_instance.histories.create_history(name=history_name)
history_id = history["id"]
if history["state"] not in ["new", "ok"]:
raise Exception(
f'Error creating history "{history_name}" (id {history_id})'
)
# launch data upload job
upload_response = self.galaxy_instance.tools.upload_file(
path=sdf_file_path, file_type="sdf", history_id=history_id
)
upload_data_id = upload_response["outputs"][0]["id"]
upload_job = upload_response["jobs"][0]
upload_job_id = upload_job["id"]
# monitor data upload until completed or on error
while upload_job["state"] not in ["ok"]:
time.sleep(2)
upload_job = self.galaxy_instance.jobs.show_job(upload_job_id)
if upload_job["state"] in ["error", "deleted", "discarded"]:
data = self.galaxy_instance.datasets.show_dataset(upload_data_id)
raise Exception(
f"Error during Galaxy data upload job - name : "
f"{data['name']}, id : {upload_data_id}, "
f"error : {data['misc_info']}"
)
# check uploaded dataset status
data = self.galaxy_instance.datasets.show_dataset(upload_data_id)
if data["state"] not in ["ok"]:
raise Exception(
f"Error during Galaxy data upload result - name : "
f"{data['name']}, id : {upload_data_id}, "
f"error : {data['misc_info']}"
)
# submit compound properties computation job
dataset_map = {"0": {"src": "hda", "id": upload_data_id}}
workflow_job = self.galaxy_instance.workflows.invoke_workflow(
WORKFLOW_ID, inputs=dataset_map, history_id=history_id
)
workflow_job_id = workflow_job["id"]
while workflow_job["state"] not in ["ok", "scheduled"]:
time.sleep(2)
workflow_job = self.galaxy_instance.workflows.show_invocation(
"dad6103ff71ca4fe", workflow_job_id
)
if workflow_job["state"] in ["error", "deleted", "discarded"]:
raise Exception(
f"Error during Galaxy workflow job - name : "
f"id : {workflow_job_id}, "
)
datasets = self.galaxy_instance.histories.show_history(
history_id, contents=True
)
actual_result_dataset = None
for dataset in datasets:
if dataset["extension"] == "json":
actual_result_dataset = dataset
if actual_result_dataset is None:
raise Exception(
f"Result for galaxy workflow invocation {workflow_job_id} not found in"
f" history {history_id}"
)
dataset = self.galaxy_instance.datasets.show_dataset(
actual_result_dataset["id"]
)
while dataset["state"] not in ["ok"]:
time.sleep(2)
dataset = self.galaxy_instance.datasets.show_dataset(
actual_result_dataset["id"]
)
download_url = dataset["download_url"]
contents_resp = requests.get(BASE_URL + download_url, verify=False)
contents = contents_resp.json()
return contents
def idrange_type(s, pat=re.compile(r"^(\d+)-(\d+)$")):
m = pat.match(s)
if not m:
raise argparse.ArgumentTypeError(
"please specify ID range as [start number]-[endnumber]"
)
return (int(m.groups()[0]), int(m.groups()[1]))
def dec(decimal_places):
def func(number):
return round(float(number), decimal_places)
return func
def chunks(data, size=10):
it = iter(data)
for i in range(0, len(data), size):
yield {k: data[k] for k in islice(it, size)}
class Command(BaseCommand):
help = "Compute compound physicochemical properties"
def add_arguments(self, parser):
parser.add_argument(
"mode", choices=["update", "compare", "print"], default="update"
)
selection = parser.add_mutually_exclusive_group(required=True)
selection.add_argument(
"--all", action="store_true", help="Process all compounds in the database"
)
selection.add_argument(
"--ids",
nargs="+",
type=int,
help="Process the compounds for the specified IDs",
)
selection.add_argument(
"--idrange",
type=idrange_type,
help="Process the compounds for the specified ID range",
)
parser.add_argument(
"--json",
type=argparse.FileType("r"),
help="Process precomputed results stored in a JSON file",
)
parser.add_argument(
"--xls", type=argparse.FileType("w"), help="Store results in Excel file"
)
def handle(self, *args, **options):
# select the compounds that need to be processed
smiles_dict = {}
compounds = []
pc_properties = {}
already_done_ids = []
if options["json"] is not None:
pc_properties_dict = json.load(open(options["json"].name, "r"))
ids = [
int(key)
for key, item in pc_properties_dict.items()
if "IUPAC" not in item
]
already_done_ids = [
int(key) for key, item in pc_properties_dict.items() if "IUPAC" in item
]
if options["all"] is True:
ids = Compound.objects.all().values("id")
elif options["ids"]:
ids = Compound.objects.filter(id__in=options["ids"]).values("id")
elif options["idrange"]:
ids = Compound.objects.filter(
id__gte=options["idrange"][0], id__lte=options["idrange"][1]
).values("id")
else:
compounds = Compound.objects.filter(iupac_name__isnull=True).values("id")
ids = [row["id"] for row in ids]
ids = list(set(ids) - set(already_done_ids))
compounds = Compound.objects.filter(id__in=ids)
for c in compounds:
smiles_dict[c.id] = c.canonical_smile
# create or reuse existing JSON file to save new results
if options["json"]:
json_file = options["json"].name
else:
json_fh = tempfile.NamedTemporaryFile(mode="w", delete=False)
json.dump({c.id: {} for c in compounds}, json_fh)
json_file = json_fh.name
json_fh.close()
self.stderr.write(self.style.SUCCESS(f"Compound properties file: {json_file}"))
if len(compounds) > 0:
self.stderr.write(f"Now processing {len(compounds)} compounds")
# set up Galaxy computation environment
gi = GalaxyInstance(url=BASE_URL, key=KEY, verify=False)
gi.nocache = True
runner = GalaxyCompoundPropertiesRunner(gi)
chunk_size = 3
for smiles_dict_chunk in chunks(smiles_dict, chunk_size):
# create SDF file for the selection
sdf_string = smi2sdf(smiles_dict_chunk)
fh = tempfile.NamedTemporaryFile(mode="w", delete=False)
fh.write(sdf_string)
fh.close()
self.stderr.write(
self.style.SUCCESS(
f"Galaxy input SDF file for compounds {smiles_dict_chunk.keys()}: {fh.name}"
)
)
# run computations on Galaxy
pc_properties = runner.compute_properties_for_sdf_file(fh.name)
new_pc_properties_dict = {
compound["Name"]: compound for compound in pc_properties
}
pc_properties_dict = json.load(open(json_file, "r"))
pc_properties_dict.update(new_pc_properties_dict)
fh = open(json_file, "w")
json.dump(pc_properties_dict, fh, indent=4)
fh.close()
self.stderr.write(
self.style.SUCCESS(
f"Properties added for compounds {smiles_dict_chunk.keys()} in JSON file: {json_file}"
)
)
# report and update database
property_mapping = {
"CanonicalSmile": ("canonical_smile", str),
"IUPAC": ("iupac_name", str),
"TPSA": ("tpsa", dec(2)),
"NbMultBonds": ("nb_multiple_bonds", int),
"BalabanIndex": ("balaban_index", dec(2)),
"NbDoubleBonds": ("nb_double_bonds", int),
"RDF070m": ("rdf070m", dec(2)),
"SumAtomPolar": ("sum_atom_polar", dec(2)),
"SumAtomVolVdW": ("sum_atom_vol_vdw", dec(2)),
"MolecularWeight": ("molecular_weight", dec(2)),
"NbCircuits": ("nb_circuits", int),
"NbAromaticsSSSR": ("nb_aromatic_sssr", int),
"NbAcceptorH": ("nb_acceptor_h", int),
"NbF": ("nb_f", int),
"Ui": ("ui", dec(2)),
"NbO": ("nb_o", int),
"NbCl": ("nb_cl", int),
"NbBonds": ("nb_bonds", int),
"LogP": ("a_log_p", dec(2)),
"RandicIndex": ("randic_index", dec(2)),
"NbBondsNonH": ("nb_bonds_non_h", int),
"NbAromaticsEther": ("nb_aromatic_ether", int),
"NbChiralCenters": ("nb_chiral_centers", int),
"NbBenzLikeRings": ("nb_benzene_like_rings", int),
"RotatableBondFraction": ("rotatable_bond_fraction", dec(2)),
"LogD": ("log_d", dec(2)),
"WienerIndex": ("wiener_index", int),
"NbN": ("nb_n", int),
"NbC": ("nb_c", int),
"NbAtom": ("nb_atom", int),
"NbAromaticsBonds": ("nb_aromatic_bonds", int),
"MeanAtomVolVdW": ("mean_atom_vol_vdw", dec(2)),
"AromaticRatio": ("aromatic_ratio", dec(2)),
"NbAtomNonH": ("nb_atom_non_h", int),
"NbDonorH": ("nb_donor_h", int),
"NbI": ("nb_i", int),
"NbRotatableBonds": ("nb_rotatable_bonds", int),
"NbRings": ("nb_rings", int),
"NbCsp2": ("nb_csp2", int),
"NbCsp3": ("nb_csp3", int),
"NbBr": ("nb_br", int),
"GCMolarRefractivity": ("gc_molar_refractivity", dec(2)),
"NbAliphaticsAmines": ("nb_aliphatic_amines", int),
}
properties_list = []
computed_properties = ["id"] + [
value[0] for key, value in property_mapping.items()
]
ippidb_convs = {value[0]: value[1] for key, value in property_mapping.items()}
ippidb_convs["id"] = int
pc_properties_dict = json.load(open(json_file))
for cid, item in pc_properties_dict.items():
compound = Compound.objects.get(id=cid)
ippidb_dict = model_to_dict(Compound.objects.get(id=cid))
ippidb_dict = {
key: ippidb_convs[key](value)
for key, value in ippidb_dict.items()
if key in computed_properties and value is not None
}
ippidb_dict["source"] = "iPPI-DB"
galaxy_dict = {"id": int(cid), "source": "Galaxy"}
for galaxy_prop, prop in property_mapping.items():
ippidb_prop = prop[0]
ippidb_conv = prop[1]
try:
galaxy_dict[ippidb_prop] = ippidb_conv(item[galaxy_prop])
except ValueError as ve:
self.stderr.write(
self.style.ERROR(
f"Error setting property {ippidb_prop} to {item[galaxy_prop]}"
f" in compound {compound.id} \ndetails:{ve}"
)
)
properties_list.append(ippidb_dict)
properties_list.append(galaxy_dict)
properties_df = pd.DataFrame(properties_list)
properties_df.set_index(["id", "source"], inplace=True)
properties_df.sort_index(inplace=True)
if options["xls"]:
properties_df.to_excel(options["xls"].name)
self.stderr.write(self.style.SUCCESS("All done!"))
import json
from django.core.management import BaseCommand from django.core.management import BaseCommand
from ippidb.models import Compound, LeLleBiplotData from ippidb.tasks import generate_le_lle_plot
class Command(BaseCommand): class Command(BaseCommand):
...@@ -11,28 +9,7 @@ class Command(BaseCommand): ...@@ -11,28 +9,7 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS("Generating the LE vs. LLE biplot...")) self.stdout.write(self.style.SUCCESS("Generating the LE vs. LLE biplot..."))
le_lle_data = [] generate_le_lle_plot()
LeLleBiplotData.objects.all().delete()
self.stdout.write(self.style.SUCCESS("Successfully flushed LE-LLE biplot data"))
for comp in Compound.objects.validated():
if comp.le is not None:
le = round(comp.le, 7)
lle = round(comp.lle, 7)
le_lle_data.append(
{
"x": le,
"y": lle,
"id": comp.id,
"family_name": comp.best_activity_ppi_family_name,
"smiles": comp.canonical_smile,
}
)
else:
self.stdout.write(self.style.WARNING("compound %s has no LE" % comp.id))
le_lle_json = json.dumps(le_lle_data, separators=(",", ":"))
new = LeLleBiplotData()
new.le_lle_biplot_data = le_lle_json
new.save()
self.stdout.write( self.stdout.write(
self.style.SUCCESS("Successfully generated LE-LLE biplot data") self.style.SUCCESS("Successfully generated LE-LLE biplot data")
) )
import json
import io
import base64
import itertools
from django.core.management import BaseCommand from django.core.management import BaseCommand
from django.forms.models import model_to_dict
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from ippidb.models import Compound, PcaBiplotData
def plot_circle(): from ippidb.tasks import generate_pca_plot
theta = np.linspace(0, 2 * np.pi, 100)
r = np.sqrt(1.0)
x1 = r * np.cos(theta)
x2 = r * np.sin(theta)
return x1, x2
class Command(BaseCommand): class Command(BaseCommand):
...@@ -29,124 +9,5 @@ class Command(BaseCommand): ...@@ -29,124 +9,5 @@ class Command(BaseCommand):
def handle(self, *args, **options): def handle(self, *args, **options):
self.stdout.write(self.style.SUCCESS("Generating the PCA biplot...")) self.stdout.write(self.style.SUCCESS("Generating the PCA biplot..."))
pca_data = [] generate_pca_plot()
features = [
"molecular_weight",
"a_log_p",
"nb_donor_h",
"nb_acceptor_h",
"tpsa",
"nb_rotatable_bonds",
"nb_benzene_like_rings",
"fsp3",
"nb_chiral_centers",
"nb_csp3",
"nb_atom",
"nb_bonds",
"nb_atom_non_h",
"nb_rings",
"nb_multiple_bonds",
"nb_aromatic_bonds",
"aromatic_ratio",
]
PcaBiplotData.objects.all().delete()
self.stdout.write(self.style.SUCCESS("Successfully flushed PCA biplot data"))
values_list = []
for comp in Compound.objects.validated():
values = model_to_dict(comp, fields=features + ["id", "family"])
values["family"] = comp.best_activity_ppi_family_name
values_list.append(values)
df = pd.DataFrame(values_list)
# drop compounds with undefined property values
df.dropna(how="any", inplace=True)
# prepare correlation circle figure
plt.switch_backend("Agg")
fig_, ax = plt.subplots(figsize=(6, 6))
x1, x2 = plot_circle()
plt.plot(x1, x2)
ax.set_aspect(1)
ax.yaxis.set_label_coords(-0.1, 0.5)
ax.xaxis.set_label_coords(0.5, -0.08)
# do not process the data unless there are compounds in the dataframe
if len(df.index) > 0:
x = df.loc[:, features].values
y = df.loc[:, ["family"]].values
x = StandardScaler().fit_transform(x)
n = x.shape[0] # retrieve number of individuals
p = x.shape[1] # retrieve number of variables
pca = PCA(n_components=p)
principal_components = pca.fit_transform(x)
# compute correlation circle
variance_ratio = pd.Series(pca.explained_variance_ratio_)
coef = np.transpose(pca.components_)
cols = ["PC-" + str(x) for x in range(len(variance_ratio))]
pc_infos = pd.DataFrame(coef, columns=cols, index=features)
# we might remove the line below if the PCA remains grayscale
pal = itertools.cycle( # noqa: F841
sns.color_palette("dark", len(features))
)
# compute the length of each feature arrow in the correlation circle
pc_infos["DIST"] = pc_infos[["PC-0", "PC-1"]].pow(2).sum(1).pow(0.5)
# store the maximal length for normalization purposes
best_distance = max(pc_infos["DIST"])
# compute corvar for the correlation circle
eigval = (float(n) - 1) / float(n) * pca.explained_variance_
sqrt_eigval = np.sqrt(eigval)
sqrt_eigval = np.sqrt(eigval)
corvar = np.zeros((p, p))
for k in range(p):
corvar[:, k] = pca.components_[k, :] * sqrt_eigval[k]
for idx in range(len(pc_infos["PC-0"])):
x = corvar[idx, 0] # use corvar to plot the variable map
y = corvar[idx, 1] # use corvar to plot the variable map
color = "black"
# alpha is the feature length normalized
# to the longest feature's length
alpha = pc_infos["DIST"][idx] / best_distance
plt.arrow(0.0, 0.0, x, y, head_width=0.02, color="black", alpha=alpha)
plt.annotate(
Compound._meta.get_field(pc_infos.index[idx]).verbose_name,
xy=(x, y),
xycoords="data",
xytext=np.asarray((x, y)) + (0.02, -0.02),
fontsize=6,
color=color,
alpha=alpha,
)
plt.xlabel("PC-1 (%s%%)" % str(variance_ratio[0])[:4].lstrip("0."))
plt.ylabel("PC-2 (%s%%)" % str(variance_ratio[1])[:4].lstrip("0."))
plt.xlim((-1, 1))
plt.ylim((-1, 1))
principal_df = pd.DataFrame(data=principal_components)
# only select the two first dimensions for the plot, and rename them to x and y
principal_df = principal_df.iloc[:, 0:2]
principal_df = principal_df.rename(columns={0: "x", 1: "y"})
final_df = pd.concat([principal_df, df[["family", "id"]]], axis=1)
for index, row in final_df.iterrows():
smiles = Compound.objects.get(id=row.id).canonical_smile
pca_data.append(
{
"x": row.x,
"y": row.y,
"id": row.id,
"family_name": row.family,
"smiles": smiles,
}
)
else:
pca_data = []
# save correlation circle PNG
my_string_io_bytes = io.BytesIO()
plt.savefig(my_string_io_bytes, format="png", dpi=600, bbox_inches="tight")
my_string_io_bytes.seek(0)
# figdata_png is the correlation circle figure, base 64-encoded
figdata_png = base64.b64encode(my_string_io_bytes.read())
pca_data_cc = {
"data": pca_data,
"correlation_circle": figdata_png.decode("utf-8"),
}
pca_json = json.dumps(pca_data_cc, separators=(",", ":"))
new = PcaBiplotData()
new.pca_biplot_data = pca_json
new.save()
self.stdout.write(self.style.SUCCESS("Successfully generated PCA biplot data")) self.stdout.write(self.style.SUCCESS("Successfully generated PCA biplot data"))
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment