Commit cdf55d3e authored by Hervé  MENAGER's avatar Hervé MENAGER
Browse files

deduplicate compounds during contribution validation

WIP on #212
parent ce49a077
Pipeline #29050 passed with stages
in 13 minutes and 33 seconds
......@@ -930,6 +930,34 @@ class Compound(AutoFillableModel):
def get_jobs(self):
return CompoundJob.objects.filter(compound=self)
def replace_compound_references(self, replacing_compound):
"""
replace the references to a given compound in the data with
references to another new compound. used to deal with
duplicates in the database
"""
for ref in RefCompoundBiblio.objects.filter(compound=self):
ref.compound = replacing_compound
ref.save()
for ca in CompoundAction.objects.filter(compound=self):
ca.compound = replacing_compound
ca.save()
for car in CompoundActivityResult.objects.filter(compound=self):
car.compound = replacing_compound
car.save()
for ccr in CompoundCytotoxicityResult.objects.filter(compound=self):
ccr.compound = replacing_compound
ccr.save()
for cpr in CompoundPKResult.objects.filter(compound=self):
cpr.compound = replacing_compound
cpr.save()
for ct in CompoundTanimoto.objects.filter(compound=self):
ct.compound = replacing_compound
ct.save()
for dct in DrugbankCompoundTanimoto.objects.filter(compound=self):
dct.compound = replacing_compound
dct.save()
class CompoundTanimoto(models.Model):
canonical_smiles = models.TextField("Canonical Smile")
......
......@@ -112,8 +112,20 @@ def compute_compound_properties(compound_ids):
}
ippidb_convs = {value[0]: value[1] for key, value in property_mapping.items()}
ippidb_convs["id"] = int
output_ids = []
for cid, item in pc_properties_dict.items():
compound = Compound.objects.get(id=cid)
duplicate_compounds = Compound.objects.filter(
canonical_smile=property_mapping["CanonicalSmile"]
)
if len(duplicate_compounds) > 0:
duplicate_compound = duplicate_compounds[0]
print(
f"Replacing references to compound {compound.id}"
f" with existing and validated compound {duplicate_compound.id}"
)
compound.replace_compound_references(duplicate_compound)
compound = duplicate_compound
updated_properties = {}
for galaxy_prop, prop in property_mapping.items():
ippidb_prop = prop[0]
......@@ -129,6 +141,8 @@ def compute_compound_properties(compound_ids):
setattr(compound, key, value)
compound.compute_fsp3()
compound.save()
output_ids.append(compound.id)
return output_ids
def compute_drugbank_similarity(qs):
......@@ -329,8 +343,9 @@ def run_compute_compound_properties(self, compound_id):
cj.job = Job.objects.get(task_result__task_id=self.task_id)
cj.save()
self.write(std_out=f"Starting computation of compound properties for {compound_id}")
compute_compound_properties([compound_id])
result_compound_ids = compute_compound_properties([compound_id])
self.write(std_out=f"Finished computation of compound properties for {compound_id}")
return result_compound_ids[0]
@task(base=MonitorTask, bind=True)
......@@ -347,6 +362,7 @@ def run_update_compound_cached_properties(self, compound_ids=None):
self.write(
std_out=f"Finished caching of compound properties for {compound_ids or 'all compounds'}"
)
return compound_ids
@task(base=MonitorTask, bind=True)
......@@ -363,6 +379,7 @@ def run_compute_drugbank_similarity(self, compound_ids=None):
self.write(
std_out=f"Finished computing Drugbank similarity for {compound_ids or 'all compounds'}"
)
return compound_ids
@task(base=MonitorTask, bind=True)
......@@ -371,6 +388,7 @@ def run_validate(self, compound_ids):
self.write(std_out=f"Starting validation of compounds {compound_ids}")
validate(compound_ids)
self.write(std_out=f"Finished validation of compounds {compound_ids}")
return compound_ids
@task(base=MonitorTask, bind=True)
......@@ -406,9 +424,9 @@ def launch_validate_contributions(contribution_ids):
# build the "main" job
compounds_properties_computation_group = chain(
run_compounds_properties_computation_group,
run_update_compound_cached_properties.si(compound_ids),
run_compute_drugbank_similarity.si(compound_ids),
run_validate.si(compound_ids),
run_update_compound_cached_properties.s(),
run_compute_drugbank_similarity.s(),
run_validate.s(),
)
contribution_jobs.append(compounds_properties_computation_group)
# compounds_properties_computation_group.delay()
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment