From 5c1b38833ee7fc7acd56d906e3b322fbc8516554 Mon Sep 17 00:00:00 2001 From: Bryan Brancotte <bryan.brancotte@pasteur.fr> Date: Fri, 30 Sep 2022 18:07:14 +0200 Subject: [PATCH] keep track in the index of occurrences of word, rework business_process so de-duplication is done before saving --- autocomplete_multi_models/business_process.py | 67 ++++++++++++++++--- .../migrations/0002_add_count.py | 16 +++++ autocomplete_multi_models/models.py | 4 ++ autocomplete_multi_models/signals.py | 1 - .../tests/test_business_process.py | 42 +++++++++++- setup.cfg | 2 +- 6 files changed, 119 insertions(+), 13 deletions(-) create mode 100644 autocomplete_multi_models/migrations/0002_add_count.py diff --git a/autocomplete_multi_models/business_process.py b/autocomplete_multi_models/business_process.py index 3f3ac19..e33e483 100644 --- a/autocomplete_multi_models/business_process.py +++ b/autocomplete_multi_models/business_process.py @@ -31,11 +31,18 @@ def split_string(value): @atomic def rebuild_index(): + """ + Rebuild the whole index, faster way of updating the index when a large amount of instances has been changed. + :return: + """ models.IndexedWord.objects.all().delete() - for model, field_names in utils.get_indexed_fields().items(): - for instance in model.objects.only(*field_names): - add_instance_to_index(instance, field_names) - clean_duplicate() + objects = dict() + with connection.cursor() as cursor: + for model, field_names in utils.get_indexed_fields().items(): + for instance in model.objects.only(*field_names): + _add_instance_to_index(instance, field_names, objects, cursor) + models.IndexedWord.objects.bulk_create(objects.values()) + def clean_duplicate(): @@ -50,23 +57,67 @@ def clean_duplicate(): def add_instance_to_index(instance, field_names: List[str]): + """ + index all word from the specified field of the instance, and then update the index. + + Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index + :param instance: the instance to study + :param field_names: fields to consider + """ + objects = dict() + with connection.cursor() as cursor: + _add_instance_to_index(instance, field_names, objects, cursor) + _update_in_index(objects) + + +def _add_instance_to_index(instance, field_names: List[str], objects: list, cursor): f = getattr(instance, _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME, None) if f and not f(): return for field_name in field_names: - add_text_to_index(getattr(instance, field_name)) + _add_text_to_index(getattr(instance, field_name), objects, cursor) def add_text_to_index(value: str): + """ + index all word from the string, and then update the index. + + Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index + + :param value: the string to tokenize and add to the index + """ + objects = dict() + with connection.cursor() as cursor: + _add_text_to_index(value, objects, cursor) + _update_in_index(objects) + + +def _update_in_index(objects: dict): + objects_to_create = [] + for ac_word_upper, o in objects.items(): + changed = ( + models.IndexedWord.objects.annotate(ac_word=Unaccent('word')) + .filter(ac_word__iexact=ac_word_upper) + .update(occurrence=F('occurrence') + Value(o.occurrence)) + ) + if changed == 0: + objects_to_create.append(o) + models.IndexedWord.objects.bulk_create(objects_to_create) + + +def _add_text_to_index(value: str, objects: list, cursor): if value is None or value == '': return - objects = [] for word in split_string(value): len_word = len(word) if len_word < _AUTOCOMPLETE_MIN_LENGTH or word.isdecimal() or len_word > 64: continue - objects.append(models.IndexedWord(word=word)) - models.IndexedWord.objects.bulk_create(objects) + cursor.execute("SELECT UPPER(UNACCENT(%s)) as value", [word]) + ac_word = cursor.fetchone()[0] + try: + objects[ac_word].occurrence += 1 + except KeyError: + objects[ac_word] = models.IndexedWord(word=word, occurrence=1) def get_closest_matching_words(word: str, limit: Optional[int] = None, min_similarity: Optional[float] = None): diff --git a/autocomplete_multi_models/migrations/0002_add_count.py b/autocomplete_multi_models/migrations/0002_add_count.py new file mode 100644 index 0000000..5addef4 --- /dev/null +++ b/autocomplete_multi_models/migrations/0002_add_count.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('autocomplete_multi_models', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='IndexedWord', + name='occurrence', + field=models.IntegerField(default=None, null=True, blank=True), + ), + ] diff --git a/autocomplete_multi_models/models.py b/autocomplete_multi_models/models.py index 679c0a1..0a50c2a 100644 --- a/autocomplete_multi_models/models.py +++ b/autocomplete_multi_models/models.py @@ -12,6 +12,10 @@ class IndexedWord(django.db.models.Model): db_index=True, null=False, ) + occurrence = django.db.models.IntegerField( + null=True, + blank=True, + ) def __str__(self): return self.word diff --git a/autocomplete_multi_models/signals.py b/autocomplete_multi_models/signals.py index 49299c4..42d6b1d 100644 --- a/autocomplete_multi_models/signals.py +++ b/autocomplete_multi_models/signals.py @@ -3,7 +3,6 @@ from autocomplete_multi_models import business_process, utils def instance_update(sender, instance, field_names, **kwargs): business_process.add_instance_to_index(instance, field_names) - business_process.clean_duplicate() def instance_delete(sender, instance, field_names, **kwargs): diff --git a/autocomplete_multi_models/tests/test_business_process.py b/autocomplete_multi_models/tests/test_business_process.py index 722fa2b..6fc28ee 100644 --- a/autocomplete_multi_models/tests/test_business_process.py +++ b/autocomplete_multi_models/tests/test_business_process.py @@ -11,19 +11,55 @@ logger = logging.getLogger(__name__) class AutoCompleteTestCase(TestCase): + def test_count(self): + business_process.add_text_to_index("nous nous sommes promené") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'nous': 2, + 'sommes': 1, + 'promené': 1, + }, + ) + business_process.add_text_to_index("nous") + self.assertEqual(models.IndexedWord.objects.get(word='nous').occurrence, 3) + business_process.add_text_to_index("test") + self.assertEqual(models.IndexedWord.objects.get(word='test').occurrence, 1) + + def test_count_case(self): + business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'Nous': 3, + 'sommes': 2, + 'promené': 1, + 'rentrés': 1, + }, + ) + + def test_count_case_accent(self): + business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés; nous sommés là ") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'Nous': 4, + 'sommes': 3, + 'promené': 1, + 'rentrés': 1, + }, + ) + def test_unaccent_ok(self): business_process.add_text_to_index("azertyêazerty azertyaezerty") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 2) def test_split_ok(self): business_process.add_text_to_index("abcd (abcd) abcd|abcd,,,,]]]abcd") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 1) def test_case_ignored(self): business_process.add_text_to_index("Nous nous") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 1) def test_init_from_settings_fails(self): diff --git a/setup.cfg b/setup.cfg index a171f93..6a9b154 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = autocomplete-multi-models -version = 0.1 +version = 0.2 description = An app that index fields across multiple models, and expose an api to query for word similar to the query. long_description = file: README.md author = Bryan Brancotte -- GitLab