From 5c1b38833ee7fc7acd56d906e3b322fbc8516554 Mon Sep 17 00:00:00 2001
From: Bryan Brancotte <bryan.brancotte@pasteur.fr>
Date: Fri, 30 Sep 2022 18:07:14 +0200
Subject: [PATCH] keep track in the index of occurrences of word, rework
 business_process so de-duplication is done before saving

---
 autocomplete_multi_models/business_process.py | 67 ++++++++++++++++---
 .../migrations/0002_add_count.py              | 16 +++++
 autocomplete_multi_models/models.py           |  4 ++
 autocomplete_multi_models/signals.py          |  1 -
 .../tests/test_business_process.py            | 42 +++++++++++-
 setup.cfg                                     |  2 +-
 6 files changed, 119 insertions(+), 13 deletions(-)
 create mode 100644 autocomplete_multi_models/migrations/0002_add_count.py

diff --git a/autocomplete_multi_models/business_process.py b/autocomplete_multi_models/business_process.py
index 3f3ac19..e33e483 100644
--- a/autocomplete_multi_models/business_process.py
+++ b/autocomplete_multi_models/business_process.py
@@ -31,11 +31,18 @@ def split_string(value):
 
 @atomic
 def rebuild_index():
+    """
+    Rebuild the whole index, faster way of updating the index when a large amount of instances has been changed.
+    :return:
+    """
     models.IndexedWord.objects.all().delete()
-    for model, field_names in utils.get_indexed_fields().items():
-        for instance in model.objects.only(*field_names):
-            add_instance_to_index(instance, field_names)
-    clean_duplicate()
+    objects = dict()
+    with connection.cursor() as cursor:
+        for model, field_names in utils.get_indexed_fields().items():
+            for instance in model.objects.only(*field_names):
+                _add_instance_to_index(instance, field_names, objects, cursor)
+    models.IndexedWord.objects.bulk_create(objects.values())
+
 
 
 def clean_duplicate():
@@ -50,23 +57,67 @@ def clean_duplicate():
 
 
 def add_instance_to_index(instance, field_names: List[str]):
+    """
+    index all word from the specified field of the instance, and then update the index.
+
+    Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
+    :param instance: the instance to study
+    :param field_names: fields to consider
+    """
+    objects = dict()
+    with connection.cursor() as cursor:
+        _add_instance_to_index(instance, field_names, objects, cursor)
+    _update_in_index(objects)
+
+
+def _add_instance_to_index(instance, field_names: List[str], objects: list, cursor):
     f = getattr(instance, _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME, None)
     if f and not f():
         return
     for field_name in field_names:
-        add_text_to_index(getattr(instance, field_name))
+        _add_text_to_index(getattr(instance, field_name), objects, cursor)
 
 
 def add_text_to_index(value: str):
+    """
+    index all word from the string, and then update the index.
+
+    Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
+
+    :param value: the string to tokenize and add to the index
+    """
+    objects = dict()
+    with connection.cursor() as cursor:
+        _add_text_to_index(value, objects, cursor)
+    _update_in_index(objects)
+
+
+def _update_in_index(objects: dict):
+    objects_to_create = []
+    for ac_word_upper, o in objects.items():
+        changed = (
+            models.IndexedWord.objects.annotate(ac_word=Unaccent('word'))
+            .filter(ac_word__iexact=ac_word_upper)
+            .update(occurrence=F('occurrence') + Value(o.occurrence))
+        )
+        if changed == 0:
+            objects_to_create.append(o)
+    models.IndexedWord.objects.bulk_create(objects_to_create)
+
+
+def _add_text_to_index(value: str, objects: list, cursor):
     if value is None or value == '':
         return
-    objects = []
     for word in split_string(value):
         len_word = len(word)
         if len_word < _AUTOCOMPLETE_MIN_LENGTH or word.isdecimal() or len_word > 64:
             continue
-        objects.append(models.IndexedWord(word=word))
-    models.IndexedWord.objects.bulk_create(objects)
+        cursor.execute("SELECT UPPER(UNACCENT(%s)) as value", [word])
+        ac_word = cursor.fetchone()[0]
+        try:
+            objects[ac_word].occurrence += 1
+        except KeyError:
+            objects[ac_word] = models.IndexedWord(word=word, occurrence=1)
 
 
 def get_closest_matching_words(word: str, limit: Optional[int] = None, min_similarity: Optional[float] = None):
diff --git a/autocomplete_multi_models/migrations/0002_add_count.py b/autocomplete_multi_models/migrations/0002_add_count.py
new file mode 100644
index 0000000..5addef4
--- /dev/null
+++ b/autocomplete_multi_models/migrations/0002_add_count.py
@@ -0,0 +1,16 @@
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('autocomplete_multi_models', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='IndexedWord',
+            name='occurrence',
+            field=models.IntegerField(default=None, null=True, blank=True),
+        ),
+    ]
diff --git a/autocomplete_multi_models/models.py b/autocomplete_multi_models/models.py
index 679c0a1..0a50c2a 100644
--- a/autocomplete_multi_models/models.py
+++ b/autocomplete_multi_models/models.py
@@ -12,6 +12,10 @@ class IndexedWord(django.db.models.Model):
         db_index=True,
         null=False,
     )
+    occurrence = django.db.models.IntegerField(
+        null=True,
+        blank=True,
+    )
 
     def __str__(self):
         return self.word
diff --git a/autocomplete_multi_models/signals.py b/autocomplete_multi_models/signals.py
index 49299c4..42d6b1d 100644
--- a/autocomplete_multi_models/signals.py
+++ b/autocomplete_multi_models/signals.py
@@ -3,7 +3,6 @@ from autocomplete_multi_models import business_process, utils
 
 def instance_update(sender, instance, field_names, **kwargs):
     business_process.add_instance_to_index(instance, field_names)
-    business_process.clean_duplicate()
 
 
 def instance_delete(sender, instance, field_names, **kwargs):
diff --git a/autocomplete_multi_models/tests/test_business_process.py b/autocomplete_multi_models/tests/test_business_process.py
index 722fa2b..6fc28ee 100644
--- a/autocomplete_multi_models/tests/test_business_process.py
+++ b/autocomplete_multi_models/tests/test_business_process.py
@@ -11,19 +11,55 @@ logger = logging.getLogger(__name__)
 
 
 class AutoCompleteTestCase(TestCase):
+    def test_count(self):
+        business_process.add_text_to_index("nous nous sommes promenÃ©")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'nous': 2,
+                'sommes': 1,
+                'promenÃ©': 1,
+            },
+        )
+        business_process.add_text_to_index("nous")
+        self.assertEqual(models.IndexedWord.objects.get(word='nous').occurrence, 3)
+        business_process.add_text_to_index("test")
+        self.assertEqual(models.IndexedWord.objects.get(word='test').occurrence, 1)
+
+    def test_count_case(self):
+        business_process.add_text_to_index("Nous nous sommes promenÃ©, et nous soMMes rentrÃ©s")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'Nous': 3,
+                'sommes': 2,
+                'promenÃ©': 1,
+                'rentrÃ©s': 1,
+            },
+        )
+
+    def test_count_case_accent(self):
+        business_process.add_text_to_index("Nous nous sommes promenÃ©, et nous soMMes rentrÃ©s; nous sommÃ©s lÃ ")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'Nous': 4,
+                'sommes': 3,
+                'promenÃ©': 1,
+                'rentrÃ©s': 1,
+            },
+        )
+
     def test_unaccent_ok(self):
         business_process.add_text_to_index("azertyÃªazerty azertyaezerty")
-        business_process.clean_duplicate()
         self.assertEqual(models.IndexedWord.objects.count(), 2)
 
     def test_split_ok(self):
         business_process.add_text_to_index("abcd (abcd) abcd|abcd,,,,]]]abcd")
-        business_process.clean_duplicate()
         self.assertEqual(models.IndexedWord.objects.count(), 1)
 
     def test_case_ignored(self):
         business_process.add_text_to_index("Nous nous")
-        business_process.clean_duplicate()
         self.assertEqual(models.IndexedWord.objects.count(), 1)
 
     def test_init_from_settings_fails(self):
diff --git a/setup.cfg b/setup.cfg
index a171f93..6a9b154 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = autocomplete-multi-models
-version = 0.1
+version = 0.2
 description = An app that index fields across multiple models, and expose an api to query for word similar to the query.
 long_description = file: README.md
 author = Bryan Brancotte
-- 
GitLab