Merge branch 'add-count' into 'main'

keep track in the index of occurrences of word See merge request hub/django-autocomplete-multi-models!1

Merge branch 'add-count' into 'main'
9a767c45 · Bryan BRANCOTTE · 7ec96183 · e845180a · 9a767c45 · 9a767c45
Commit 9a767c45 authored 2 years ago by Bryan BRANCOTTE
--- a/autocomplete_multi_models/business_process.py
+++ b/autocomplete_multi_models/business_process.py
@@ -31,42 +31,92 @@ def split_string(value):

 @atomic
 def rebuild_index():
+    """
+    Rebuild the whole index, faster way of updating the index when a large amount of instances has been changed.
+    :return:
+    """
    models.IndexedWord.objects.all().delete()
-    for model, field_names in utils.get_indexed_fields().items():
-        for instance in model.objects.only(*field_names):
-            add_instance_to_index(instance, field_names)
-    clean_duplicate()
-
-
-def clean_duplicate():
-    models.IndexedWord.objects.annotate(
-        is_duplicate=Exists(
-            models.IndexedWord.objects.filter(
-                word__iexact=OuterRef('word'),
-                pk__gt=OuterRef('pk'),
-            )
-        )
-    ).filter(is_duplicate=True).delete()
+    objects = dict()
+    with connection.cursor() as cursor:
+        for model, field_names in utils.get_indexed_fields().items():
+            for instance in model.objects.only(*field_names):
+                _add_instance_to_index(instance, field_names, objects, cursor)
+    models.IndexedWord.objects.bulk_create(objects.values())
+
+
+# def clean_duplicate():
+#     models.IndexedWord.objects.annotate(
+#         is_duplicate=Exists(
+#             models.IndexedWord.objects.filter(
+#                 word__iexact=OuterRef('word'),
+#                 pk__gt=OuterRef('pk'),
+#             )
+#         )
+#     ).filter(is_duplicate=True).delete()


 def add_instance_to_index(instance, field_names: List[str]):
+    """
+    index all word from the specified field of the instance, and then update the index.
+
+    Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
+    :param instance: the instance to study
+    :param field_names: fields to consider
+    """
+    objects = dict()
+    with connection.cursor() as cursor:
+        _add_instance_to_index(instance, field_names, objects, cursor)
+    _update_in_index(objects)
+
+
+def _add_instance_to_index(instance, field_names: List[str], objects: list, cursor):
    f = getattr(instance, _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME, None)
    if f and not f():
        return
    for field_name in field_names:
-        add_text_to_index(getattr(instance, field_name))
+        _add_text_to_index(getattr(instance, field_name), objects, cursor)


 def add_text_to_index(value: str):
+    """
+    index all word from the string, and then update the index.
+
+    Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
+
+    :param value: the string to tokenize and add to the index
+    """
+    objects = dict()
+    with connection.cursor() as cursor:
+        _add_text_to_index(value, objects, cursor)
+    _update_in_index(objects)
+
+
+def _update_in_index(objects: dict):
+    objects_to_create = []
+    for ac_word_upper, o in objects.items():
+        changed = (
+            models.IndexedWord.objects.annotate(ac_word=Unaccent('word'))
+            .filter(ac_word__iexact=ac_word_upper)
+            .update(occurrence=F('occurrence') + Value(o.occurrence))
+        )
+        if changed == 0:
+            objects_to_create.append(o)
+    models.IndexedWord.objects.bulk_create(objects_to_create)
+
+
+def _add_text_to_index(value: str, objects: list, cursor):
    if value is None or value == '':
        return
-    objects = []
    for word in split_string(value):
        len_word = len(word)
        if len_word < _AUTOCOMPLETE_MIN_LENGTH or word.isdecimal() or len_word > 64:
            continue
-        objects.append(models.IndexedWord(word=word))
-    models.IndexedWord.objects.bulk_create(objects)
+        cursor.execute("SELECT UPPER(UNACCENT(%s)) as value", [word])
+        ac_word = cursor.fetchone()[0]
+        try:
+            objects[ac_word].occurrence += 1
+        except KeyError:
+            objects[ac_word] = models.IndexedWord(word=word, occurrence=1)


 def get_closest_matching_words(word: str, limit: Optional[int] = None, min_similarity: Optional[float] = None):

--- a/autocomplete_multi_models/migrations/0002_add_count.py
+++ b/autocomplete_multi_models/migrations/0002_add_count.py
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('autocomplete_multi_models', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='IndexedWord',
+            name='occurrence',
+            field=models.IntegerField(default=None, null=True, blank=True),
+        ),
+    ]
--- a/autocomplete_multi_models/models.py
+++ b/autocomplete_multi_models/models.py
@@ -12,6 +12,10 @@ class IndexedWord(django.db.models.Model):
        db_index=True,
        null=False,
    )
+    occurrence = django.db.models.IntegerField(
+        null=True,
+        blank=True,
+    )

    def __str__(self):
        return self.word
--- a/autocomplete_multi_models/signals.py
+++ b/autocomplete_multi_models/signals.py
@@ -3,7 +3,6 @@ from autocomplete_multi_models import business_process, utils

 def instance_update(sender, instance, field_names, **kwargs):
    business_process.add_instance_to_index(instance, field_names)
-    business_process.clean_duplicate()


 def instance_delete(sender, instance, field_names, **kwargs):

--- a/autocomplete_multi_models/tests/test_business_process.py
+++ b/autocomplete_multi_models/tests/test_business_process.py
@@ -11,19 +11,55 @@ logger = logging.getLogger(__name__)


 class AutoCompleteTestCase(TestCase):
+    def test_count(self):
+        business_process.add_text_to_index("nous nous sommes promené")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'nous': 2,
+                'sommes': 1,
+                'promené': 1,
+            },
+        )
+        business_process.add_text_to_index("nous")
+        self.assertEqual(models.IndexedWord.objects.get(word='nous').occurrence, 3)
+        business_process.add_text_to_index("test")
+        self.assertEqual(models.IndexedWord.objects.get(word='test').occurrence, 1)
+
+    def test_count_case(self):
+        business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'Nous': 3,
+                'sommes': 2,
+                'promené': 1,
+                'rentrés': 1,
+            },
+        )
+
+    def test_count_case_accent(self):
+        business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés; nous sommés là")
+        self.assertDictEqual(
+            dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
+            {
+                'Nous': 4,
+                'sommes': 3,
+                'promené': 1,
+                'rentrés': 1,
+            },
+        )
+
    def test_unaccent_ok(self):
        business_process.add_text_to_index("azertyêazerty azertyaezerty")
-        business_process.clean_duplicate()
        self.assertEqual(models.IndexedWord.objects.count(), 2)

    def test_split_ok(self):
        business_process.add_text_to_index("abcd (abcd) abcd|abcd,,,,]]]abcd")
-        business_process.clean_duplicate()
        self.assertEqual(models.IndexedWord.objects.count(), 1)

    def test_case_ignored(self):
        business_process.add_text_to_index("Nous nous")
-        business_process.clean_duplicate()
        self.assertEqual(models.IndexedWord.objects.count(), 1)

    def test_init_from_settings_fails(self):
@@ -63,6 +99,13 @@ class AutoCompleteTestCase(TestCase):
        qs = business_process.get_closest_matching_words("ÄRNtoto", limit=-1, min_similarity=-1)
        self.assertGreater(qs.get(word="ARNtoto").similarity, qs.get(word="RNtoto").similarity)

+    def test_search_without_accent_find_accent(self):
+        models.IndexedWord.objects.create(word="azerty")
+        models.IndexedWord.objects.create(word="azérty")
+
+        qs = business_process.get_closest_matching_words("azerty", limit=-1, min_similarity=-1)
+        self.assertEqual(qs.get(word="azérty").similarity, qs.get(word="azerty").similarity)
+

 @override_settings(AUTOCOMPLETE_MIN_LENGTH=1)
 class MinLength1(test_helpers.ChangeAutoCompleteSettingsTestCase):

--- a/requirements-test.txt
+++ b/requirements-test.txt
 python-decouple
 pre-commit
 Black==20.8b1
+click~=8.0.4
 coverage
--- a/setup.cfg
+++ b/setup.cfg
 [metadata]
 name = autocomplete-multi-models
-version = 0.1
+version = 0.2
 description = An app that index fields across multiple models, and expose an api to query for word similar to the query.
 long_description = file: README.md
 author = Bryan Brancotte