From b0672bec96f7b4ebd01870c04535599bb541ad62 Mon Sep 17 00:00:00 2001
From: Bryan Brancotte <bryan.brancotte@pasteur.fr>
Date: Mon, 19 Dec 2022 15:03:46 +0100
Subject: [PATCH] Merge plural with singular form, ability to toggle this new
 feature

---
 autocomplete_multi_models/business_process.py | 25 ++++++++++++++++++-
 .../tests/test_business_process.py            | 14 +++++++++++
 autocomplete_multi_models/utils.py            |  6 +++++
 setup.cfg                                     |  2 +-
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/autocomplete_multi_models/business_process.py b/autocomplete_multi_models/business_process.py
index a2aa29b..bec655a 100644
--- a/autocomplete_multi_models/business_process.py
+++ b/autocomplete_multi_models/business_process.py
@@ -5,7 +5,7 @@ from django.contrib.postgres.lookups import Unaccent
 from django.contrib.postgres.search import TrigramSimilarity
 from django.db import connection
 from django.db.models import Exists, OuterRef, Case, When, Value, F
-from django.db.models.functions import Upper
+from django.db.models.functions import Upper, Substr, Length
 from django.db.transaction import atomic
 
 from autocomplete_multi_models import utils, models
@@ -15,6 +15,7 @@ _pattern = re.compile("[^\\w\\d]")
 _AUTOCOMPLETE_MIN_LENGTH = utils.DEFAULT_AUTOCOMPLETE_MIN_LENGTH
 _AUTOCOMPLETE_MIN_SIMILARITY = utils.DEFAULT_AUTOCOMPLETE_MIN_SIMILARITY
 _AUTOCOMPLETE_LIMIT = utils.DEFAULT_AUTOCOMPLETE_LIMIT
+_SHOULD_MERGE_PLURAL_INTO_SINGULAR = utils.DEFAULT_SHOULD_MERGE_PLURAL_INTO_SINGULAR
 _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME = utils.DEFAULT_CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME
 __in_mem_storage = {
     utils.AUTO_UPDATE_ENABLED: True,
@@ -48,6 +49,7 @@ def rebuild_index():
                 _add_instance_to_index(instance, field_names, objects, cursor)
     models.IndexedWord.objects.bulk_create(objects.values())
     _purge_banned_words()
+    _merge_plural_into_singular()
 
 
 # def clean_duplicate():
@@ -109,6 +111,7 @@ def _update_in_index(objects: dict):
             objects_to_create.append(o)
     models.IndexedWord.objects.bulk_create(objects_to_create)
     _purge_banned_words()
+    _merge_plural_into_singular()
 
 
 def _purge_banned_words():
@@ -117,6 +120,26 @@ def _purge_banned_words():
     ).filter(banned=True).delete()
 
 
+def _merge_plural_into_singular():
+    if not _SHOULD_MERGE_PLURAL_INTO_SINGULAR:
+        return
+    for w in (
+        models.IndexedWord.objects.filter(word__endswith="s")
+        .annotate(
+            has_singular=Exists(
+                models.IndexedWord.objects.filter(
+                    word__iexact=Substr(OuterRef('word'), 1, Length(OuterRef('word')) - Value(1))
+                )
+            )
+        )
+        .filter(has_singular=True)
+    ):
+        # as same word with different case is not allowed, it work as there is thus only one singular word
+        # If unicity is nt ensured,
+        models.IndexedWord.objects.filter(word__iexact=w.word[:-1]).update(occurrence=F('occurrence') + w.occurrence)
+        w.delete()
+
+
 def _add_text_to_index(value: str, objects: list, cursor):
     if value is None or value == '':
         return
diff --git a/autocomplete_multi_models/tests/test_business_process.py b/autocomplete_multi_models/tests/test_business_process.py
index ef1dbab..cf044ba 100644
--- a/autocomplete_multi_models/tests/test_business_process.py
+++ b/autocomplete_multi_models/tests/test_business_process.py
@@ -162,6 +162,20 @@ class NeedRebuildDefaultBehaviorTestCase(test_helpers.ChangeAutoCompleteSettings
         self.assertEqual(models.IndexedWord.objects.count(), 0)
 
 
+@override_settings(SHOULD_MERGE_PLURAL_INTO_SINGULAR=True)
+class MergePlural1(test_helpers.ChangeAutoCompleteSettingsTestCase):
+    def test_it(self):
+        business_process.add_text_to_index("Gene genes gene Genes")
+        self.assertEqual(models.IndexedWord.objects.count(), 1)
+
+
+@override_settings(SHOULD_MERGE_PLURAL_INTO_SINGULAR=False)
+class MergePlural2(test_helpers.ChangeAutoCompleteSettingsTestCase):
+    def test_it(self):
+        business_process.add_text_to_index("Gene genes gene genes")
+        self.assertEqual(models.IndexedWord.objects.count(), 2)
+
+
 @override_settings(
     AUTOCOMPLETE_PERSISTENT_VARIABLE_GETTER_SETTER=(
         "autocomplete_multi_models.tests.settings_storage_file_based.get_fcn",
diff --git a/autocomplete_multi_models/utils.py b/autocomplete_multi_models/utils.py
index e01972e..6aaaec9 100644
--- a/autocomplete_multi_models/utils.py
+++ b/autocomplete_multi_models/utils.py
@@ -5,6 +5,7 @@ from django.apps import apps
 DEFAULT_AUTOCOMPLETE_MIN_LENGTH = 4
 DEFAULT_AUTOCOMPLETE_MIN_SIMILARITY = 0.3
 DEFAULT_AUTOCOMPLETE_LIMIT = 10
+DEFAULT_SHOULD_MERGE_PLURAL_INTO_SINGULAR = False
 REBUILD_NEEDED = "is_autocomplete_multi_models_rebuild_needed"
 DEFAULT_CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME = "can_be_indexed_by_autocomplete"
 AUTO_UPDATE_ENABLED = "is_autocomplete_auto_update_on_save_enabled"
@@ -35,6 +36,11 @@ def init_from_settings():
         'AUTOCOMPLETE_LIMIT',
         DEFAULT_AUTOCOMPLETE_LIMIT,
     )
+    business_process._SHOULD_MERGE_PLURAL_INTO_SINGULAR = getattr(
+        settings,
+        'SHOULD_MERGE_PLURAL_INTO_SINGULAR',
+        DEFAULT_SHOULD_MERGE_PLURAL_INTO_SINGULAR,
+    )
     business_process._CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME = getattr(
         settings,
         'CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME',
diff --git a/setup.cfg b/setup.cfg
index b146079..18eec64 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = autocomplete-multi-models
-version = 0.4.3.1
+version = 0.5
 description = An app that index fields across multiple models, and expose an api to query for word similar to the query.
 long_description = file: README.md
 author = Bryan Brancotte
-- 
GitLab