diff --git a/autocomplete_multi_models/business_process.py b/autocomplete_multi_models/business_process.py index 3f3ac196ef5ed17dabfcb06e64c098619e036d5e..b86026af350bf7ccbdfda1fb767a722b134053d2 100644 --- a/autocomplete_multi_models/business_process.py +++ b/autocomplete_multi_models/business_process.py @@ -31,42 +31,92 @@ def split_string(value): @atomic def rebuild_index(): + """ + Rebuild the whole index, faster way of updating the index when a large amount of instances has been changed. + :return: + """ models.IndexedWord.objects.all().delete() - for model, field_names in utils.get_indexed_fields().items(): - for instance in model.objects.only(*field_names): - add_instance_to_index(instance, field_names) - clean_duplicate() - - -def clean_duplicate(): - models.IndexedWord.objects.annotate( - is_duplicate=Exists( - models.IndexedWord.objects.filter( - word__iexact=OuterRef('word'), - pk__gt=OuterRef('pk'), - ) - ) - ).filter(is_duplicate=True).delete() + objects = dict() + with connection.cursor() as cursor: + for model, field_names in utils.get_indexed_fields().items(): + for instance in model.objects.only(*field_names): + _add_instance_to_index(instance, field_names, objects, cursor) + models.IndexedWord.objects.bulk_create(objects.values()) + + +# def clean_duplicate(): +# models.IndexedWord.objects.annotate( +# is_duplicate=Exists( +# models.IndexedWord.objects.filter( +# word__iexact=OuterRef('word'), +# pk__gt=OuterRef('pk'), +# ) +# ) +# ).filter(is_duplicate=True).delete() def add_instance_to_index(instance, field_names: List[str]): + """ + index all word from the specified field of the instance, and then update the index. + + Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index + :param instance: the instance to study + :param field_names: fields to consider + """ + objects = dict() + with connection.cursor() as cursor: + _add_instance_to_index(instance, field_names, objects, cursor) + _update_in_index(objects) + + +def _add_instance_to_index(instance, field_names: List[str], objects: list, cursor): f = getattr(instance, _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME, None) if f and not f(): return for field_name in field_names: - add_text_to_index(getattr(instance, field_name)) + _add_text_to_index(getattr(instance, field_name), objects, cursor) def add_text_to_index(value: str): + """ + index all word from the string, and then update the index. + + Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index + + :param value: the string to tokenize and add to the index + """ + objects = dict() + with connection.cursor() as cursor: + _add_text_to_index(value, objects, cursor) + _update_in_index(objects) + + +def _update_in_index(objects: dict): + objects_to_create = [] + for ac_word_upper, o in objects.items(): + changed = ( + models.IndexedWord.objects.annotate(ac_word=Unaccent('word')) + .filter(ac_word__iexact=ac_word_upper) + .update(occurrence=F('occurrence') + Value(o.occurrence)) + ) + if changed == 0: + objects_to_create.append(o) + models.IndexedWord.objects.bulk_create(objects_to_create) + + +def _add_text_to_index(value: str, objects: list, cursor): if value is None or value == '': return - objects = [] for word in split_string(value): len_word = len(word) if len_word < _AUTOCOMPLETE_MIN_LENGTH or word.isdecimal() or len_word > 64: continue - objects.append(models.IndexedWord(word=word)) - models.IndexedWord.objects.bulk_create(objects) + cursor.execute("SELECT UPPER(UNACCENT(%s)) as value", [word]) + ac_word = cursor.fetchone()[0] + try: + objects[ac_word].occurrence += 1 + except KeyError: + objects[ac_word] = models.IndexedWord(word=word, occurrence=1) def get_closest_matching_words(word: str, limit: Optional[int] = None, min_similarity: Optional[float] = None): diff --git a/autocomplete_multi_models/migrations/0002_add_count.py b/autocomplete_multi_models/migrations/0002_add_count.py new file mode 100644 index 0000000000000000000000000000000000000000..5addef4ccb4f4b11489c9955a71d99ca51162643 --- /dev/null +++ b/autocomplete_multi_models/migrations/0002_add_count.py @@ -0,0 +1,16 @@ +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('autocomplete_multi_models', '0001_initial'), + ] + + operations = [ + migrations.AddField( + model_name='IndexedWord', + name='occurrence', + field=models.IntegerField(default=None, null=True, blank=True), + ), + ] diff --git a/autocomplete_multi_models/models.py b/autocomplete_multi_models/models.py index 679c0a1a484731ce4c9958fc93d3810041412d03..0a50c2a92162c9a2cc310b87630a8a75f11f5f67 100644 --- a/autocomplete_multi_models/models.py +++ b/autocomplete_multi_models/models.py @@ -12,6 +12,10 @@ class IndexedWord(django.db.models.Model): db_index=True, null=False, ) + occurrence = django.db.models.IntegerField( + null=True, + blank=True, + ) def __str__(self): return self.word diff --git a/autocomplete_multi_models/signals.py b/autocomplete_multi_models/signals.py index 49299c48f665cdae0b8fb8f1c0eaf56d04eedccf..42d6b1d298117bf6d7a9b4a02fc52e395ed1936c 100644 --- a/autocomplete_multi_models/signals.py +++ b/autocomplete_multi_models/signals.py @@ -3,7 +3,6 @@ from autocomplete_multi_models import business_process, utils def instance_update(sender, instance, field_names, **kwargs): business_process.add_instance_to_index(instance, field_names) - business_process.clean_duplicate() def instance_delete(sender, instance, field_names, **kwargs): diff --git a/autocomplete_multi_models/tests/test_business_process.py b/autocomplete_multi_models/tests/test_business_process.py index 722fa2bd6bc3ebdbdcc53b9427a5f511ff3a478e..c3ed3911736f0b1952eeafa6ed3c8c84e2b40c6c 100644 --- a/autocomplete_multi_models/tests/test_business_process.py +++ b/autocomplete_multi_models/tests/test_business_process.py @@ -11,19 +11,55 @@ logger = logging.getLogger(__name__) class AutoCompleteTestCase(TestCase): + def test_count(self): + business_process.add_text_to_index("nous nous sommes promené") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'nous': 2, + 'sommes': 1, + 'promené': 1, + }, + ) + business_process.add_text_to_index("nous") + self.assertEqual(models.IndexedWord.objects.get(word='nous').occurrence, 3) + business_process.add_text_to_index("test") + self.assertEqual(models.IndexedWord.objects.get(word='test').occurrence, 1) + + def test_count_case(self): + business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'Nous': 3, + 'sommes': 2, + 'promené': 1, + 'rentrés': 1, + }, + ) + + def test_count_case_accent(self): + business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés; nous sommés là ") + self.assertDictEqual( + dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]), + { + 'Nous': 4, + 'sommes': 3, + 'promené': 1, + 'rentrés': 1, + }, + ) + def test_unaccent_ok(self): business_process.add_text_to_index("azertyêazerty azertyaezerty") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 2) def test_split_ok(self): business_process.add_text_to_index("abcd (abcd) abcd|abcd,,,,]]]abcd") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 1) def test_case_ignored(self): business_process.add_text_to_index("Nous nous") - business_process.clean_duplicate() self.assertEqual(models.IndexedWord.objects.count(), 1) def test_init_from_settings_fails(self): @@ -63,6 +99,13 @@ class AutoCompleteTestCase(TestCase): qs = business_process.get_closest_matching_words("ÄRNtoto", limit=-1, min_similarity=-1) self.assertGreater(qs.get(word="ARNtoto").similarity, qs.get(word="RNtoto").similarity) + def test_search_without_accent_find_accent(self): + models.IndexedWord.objects.create(word="azerty") + models.IndexedWord.objects.create(word="azérty") + + qs = business_process.get_closest_matching_words("azerty", limit=-1, min_similarity=-1) + self.assertEqual(qs.get(word="azérty").similarity, qs.get(word="azerty").similarity) + @override_settings(AUTOCOMPLETE_MIN_LENGTH=1) class MinLength1(test_helpers.ChangeAutoCompleteSettingsTestCase): diff --git a/requirements-test.txt b/requirements-test.txt index c0377762a3b876cfc36dcd2f4072b2e487943428..24392f5d685dd46292fb66f9855939fe640500a2 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,4 +1,5 @@ python-decouple pre-commit Black==20.8b1 +click~=8.0.4 coverage diff --git a/setup.cfg b/setup.cfg index a171f9356ee862f84dfd3ef344ee26bffde63701..6a9b154063523767739aafa61798502de9ba43b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = autocomplete-multi-models -version = 0.1 +version = 0.2 description = An app that index fields across multiple models, and expose an api to query for word similar to the query. long_description = file: README.md author = Bryan Brancotte