Skip to content
Snippets Groups Projects
Commit 9a767c45 authored by Bryan BRANCOTTE's avatar Bryan BRANCOTTE
Browse files

Merge branch 'add-count' into 'main'

keep track in the index of occurrences of word

See merge request hub/django-autocomplete-multi-models!1
parents 7ec96183 e845180a
No related branches found
No related tags found
No related merge requests found
......@@ -31,42 +31,92 @@ def split_string(value):
@atomic
def rebuild_index():
"""
Rebuild the whole index, faster way of updating the index when a large amount of instances has been changed.
:return:
"""
models.IndexedWord.objects.all().delete()
for model, field_names in utils.get_indexed_fields().items():
for instance in model.objects.only(*field_names):
add_instance_to_index(instance, field_names)
clean_duplicate()
def clean_duplicate():
models.IndexedWord.objects.annotate(
is_duplicate=Exists(
models.IndexedWord.objects.filter(
word__iexact=OuterRef('word'),
pk__gt=OuterRef('pk'),
)
)
).filter(is_duplicate=True).delete()
objects = dict()
with connection.cursor() as cursor:
for model, field_names in utils.get_indexed_fields().items():
for instance in model.objects.only(*field_names):
_add_instance_to_index(instance, field_names, objects, cursor)
models.IndexedWord.objects.bulk_create(objects.values())
# def clean_duplicate():
# models.IndexedWord.objects.annotate(
# is_duplicate=Exists(
# models.IndexedWord.objects.filter(
# word__iexact=OuterRef('word'),
# pk__gt=OuterRef('pk'),
# )
# )
# ).filter(is_duplicate=True).delete()
def add_instance_to_index(instance, field_names: List[str]):
"""
index all word from the specified field of the instance, and then update the index.
Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
:param instance: the instance to study
:param field_names: fields to consider
"""
objects = dict()
with connection.cursor() as cursor:
_add_instance_to_index(instance, field_names, objects, cursor)
_update_in_index(objects)
def _add_instance_to_index(instance, field_names: List[str], objects: list, cursor):
f = getattr(instance, _CAN_BE_INDEXED_BY_AUTOCOMPLETE_FUNCTION_NAME, None)
if f and not f():
return
for field_name in field_names:
add_text_to_index(getattr(instance, field_name))
_add_text_to_index(getattr(instance, field_name), objects, cursor)
def add_text_to_index(value: str):
"""
index all word from the string, and then update the index.
Warning: Should only be used for few instance as index update is slow, for large insertion, prefer rebuild_index
:param value: the string to tokenize and add to the index
"""
objects = dict()
with connection.cursor() as cursor:
_add_text_to_index(value, objects, cursor)
_update_in_index(objects)
def _update_in_index(objects: dict):
objects_to_create = []
for ac_word_upper, o in objects.items():
changed = (
models.IndexedWord.objects.annotate(ac_word=Unaccent('word'))
.filter(ac_word__iexact=ac_word_upper)
.update(occurrence=F('occurrence') + Value(o.occurrence))
)
if changed == 0:
objects_to_create.append(o)
models.IndexedWord.objects.bulk_create(objects_to_create)
def _add_text_to_index(value: str, objects: list, cursor):
if value is None or value == '':
return
objects = []
for word in split_string(value):
len_word = len(word)
if len_word < _AUTOCOMPLETE_MIN_LENGTH or word.isdecimal() or len_word > 64:
continue
objects.append(models.IndexedWord(word=word))
models.IndexedWord.objects.bulk_create(objects)
cursor.execute("SELECT UPPER(UNACCENT(%s)) as value", [word])
ac_word = cursor.fetchone()[0]
try:
objects[ac_word].occurrence += 1
except KeyError:
objects[ac_word] = models.IndexedWord(word=word, occurrence=1)
def get_closest_matching_words(word: str, limit: Optional[int] = None, min_similarity: Optional[float] = None):
......
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('autocomplete_multi_models', '0001_initial'),
]
operations = [
migrations.AddField(
model_name='IndexedWord',
name='occurrence',
field=models.IntegerField(default=None, null=True, blank=True),
),
]
......@@ -12,6 +12,10 @@ class IndexedWord(django.db.models.Model):
db_index=True,
null=False,
)
occurrence = django.db.models.IntegerField(
null=True,
blank=True,
)
def __str__(self):
return self.word
......@@ -3,7 +3,6 @@ from autocomplete_multi_models import business_process, utils
def instance_update(sender, instance, field_names, **kwargs):
business_process.add_instance_to_index(instance, field_names)
business_process.clean_duplicate()
def instance_delete(sender, instance, field_names, **kwargs):
......
......@@ -11,19 +11,55 @@ logger = logging.getLogger(__name__)
class AutoCompleteTestCase(TestCase):
def test_count(self):
business_process.add_text_to_index("nous nous sommes promené")
self.assertDictEqual(
dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
{
'nous': 2,
'sommes': 1,
'promené': 1,
},
)
business_process.add_text_to_index("nous")
self.assertEqual(models.IndexedWord.objects.get(word='nous').occurrence, 3)
business_process.add_text_to_index("test")
self.assertEqual(models.IndexedWord.objects.get(word='test').occurrence, 1)
def test_count_case(self):
business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés")
self.assertDictEqual(
dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
{
'Nous': 3,
'sommes': 2,
'promené': 1,
'rentrés': 1,
},
)
def test_count_case_accent(self):
business_process.add_text_to_index("Nous nous sommes promené, et nous soMMes rentrés; nous sommés là")
self.assertDictEqual(
dict([(o.word, o.occurrence) for o in models.IndexedWord.objects.all()]),
{
'Nous': 4,
'sommes': 3,
'promené': 1,
'rentrés': 1,
},
)
def test_unaccent_ok(self):
business_process.add_text_to_index("azertyêazerty azertyaezerty")
business_process.clean_duplicate()
self.assertEqual(models.IndexedWord.objects.count(), 2)
def test_split_ok(self):
business_process.add_text_to_index("abcd (abcd) abcd|abcd,,,,]]]abcd")
business_process.clean_duplicate()
self.assertEqual(models.IndexedWord.objects.count(), 1)
def test_case_ignored(self):
business_process.add_text_to_index("Nous nous")
business_process.clean_duplicate()
self.assertEqual(models.IndexedWord.objects.count(), 1)
def test_init_from_settings_fails(self):
......@@ -63,6 +99,13 @@ class AutoCompleteTestCase(TestCase):
qs = business_process.get_closest_matching_words("ÄRNtoto", limit=-1, min_similarity=-1)
self.assertGreater(qs.get(word="ARNtoto").similarity, qs.get(word="RNtoto").similarity)
def test_search_without_accent_find_accent(self):
models.IndexedWord.objects.create(word="azerty")
models.IndexedWord.objects.create(word="azérty")
qs = business_process.get_closest_matching_words("azerty", limit=-1, min_similarity=-1)
self.assertEqual(qs.get(word="azérty").similarity, qs.get(word="azerty").similarity)
@override_settings(AUTOCOMPLETE_MIN_LENGTH=1)
class MinLength1(test_helpers.ChangeAutoCompleteSettingsTestCase):
......
python-decouple
pre-commit
Black==20.8b1
click~=8.0.4
coverage
[metadata]
name = autocomplete-multi-models
version = 0.1
version = 0.2
description = An app that index fields across multiple models, and expose an api to query for word similar to the query.
long_description = file: README.md
author = Bryan Brancotte
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment