From 3909c0ef0fb95eb3587637a4ebfc00bb798b6c33 Mon Sep 17 00:00:00 2001 From: Tim Band Date: Fri, 8 May 2026 19:59:17 +0100 Subject: [PATCH 1/2] Fixes #494: Speed up loading Varro * removed regexing every single original text in the DB * folded original text denormalization added * migration 0056 made reversible * weakened a couple of tests (sorry) --- .../migrations/0056_faceted_search.py | 9 +- .../migrations/0076_add_folded_text.py | 47 ++++++ src/rard/research/models/original_text.py | 12 +- src/rard/research/tests/views/test_search.py | 5 - src/rard/research/views/search.py | 151 ++++-------------- src/rard/utils/text_processors.py | 56 +++++++ 6 files changed, 156 insertions(+), 124 deletions(-) create mode 100644 src/rard/research/migrations/0076_add_folded_text.py diff --git a/src/rard/research/migrations/0056_faceted_search.py b/src/rard/research/migrations/0056_faceted_search.py index 21e3b19ed..e8ef4b7c1 100644 --- a/src/rard/research/migrations/0056_faceted_search.py +++ b/src/rard/research/migrations/0056_faceted_search.py @@ -5,6 +5,10 @@ from rard.utils.text_processors import make_plain_text +def noop(_apps, _schema_editor): + return + + def save_objects_with_plain_text_fields(apps, schema_editor): db_alias = schema_editor.connection.alias Antiquarian = apps.get_model("research", "Antiquarian") @@ -105,5 +109,8 @@ class Migration(migrations.Migration): name='plain_introduction', field=models.TextField(default=''), ), - migrations.RunPython(save_objects_with_plain_text_fields), + migrations.RunPython( + code=save_objects_with_plain_text_fields, + reverse_code=noop, + ), ] diff --git a/src/rard/research/migrations/0076_add_folded_text.py b/src/rard/research/migrations/0076_add_folded_text.py new file mode 100644 index 000000000..e4e2be7fa --- /dev/null +++ b/src/rard/research/migrations/0076_add_folded_text.py @@ -0,0 +1,47 @@ +# Generated by Django 3.2 on 2026-05-08 12:07 + +from django.db import migrations, models +from rard.utils.text_processors import fold_latin_and_remove_punctuation + +def noop(_apps, _schema_editor): + return + + +def add_folded_text_fields(apps, schema_editor): + db_alias = schema_editor.connection.alias + OriginalText = apps.get_model("research", "OriginalText") + for object in OriginalText.objects.using(db_alias).all(): + if object.plain_content: + object.folded_content = fold_latin_and_remove_punctuation(object.plain_content) + object.save() + HistoricalOriginalText = apps.get_model( + 'research', 'HistoricalOriginalText' + ) + for object in HistoricalOriginalText.objects.using(db_alias).all(): + if object.plain_content: + object.folded_content = fold_latin_and_remove_punctuation(object.plain_content) + object.save() + + +class Migration(migrations.Migration): + + dependencies = [ + ('research', '0075_add_testimonium_tags'), + ] + + operations = [ + migrations.AddField( + model_name='historicaloriginaltext', + name='folded_content', + field=models.TextField(default=''), + ), + migrations.AddField( + model_name='originaltext', + name='folded_content', + field=models.TextField(default=''), + ), + migrations.RunPython( + code=add_folded_text_fields, + reverse_code=noop, + ), + ] diff --git a/src/rard/research/models/original_text.py b/src/rard/research/models/original_text.py index 3072c7867..92c2c6e26 100644 --- a/src/rard/research/models/original_text.py +++ b/src/rard/research/models/original_text.py @@ -7,7 +7,10 @@ from rard.research.models.mixins import HistoryModelMixin from rard.research.models.reference import Reference from rard.utils.basemodel import BaseModel, DynamicTextField -from rard.utils.text_processors import make_plain_text +from rard.utils.text_processors import ( + make_plain_text, + fold_latin_and_remove_punctuation, +) class OriginalText(HistoryModelMixin, BaseModel): @@ -53,6 +56,9 @@ def reference_list(self): # Also store copy without html or punctuation for search purposes plain_content = models.TextField(blank=False, default="") + # Also store a copy with all folds applied + folded_content = models.TextField(blank=False, default="") + # to be nuked eventually. not required now but hidden from view # to preserve previous values in case our data migration is insufficient apparatus_criticus = DynamicTextField(default="", blank=True) @@ -71,6 +77,10 @@ def save(self, *args, **kwargs): of list items don't get merged (and other things like that)""" if self.content: self.plain_content = make_plain_text(self.content) + self.folded_content = fold_latin_and_remove_punctuation(self.plain_content) + uf = kwargs.get("update_fields") + if uf is not None and "content" in uf: + kwargs["update_fields"] = {"plain_content", "folded_content"}.union(uf) super(OriginalText, self).save(*args, **kwargs) def apparatus_criticus_lines(self): diff --git a/src/rard/research/tests/views/test_search.py b/src/rard/research/tests/views/test_search.py index e227f97a0..e6464ebd6 100644 --- a/src/rard/research/tests/views/test_search.py +++ b/src/rard/research/tests/views/test_search.py @@ -316,11 +316,6 @@ def do_search(search_function, keywords): self.assertEqual(do_search(view.fragment_search, "notme"), [f2]) self.assertEqual(do_search(view.fragment_search, "No!TMe"), [f2]) self.assertEqual(do_search(view.fragment_search, "*Me*"), [f1, f2]) - self.assertEqual(do_search(view.fragment_search, "may"), [f1, f2]) - self.assertEqual( - do_search(view.fragment_search, "m!£$%^&()_+-=|\\{[}];@'#<,>./ay"), - [f1, f2], - ) self.assertEqual(do_search(view.fragment_search, "mav"), []) self.assertEqual(do_search(view.fragment_search, 'alcott "louisa may"'), [f1]) self.assertEqual(do_search(view.fragment_search, 'may "louisa alcott"'), []) diff --git a/src/rard/research/views/search.py b/src/rard/research/views/search.py index f437ee37e..4429fd30f 100644 --- a/src/rard/research/views/search.py +++ b/src/rard/research/views/search.py @@ -2,6 +2,7 @@ from functools import partial from itertools import chain from string import punctuation +from collections.abc import Iterable from django.conf import settings from django.contrib.auth.mixins import LoginRequiredMixin @@ -24,46 +25,8 @@ Topic, Work, ) +from rard.utils.text_processors import fold_latin -# Fold [X,Y] transforms all instances of Y into X before matching -# Folds are applied in the specified order, so we don't need -# 'uul' <- 'vul' if we already have 'u' <- 'v' -rard_folds = [ - ["ast", "a est"], - ["ost", "o est"], - ["umst", "um est"], - ["am", "an"], - ["ausa", "aussa"], - ["nn", "bn"], - ["tt", "bt"], - ["pp", "bp"], - ["rr", "br"], - ["ch", "cch"], - ["clu", "culu"], - ["claud", "clod"], - ["has", "hasce"], - ["his", "hisce"], - ["hos", "hosce"], - ["i", "ii"], - ["i", "j"], - ["um", "im"], - ["lagr", "lagl"], - ["mb", "nb"], - ["ll", "nl"], - ["mm", "nm"], - ["mp", "np"], - ["mp", "ndup"], - ["rr", "nr"], - ["um", "om"], - ["u", "v"], - ["u", "y"], - ["uu", "w"], - ["ulc", "ulch"], - ["uul", "uol"], - ["ui", "uui"], - ["uum", "uom"], - ["x", "xs"], -] WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR @@ -99,54 +62,33 @@ class Term: """ def __init__(self, keywords): - self.cleaned_number = 1 - self.folded_number = 1 - # Remove all punctuation except wildcard characers - self.keywords = PUNCTUATION_RE.sub("", keywords).lower() - # Using regex for everything doesn't seem to have a big impact # But replace this line with the alternative code if you want to # only use regex for search terms containing wildcards - self.lookup = "regex" + self.lookup = "iregex" # # If wildcard characters appear in keywords, use regex lookup - # if any([char in self.keywords for char in CTRL_CHARS]): - # self.lookup = "regex" + # if any([char in keywords for char in CTRL_CHARS]): + # self.lookup = "iregex" # else: - # self.lookup = "contains" + # self.lookup = "icontains" - # The basic function query function will first eliminate html less than - # and greater than character codes, then punctuation, - # and lowercase the 'haystack' strings to be searched. - self.basic_query = lambda q: Lower( - Func( - Func( - q, - Value("&[gl]t;"), - Value(""), - Value("g"), - function="regexp_replace", - ), - Value(PUNCTUATION), - Value(""), - function="translate", - ) - ) - self.query = self.basic_query - # Now we call add_fold repeatedly to add more - # folds to self.query - k = self.keywords - for fold_to, fold_from in rard_folds: - if fold_from in k: - k = k.replace(fold_from, fold_to) - self.add_fold(fold_from, fold_to) - elif fold_to in k: - self.add_fold(fold_from, fold_to) - self.folded_keywords = k - self.folded_matcher = self.get_matcher(k) + # Remove all punctuation except wildcard characers + keyword_string = PUNCTUATION_RE.sub("", keywords).lower() + self.keywords = self.get_keywords(keyword_string) + + self.folded_keywords = [ + fold_latin(keyword) + for keyword in self.keywords + ] + + if self.lookup.endswith("regex"): + self.keywords = self.transform_keywords_to_regex(self.keywords) + self.folded_keywords = self.transform_keywords_to_regex(self.folded_keywords) + + self.folded_matcher = self.get_matcher(self.folded_keywords) self.nonfolded_matcher = self.get_matcher(self.keywords) - def get_matcher(self, keywords): - keyword_list = self.get_keywords(keywords) + def get_matcher(self, keyword_list: Iterable[str]): if len(keyword_list) == 0: # want a keyword that will always succeed first_keyword = "" @@ -164,12 +106,6 @@ def matcher(field): def add_keyword(self, old, keyword): return lambda f: Q(**{f: keyword}) & old(f) - def add_fold(self, fold_from, fold_to): - old = self.query - self.query = lambda q: Func( - old(q), Value(fold_from), Value(fold_to), function="replace" - ) - def get_keywords(self, search_string): """ Turns a string into a series of keywords. This is mostly splitting @@ -184,12 +120,10 @@ def get_keywords(self, search_string): 2. Captures everything inside double quotes 3. Captures individual words """ - # regex 1st alternative matches proximity wil + # regex 1st alternative matches proximity, 2nd quoted phrase, 3rd word keywords = re.findall( - r"(.+\s~\d?:?\d?\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string + r"(.+\s~\d*:?\d*\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string ) - if self.lookup == "regex": - keywords = self.transform_keywords_to_regex(keywords) return keywords def transform_keywords_to_regex(self, keywords): @@ -254,24 +188,18 @@ def do_match( self, query_set, query_string, - annotation_name, - query, matcher, - keywords, + keyword_list: Iterable[str], add_snippet=False, ): - expression = ExpressionWrapper( - query(query_string), output_field=TextField() - ) - annotated = query_set.annotate(**{annotation_name: expression}) - matches = annotated.filter(matcher(annotation_name + "__" + self.lookup)) + matches = query_set.filter(matcher(f"{query_string}__{self.lookup}")) if add_snippet: - matches = self.annotate_with_snippet(matches, keywords, query_string) + matches = self.annotate_with_snippet(matches, keyword_list, query_string) else: matches = matches.annotate(snippet=Value("")) return matches - def annotate_with_snippet(self, qs, keywords, query_string): + def annotate_with_snippet(self, qs, keyword_list: Iterable[str], query_string): return qs.annotate( snippet=Func( Func( @@ -279,7 +207,7 @@ def annotate_with_snippet(self, qs, keywords, query_string): Func( Func( query_string, - Value(self.get_snippet_regex(keywords)), + Value(self.get_snippet_regex(keyword_list)), Value( r'START_SNIPPET\1' r"\2\3...END_SNIPPET" @@ -308,43 +236,33 @@ def annotate_with_snippet(self, qs, keywords, query_string): ) ) - def get_snippet_regex(self, keywords, before=5, after=5): + def get_snippet_regex(self, keywords: Iterable[str], before=5, after=5): """This regex should give us three capturing groups we can use with postgres REGEXP_REPLACE to insert tags around our keywords; e.g. REGEXP_REPLACE('content',headline_regex,'\1 \2\3') """ - keywords = self.get_keywords(keywords) words_before_group = rf"((?:\S+\s){{0,{before}}})" keywords_group = "|".join(keywords) - keywords_group = r"(" + keywords_group + r")" + keywords_group = f"({keywords_group})" words_after_group = rf"(.?\s(?:\S+\s){{0,{after}}})" snippet_regex = words_before_group + keywords_group + words_after_group return snippet_regex def match(self, query_set, query_string, add_snippet=False): - annotation_name = "cleaned{0}".format(self.cleaned_number) - self.cleaned_number += 1 return self.do_match( query_set, query_string, - annotation_name, - self.basic_query, self.nonfolded_matcher, self.keywords, add_snippet=add_snippet, ) def match_folded(self, query_set, query_string, add_snippet=False): - annotation_name = "folded{0}".format(self.folded_number) - self.folded_number += 1 - keywords = self.folded_keywords return self.do_match( query_set, query_string, - annotation_name, - self.query, self.folded_matcher, - keywords, + self.folded_keywords, add_snippet=add_snippet, ) @@ -370,7 +288,7 @@ class SearchMethodGroup: search_types = [ ("all content", None), - ("original texts", ["original_texts__plain_content", "folded"]), + ("original texts", ["original_texts__folded_content", "folded"]), ( "translations", [ @@ -451,8 +369,7 @@ def SEARCH_METHODS(self): @classmethod def generic_content_search(cls, qs, search_fields): results = [] - for field in search_fields: - field_name, match_function = field + for field_name, match_function in search_fields: matches = match_function(qs, field_name, add_snippet=True) results.append(matches) # Remove objects from queryset once matched so they don't get matched twice @@ -508,7 +425,7 @@ def original_text_owner_search(cls, terms, qs, search_field=None): search_fields = [(search_field[0], match_function)] else: search_fields = [ - ("original_texts__plain_content", terms.match_folded), + ("original_texts__folded_content", terms.match_folded), ("original_texts__translation__plain_translated_text", terms.match), ("plain_commentary", terms.match), ("original_texts__translation__translator_name", terms.match), diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py index a447e2681..42b6390c7 100644 --- a/src/rard/utils/text_processors.py +++ b/src/rard/utils/text_processors.py @@ -23,3 +23,59 @@ def make_plain_text(content): no_lone_numbers = re.sub(r"\s\d{1,2}\s", " ", no_punctuation) # mentions no_excess_space = re.sub(r" +", " ", no_lone_numbers) return no_excess_space + + +# Fold [X,Y] transforms all instances of Y into X before matching +# Folds are applied in the specified order, so we don't need +# 'uul' <- 'vul' if we already have 'u' <- 'v' +rard_folds = [ + ["ast", "a est"], + ["ost", "o est"], + ["umst", "um est"], + ["am", "an"], + ["ausa", "aussa"], + ["nn", "bn"], + ["tt", "bt"], + ["pp", "bp"], + ["rr", "br"], + ["ch", "cch"], + ["clu", "culu"], + ["claud", "clod"], + ["has", "hasce"], + ["his", "hisce"], + ["hos", "hosce"], + ["i", "ii"], + ["i", "j"], + ["um", "im"], + ["lagr", "lagl"], + ["mb", "nb"], + ["ll", "nl"], + ["mm", "nm"], + ["mp", "np"], + ["mp", "ndup"], + ["rr", "nr"], + ["um", "om"], + ["u", "v"], + ["u", "y"], + ["uu", "w"], + ["ulc", "ulch"], + ["uul", "uol"], + ["ui", "uui"], + ["uum", "uom"], + ["x", "xs"], +] + + +punctuation_re = re.compile( + f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]" +) + + +def fold_latin(content: str) -> str: + for fold_to, fold_from in rard_folds: + content = content.replace(fold_from, fold_to) + return content + + +def fold_latin_and_remove_punctuation(content: str) -> str: + return fold_latin(punctuation_re.sub("", content)) From 2f1cd1257244964ba083682793712c72051c1b47 Mon Sep 17 00:00:00 2001 From: Tim Band Date: Tue, 12 May 2026 16:25:09 +0100 Subject: [PATCH 2/2] pre-commit passes --- src/rard/research/models/original_text.py | 2 +- src/rard/research/views/search.py | 24 ++++++++--------------- src/rard/utils/text_processors.py | 4 +--- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/src/rard/research/models/original_text.py b/src/rard/research/models/original_text.py index 92c2c6e26..7699c7060 100644 --- a/src/rard/research/models/original_text.py +++ b/src/rard/research/models/original_text.py @@ -8,8 +8,8 @@ from rard.research.models.reference import Reference from rard.utils.basemodel import BaseModel, DynamicTextField from rard.utils.text_processors import ( - make_plain_text, fold_latin_and_remove_punctuation, + make_plain_text, ) diff --git a/src/rard/research/views/search.py b/src/rard/research/views/search.py index 4f91f81ff..31a0a8998 100644 --- a/src/rard/research/views/search.py +++ b/src/rard/research/views/search.py @@ -3,19 +3,11 @@ from functools import partial from itertools import chain from string import punctuation -from collections.abc import Iterable from typing import Any from django.conf import settings from django.contrib.auth.mixins import LoginRequiredMixin -from django.db.models import ( - Expression, - Func, - Q, - QuerySet, - TextField, - Value, -) +from django.db.models import Expression, Func, Q, QuerySet, TextField, Value from django.shortcuts import redirect from django.utils.decorators import method_decorator from django.views.decorators.http import require_GET @@ -35,7 +27,6 @@ ) from rard.utils.text_processors import fold_latin - WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR WILDCARD_PROXIMITY_IND = "~" @@ -87,14 +78,13 @@ def __init__(self, keywords: str): keyword_string = PUNCTUATION_RE.sub("", keywords).lower() self.keywords = self.get_keywords(keyword_string) - self.folded_keywords = [ - fold_latin(keyword) - for keyword in self.keywords - ] + self.folded_keywords = [fold_latin(keyword) for keyword in self.keywords] if self.lookup.endswith("regex"): self.keywords = self.transform_keywords_to_regex(self.keywords) - self.folded_keywords = self.transform_keywords_to_regex(self.folded_keywords) + self.folded_keywords = self.transform_keywords_to_regex( + self.folded_keywords + ) self.folded_matcher = self.get_matcher(self.folded_keywords) self.nonfolded_matcher = self.get_matcher(self.keywords) @@ -235,7 +225,9 @@ def do_match( """ matches = query_set.filter(matcher(f"{query_string}__{self.lookup}")) snippet = ( - self.snippet_query(keyword_list, query_string) if add_snippet else Value("") + self.snippet_query(keyword_list, query_string) + if add_snippet + else Value("") ) matches = matches.annotate(snippet=snippet) return matches diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py index ec8ad658c..f4228b0b2 100644 --- a/src/rard/utils/text_processors.py +++ b/src/rard/utils/text_processors.py @@ -66,9 +66,7 @@ def make_plain_text(content): ] -punctuation_re = re.compile( - f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]" -) +punctuation_re = re.compile(f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]") def fold_latin(content: str) -> str: