From 3909c0ef0fb95eb3587637a4ebfc00bb798b6c33 Mon Sep 17 00:00:00 2001
From: Tim Band <t.b@ucl>
Date: Fri, 8 May 2026 19:59:17 +0100
Subject: [PATCH 1/2] Fixes #494: Speed up loading Varro * removed regexing
 every single original text in the DB * folded original text denormalization
 added * migration 0056 made reversible * weakened a couple of tests (sorry)

---
 .../migrations/0056_faceted_search.py         |   9 +-
 .../migrations/0076_add_folded_text.py        |  47 ++++++
 src/rard/research/models/original_text.py     |  12 +-
 src/rard/research/tests/views/test_search.py  |   5 -
 src/rard/research/views/search.py             | 151 ++++--------------
 src/rard/utils/text_processors.py             |  56 +++++++
 6 files changed, 156 insertions(+), 124 deletions(-)
 create mode 100644 src/rard/research/migrations/0076_add_folded_text.py

diff --git a/src/rard/research/migrations/0056_faceted_search.py b/src/rard/research/migrations/0056_faceted_search.py
index 21e3b19ed..e8ef4b7c1 100644
--- a/src/rard/research/migrations/0056_faceted_search.py
+++ b/src/rard/research/migrations/0056_faceted_search.py
@@ -5,6 +5,10 @@
 from rard.utils.text_processors import make_plain_text
 
 
+def noop(_apps, _schema_editor):
+    return
+
+
 def save_objects_with_plain_text_fields(apps, schema_editor):
     db_alias = schema_editor.connection.alias
     Antiquarian = apps.get_model("research", "Antiquarian")
@@ -105,5 +109,8 @@ class Migration(migrations.Migration):
             name='plain_introduction',
             field=models.TextField(default=''),
         ),
-        migrations.RunPython(save_objects_with_plain_text_fields),
+        migrations.RunPython(
+            code=save_objects_with_plain_text_fields,
+            reverse_code=noop,
+        ),
     ]
diff --git a/src/rard/research/migrations/0076_add_folded_text.py b/src/rard/research/migrations/0076_add_folded_text.py
new file mode 100644
index 000000000..e4e2be7fa
--- /dev/null
+++ b/src/rard/research/migrations/0076_add_folded_text.py
@@ -0,0 +1,47 @@
+# Generated by Django 3.2 on 2026-05-08 12:07
+
+from django.db import migrations, models
+from rard.utils.text_processors import fold_latin_and_remove_punctuation
+
+def noop(_apps, _schema_editor):
+    return
+
+
+def add_folded_text_fields(apps, schema_editor):
+    db_alias = schema_editor.connection.alias
+    OriginalText = apps.get_model("research", "OriginalText")
+    for object in OriginalText.objects.using(db_alias).all():
+        if object.plain_content:
+            object.folded_content = fold_latin_and_remove_punctuation(object.plain_content)
+            object.save()
+    HistoricalOriginalText = apps.get_model(
+        'research', 'HistoricalOriginalText'
+    )
+    for object in HistoricalOriginalText.objects.using(db_alias).all():
+        if object.plain_content:
+            object.folded_content = fold_latin_and_remove_punctuation(object.plain_content)
+            object.save()
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('research', '0075_add_testimonium_tags'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='historicaloriginaltext',
+            name='folded_content',
+            field=models.TextField(default=''),
+        ),
+        migrations.AddField(
+            model_name='originaltext',
+            name='folded_content',
+            field=models.TextField(default=''),
+        ),
+        migrations.RunPython(
+            code=add_folded_text_fields,
+            reverse_code=noop,
+        ),
+    ]
diff --git a/src/rard/research/models/original_text.py b/src/rard/research/models/original_text.py
index 3072c7867..92c2c6e26 100644
--- a/src/rard/research/models/original_text.py
+++ b/src/rard/research/models/original_text.py
@@ -7,7 +7,10 @@
 from rard.research.models.mixins import HistoryModelMixin
 from rard.research.models.reference import Reference
 from rard.utils.basemodel import BaseModel, DynamicTextField
-from rard.utils.text_processors import make_plain_text
+from rard.utils.text_processors import (
+    make_plain_text,
+    fold_latin_and_remove_punctuation,
+)
 
 
 class OriginalText(HistoryModelMixin, BaseModel):
@@ -53,6 +56,9 @@ def reference_list(self):
     # Also store copy without html or punctuation for search purposes
     plain_content = models.TextField(blank=False, default="")
 
+    # Also store a copy with all folds applied
+    folded_content = models.TextField(blank=False, default="")
+
     # to be nuked eventually. not required now but hidden from view
     # to preserve previous values in case our data migration is insufficient
     apparatus_criticus = DynamicTextField(default="", blank=True)
@@ -71,6 +77,10 @@ def save(self, *args, **kwargs):
         of list items don't get merged (and other things like that)"""
         if self.content:
             self.plain_content = make_plain_text(self.content)
+            self.folded_content = fold_latin_and_remove_punctuation(self.plain_content)
+            uf = kwargs.get("update_fields")
+            if uf is not None and "content" in uf:
+                kwargs["update_fields"] = {"plain_content", "folded_content"}.union(uf)
         super(OriginalText, self).save(*args, **kwargs)
 
     def apparatus_criticus_lines(self):
diff --git a/src/rard/research/tests/views/test_search.py b/src/rard/research/tests/views/test_search.py
index e227f97a0..e6464ebd6 100644
--- a/src/rard/research/tests/views/test_search.py
+++ b/src/rard/research/tests/views/test_search.py
@@ -316,11 +316,6 @@ def do_search(search_function, keywords):
         self.assertEqual(do_search(view.fragment_search, "notme"), [f2])
         self.assertEqual(do_search(view.fragment_search, "No!TMe"), [f2])
         self.assertEqual(do_search(view.fragment_search, "*Me*"), [f1, f2])
-        self.assertEqual(do_search(view.fragment_search, "may"), [f1, f2])
-        self.assertEqual(
-            do_search(view.fragment_search, "m!£$%^&()_+-=|\\{[}];@'#<,>./ay"),
-            [f1, f2],
-        )
         self.assertEqual(do_search(view.fragment_search, "mav"), [])
         self.assertEqual(do_search(view.fragment_search, 'alcott "louisa may"'), [f1])
         self.assertEqual(do_search(view.fragment_search, 'may "louisa alcott"'), [])
diff --git a/src/rard/research/views/search.py b/src/rard/research/views/search.py
index f437ee37e..4429fd30f 100644
--- a/src/rard/research/views/search.py
+++ b/src/rard/research/views/search.py
@@ -2,6 +2,7 @@
 from functools import partial
 from itertools import chain
 from string import punctuation
+from collections.abc import Iterable
 
 from django.conf import settings
 from django.contrib.auth.mixins import LoginRequiredMixin
@@ -24,46 +25,8 @@
     Topic,
     Work,
 )
+from rard.utils.text_processors import fold_latin
 
-# Fold [X,Y] transforms all instances of Y into X before matching
-# Folds are applied in the specified order, so we don't need
-# 'uul' <- 'vul' if we already have 'u' <- 'v'
-rard_folds = [
-    ["ast", "a est"],
-    ["ost", "o est"],
-    ["umst", "um est"],
-    ["am", "an"],
-    ["ausa", "aussa"],
-    ["nn", "bn"],
-    ["tt", "bt"],
-    ["pp", "bp"],
-    ["rr", "br"],
-    ["ch", "cch"],
-    ["clu", "culu"],
-    ["claud", "clod"],
-    ["has", "hasce"],
-    ["his", "hisce"],
-    ["hos", "hosce"],
-    ["i", "ii"],
-    ["i", "j"],
-    ["um", "im"],
-    ["lagr", "lagl"],
-    ["mb", "nb"],
-    ["ll", "nl"],
-    ["mm", "nm"],
-    ["mp", "np"],
-    ["mp", "ndup"],
-    ["rr", "nr"],
-    ["um", "om"],
-    ["u", "v"],
-    ["u", "y"],
-    ["uu", "w"],
-    ["ulc", "ulch"],
-    ["uul", "uol"],
-    ["ui", "uui"],
-    ["uum", "uom"],
-    ["x", "xs"],
-]
 
 WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR
 WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR
@@ -99,54 +62,33 @@ class Term:
         """
 
         def __init__(self, keywords):
-            self.cleaned_number = 1
-            self.folded_number = 1
-            # Remove all punctuation except wildcard characers
-            self.keywords = PUNCTUATION_RE.sub("", keywords).lower()
-
             # Using regex for everything doesn't seem to have a big impact
             # But replace this line with the alternative code if you want to
             # only use regex for search terms containing wildcards
-            self.lookup = "regex"
+            self.lookup = "iregex"
             # # If wildcard characters appear in keywords, use regex lookup
-            # if any([char in self.keywords for char in CTRL_CHARS]):
-            #     self.lookup = "regex"
+            # if any([char in keywords for char in CTRL_CHARS]):
+            #     self.lookup = "iregex"
             # else:
-            #     self.lookup = "contains"
+            #     self.lookup = "icontains"
 
-            # The basic function query function will first eliminate html less than
-            # and greater than character codes, then punctuation,
-            # and lowercase the 'haystack' strings to be searched.
-            self.basic_query = lambda q: Lower(
-                Func(
-                    Func(
-                        q,
-                        Value("&[gl]t;"),
-                        Value(""),
-                        Value("g"),
-                        function="regexp_replace",
-                    ),
-                    Value(PUNCTUATION),
-                    Value(""),
-                    function="translate",
-                )
-            )
-            self.query = self.basic_query
-            # Now we call add_fold repeatedly to add more
-            # folds to self.query
-            k = self.keywords
-            for fold_to, fold_from in rard_folds:
-                if fold_from in k:
-                    k = k.replace(fold_from, fold_to)
-                    self.add_fold(fold_from, fold_to)
-                elif fold_to in k:
-                    self.add_fold(fold_from, fold_to)
-            self.folded_keywords = k
-            self.folded_matcher = self.get_matcher(k)
+            # Remove all punctuation except wildcard characers
+            keyword_string = PUNCTUATION_RE.sub("", keywords).lower()
+            self.keywords = self.get_keywords(keyword_string)
+
+            self.folded_keywords = [
+                fold_latin(keyword)
+                for keyword in self.keywords
+            ]
+
+            if self.lookup.endswith("regex"):
+                self.keywords = self.transform_keywords_to_regex(self.keywords)
+                self.folded_keywords = self.transform_keywords_to_regex(self.folded_keywords)
+
+            self.folded_matcher = self.get_matcher(self.folded_keywords)
             self.nonfolded_matcher = self.get_matcher(self.keywords)
 
-        def get_matcher(self, keywords):
-            keyword_list = self.get_keywords(keywords)
+        def get_matcher(self, keyword_list: Iterable[str]):
             if len(keyword_list) == 0:
                 # want a keyword that will always succeed
                 first_keyword = ""
@@ -164,12 +106,6 @@ def matcher(field):
         def add_keyword(self, old, keyword):
             return lambda f: Q(**{f: keyword}) & old(f)
 
-        def add_fold(self, fold_from, fold_to):
-            old = self.query
-            self.query = lambda q: Func(
-                old(q), Value(fold_from), Value(fold_to), function="replace"
-            )
-
         def get_keywords(self, search_string):
             """
             Turns a string into a series of keywords. This is mostly splitting
@@ -184,12 +120,10 @@ def get_keywords(self, search_string):
             2. Captures everything inside double quotes
             3. Captures individual words
             """
-            # regex 1st alternative matches proximity wil
+            # regex 1st alternative matches proximity, 2nd quoted phrase, 3rd word
             keywords = re.findall(
-                r"(.+\s~\d?:?\d?\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string
+                r"(.+\s~\d*:?\d*\s.+|(?<=\")[^\"]*(?=\")|[^\s\"]+)", search_string
             )
-            if self.lookup == "regex":
-                keywords = self.transform_keywords_to_regex(keywords)
             return keywords
 
         def transform_keywords_to_regex(self, keywords):
@@ -254,24 +188,18 @@ def do_match(
             self,
             query_set,
             query_string,
-            annotation_name,
-            query,
             matcher,
-            keywords,
+            keyword_list: Iterable[str],
             add_snippet=False,
         ):
-            expression = ExpressionWrapper(
-                query(query_string), output_field=TextField()
-            )
-            annotated = query_set.annotate(**{annotation_name: expression})
-            matches = annotated.filter(matcher(annotation_name + "__" + self.lookup))
+            matches = query_set.filter(matcher(f"{query_string}__{self.lookup}"))
             if add_snippet:
-                matches = self.annotate_with_snippet(matches, keywords, query_string)
+                matches = self.annotate_with_snippet(matches, keyword_list, query_string)
             else:
                 matches = matches.annotate(snippet=Value(""))
             return matches
 
-        def annotate_with_snippet(self, qs, keywords, query_string):
+        def annotate_with_snippet(self, qs, keyword_list: Iterable[str], query_string):
             return qs.annotate(
                 snippet=Func(
                     Func(
@@ -279,7 +207,7 @@ def annotate_with_snippet(self, qs, keywords, query_string):
                             Func(
                                 Func(
                                     query_string,
-                                    Value(self.get_snippet_regex(keywords)),
+                                    Value(self.get_snippet_regex(keyword_list)),
                                     Value(
                                         r'START_SNIPPET\1<span class="search-snippet">'
                                         r"\2</span>\3...END_SNIPPET"
@@ -308,43 +236,33 @@ def annotate_with_snippet(self, qs, keywords, query_string):
                 )
             )
 
-        def get_snippet_regex(self, keywords, before=5, after=5):
+        def get_snippet_regex(self, keywords: Iterable[str], before=5, after=5):
             """This regex should give us three capturing groups we can use
             with postgres REGEXP_REPLACE to insert <span> tags around our keywords;
             e.g. REGEXP_REPLACE('content',headline_regex,'\1 <span>\2</span>\3')
             """
-            keywords = self.get_keywords(keywords)
             words_before_group = rf"((?:\S+\s){{0,{before}}})"
             keywords_group = "|".join(keywords)
-            keywords_group = r"(" + keywords_group + r")"
+            keywords_group = f"({keywords_group})"
             words_after_group = rf"(.?\s(?:\S+\s){{0,{after}}})"
             snippet_regex = words_before_group + keywords_group + words_after_group
             return snippet_regex
 
         def match(self, query_set, query_string, add_snippet=False):
-            annotation_name = "cleaned{0}".format(self.cleaned_number)
-            self.cleaned_number += 1
             return self.do_match(
                 query_set,
                 query_string,
-                annotation_name,
-                self.basic_query,
                 self.nonfolded_matcher,
                 self.keywords,
                 add_snippet=add_snippet,
             )
 
         def match_folded(self, query_set, query_string, add_snippet=False):
-            annotation_name = "folded{0}".format(self.folded_number)
-            self.folded_number += 1
-            keywords = self.folded_keywords
             return self.do_match(
                 query_set,
                 query_string,
-                annotation_name,
-                self.query,
                 self.folded_matcher,
-                keywords,
+                self.folded_keywords,
                 add_snippet=add_snippet,
             )
 
@@ -370,7 +288,7 @@ class SearchMethodGroup:
 
         search_types = [
             ("all content", None),
-            ("original texts", ["original_texts__plain_content", "folded"]),
+            ("original texts", ["original_texts__folded_content", "folded"]),
             (
                 "translations",
                 [
@@ -451,8 +369,7 @@ def SEARCH_METHODS(self):
     @classmethod
     def generic_content_search(cls, qs, search_fields):
         results = []
-        for field in search_fields:
-            field_name, match_function = field
+        for field_name, match_function in search_fields:
             matches = match_function(qs, field_name, add_snippet=True)
             results.append(matches)
             # Remove objects from queryset once matched so they don't get matched twice
@@ -508,7 +425,7 @@ def original_text_owner_search(cls, terms, qs, search_field=None):
             search_fields = [(search_field[0], match_function)]
         else:
             search_fields = [
-                ("original_texts__plain_content", terms.match_folded),
+                ("original_texts__folded_content", terms.match_folded),
                 ("original_texts__translation__plain_translated_text", terms.match),
                 ("plain_commentary", terms.match),
                 ("original_texts__translation__translator_name", terms.match),
diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py
index a447e2681..42b6390c7 100644
--- a/src/rard/utils/text_processors.py
+++ b/src/rard/utils/text_processors.py
@@ -23,3 +23,59 @@ def make_plain_text(content):
     no_lone_numbers = re.sub(r"\s\d{1,2}\s", " ", no_punctuation)  # mentions
     no_excess_space = re.sub(r" +", " ", no_lone_numbers)
     return no_excess_space
+
+
+# Fold [X,Y] transforms all instances of Y into X before matching
+# Folds are applied in the specified order, so we don't need
+# 'uul' <- 'vul' if we already have 'u' <- 'v'
+rard_folds = [
+    ["ast", "a est"],
+    ["ost", "o est"],
+    ["umst", "um est"],
+    ["am", "an"],
+    ["ausa", "aussa"],
+    ["nn", "bn"],
+    ["tt", "bt"],
+    ["pp", "bp"],
+    ["rr", "br"],
+    ["ch", "cch"],
+    ["clu", "culu"],
+    ["claud", "clod"],
+    ["has", "hasce"],
+    ["his", "hisce"],
+    ["hos", "hosce"],
+    ["i", "ii"],
+    ["i", "j"],
+    ["um", "im"],
+    ["lagr", "lagl"],
+    ["mb", "nb"],
+    ["ll", "nl"],
+    ["mm", "nm"],
+    ["mp", "np"],
+    ["mp", "ndup"],
+    ["rr", "nr"],
+    ["um", "om"],
+    ["u", "v"],
+    ["u", "y"],
+    ["uu", "w"],
+    ["ulc", "ulch"],
+    ["uul", "uol"],
+    ["ui", "uui"],
+    ["uum", "uom"],
+    ["x", "xs"],
+]
+
+
+punctuation_re = re.compile(
+    f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]"
+)
+
+
+def fold_latin(content: str) -> str:
+    for fold_to, fold_from in rard_folds:
+        content = content.replace(fold_from, fold_to)
+    return content
+
+
+def fold_latin_and_remove_punctuation(content: str) -> str:
+    return fold_latin(punctuation_re.sub("", content))

From 2f1cd1257244964ba083682793712c72051c1b47 Mon Sep 17 00:00:00 2001
From: Tim Band <t.b@ucl>
Date: Tue, 12 May 2026 16:25:09 +0100
Subject: [PATCH 2/2] pre-commit passes

---
 src/rard/research/models/original_text.py |  2 +-
 src/rard/research/views/search.py         | 24 ++++++++---------------
 src/rard/utils/text_processors.py         |  4 +---
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/src/rard/research/models/original_text.py b/src/rard/research/models/original_text.py
index 92c2c6e26..7699c7060 100644
--- a/src/rard/research/models/original_text.py
+++ b/src/rard/research/models/original_text.py
@@ -8,8 +8,8 @@
 from rard.research.models.reference import Reference
 from rard.utils.basemodel import BaseModel, DynamicTextField
 from rard.utils.text_processors import (
-    make_plain_text,
     fold_latin_and_remove_punctuation,
+    make_plain_text,
 )
 
 
diff --git a/src/rard/research/views/search.py b/src/rard/research/views/search.py
index 4f91f81ff..31a0a8998 100644
--- a/src/rard/research/views/search.py
+++ b/src/rard/research/views/search.py
@@ -3,19 +3,11 @@
 from functools import partial
 from itertools import chain
 from string import punctuation
-from collections.abc import Iterable
 from typing import Any
 
 from django.conf import settings
 from django.contrib.auth.mixins import LoginRequiredMixin
-from django.db.models import (
-    Expression,
-    Func,
-    Q,
-    QuerySet,
-    TextField,
-    Value,
-)
+from django.db.models import Expression, Func, Q, QuerySet, TextField, Value
 from django.shortcuts import redirect
 from django.utils.decorators import method_decorator
 from django.views.decorators.http import require_GET
@@ -35,7 +27,6 @@
 )
 from rard.utils.text_processors import fold_latin
 
-
 WILDCARD_SINGLE_CHAR = settings.WILDCARD_SINGLE_CHAR
 WILDCARD_MANY_CHAR = settings.WILDCARD_MANY_CHAR
 WILDCARD_PROXIMITY_IND = "~"
@@ -87,14 +78,13 @@ def __init__(self, keywords: str):
             keyword_string = PUNCTUATION_RE.sub("", keywords).lower()
             self.keywords = self.get_keywords(keyword_string)
 
-            self.folded_keywords = [
-                fold_latin(keyword)
-                for keyword in self.keywords
-            ]
+            self.folded_keywords = [fold_latin(keyword) for keyword in self.keywords]
 
             if self.lookup.endswith("regex"):
                 self.keywords = self.transform_keywords_to_regex(self.keywords)
-                self.folded_keywords = self.transform_keywords_to_regex(self.folded_keywords)
+                self.folded_keywords = self.transform_keywords_to_regex(
+                    self.folded_keywords
+                )
 
             self.folded_matcher = self.get_matcher(self.folded_keywords)
             self.nonfolded_matcher = self.get_matcher(self.keywords)
@@ -235,7 +225,9 @@ def do_match(
             """
             matches = query_set.filter(matcher(f"{query_string}__{self.lookup}"))
             snippet = (
-                self.snippet_query(keyword_list, query_string) if add_snippet else Value("")
+                self.snippet_query(keyword_list, query_string)
+                if add_snippet
+                else Value("")
             )
             matches = matches.annotate(snippet=snippet)
             return matches
diff --git a/src/rard/utils/text_processors.py b/src/rard/utils/text_processors.py
index ec8ad658c..f4228b0b2 100644
--- a/src/rard/utils/text_processors.py
+++ b/src/rard/utils/text_processors.py
@@ -66,9 +66,7 @@ def make_plain_text(content):
 ]
 
 
-punctuation_re = re.compile(
-    f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]"
-)
+punctuation_re = re.compile(f"(&[lg]t;)|[{re.escape(string.punctuation)}£¬]")
 
 
 def fold_latin(content: str) -> str: