From 6b842fadd526c9205a24ba20ccef1c5e580e9754 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Thu, 26 Mar 2026 14:51:39 +0100 Subject: [PATCH 1/4] Add (failing) serialization roundtrip tests --- test/test_page.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/test/test_page.py b/test/test_page.py index c5f4fd9..23745e0 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -208,6 +208,11 @@ def test_textline_words(tl: TextLine) -> None: assert tl.words() == tl.text.split() +def test_textline_serialization_roundtrip() -> None: + tl = TextLine(id="tl-id", coords=Coords.parse("1,2 3,4"), text="foo bar") + assert TextLine.from_dict(tl.to_dict()) == tl + + ####### Tests for TextRegion ############### @@ -354,6 +359,17 @@ def test_textregion_all_arbitrary_text_and_words(region: TextRegion) -> None: ] +def test_textregion_serialization_roundtrip() -> None: + tr = TextRegion( + id="tr-id", + coords=Coords.parse("1,2 3,4"), + textlines={ + "tl-1": TextLine(id="tl-1", coords=Coords.parse("1,2 3,4"), text="foo") + }, + ) + assert TextRegion.from_dict(tr.to_dict()) == tr + + ############### Tests for Page #################### @@ -787,3 +803,21 @@ def test_page_all_arbitrary_text_and_words(page: Page) -> None: assert list(page.all_words()) == [ w for r in page.regions.values() for w in r.all_words() ] + + +def test_page_serialization_roundtrip() -> None: + pa = Page( + image_filename="a.jpg", + regions={ + "tr-1": TextRegion( + id="tr-1", + coords=Coords.parse("1,2 3,4"), + textlines={ + "tl-1": TextLine( + id="tl-1", coords=Coords.parse("1,2 3,4"), text="foo" + ) + }, + ) + }, + ) + assert Page.from_dict(pa.to_dict()) == pa From 32ed4ebc67265642b18d4ea217abf37d5e4b83a5 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Thu, 26 Mar 2026 15:04:23 +0100 Subject: [PATCH 2/4] Remove ID type alias --- pygexml/page.py | 17 +++++++---------- test/test_page.py | 4 ++-- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pygexml/page.py b/pygexml/page.py index e95c57b..3608851 100644 --- a/pygexml/page.py +++ b/pygexml/page.py @@ -83,12 +83,9 @@ def __str__(self) -> str: return " ".join(str(p) for p in self.polygon.points) -type ID = str - - @dataclass class TextLine(DataClassJsonMixin): - id: ID + id: str coords: Coords text: str @@ -155,9 +152,9 @@ def words(self) -> Iterable[str]: @dataclass class TextRegion(DataClassJsonMixin): - id: ID + id: str coords: Coords - textlines: dict[ID, TextLine] + textlines: dict[str, TextLine] @classmethod def from_xml(cls, element: Element) -> "TextRegion": @@ -200,7 +197,7 @@ def from_alto(cls, element: Element) -> "TextRegion": ) ) - textlines: dict[ID, TextLine] = {} + textlines: dict[str, TextLine] = {} for child in element: if QName(child).localname == "TextLine": tl = TextLine.from_alto(child) @@ -213,7 +210,7 @@ def from_alto(cls, element: Element) -> "TextRegion": id=str(element.attrib["ID"]), coords=coords, textlines=textlines ) - def lookup_textline(self, id: ID) -> TextLine | None: + def lookup_textline(self, id: str) -> TextLine | None: return self.textlines.get(id) def all_text(self) -> Iterable[str]: @@ -226,7 +223,7 @@ def all_words(self) -> Iterable[str]: @dataclass class Page(DataClassJsonMixin): image_filename: str - regions: dict[ID, TextRegion] + regions: dict[str, TextRegion] @classmethod def from_xml(cls, element: Element) -> "Page": @@ -307,7 +304,7 @@ def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page": xml_string = path.read_text(encoding=encoding) return Page.from_alto_string(xml_string) - def lookup_region(self, id: ID) -> TextRegion | None: + def lookup_region(self, id: str) -> TextRegion | None: return self.regions.get(id) def all_text(self) -> Iterable[str]: diff --git a/test/test_page.py b/test/test_page.py index 23745e0..e2de1ab 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -8,7 +8,7 @@ from pygexml.strategies import * from pygexml.geometry import Point, Box, Polygon -from pygexml.page import Coords, ID, TextLine, TextRegion, Page +from pygexml.page import Coords, TextLine, TextRegion, Page ############## Tests for Coords #################### @@ -333,7 +333,7 @@ def test_textregion_line_lookup(line: TextLine, region: TextRegion) -> None: @given(st.text(), st_text_regions) -def test_textregion_line_lookup_not_found(id: ID, region: TextRegion) -> None: +def test_textregion_line_lookup_not_found(id: str, region: TextRegion) -> None: assume(not id in region.textlines) assert region.lookup_textline(id) is None From d9a2cb4cdaad860aec971557127ed7ef3adac41c Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Thu, 26 Mar 2026 15:19:29 +0100 Subject: [PATCH 3/4] Revert "Remove ID type alias" This reverts commit 32ed4ebc67265642b18d4ea217abf37d5e4b83a5. --- pygexml/page.py | 17 ++++++++++------- test/test_page.py | 4 ++-- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/pygexml/page.py b/pygexml/page.py index 3608851..e95c57b 100644 --- a/pygexml/page.py +++ b/pygexml/page.py @@ -83,9 +83,12 @@ def __str__(self) -> str: return " ".join(str(p) for p in self.polygon.points) +type ID = str + + @dataclass class TextLine(DataClassJsonMixin): - id: str + id: ID coords: Coords text: str @@ -152,9 +155,9 @@ def words(self) -> Iterable[str]: @dataclass class TextRegion(DataClassJsonMixin): - id: str + id: ID coords: Coords - textlines: dict[str, TextLine] + textlines: dict[ID, TextLine] @classmethod def from_xml(cls, element: Element) -> "TextRegion": @@ -197,7 +200,7 @@ def from_alto(cls, element: Element) -> "TextRegion": ) ) - textlines: dict[str, TextLine] = {} + textlines: dict[ID, TextLine] = {} for child in element: if QName(child).localname == "TextLine": tl = TextLine.from_alto(child) @@ -210,7 +213,7 @@ def from_alto(cls, element: Element) -> "TextRegion": id=str(element.attrib["ID"]), coords=coords, textlines=textlines ) - def lookup_textline(self, id: str) -> TextLine | None: + def lookup_textline(self, id: ID) -> TextLine | None: return self.textlines.get(id) def all_text(self) -> Iterable[str]: @@ -223,7 +226,7 @@ def all_words(self) -> Iterable[str]: @dataclass class Page(DataClassJsonMixin): image_filename: str - regions: dict[str, TextRegion] + regions: dict[ID, TextRegion] @classmethod def from_xml(cls, element: Element) -> "Page": @@ -304,7 +307,7 @@ def from_alto_file(cls, file: Path | str, encoding: str = "utf-8") -> "Page": xml_string = path.read_text(encoding=encoding) return Page.from_alto_string(xml_string) - def lookup_region(self, id: str) -> TextRegion | None: + def lookup_region(self, id: ID) -> TextRegion | None: return self.regions.get(id) def all_text(self) -> Iterable[str]: diff --git a/test/test_page.py b/test/test_page.py index e2de1ab..23745e0 100644 --- a/test/test_page.py +++ b/test/test_page.py @@ -8,7 +8,7 @@ from pygexml.strategies import * from pygexml.geometry import Point, Box, Polygon -from pygexml.page import Coords, TextLine, TextRegion, Page +from pygexml.page import Coords, ID, TextLine, TextRegion, Page ############## Tests for Coords #################### @@ -333,7 +333,7 @@ def test_textregion_line_lookup(line: TextLine, region: TextRegion) -> None: @given(st.text(), st_text_regions) -def test_textregion_line_lookup_not_found(id: str, region: TextRegion) -> None: +def test_textregion_line_lookup_not_found(id: ID, region: TextRegion) -> None: assume(not id in region.textlines) assert region.lookup_textline(id) is None From f017bd41971a6043a2841925e778c0f52dc29768 Mon Sep 17 00:00:00 2001 From: Mirko Westermeier Date: Thu, 26 Mar 2026 15:24:08 +0100 Subject: [PATCH 4/4] Change the ID type alias to pre-PEP695 --- pygexml/page.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pygexml/page.py b/pygexml/page.py index e95c57b..116b1ce 100644 --- a/pygexml/page.py +++ b/pygexml/page.py @@ -3,7 +3,7 @@ from warnings import warn from dataclasses import dataclass from dataclasses_json import DataClassJsonMixin -from typing import ClassVar +from typing import ClassVar, TypeAlias from collections.abc import Iterable from lxml import etree from lxml.etree import _Element as Element, QName @@ -83,7 +83,7 @@ def __str__(self) -> str: return " ".join(str(p) for p in self.polygon.points) -type ID = str +ID: TypeAlias = str @dataclass