-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathformalization.tex
More file actions
167 lines (132 loc) · 11 KB
/
formalization.tex
File metadata and controls
167 lines (132 loc) · 11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
\documentclass{article}
\usepackage{amsmath, amssymb, amsthm}
\usepackage{mathpazo}
\newtheorem{theorem}{Theorem}
\newtheorem{lemma}{Lemma}
\newtheorem{definition}{Definition}
\begin{document}
\section*{Formalization: jq $\to$ SQL Translation Correctness}
We formalize a translation from jq-style queries on JSON records to SQL queries on a columnar relational store, and prove that the translation preserves meaning. The development is mechanized in Lean~4 (\texttt{Main.lean}); Lean names are given in \texttt{typewriter} where they match.
\subsection*{1. Data}
A user has three fields: a name, an age, and a role drawn from $\mathrm{Role} = \{\mathsf{student}, \mathsf{employee}, \mathsf{retired}\}$. The same user is represented two ways:
\begin{itemize}
\item \textbf{JSON} (\texttt{Juser}): a record $\langle\mathrm{name} : \mathbb{S},\ \mathrm{age} : \mathbb{N},\ \mathrm{role} : \mathrm{Role}\rangle$, where $\mathbb{S}$ is the set of strings. A JSON database is a list, $\mathcal{JD} = \mathrm{List}\,\mathcal{J}$ (\texttt{JDB}).
\item \textbf{SQL} (\texttt{Suser}): a tuple in $\mathcal{S} = \mathbb{S} \times \mathbb{N} \times \mathrm{Role}$. An SQL database $SD \in \mathcal{SD}$ (\texttt{SDB}) is a triple of parallel column lists $(\mathrm{names}, \mathrm{ages}, \mathrm{roles})$, with no constraint that the columns have equal length.
\end{itemize}
The conversions $\mathrm{toS} : \mathcal{J} \to \mathcal{S}$ and $\mathrm{toJ} : \mathcal{S} \to \mathcal{J}$ form an isomorphism: $\mathrm{toJ} \circ \mathrm{toS} = \mathrm{id}_{\mathcal{J}}$ and $\mathrm{toS} \circ \mathrm{toJ} = \mathrm{id}_{\mathcal{S}}$.
\begin{definition}[Row projection]
\label{def:torows}
The columnar layout is unfolded into a row list by a 3-way zip:
\[
\mathrm{toRows}(SD) \;\triangleq\; \mathrm{zip}_3(SD.\mathrm{names},\, SD.\mathrm{ages},\, SD.\mathrm{roles}).
\]
The zip terminates at the shortest column, so only triples with all three components contribute rows. All semantics on $\mathcal{SD}$ are mediated through $\mathrm{toRows}$.
\end{definition}
\subsection*{2. Schema, Conditions, and Updates}
The schema is parameterised by a column tag $\mathrm{Col} = \{\mathsf{name}, \mathsf{age}, \mathsf{role}, \mathsf{all}\}$ (with $\mathsf{all}$ as a wildcard) and a tagged value type $\mathrm{Value} = \mathbb{N} \mathbin{\dot\cup} \mathbb{S} \mathbin{\dot\cup} \mathrm{Role}$. Comparison operators range over $\mathrm{Op} = \{=, >, <, \geq, \leq\}$.
\begin{definition}[Conditions]
A condition is generated by the grammar
\[
C \;::=\; \mathsf{always} \;\mid\; \mathsf{cmp}(\mathrm{col}, \mathrm{op}, v) \;\mid\; C \land C \;\mid\; C \lor C,
\]
and interpreted as Boolean-valued functions $\mathrm{eval}_J : \mathrm{Cond} \times \mathcal{J} \to \mathbb{B}$ and $\mathrm{eval}_S : \mathrm{Cond} \times \mathcal{S} \to \mathbb{B}$ by structural recursion. Comparisons between values of distinct payload tags evaluate to $\mathsf{false}$.
\end{definition}
\begin{lemma}[Condition bridge]
\label{lem:cond-bridge}
For every condition $C$ and every $j \in \mathcal{J}$,
\[
\mathrm{eval}_J(C, j) \;=\; \mathrm{eval}_S(C, \mathrm{toS}(j)).
\]
The dual statement on $\mathcal{S}$ holds via $\mathrm{toJ}$.
\end{lemma}
The proof is by induction on $C$, with the atomic case using the column-projection bridge $\mathrm{getVal}_J(\mathrm{col}, j) = \mathrm{getVal}_S(\mathrm{col}, \mathrm{toS}(j))$.
\paragraph{Per-row update.}
The unconditional updater $\mathrm{applyUpdate}_J(\mathrm{col}, v, j)$ writes $v$ into the column \emph{only when the value's payload tag matches the column's type}; otherwise the row passes through unchanged. The conditional version $\mathrm{applyUpdateIf}_J(\mathrm{col}, v, C, j)$ updates iff $\mathrm{eval}_J(C, j) = \mathsf{true}$, with an analogous pair on $\mathcal{S}$.
\begin{lemma}[Update bridge]
\label{lem:update-bridge}
For every $\mathrm{col}, v, C$, and $j \in \mathcal{J}$,
\[
\mathrm{applyUpdateIf}_S(\mathrm{col}, v, C, \mathrm{toS}(j)) \;=\; \mathrm{toS}\big(\mathrm{applyUpdateIf}_J(\mathrm{col}, v, C, j)\big).
\]
\end{lemma}
\subsection*{3. Equivalence}
\begin{definition}[Database equivalence]
\label{def:equiv}
For $JD \in \mathcal{JD}$ and $SD \in \mathcal{SD}$, we write $JD \equiv SD$ when $JD\,\textsf{.map}\,\mathrm{toS}$ is a permutation of $\mathrm{toRows}(SD)$:
\[
JD \equiv SD \;\triangleq\; JD\,\textsf{.map}\,\mathrm{toS} \;\sim\; \mathrm{toRows}(SD).
\]
\end{definition}
Permutation is the appropriate level of granularity for relating a JSON database to a columnar one: row order is irrelevant to the semantics of the considered queries, but multiplicities matter. A weaker notion that ignored multiplicities (set membership) would fail to support queries that observe cardinality (\textsf{length}/\textsf{count}) or that act pointwise on duplicates (\textsf{modify}/\textsf{update}); a stronger notion (list equality) would refuse legitimate row reorderings.
\subsection*{4. Query Languages and Translation}
The two languages each have six constructors. The translation $T$ ($\texttt{jquery\_to\_squery}$) is structure-preserving; note the asymmetric naming for the aggregate (jq's operator is \textsf{length}, SQL's is \textsf{count}).
\begin{center}
\begin{tabular}{lll}
\hline
jq ($\mathcal{Q}_{jq}$) & SQL ($\mathcal{Q}_{sql}$) & Result \\
\hline
$\mathsf{find}(\mathrm{col}, C)$ & $\mathsf{select}(\mathrm{col}, C)$ & database \\
$\mathsf{drop}(C)$ & $\mathsf{delete}(C)$ & database \\
$\mathsf{prepend}(j)$ & $\mathsf{insert}(\mathrm{toS}(j))$ & database \\
$\mathsf{clear}$ & $\mathsf{truncate}$ & database \\
$\mathsf{length}$ & $\mathsf{count}$ & $\mathbb{N}$ \\
$\mathsf{modify}(\mathrm{col}, v, C)$ & $\mathsf{update}(\mathrm{col}, v, C)$ & database \\
\hline
\end{tabular}
\end{center}
The two evaluators implement these clauses as expected: \textsf{find}/\textsf{select} filter; \textsf{drop}/\textsf{delete} filter the negation; \textsf{prepend}/\textsf{insert} cons at the head; \textsf{clear}/\textsf{truncate} return empty; \textsf{length}/\textsf{count} return the list length; \textsf{modify}/\textsf{update} map a per-row updater. On the SQL side, every database-returning query first decomposes the columns into a row list via $\mathrm{toRows}$, then transforms the rows, then rebuilds three columns.
The result equivalence $\equiv_R$ dispatches by tag: two database results compare under $\equiv$, two scalar results compare with $=$, and a tag mismatch is $\bot$.
\subsection*{5. Bridge and Round-Trip Lemmas}
Two bridge lemmas lift the per-row correspondences (Lemmas~\ref{lem:cond-bridge} and \ref{lem:update-bridge}) through $\textsf{.filter}$ and $\textsf{.map}$, so that the JSON-side and SQL-side transformations align after applying $\mathrm{toS}$:
\begin{lemma}[Filter bridges]
\label{lem:filter-bridge}
For every $JD \in \mathcal{JD}$ and condition $C$,
\begin{align*}
(JD\,\textsf{.filter}\,\mathrm{eval}_J(C, \cdot))\,\textsf{.map}\,\mathrm{toS}
&\;=\; (JD\,\textsf{.map}\,\mathrm{toS})\,\textsf{.filter}\,\mathrm{eval}_S(C, \cdot), \\
(JD\,\textsf{.filter}\,(\lnot \mathrm{eval}_J(C, \cdot)))\,\textsf{.map}\,\mathrm{toS}
&\;=\; (JD\,\textsf{.map}\,\mathrm{toS})\,\textsf{.filter}\,(\lnot \mathrm{eval}_S(C, \cdot)).
\end{align*}
\end{lemma}
\begin{lemma}[Map-update bridge]
\label{lem:map-update}
For every $JD$, $\mathrm{col}, v, C$,
\[
(JD\,\textsf{.map}\,\mathrm{applyUpdateIf}_J(\mathrm{col}, v, C))\,\textsf{.map}\,\mathrm{toS}
\;=\; (JD\,\textsf{.map}\,\mathrm{toS})\,\textsf{.map}\,\mathrm{applyUpdateIf}_S(\mathrm{col}, v, C).
\]
\end{lemma}
The decompose-transform-rebuild pattern on the SQL side is invisible to $\mathrm{toRows}$:
\begin{lemma}[Round-trips]
\label{lem:roundtrip}
For every $SD$, predicate $p : \mathcal{S} \to \mathbb{B}$, function $f : \mathcal{S} \to \mathcal{S}$, and $s \in \mathcal{S}$:
\begin{align*}
\mathrm{toRows}\big(\mathrm{rebuild}(\mathrm{filter}(p, \mathrm{toRows}(SD)))\big) &\;=\; \mathrm{filter}(p, \mathrm{toRows}(SD)), \\
\mathrm{toRows}\big(\mathrm{rebuild}(\mathrm{toRows}(SD)\,\textsf{.map}\,f)\big) &\;=\; \mathrm{toRows}(SD)\,\textsf{.map}\,f, \\
\mathrm{toRows}\big(s\ \text{prepended to}\ SD\big) &\;=\; s :: \mathrm{toRows}(SD).
\end{align*}
\end{lemma}
All three reduce to the identity $\mathrm{zip}_3(\ell\,\textsf{.map}\,\pi_1, \ell\,\textsf{.map}\,\pi_{2,1}, \ell\,\textsf{.map}\,\pi_{2,2}) = \ell$, by induction on $\ell$.
\subsection*{6. Main Theorem}
\begin{theorem}[Translation correctness]
\label{thm:main}
For all $JD \in \mathcal{JD}$, $SD \in \mathcal{SD}$, and $q \in \mathcal{Q}_{jq}$,
\[
JD \equiv SD \;\implies\; \mathrm{eval}_{jq}(JD, q) \;\equiv_R\; \mathrm{eval}_{sql}(SD, T(q)).
\]
\end{theorem}
\begin{proof}[Proof sketch]
Case analysis on $q$. Each database-returning case rewrites the goal into a permutation between two row lists, using a bridge lemma (\ref{lem:filter-bridge} or \ref{lem:map-update}) on the JSON side and a round-trip (\ref{lem:roundtrip}) on the SQL side; the goal is then closed by the corresponding $\mathrm{List.Perm}$ congruence applied to the hypothesis $JD \equiv SD$:
\begin{itemize}
\item \emph{\textsf{find}, \textsf{drop}}: rewrite via Lemma~\ref{lem:filter-bridge} and the filter round-trip, then apply $\mathrm{List.Perm.filter}$ to the hypothesis.
\item \emph{\textsf{prepend}}: rewrite via $\textsf{.map}$ on a cons and the insert round-trip, then apply $\mathrm{List.Perm.cons}$.
\item \emph{\textsf{clear}}: both sides reduce to $[\,]$.
\item \emph{\textsf{length}}: scalar case. Length is preserved by $\textsf{.map}$ and by permutation, so $|JD| = |JD\,\textsf{.map}\,\mathrm{toS}| = |\mathrm{toRows}(SD)|$.
\item \emph{\textsf{modify}}: rewrite via Lemma~\ref{lem:map-update} and the map round-trip, then apply $\mathrm{List.Perm.map}$.
\end{itemize}
The proof has the same shape for every database-returning case: \emph{lift the per-row bridge through $\textsf{.map}$, collapse the SQL rebuild via a round-trip, transport the input permutation through the appropriate $\mathrm{List.Perm}$ congruence}.
\end{proof}
\subsection*{7. Mechanization}
Definitions and lemmas correspond to Lean names in \texttt{Main.lean}: $\mathrm{toRows}$ is \texttt{SDB.toRows}; $\mathrm{eval}_J$ and $\mathrm{eval}_S$ are \texttt{Cond.evalJ} and \texttt{Cond.evalS}; the per-row bridges are \texttt{eval\_bridgeJ}, \texttt{eval\_bridgeS}, and \texttt{applyUpdateIf\_bridge}; equivalence is \texttt{equiv}, with the result-level lift \texttt{result\_equiv}; the lifted bridges of Section~5 are \texttt{filter\_eval\_bridge}, \texttt{filter\_neg\_eval\_bridge}, and \texttt{map\_update\_bridge}; the round-trips are \texttt{toRows\_filter\_reconstruct}, \texttt{toRows\_map\_reconstruct}, and \texttt{toRows\_insert}, all reducing to \texttt{zip3\_map\_components}; the main theorem is \texttt{query\_equiv}.
A complementary suite of property-based tests in \texttt{Test.lean} uses Plausible to randomly generate inputs, and a deliberately-bugged variant in \texttt{Error.lean} exhibits minimal counterexamples on which the tests fail.
\end{document}