From 13ae501d2ad31fb23a569b69ba6020bf35dfc764 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sat, 31 Jan 2026 22:25:05 +0100 Subject: [PATCH 01/21] feat: repr module --- libs/@local/hashql/core/src/lib.rs | 1 + libs/@local/hashql/core/src/symbol/mod.rs | 1 + libs/@local/hashql/core/src/symbol/repr.rs | 441 +++++++++++++++++++++ 3 files changed, 443 insertions(+) create mode 100644 libs/@local/hashql/core/src/symbol/repr.rs diff --git a/libs/@local/hashql/core/src/lib.rs b/libs/@local/hashql/core/src/lib.rs index 64671748fd5..e99cb0d6065 100644 --- a/libs/@local/hashql/core/src/lib.rs +++ b/libs/@local/hashql/core/src/lib.rs @@ -32,6 +32,7 @@ step_trait, try_trait_v2, variant_count, + str_from_raw_parts )] extern crate alloc; diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index 94f8d355f83..d26fb291031 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -18,6 +18,7 @@ //! This encapsulation enables future optimizations such as string interning (either through //! the `string_interner` crate or a custom implementation) without requiring API changes. +mod repr; pub mod sym; mod table; diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs new file mode 100644 index 00000000000..3668621b15d --- /dev/null +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -0,0 +1,441 @@ +#![expect(unsafe_code)] +//! Compact symbol representation using tagged pointers. +//! +//! This module provides [`Repr`], a single-word representation for symbols that can be either: +//! +//! - **Runtime symbols**: Heap-allocated on a bump allocator with inline string data +//! - **Constant symbols**: Indices into a static string table, encoded directly in pointer bits +//! +//! # Design Goals +//! +//! - **Compact**: `Repr` is exactly one pointer in size (8 bytes on 64-bit) +//! - **Niche optimization**: `Option` is also one pointer in size +//! - **Efficient**: Symbols are frequently created but rarely accessed +//! +//! # Tagged Pointer Scheme +//! +//! Uses the lowest bit as a discriminant tag (possible because allocations are 2-byte aligned): +//! +//! - Bit 0 = `0`: Runtime symbol (pointer to [`RuntimeSymbol`] allocation) +//! - Bit 0 = `1`: Constant symbol (index shifted left by 1, `OR`ed with tag) +//! +//! # Provenance +//! +//! Runtime symbols store a [`NonNull`] rather than a reference to preserve +//! full allocation provenance. Creating `&RuntimeSymbol` would narrow provenance to just the +//! header, causing undefined behavior when accessing the trailing inline bytes under strict +//! provenance / Stacked Borrows. + +use alloc::alloc::handle_alloc_error; +use core::{ + alloc::{AllocError, Layout}, + mem, + num::NonZero, + ptr::{self, NonNull}, +}; + +use crate::heap::BumpAllocator; + +/// Static table of constant symbol strings. +/// +/// Constant symbols encode an index into this table rather than storing string data. +static STRINGS: &[&str] = &["foo", "bar"]; + +/// Header for a runtime-allocated symbol with inline string data. +/// +/// # Memory Layout +/// +/// ```text +/// ┌──────────────┬──────────────────────┐ +/// │ len: usize │ data: [u8; len] │ +/// └──────────────┴──────────────────────┘ +/// ``` +/// +/// The `data` field is a zero-sized array marker; actual bytes are allocated +/// immediately after the header. The struct uses `#[repr(C)]` to guarantee +/// this layout. +/// +/// # Provenance +/// +/// References to this type (`&RuntimeSymbol`) only have provenance for the header, +/// not the trailing bytes. All access must go through [`NonNull`] +/// to preserve full allocation provenance. +#[repr(C, align(2))] +struct RuntimeSymbol { + len: usize, + data: [u8; 0], +} + +impl RuntimeSymbol { + /// Computes the allocation layout for a runtime symbol with `len` bytes of data. + fn layout(len: usize) -> Layout { + Layout::from_size_align( + size_of::().checked_add(len).expect("overflow"), + mem::align_of::(), + ) + .expect("invalid RuntimeSymbol layout") + } + + /// Allocates a runtime symbol containing `value` on the given allocator. + /// + /// Returns a [`NonNull`] pointer with provenance for the entire allocation, + /// including the trailing string bytes. + /// + /// # Panics + /// + /// Panics if allocation fails. + fn alloc(alloc: &A, value: &str) -> NonNull { + let Ok(value) = Self::try_alloc(alloc, value) else { + handle_alloc_error(Self::layout(value.len())) + }; + + value + } + + /// Attempts to allocate a runtime symbol containing `value`. + /// + /// # Errors + /// + /// Returns [`AllocError`] if the allocator cannot satisfy the request. + fn try_alloc(alloc: &A, value: &str) -> Result, AllocError> { + let len = value.len(); + + let layout = Self::layout(value.len()); + + let ptr = alloc.allocate(layout)?.cast::(); + + // SAFETY: `ptr` points to a freshly allocated block of `layout` size. + // We write `len` to the header and copy `len` bytes of string data + // immediately after the header, which fits within the allocation. + unsafe { + ptr.cast::().write(len); + + let buf = ptr.add(1).cast::(); + ptr::copy_nonoverlapping(value.as_ptr(), buf.as_ptr(), len); + } + + Ok(ptr) + } + + /// Returns a pointer to the inline string data. + /// + /// This performs pointer arithmetic without dereferencing, so it is safe. + /// The returned pointer has provenance for the trailing bytes if `this` + /// has provenance for the full allocation. + const fn data_ptr(this: NonNull) -> NonNull { + // SAFETY: `this` points to a valid `RuntimeSymbol` allocation, which + // always has at least `size_of::()` bytes. Adding 1 moves past + // the header to the inline data region. + unsafe { this.add(1) }.cast() + } + + /// Reads the length of the inline string data. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - The allocation must remain live for the duration of this call. + const unsafe fn len(this: NonNull) -> usize { + // SAFETY: Caller guarantees `this` points to a valid, initialized allocation. + unsafe { this.cast::().read() } + } + + /// Returns the inline data as a byte slice. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - The allocation must remain live for the lifetime `'a`. + /// - The returned slice must not be mutated for the lifetime `'a`. + const unsafe fn as_bytes<'a>(this: NonNull) -> &'a [u8] { + // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. + // `data_ptr` returns a pointer to the inline bytes, and `len` returns the count. + unsafe { core::slice::from_raw_parts(Self::data_ptr(this).as_ptr(), Self::len(this)) } + } + + /// Returns the inline data as a string slice. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - The allocation must remain live for the lifetime `'a`. + /// - The returned string must not be mutated for the lifetime `'a`. + const unsafe fn as_str<'a>(this: NonNull) -> &'a str { + // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. + // The bytes are valid UTF-8 because they were copied from a `&str` in `try_alloc`. + unsafe { core::str::from_raw_parts(Self::data_ptr(this).as_ptr(), Self::len(this)) } + } +} + +/// A constant symbol represented as an index into [`STRINGS`]. +#[derive(Copy, Clone)] +struct ConstantSymbol(usize); + +impl ConstantSymbol { + /// Returns the string value for this constant symbol. + fn as_str(self) -> &'static str { + STRINGS[self.0] + } + + /// Returns the string value without bounds checking. + /// + /// # Safety + /// + /// The index must be within bounds of [`STRINGS`]. + unsafe fn as_str_unchecked(self) -> &'static str { + // SAFETY: Caller guarantees the index is in bounds. + unsafe { STRINGS.get_unchecked(self.0) } + } +} + +/// A compact, single-word representation for symbols. +/// +/// Uses a tagged pointer to distinguish between runtime and constant symbols: +/// +/// - **Runtime** (tag = 0): Pointer to a [`RuntimeSymbol`] allocation +/// - **Constant** (tag = 1): Index into [`STRINGS`] encoded in the pointer bits +/// +/// # Size +/// +/// `Repr` is exactly one pointer in size. Thanks to [`NonNull`], `Option` +/// is also one pointer in size (niche optimization). +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +struct Repr { + ptr: NonNull, +} + +impl Repr { + /// Minimum alignment for runtime symbol allocations. + /// + /// Must be at least 2 to ensure the lowest bit is always 0 for valid pointers. + const MIN_ALIGN: usize = 2; + /// Tag value for constant symbols (bit 0 = 1). + const TAG_CONSTANT: usize = 0b1; + /// Bitmask for extracting the tag from a pointer address. + const TAG_MASK: usize = 0b1; + /// Tag value for runtime symbols (bit 0 = 0). + const TAG_RUNTIME: usize = 0b0; + /// Number of bits used for the tag (determines how much to shift indices). + const TAG_SHIFT: u32 = 1; + + /// Returns the tag value (0 for runtime, 1 for constant). + fn tag(self) -> usize { + self.ptr.addr().get() & Self::TAG_MASK + } + + /// Extracts the runtime symbol pointer. + /// + /// # Safety + /// + /// - `self` must have been created via [`Repr::runtime`]. + /// - The underlying allocation must still be live. + unsafe fn as_runtime_symbol(self) -> NonNull { + debug_assert!(self.tag() == Self::TAG_RUNTIME); + + self.ptr + .map_addr(|addr| { + // SAFETY: Runtime symbols are aligned to at least MIN_ALIGN (2), so the + // lowest bit is always 0. Masking it off preserves a valid, non-zero address. + unsafe { NonZero::new_unchecked(addr.get() & !Self::TAG_MASK) } + }) + .cast::() + } + + /// Extracts the constant symbol index. + /// + /// # Safety + /// + /// - `self` must have been created via [`Repr::constant`]. + unsafe fn as_constant_symbol(self) -> ConstantSymbol { + debug_assert!(self.tag() == Self::TAG_CONSTANT); + + let addr = self.ptr.addr().get(); + ConstantSymbol((addr & !Self::TAG_MASK) >> Self::TAG_SHIFT) + } + + /// Returns the string content of this symbol. + /// + /// # Safety + /// + /// - For runtime symbols: the allocation must remain live for lifetime `'str`. + /// - The returned string must not be mutated for lifetime `'str`. + unsafe fn as_str<'str>(self) -> &'str str { + if self.tag() == Self::TAG_RUNTIME { + // SAFETY: Caller guarantees the allocation is live for 'str. + unsafe { RuntimeSymbol::as_str(self.as_runtime_symbol()) } + } else { + // SAFETY: Constant symbols return &'static str, which coerces to &'str. + unsafe { self.as_constant_symbol().as_str_unchecked() } + } + } + + /// Creates a `Repr` for a constant symbol. + /// + /// The index is encoded directly in the pointer bits (shifted to make room for the tag). + const fn constant(constant: ConstantSymbol) -> Self { + const { + assert!( + Self::TAG_CONSTANT != 0, + "Constant symbol tag must be non-zero" + ); + } + + debug_assert!( + (constant.0 << Self::TAG_SHIFT >> Self::TAG_SHIFT) == constant.0, + "constant has set the top most bit" + ); + debug_assert!(constant.0 < STRINGS.len(), "constant is out of range"); + + let addr = (constant.0 << Self::TAG_SHIFT) | Self::TAG_CONSTANT; + let ptr = ptr::without_provenance_mut(addr); + + Self { + // SAFETY: TAG_CONSTANT is non-zero, therefore `addr` is non-null. + ptr: unsafe { NonNull::new_unchecked(ptr) }, + } + } + + /// Creates a `Repr` for a runtime symbol. + /// + /// The pointer is stored directly with its tag bit set to 0 (which is a no-op + /// since runtime allocations are already aligned). + fn runtime(symbol: NonNull) -> Self { + const { + assert!(align_of::() >= Self::MIN_ALIGN); + } + + let ptr = symbol.map_addr(|addr| addr | Self::TAG_RUNTIME).cast(); + + Self { ptr } + } +} + +#[cfg(test)] +mod tests { + #![expect(clippy::non_ascii_literal)] + use core::mem; + + use super::{ConstantSymbol, Repr, RuntimeSymbol, STRINGS}; + use crate::heap::Scratch; + + #[test] + fn repr_size_is_one_pointer() { + assert_eq!(mem::size_of::(), mem::size_of::<*const ()>()); + } + + #[test] + fn option_repr_size_is_one_pointer() { + assert_eq!(mem::size_of::>(), mem::size_of::<*const ()>()); + } + + #[test] + fn runtime_symbol_has_minimum_alignment() { + assert!(mem::align_of::() >= Repr::MIN_ALIGN); + } + + #[test] + fn constant_symbol_first_entry() { + let constant = ConstantSymbol(0); + let repr = Repr::constant(constant); + + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, STRINGS[0]); + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, "foo"); + } + + #[test] + fn constant_symbol_second_entry() { + let constant = ConstantSymbol(1); + let repr = Repr::constant(constant); + + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, STRINGS[1]); + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, "bar"); + } + + #[test] + fn runtime_symbol_empty_string() { + let heap = Scratch::new(); + let symbol = RuntimeSymbol::alloc(&heap, ""); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, ""); + } + + #[test] + fn runtime_symbol_simple_string() { + let heap = Scratch::new(); + let symbol = RuntimeSymbol::alloc(&heap, "hello"); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, "hello"); + } + + #[test] + fn runtime_symbol_unicode() { + let heap = Scratch::new(); + let symbol = RuntimeSymbol::alloc(&heap, "日本語 🎉 émojis"); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, "日本語 🎉 émojis"); + } + + #[test] + fn runtime_symbol_long_string() { + let heap = Scratch::new(); + let long_string = "a".repeat(10_000); + let symbol = RuntimeSymbol::alloc(&heap, &long_string); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, long_string); + } + + #[test] + fn multiple_runtime_symbols() { + let heap = Scratch::new(); + + let symbol1 = RuntimeSymbol::alloc(&heap, "first"); + let symbol2 = RuntimeSymbol::alloc(&heap, "second"); + let symbol3 = RuntimeSymbol::alloc(&heap, "third"); + + let repr1 = Repr::runtime(symbol1); + let repr2 = Repr::runtime(symbol2); + let repr3 = Repr::runtime(symbol3); + + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr1.as_str() }, "first"); + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr2.as_str() }, "second"); + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr3.as_str() }, "third"); + } + + #[test] + fn tag_distinguishes_constant_from_runtime() { + let heap = Scratch::new(); + + let constant = Repr::constant(ConstantSymbol(0)); + let runtime = Repr::runtime(RuntimeSymbol::alloc(&heap, "test")); + + assert_eq!(constant.tag(), Repr::TAG_CONSTANT); + assert_eq!(runtime.tag(), Repr::TAG_RUNTIME); + } + + #[test] + fn runtime_symbol_stores_correct_length() { + let heap = Scratch::new(); + let symbol = RuntimeSymbol::alloc(&heap, "hello"); + + // SAFETY: `symbol` points to a valid allocation and `heap` is live. + unsafe { + assert_eq!(RuntimeSymbol::len(symbol), 5); + assert_eq!(RuntimeSymbol::as_str(symbol).len(), 5); + } + } +} From 20ed1479e670cfb04490d5f3ac0fd261a933fcb6 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sat, 31 Jan 2026 22:33:23 +0100 Subject: [PATCH 02/21] feat: sym2 --- Cargo.lock | 1 + libs/@local/hashql/core/Cargo.toml | 1 + libs/@local/hashql/core/src/symbol/mod.rs | 1 + libs/@local/hashql/core/src/symbol/repr.rs | 4 ++++ libs/@local/hashql/core/src/symbol/sym2.rs | 1 + 5 files changed, 8 insertions(+) create mode 100644 libs/@local/hashql/core/src/symbol/sym2.rs diff --git a/Cargo.lock b/Cargo.lock index 1a95ca3e368..4c4f9a6433a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3894,6 +3894,7 @@ dependencies = [ "insta", "lexical", "memchr", + "phf 0.13.1", "pretty", "proptest", "rapidfuzz", diff --git a/libs/@local/hashql/core/Cargo.toml b/libs/@local/hashql/core/Cargo.toml index 038725456eb..923ffa875fb 100644 --- a/libs/@local/hashql/core/Cargo.toml +++ b/libs/@local/hashql/core/Cargo.toml @@ -29,6 +29,7 @@ derive_more = { workspace = true, features = ["debug", "from"] } ena = { workspace = true } lexical = { workspace = true, features = ["parse-integers", "parse-floats", "format"] } memchr = { workspace = true } +phf = { version = "0.13.1", features = ["macros"] } rapidfuzz = { workspace = true } roaring = { workspace = true, features = ["std", "simd"] } rpds = { workspace = true, features = ["std"] } diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index d26fb291031..da9230a5805 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -20,6 +20,7 @@ mod repr; pub mod sym; +mod sym2; mod table; use core::{ diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index 3668621b15d..ff9576922f2 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -172,6 +172,10 @@ impl RuntimeSymbol { struct ConstantSymbol(usize); impl ConstantSymbol { + const fn new_unchecked(index: usize) -> Self { + Self(index) + } + /// Returns the string value for this constant symbol. fn as_str(self) -> &'static str { STRINGS[self.0] diff --git a/libs/@local/hashql/core/src/symbol/sym2.rs b/libs/@local/hashql/core/src/symbol/sym2.rs new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/libs/@local/hashql/core/src/symbol/sym2.rs @@ -0,0 +1 @@ + From 09a2e03c80256ff90915e6c67f041dae9771b0a3 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 00:56:46 +0100 Subject: [PATCH 03/21] feat: checkpoint --- SPEC.md | 77 +++++++++ libs/@local/hashql/core/src/symbol/mod.rs | 60 +++++-- libs/@local/hashql/core/src/symbol/repr.rs | 35 ++-- libs/@local/hashql/core/src/symbol/sym.rs | 60 +++---- libs/@local/hashql/core/src/symbol/sym2.rs | 155 ++++++++++++++++++ .../hashql/core/src/value/primitive/string.rs | 2 +- 6 files changed, 325 insertions(+), 64 deletions(-) create mode 100644 SPEC.md diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 00000000000..89e288927e1 --- /dev/null +++ b/SPEC.md @@ -0,0 +1,77 @@ +# sym2.rs symbols! macro specification + +## Goal + +Define a declarative `symbols!` macro in `libs/@local/hashql/core/src/symbol/sym2.rs` that expands a compact symbol list into: + +- A single global string table `SYMBOLS` containing every symbol literal exactly once. +- `const` `Symbol` values for each symbol name (including nested modules). +- A `phf::Map` lookup table from string to `Symbol`. + +## Inputs + +The macro is invoked in `sym2.rs` with a mixture of: + +- Bare identifiers (e.g., `access, add`). +- Identifier-to-string pairs using `name: "..."` (e.g., `r#if: "if"`). +- Nested module blocks using `module_name: { ... }` with the same item forms inside. + +## Outputs and behavior + +1. **Global string table** + - Emit `static SYMBOLS: &[&str] = &[ ... ];` that contains **all** symbol string values, in macro traversal order. + - Order is deterministic and mirrors the macro input order, flattening nested blocks in-place. + - Each string appears **exactly once**; duplicates are detected by a runtime test. + +2. **Symbol constants** + - For each symbol item, emit a `const : Symbol = Symbol::constant_unchecked();`. + - The `` is the position of the symbol’s string in `SYMBOLS`. + - When items are inside `module_name: { ... }`, emit a `mod module_name { use super::*; ... }` containing the `const`s for that module’s items. + +3. **Lookup map** + - Emit `static LOOKUP: phf::Map<&'static str, Symbol> = phf_map! { ... };`. + - Each entry maps the **string value** to the corresponding `Symbol` constant. + - The map includes entries for both top-level and nested module items, with the value referencing the correct constant (e.g., `"*" => symbol::asterisk`). + +4. **Uniqueness checks** + - No macro-time checks; uniqueness is enforced by a runtime test that fails if duplicate strings exist in `SYMBOLS`. + +## Expansion details + +Given the existing example in `sym2.rs`, the expansion will: + +- Generate `SYMBOLS` covering: `"access"`, `"add"`, `"and"`, `"archived"`, `"archived_by_id"`, `"bar"`, `"BaseUrl"`, `"bit_and"`, `"bit_not"`, `"bit_or"`, `"if"`, `""`, `"'"`, `"*"`, `"0"`, `"1"`, `"::core::option::Option"`, `"::core::option::Some"`, `"::core::option::None"`, `"::graph::head::entities"`, `"::graph::body::filter"`, `"::graph::tail::collect"`. +- Create `const` bindings for each name in the correct scope, each using the index into `SYMBOLS`. +- Create a `LOOKUP` `phf_map!` with all strings mapped to their corresponding `Symbol` constants. + +## Implementation approach + +1. **Define the macro interface** to accept a comma-separated list of `symbol_item` forms: + - `ident` (string = ident name) + - `ident : literal` (string = literal) + - `module_ident : { ... }` + +2. **Flatten items** into a single sequence of `(string_literal, const_path)` in the exact order of appearance. + - For nested modules, the `const_path` is `module_ident::item_ident`. + +3. **Generate indices** by counting from `0` in the flattened order. + - Use a recursive macro to emit tuples `((string, path), index)` as it walks the input. + +4. **Emit `SYMBOLS`** by collecting the flattened string list. + +5. **Emit consts** + - For top-level items: `const name: Symbol = Symbol::constant_unchecked(index);`. + - For module items: `mod module { use super::*; const name: Symbol = Symbol::constant_unchecked(index); }`. + +6. **Emit `LOOKUP`** by mapping each flattened string to its `const_path`. + +7. **Uniqueness enforcement** + - Add a `#[test]` in `sym2.rs` that inserts every entry from `SYMBOLS` into a `HashSet` and asserts that the set size equals `SYMBOLS.len()`. + - The test is the only enforcement mechanism; compilation is not affected. + +## Definition of done + +- `sym2.rs` contains the new `symbols!` macro that expands as specified. +- `SYMBOLS`, all `const` symbols, and `LOOKUP` are generated from the macro invocation. +- Duplicate string values fail the uniqueness test. +- Code builds without additional files or edits outside `sym2.rs` (other than this spec file). diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index da9230a5805..255b5317925 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -27,9 +27,11 @@ use core::{ cmp::Ordering, fmt::{self, Display, Formatter}, hash::{Hash, Hasher}, + marker::PhantomData, ptr, }; +use self::repr::{ConstantSymbol, Repr}; pub use self::table::SymbolTable; use crate::span::SpanId; @@ -46,19 +48,35 @@ use crate::span::SpanId; /// The caller must ensure that the string is unique and interned. The types correctness requires /// relies on these *but it does not enforce it*. #[derive(Debug, Copy, Clone)] -pub struct Symbol<'heap>(&'heap str); +pub struct Symbol<'heap> { + repr: Repr, + _marker: PhantomData<&'heap ()>, +} +#[expect(unsafe_code)] impl<'heap> Symbol<'heap> { /// Creates a new interned symbol from a string slice. /// /// The caller must ensure that the string is unique and interned. pub(crate) const fn new_unchecked(string: &'heap str) -> Self { - Self(string) + Symbol { + repr: Repr::constant(ConstantSymbol::new_unchecked(0)), + _marker: PhantomData, + } + // unimplemented!() + } + + const fn new_constant_unchecked(index: usize) -> Self { + Symbol { + repr: Repr::constant(ConstantSymbol::new_unchecked(index)), + _marker: PhantomData, + } } #[must_use] - pub const fn as_str(&self) -> &str { - self.0 + pub fn as_str(&self) -> &str { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_str() } } /// Returns the string representation of the symbol. @@ -67,18 +85,21 @@ impl<'heap> Symbol<'heap> { /// instead of the symbol itself, somewhat circumventing the protections given to the symbol /// itself. Any unwrapped type should be considered no longer unique and interned. #[must_use] - pub const fn unwrap(&self) -> &'heap str { - self.0 + pub fn unwrap(&self) -> &'heap str { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_str() } } #[must_use] pub const fn as_bytes(&self) -> &[u8] { - self.0.as_bytes() + unimplemented!() + // self.0.as_bytes() } #[must_use] pub fn demangle(self) -> &'heap str { - self.0.rsplit_once(':').map_or(self.0, |(name, _)| name) + unimplemented!() + // self.0.rsplit_once(':').map_or(self.0, |(name, _)| name) } } @@ -92,7 +113,8 @@ impl AsRef for Symbol<'_> { impl PartialEq for Symbol<'_> { fn eq(&self, other: &Self) -> bool { // Pointer equality implies string equality (due to the unique contents assumption) - ptr::eq(self.0, other.0) + // ptr::eq(self.0, other.0) + unimplemented!() } } @@ -108,24 +130,27 @@ impl Ord for Symbol<'_> { fn cmp(&self, other: &Self) -> Ordering { // Pointer equality implies string equality (due to the unique contents assumption), but if // not the same the contents must be compared. - if self == other { - Ordering::Equal - } else { - self.0.cmp(other.0) - } + // if self == other { + // Ordering::Equal + // } else { + // self.0.cmp(other.0) + // } + unimplemented!() } } impl Hash for Symbol<'_> { fn hash(&self, state: &mut H) { // Pointer hashing is sufficient (due to the unique contents assumption) - ptr::hash(self.0, state); + // ptr::hash(self.0, state); + unimplemented!() } } impl Display for Symbol<'_> { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { - Display::fmt(self.0, fmt) + unimplemented!() + // Display::fmt(self.0, fmt) } } @@ -285,6 +310,7 @@ impl AsRef for Ident<'_> { impl Display for Ident<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - Display::fmt(&self.value.0, fmt) + unimplemented!() + // Display::fmt(&self.value.0, fmt) } } diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index ff9576922f2..a155df9af0f 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -34,13 +34,9 @@ use core::{ ptr::{self, NonNull}, }; +use super::sym2::SYMBOLS; use crate::heap::BumpAllocator; -/// Static table of constant symbol strings. -/// -/// Constant symbols encode an index into this table rather than storing string data. -static STRINGS: &[&str] = &["foo", "bar"]; - /// Header for a runtime-allocated symbol with inline string data. /// /// # Memory Layout @@ -169,16 +165,16 @@ impl RuntimeSymbol { /// A constant symbol represented as an index into [`STRINGS`]. #[derive(Copy, Clone)] -struct ConstantSymbol(usize); +pub(crate) struct ConstantSymbol(usize); impl ConstantSymbol { - const fn new_unchecked(index: usize) -> Self { + pub(crate) const fn new_unchecked(index: usize) -> Self { Self(index) } /// Returns the string value for this constant symbol. fn as_str(self) -> &'static str { - STRINGS[self.0] + SYMBOLS[self.0] } /// Returns the string value without bounds checking. @@ -188,7 +184,7 @@ impl ConstantSymbol { /// The index must be within bounds of [`STRINGS`]. unsafe fn as_str_unchecked(self) -> &'static str { // SAFETY: Caller guarantees the index is in bounds. - unsafe { STRINGS.get_unchecked(self.0) } + unsafe { SYMBOLS.get_unchecked(self.0) } } } @@ -204,10 +200,17 @@ impl ConstantSymbol { /// `Repr` is exactly one pointer in size. Thanks to [`NonNull`], `Option` /// is also one pointer in size (niche optimization). #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] -struct Repr { +pub(crate) struct Repr { ptr: NonNull, } +// SAFETY: while NonNull (for niche optimization), the pointer itself is only accessed via `*const` +// ptr and never modified. The underlying data is Send + Sync. +unsafe impl Send for Repr {} +// SAFETY: while NonNull (for niche optimization), the pointer itself is only accessed via `*const` +// ptr and never modified. The underlying data is Send + Sync. +unsafe impl Sync for Repr {} + impl Repr { /// Minimum alignment for runtime symbol allocations. /// @@ -263,7 +266,7 @@ impl Repr { /// /// - For runtime symbols: the allocation must remain live for lifetime `'str`. /// - The returned string must not be mutated for lifetime `'str`. - unsafe fn as_str<'str>(self) -> &'str str { + pub(crate) unsafe fn as_str<'str>(self) -> &'str str { if self.tag() == Self::TAG_RUNTIME { // SAFETY: Caller guarantees the allocation is live for 'str. unsafe { RuntimeSymbol::as_str(self.as_runtime_symbol()) } @@ -276,7 +279,7 @@ impl Repr { /// Creates a `Repr` for a constant symbol. /// /// The index is encoded directly in the pointer bits (shifted to make room for the tag). - const fn constant(constant: ConstantSymbol) -> Self { + pub(crate) const fn constant(constant: ConstantSymbol) -> Self { const { assert!( Self::TAG_CONSTANT != 0, @@ -288,7 +291,7 @@ impl Repr { (constant.0 << Self::TAG_SHIFT >> Self::TAG_SHIFT) == constant.0, "constant has set the top most bit" ); - debug_assert!(constant.0 < STRINGS.len(), "constant is out of range"); + debug_assert!(constant.0 < SYMBOLS.len(), "constant is out of range"); let addr = (constant.0 << Self::TAG_SHIFT) | Self::TAG_CONSTANT; let ptr = ptr::without_provenance_mut(addr); @@ -319,7 +322,7 @@ mod tests { #![expect(clippy::non_ascii_literal)] use core::mem; - use super::{ConstantSymbol, Repr, RuntimeSymbol, STRINGS}; + use super::{ConstantSymbol, Repr, RuntimeSymbol, SYMBOLS}; use crate::heap::Scratch; #[test] @@ -343,7 +346,7 @@ mod tests { let repr = Repr::constant(constant); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. - assert_eq!(unsafe { repr.as_str() }, STRINGS[0]); + assert_eq!(unsafe { repr.as_str() }, SYMBOLS[0]); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. assert_eq!(unsafe { repr.as_str() }, "foo"); } @@ -354,7 +357,7 @@ mod tests { let repr = Repr::constant(constant); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. - assert_eq!(unsafe { repr.as_str() }, STRINGS[1]); + assert_eq!(unsafe { repr.as_str() }, SYMBOLS[1]); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. assert_eq!(unsafe { repr.as_str() }, "bar"); } diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index e276aa5b355..69f51aeaa21 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -233,41 +233,41 @@ symbols![path; PATHS; pub(crate) const TABLES: &[&[&Symbol<'static>]] = &[LEXICAL, DIGITS, SYMBOLS, PATHS, INTERNAL]; -#[cfg(test)] -mod test { - use core::ptr; +// #[cfg(test)] +// mod test { +// use core::ptr; - use super::TABLES; - use crate::{ - heap::{Heap, ResetAllocator as _}, - symbol::sym, - }; +// use super::TABLES; +// use crate::{ +// heap::{Heap, ResetAllocator as _}, +// symbol::sym, +// }; - #[test] - fn pointer_equality_from_heap() { - let mut heap = Heap::new(); +// #[test] +// fn pointer_equality_from_heap() { +// let mut heap = Heap::new(); - let mul_heap = heap.intern_symbol("*"); - let mul_sym = sym::symbol::asterisk; +// let mul_heap = heap.intern_symbol("*"); +// let mul_sym = sym::symbol::asterisk; - assert!(ptr::eq(mul_heap.0, mul_sym.0)); +// assert!(ptr::eq(mul_heap.0, mul_sym.0)); - // even after reset that should be the case - heap.reset(); +// // even after reset that should be the case +// heap.reset(); - let mul_heap = heap.intern_symbol("*"); - let mul_sym = sym::symbol::asterisk; +// let mul_heap = heap.intern_symbol("*"); +// let mul_sym = sym::symbol::asterisk; - assert!(ptr::eq(mul_heap.0, mul_sym.0)); - } +// assert!(ptr::eq(mul_heap.0, mul_sym.0)); +// } - #[test] - fn ensure_no_collisions() { - let mut set = std::collections::HashSet::new(); - for &table in TABLES { - for &symbol in table { - assert!(set.insert(symbol.0)); - } - } - } -} +// #[test] +// fn ensure_no_collisions() { +// let mut set = std::collections::HashSet::new(); +// for &table in TABLES { +// for &symbol in table { +// assert!(set.insert(symbol.0)); +// } +// } +// } +// } diff --git a/libs/@local/hashql/core/src/symbol/sym2.rs b/libs/@local/hashql/core/src/symbol/sym2.rs index 8b137891791..25c24dfe1e2 100644 --- a/libs/@local/hashql/core/src/symbol/sym2.rs +++ b/libs/@local/hashql/core/src/symbol/sym2.rs @@ -1 +1,156 @@ +use phf::phf_map; +use super::Symbol; + +macro_rules! symbols { + (@strings [$($acc:tt)*];) => { + pub(crate) static SYMBOLS: &[&str] = &[ + $($acc),* + ]; + }; + (@strings [$($acc:tt)*]; , $($rest:tt)*) => { + symbols!(@strings [$($acc)*]; $($rest)*); + }; + (@strings [$($acc:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)*]; $($inner)* $(, $($rest)*)?); + }; + (@strings [$($acc:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* $value]; $($($rest)*)?); + }; + (@strings [$($acc:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* (stringify!($name))]; $($($rest)*)?); + }; + + (@consts @cont [$($count:tt)*] [$($next:tt)*];) => { + symbols!(@consts [$($count)*]; $($next)*); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; , $($rest:tt)*) => { + symbols!(@consts @cont [$($count)*] [$($next)*]; $($rest)*); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*];; $($inner)* $(, $($rest)*)?); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); + }; + + (@consts [$($count:tt)*];) => {}; + (@consts [$($count:tt)*]; , $($rest:tt)*) => { + symbols!(@consts [$($count)*]; $($rest)*); + }; + (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + (@consts [$($count:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + pub mod $module { + use super::*; + + symbols!(@consts [$($count)*]; $($inner)*); + } + + symbols!(@consts @cont [$($count)*] [$($($rest)*)?]; $($inner)*); + }; + (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + + (@path [] [$($path:ident)*];) => { + $($path)::* + }; + (@path [$next:tt $($rest:tt)*] [$($path:tt)*];) => { + symbols!(@path [$($rest)*] [$next $($path)*];) + }; + + (@lookup [$(, $arm:pat => $value:expr)*] [$($path:tt),*];) => { + static LOOKUP: phf::Map<&'static str, Symbol<'static>> = phf_map! { $($arm => $value),* }; + }; + (@lookup [$($arms:tt)*] [$($path:tt),*];) => { + + }; + (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; , $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, $value => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*] [$module $(, $path)*]; $($inner)* ,| $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, stringify!($name) => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + + (@table; $($items:tt)*) => { + symbols!(@strings []; $($items)*); + symbols!(@consts []; $($items)*); + symbols!(@lookup [] [self]; $($items)*); + }; +} + +// symbols! { +// access, +// add, +// and, +// archived, +// archived_by_id, +// bar, +// BaseUrl, +// bit_and, +// bit_not, +// bit_or, +// r#if: "if", +// dummy: "", + +// internal: { +// ClosureEnv: "'" +// }, + +// symbol: { +// asterisk: "*", +// }, + +// digit: { +// zero: "0", +// one: "1", +// /* and so on */ +// }, + +// path: { +// option: "::core::option::Option", +// some: "::core::option::Some", +// none: "::core::option::None", +// graph_head_entities: "::graph::head::entities", +// graph_body_filter: "::graph::body::filter", +// graph_tail_collect: "::graph::tail::collect", +// } +// } + +symbols!(@table; + access +); + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::SYMBOLS; + + #[test] + fn symbols_are_unique() { + let mut set = HashSet::with_capacity(SYMBOLS.len()); + + for symbol in SYMBOLS { + set.insert(*symbol); + } + + assert_eq!(set.len(), SYMBOLS.len(), "duplicate symbol value found"); + } +} diff --git a/libs/@local/hashql/core/src/value/primitive/string.rs b/libs/@local/hashql/core/src/value/primitive/string.rs index d4618d860b5..0fff8c8d6ae 100644 --- a/libs/@local/hashql/core/src/value/primitive/string.rs +++ b/libs/@local/hashql/core/src/value/primitive/string.rs @@ -57,7 +57,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_str(), "Hello, world!"); /// ``` #[must_use] - pub const fn as_str(&self) -> &str { + pub fn as_str(&self) -> &str { self.value.as_str() } From 942ec3ba4d0b8f41dbcf7957f2748cb866d53a75 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 14:56:58 +0100 Subject: [PATCH 04/21] feat: checkpoint --- libs/@local/hashql/core/src/lib.rs | 5 +- libs/@local/hashql/core/src/symbol/mod.rs | 4 + libs/@local/hashql/core/src/symbol/repr.rs | 14 +- libs/@local/hashql/core/src/symbol/sym2.rs | 235 ++++++++++++++++----- 4 files changed, 205 insertions(+), 53 deletions(-) diff --git a/libs/@local/hashql/core/src/lib.rs b/libs/@local/hashql/core/src/lib.rs index e99cb0d6065..861e8482e9e 100644 --- a/libs/@local/hashql/core/src/lib.rs +++ b/libs/@local/hashql/core/src/lib.rs @@ -3,6 +3,7 @@ //! ## Workspace dependencies #![cfg_attr(doc, doc = simple_mermaid::mermaid!("../docs/dependency-diagram.mmd"))] #![expect(clippy::indexing_slicing)] +#![recursion_limit = "256"] #![feature( // Language Features arbitrary_self_types, @@ -20,6 +21,8 @@ assert_matches, binary_heap_into_iter_sorted, clone_from_ref, + const_cmp, + const_trait_impl, debug_closure_helpers, extend_one, formatting_options, @@ -30,9 +33,9 @@ slice_partition_dedup, slice_swap_unchecked, step_trait, + str_from_raw_parts, try_trait_v2, variant_count, - str_from_raw_parts )] extern crate alloc; diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index 255b5317925..448ba80313f 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -73,6 +73,10 @@ impl<'heap> Symbol<'heap> { } } + const fn into_repr(self) -> Repr { + self.repr + } + #[must_use] pub fn as_str(&self) -> &str { // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index a155df9af0f..f5bcc2b1f3b 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -182,9 +182,9 @@ impl ConstantSymbol { /// # Safety /// /// The index must be within bounds of [`STRINGS`]. - unsafe fn as_str_unchecked(self) -> &'static str { + const unsafe fn as_str_unchecked(self) -> &'static str { // SAFETY: Caller guarantees the index is in bounds. - unsafe { SYMBOLS.get_unchecked(self.0) } + unsafe { *SYMBOLS.as_ptr().add(self.0) } } } @@ -347,8 +347,14 @@ mod tests { // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. assert_eq!(unsafe { repr.as_str() }, SYMBOLS[0]); + } + + #[test] + fn constant_symbol_first_entry_unchecked() { + let constant = ConstantSymbol(0); + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. - assert_eq!(unsafe { repr.as_str() }, "foo"); + assert_eq!(unsafe { constant.as_str_unchecked() }, SYMBOLS[0]); } #[test] @@ -358,8 +364,6 @@ mod tests { // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. assert_eq!(unsafe { repr.as_str() }, SYMBOLS[1]); - // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. - assert_eq!(unsafe { repr.as_str() }, "bar"); } #[test] diff --git a/libs/@local/hashql/core/src/symbol/sym2.rs b/libs/@local/hashql/core/src/symbol/sym2.rs index 25c24dfe1e2..6f06a574d9d 100644 --- a/libs/@local/hashql/core/src/symbol/sym2.rs +++ b/libs/@local/hashql/core/src/symbol/sym2.rs @@ -1,5 +1,4 @@ -use phf::phf_map; - +#![expect(non_upper_case_globals, clippy::min_ident_chars)] use super::Symbol; macro_rules! symbols { @@ -42,6 +41,7 @@ macro_rules! symbols { symbols!(@consts [$($count)*]; $($rest)*); }; (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == $value) }; pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); symbols!(@consts [$($count)* ()]; $($($rest)*)?); }; @@ -55,6 +55,7 @@ macro_rules! symbols { symbols!(@consts @cont [$($count)*] [$($($rest)*)?]; $($inner)*); }; (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == stringify!($name)) }; pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); symbols!(@consts [$($count)* ()]; $($($rest)*)?); }; @@ -66,11 +67,17 @@ macro_rules! symbols { symbols!(@path [$($rest)*] [$next $($path)*];) }; - (@lookup [$(, $arm:pat => $value:expr)*] [$($path:tt),*];) => { - static LOOKUP: phf::Map<&'static str, Symbol<'static>> = phf_map! { $($arm => $value),* }; - }; - (@lookup [$($arms:tt)*] [$($path:tt),*];) => { + (@lookup [$(, $arm:expr => $value:expr)*] [$($path:tt),*];) => { + #[expect(unsafe_code)] + pub(crate) fn prime(map: &mut hashbrown::HashMap<&'static str, super::repr::Repr, S, A>) { + debug_assert!(map.is_empty()); + map.reserve(SYMBOLS.len()); + $( + // SAFETY: The declarative macro guarantees that the symbol is unique. + unsafe { map.insert_unique_unchecked($arm, $value.into_repr()); } + )* + } }; (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); @@ -95,47 +102,181 @@ macro_rules! symbols { }; } -// symbols! { -// access, -// add, -// and, -// archived, -// archived_by_id, -// bar, -// BaseUrl, -// bit_and, -// bit_not, -// bit_or, -// r#if: "if", -// dummy: "", - -// internal: { -// ClosureEnv: "'" -// }, - -// symbol: { -// asterisk: "*", -// }, - -// digit: { -// zero: "0", -// one: "1", -// /* and so on */ -// }, - -// path: { -// option: "::core::option::Option", -// some: "::core::option::Some", -// none: "::core::option::None", -// graph_head_entities: "::graph::head::entities", -// graph_body_filter: "::graph::body::filter", -// graph_tail_collect: "::graph::tail::collect", -// } -// } - -symbols!(@table; - access -); +symbols! {@table; + // [tidy] sort alphabetically start + access, + add, + and, + archived, + archived_by_id, + bar, + BaseUrl, + bit_and, + bit_not, + bit_or, + bit_shl, + bit_shr, + bit_xor, + Boolean, + collect, + confidence, + core, + created_at_decision_time, + created_at_transaction_time, + created_by_id, + decision_time, + Dict, + div, + draft_id, + dummy: "", + E, + edition, + edition_id, + encodings, + entity, + entity_edition_id, + entity_id, + entity_type_ids, + entity_uuid, + eq, + Err, + filter, + foo, + gt, + gte, + id, + index, + inferred, + input, + input_exists: "$exists", + Integer, + Intersection, + kernel, + left_entity_confidence, + left_entity_id, + left_entity_provenance, + link_data, + List, + lt, + lte, + math, + metadata, + mul, + ne, + Never, + None, + not, + Null, + null, + Number, + Ok, + option, + or, + pow, + properties, + provenance, + provided, + r#as: "as", + r#as_force: "as!", + r#else: "else", + r#false: "false", + r#fn: "fn", + r#if: "if", + r#in: "in", + r#is: "is", + r#let: "let", + r#mod: "mod", + r#newtype: "newtype", + r#true: "true", + r#type: "type", + r#use: "use", + R, + record_id, + Result, + right_entity_confidence, + right_entity_id, + right_entity_provenance, + Some, + special_form, + String, + sub, + T, + temporal_versioning, + then: "then", + thunk: "thunk", + transaction_time, + U, + Union, + Unknown, + unknown, + Url, + vectors, + web_id, + // [tidy] sort alphabetically end + + internal: { + ClosureEnv: "'" + }, + + symbol: { + // [tidy] sort alphabetically start + ampamp: "&&", + ampersand: "&", + arrow: "->", + arrow_head: "|>", + asterisk: "*", + exclamation: "!", + excleq: "!=", + brackets: "[]", + caret: "^", + colon: ":", + colon_colon: "::", + comma: ",", + dollar: "$", + dollar_question_mark: "$?", + dot: ".", + eq: "=", + eqeq: "==", + gt: ">", + gteq: ">=", + gtgt: ">>", + lt: "<", + lteq: "<=", + ltlt: "<<", + minus: "-", + pipepipe: "||", + pipe: "|", + plus: "+", + question_mark: "?", + slash: "/", + tilde: "~", + // [tidy] sort alphabetically end + }, + + digit: { + zero: "0", + one: "1", + two: "2", + three: "3", + four: "4", + five: "5", + six: "6", + seven: "7", + eight: "8", + nine: "9", + }, + + path: { + // [tidy] sort alphabetically start + option: "::core::option::Option", + some: "::core::option::Some", + none: "::core::option::None", + graph_head_entities: "::graph::head::entities", + graph_body_filter: "::graph::body::filter", + graph_tail_collect: "::graph::tail::collect", + // [tidy] sort alphabetically end + } +} #[cfg(test)] mod tests { From 8d5ef6a04df24939088b83343d21e69a16fcdc78 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 15:13:26 +0100 Subject: [PATCH 05/21] feat: checkpoint --- libs/@local/hashql/core/src/symbol/lookup.rs | 238 +++++++++++++++++ libs/@local/hashql/core/src/symbol/mod.rs | 4 +- libs/@local/hashql/core/src/symbol/repr.rs | 6 +- libs/@local/hashql/core/src/symbol/sym2.rs | 4 + libs/@local/hashql/core/src/symbol/table.rs | 262 +++---------------- libs/@local/hashql/hir/src/context.rs | 8 +- 6 files changed, 294 insertions(+), 228 deletions(-) create mode 100644 libs/@local/hashql/core/src/symbol/lookup.rs diff --git a/libs/@local/hashql/core/src/symbol/lookup.rs b/libs/@local/hashql/core/src/symbol/lookup.rs new file mode 100644 index 00000000000..09dfbfedc4d --- /dev/null +++ b/libs/@local/hashql/core/src/symbol/lookup.rs @@ -0,0 +1,238 @@ +use core::ops::Index; + +use super::Symbol; +use crate::{ + collections::FastHashMap, + id::{Id, IdVec}, +}; + +#[derive(Debug)] +enum SymbolLookupInner<'heap, I> { + Dense(IdVec>), + Gapped(IdVec>>), + Sparse(FastHashMap>), +} + +/// A mapping from identifiers to symbols optimized for different access patterns. +/// +/// [`SymbolTable`] provides efficient storage and retrieval of [`Symbol`] instances which are tied +/// to a specific identifier (which is any type that implements the [`Id`] trait). +/// +/// # Storage Strategies +/// +/// To accommodate different access patterns, [`SymbolTable`] supports three storage strategies: +/// +/// ## Dense Storage +/// +/// Created with [`SymbolTable::dense()`], this mode uses a [`Vec`] internally and requires +/// IDs to be inserted sequentially starting from 0. This provides optimal memory efficiency +/// and cache performance for contiguous ID ranges. +/// +/// ## Gapped Storage +/// +/// Created with [`SymbolTable::gapped()`], this mode uses a [`Vec`] of [`Option`] +/// internally and allows insertion at arbitrary indices. Unlike dense storage, gaps are allowed in +/// the ID sequence. This provides a balance between the memory efficiency of dense storage and the +/// flexibility of sparse storage, making it ideal for scenarios where most IDs are contiguous but +/// some gaps may exist. +/// +/// ## Sparse Storage +/// +/// Created with [`SymbolTable::sparse()`], this mode uses a [`FastHashMap`] internally and +/// supports arbitrary ID insertion order. This provides flexibility at the cost of higher +/// memory overhead per entry. +/// +/// # Examples +/// +/// ``` +/// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; +/// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); +/// # let mut heap = Heap::new(); +/// # let symbol = heap.intern_symbol("example"); +/// // Dense storage for sequential IDs +/// let mut dense_table = SymbolTable::::dense(); +/// dense_table.insert(MyId::from_u32(0), symbol); +/// assert_eq!(dense_table.get(MyId::from_u32(0)), Some(symbol)); +/// +/// // Gapped storage for mostly contiguous IDs with some gaps +/// let mut gapped_table = SymbolTable::::gapped(); +/// gapped_table.insert(MyId::from_u32(0), symbol); +/// gapped_table.insert(MyId::from_u32(5), symbol); // Gap at IDs 1-4 +/// assert_eq!(gapped_table.get(MyId::from_u32(0)), Some(symbol)); +/// assert_eq!(gapped_table.get(MyId::from_u32(2)), None); // Gap +/// assert_eq!(gapped_table.get(MyId::from_u32(5)), Some(symbol)); +/// +/// // Sparse storage for arbitrary IDs +/// let mut sparse_table = SymbolTable::::sparse(); +/// sparse_table.insert(MyId::from_u32(100), symbol); +/// assert_eq!(sparse_table.get(MyId::from_u32(100)), Some(symbol)); +/// sparse_table.insert(MyId::from_u32(5), symbol); +/// assert_eq!(sparse_table.get(MyId::from_u32(5)), Some(symbol)); +/// ``` +#[derive(Debug)] +pub struct SymbolLookup<'heap, I> { + inner: SymbolLookupInner<'heap, I>, +} + +impl<'heap, I> SymbolLookup<'heap, I> +where + I: Id, +{ + /// Creates a new symbol table using dense vector-based storage. + /// + /// Dense tables require sequential ID insertion starting from 0 and provide + /// optimal memory efficiency and cache performance for contiguous ID ranges. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolTable::::dense(); + /// // Insertions must be sequential: 0, 1, 2, ... + /// ``` + #[must_use] + pub const fn dense() -> Self { + Self { + inner: SymbolLookupInner::Dense(IdVec::new()), + } + } + + /// Creates a new symbol table using gapped vector-based storage. + /// + /// Gapped tables allow insertion at arbitrary indices within a vector, automatically + /// filling gaps with `None` values. This provides better memory locality than sparse + /// tables while still allowing non-contiguous ID ranges. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolTable::::gapped(); + /// // Insertions can have gaps: 0, 5, 3, 10, ... + /// ``` + #[must_use] + pub const fn gapped() -> Self { + Self { + inner: SymbolLookupInner::Gapped(IdVec::new()), + } + } + + /// Creates a new symbol table using sparse hash-based storage. + /// + /// Sparse tables support arbitrary ID insertion order and provide flexibility + /// for non-contiguous ID ranges at the cost of higher memory overhead per entry. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolTable::::sparse(); + /// // Insertions can be in any order: 100, 5, 1000, ... + /// ``` + #[must_use] + pub fn sparse() -> Self { + Self { + inner: SymbolLookupInner::Sparse(FastHashMap::default()), + } + } + + /// Inserts a symbol associated with the given identifier. + /// + /// - For dense tables, the `id` must be sequential starting from 0. + /// - For gapped tables, any `id` value is accepted, and gaps will be filled with `None`. + /// - For sparse tables, any `id` value is accepted. + /// + /// If the `id` already exists in a gapped or sparse table, the previous symbol is replaced. + /// + /// # Panics + /// + /// Panics if this is a dense table and the `id` is not sequential (i.e., not equal + /// to the current length of the internal vector). + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolTable::::dense(); + /// table.insert(MyId::from_u32(0), symbol); // First insertion + /// table.insert(MyId::from_u32(1), symbol); // Sequential insertion + /// ``` + /// + /// Non-sequential insertions will panic in dense tables: + /// + /// ```should_panic + /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolTable::::dense(); + /// table.insert(MyId::from_u32(0), symbol); // First insertion + /// table.insert(MyId::from_u32(2), symbol); // Non-sequential insertion + /// ``` + pub fn insert(&mut self, id: I, symbol: Symbol<'heap>) { + match &mut self.inner { + SymbolLookupInner::Dense(vec) => { + assert_eq!( + id, + vec.bound(), + "insertions into dense symbol tables must be sequential and contiguous" + ); + + vec.push(symbol); + } + SymbolLookupInner::Gapped(vec) => { + vec.insert(id, symbol); + } + SymbolLookupInner::Sparse(map) => { + map.insert(id, symbol); + } + } + } + + /// Retrieves the symbol associated with the given identifier. + /// + /// Returns the [`Symbol`] if the `id` exists in the table, or [`None`] if + /// the `id` is not found or if the entry is a gap (in gapped tables). + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolTable::::sparse(); + /// table.insert(MyId::from_u32(42), symbol); + /// + /// assert_eq!(table.get(MyId::from_u32(42)), Some(symbol)); + /// assert_eq!(table.get(MyId::from_u32(99)), None); + /// ``` + pub fn get(&self, id: I) -> Option> { + match &self.inner { + SymbolLookupInner::Dense(vec) => vec.get(id).copied(), + SymbolLookupInner::Gapped(vec) => vec.get(id).copied().flatten(), + SymbolLookupInner::Sparse(map) => map.get(&id).copied(), + } + } +} + +impl<'heap, I> Index for SymbolLookup<'heap, I> +where + I: Id, +{ + type Output = Symbol<'heap>; + + fn index(&self, index: I) -> &Self::Output { + match &self.inner { + SymbolLookupInner::Dense(vec) => &vec[index], + SymbolLookupInner::Gapped(vec) => vec[index].as_ref().expect("index out of bounds"), + SymbolLookupInner::Sparse(map) => &map[&index], + } + } +} diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index 448ba80313f..e09f3da27b9 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -18,6 +18,7 @@ //! This encapsulation enables future optimizations such as string interning (either through //! the `string_interner` crate or a custom implementation) without requiring API changes. +mod lookup; mod repr; pub mod sym; mod sym2; @@ -28,11 +29,10 @@ use core::{ fmt::{self, Display, Formatter}, hash::{Hash, Hasher}, marker::PhantomData, - ptr, }; +pub use self::lookup::SymbolLookup; use self::repr::{ConstantSymbol, Repr}; -pub use self::table::SymbolTable; use crate::span::SpanId; /// A string-like value used throughout the HashQL compiler. diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index f5bcc2b1f3b..dacf6fb4416 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -57,7 +57,7 @@ use crate::heap::BumpAllocator; /// not the trailing bytes. All access must go through [`NonNull`] /// to preserve full allocation provenance. #[repr(C, align(2))] -struct RuntimeSymbol { +pub(crate) struct RuntimeSymbol { len: usize, data: [u8; 0], } @@ -80,7 +80,7 @@ impl RuntimeSymbol { /// # Panics /// /// Panics if allocation fails. - fn alloc(alloc: &A, value: &str) -> NonNull { + pub(crate) fn alloc(alloc: &A, value: &str) -> NonNull { let Ok(value) = Self::try_alloc(alloc, value) else { handle_alloc_error(Self::layout(value.len())) }; @@ -306,7 +306,7 @@ impl Repr { /// /// The pointer is stored directly with its tag bit set to 0 (which is a no-op /// since runtime allocations are already aligned). - fn runtime(symbol: NonNull) -> Self { + pub(crate) fn runtime(symbol: NonNull) -> Self { const { assert!(align_of::() >= Self::MIN_ALIGN); } diff --git a/libs/@local/hashql/core/src/symbol/sym2.rs b/libs/@local/hashql/core/src/symbol/sym2.rs index 6f06a574d9d..b8bfc2ef615 100644 --- a/libs/@local/hashql/core/src/symbol/sym2.rs +++ b/libs/@local/hashql/core/src/symbol/sym2.rs @@ -78,6 +78,10 @@ macro_rules! symbols { unsafe { map.insert_unique_unchecked($arm, $value.into_repr()); } )* } + + pub(crate) static LOOKUP: &[(&'static str, super::repr::Repr)] = &[ + $(($arm, $value.into_repr())),* + ]; }; (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index eabcaf8d0c1..4031dfe29c3 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -1,238 +1,62 @@ -use core::ops::Index; +use alloc::alloc::Global; +use core::{alloc::Allocator, hash::BuildHasher as _}; -use super::Symbol; -use crate::{ - collections::FastHashMap, - id::{Id, IdVec}, -}; +use foldhash::fast::RandomState; +use hashbrown::{HashTable, hash_table::Entry}; -#[derive(Debug)] -enum SymbolTableInner<'heap, I> { - Dense(IdVec>), - Gapped(IdVec>>), - Sparse(FastHashMap>), -} +use super::repr::{Repr, RuntimeSymbol}; +use crate::heap::BumpAllocator; -/// A mapping from identifiers to symbols optimized for different access patterns. -/// -/// [`SymbolTable`] provides efficient storage and retrieval of [`Symbol`] instances which are tied -/// to a specific identifier (which is any type that implements the [`Id`] trait). -/// -/// # Storage Strategies -/// -/// To accommodate different access patterns, [`SymbolTable`] supports three storage strategies: -/// -/// ## Dense Storage -/// -/// Created with [`SymbolTable::dense()`], this mode uses a [`Vec`] internally and requires -/// IDs to be inserted sequentially starting from 0. This provides optimal memory efficiency -/// and cache performance for contiguous ID ranges. -/// -/// ## Gapped Storage -/// -/// Created with [`SymbolTable::gapped()`], this mode uses a [`Vec`] of [`Option`] -/// internally and allows insertion at arbitrary indices. Unlike dense storage, gaps are allowed in -/// the ID sequence. This provides a balance between the memory efficiency of dense storage and the -/// flexibility of sparse storage, making it ideal for scenarios where most IDs are contiguous but -/// some gaps may exist. -/// -/// ## Sparse Storage -/// -/// Created with [`SymbolTable::sparse()`], this mode uses a [`FastHashMap`] internally and -/// supports arbitrary ID insertion order. This provides flexibility at the cost of higher -/// memory overhead per entry. -/// -/// # Examples -/// -/// ``` -/// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; -/// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); -/// # let mut heap = Heap::new(); -/// # let symbol = heap.intern_symbol("example"); -/// // Dense storage for sequential IDs -/// let mut dense_table = SymbolTable::::dense(); -/// dense_table.insert(MyId::from_u32(0), symbol); -/// assert_eq!(dense_table.get(MyId::from_u32(0)), Some(symbol)); -/// -/// // Gapped storage for mostly contiguous IDs with some gaps -/// let mut gapped_table = SymbolTable::::gapped(); -/// gapped_table.insert(MyId::from_u32(0), symbol); -/// gapped_table.insert(MyId::from_u32(5), symbol); // Gap at IDs 1-4 -/// assert_eq!(gapped_table.get(MyId::from_u32(0)), Some(symbol)); -/// assert_eq!(gapped_table.get(MyId::from_u32(2)), None); // Gap -/// assert_eq!(gapped_table.get(MyId::from_u32(5)), Some(symbol)); -/// -/// // Sparse storage for arbitrary IDs -/// let mut sparse_table = SymbolTable::::sparse(); -/// sparse_table.insert(MyId::from_u32(100), symbol); -/// assert_eq!(sparse_table.get(MyId::from_u32(100)), Some(symbol)); -/// sparse_table.insert(MyId::from_u32(5), symbol); -/// assert_eq!(sparse_table.get(MyId::from_u32(5)), Some(symbol)); -/// ``` #[derive(Debug)] -pub struct SymbolTable<'heap, I> { - inner: SymbolTableInner<'heap, I>, +struct SymbolTable { + inner: HashTable, + hasher: RandomState, } -impl<'heap, I> SymbolTable<'heap, I> -where - I: Id, -{ - /// Creates a new symbol table using dense vector-based storage. - /// - /// Dense tables require sequential ID insertion starting from 0 and provide - /// optimal memory efficiency and cache performance for contiguous ID ranges. - /// - /// # Examples - /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::dense(); - /// // Insertions must be sequential: 0, 1, 2, ... - /// ``` - #[must_use] - pub const fn dense() -> Self { - Self { - inner: SymbolTableInner::Dense(IdVec::new()), - } +#[expect(unsafe_code)] +impl SymbolTable { + pub(crate) unsafe fn clear(&mut self) { + self.inner.clear(); } - /// Creates a new symbol table using gapped vector-based storage. - /// - /// Gapped tables allow insertion at arbitrary indices within a vector, automatically - /// filling gaps with `None` values. This provides better memory locality than sparse - /// tables while still allowing non-contiguous ID ranges. - /// - /// # Examples - /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::gapped(); - /// // Insertions can have gaps: 0, 5, 3, 10, ... - /// ``` - #[must_use] - pub const fn gapped() -> Self { - Self { - inner: SymbolTableInner::Gapped(IdVec::new()), - } - } - - /// Creates a new symbol table using sparse hash-based storage. - /// - /// Sparse tables support arbitrary ID insertion order and provide flexibility - /// for non-contiguous ID ranges at the cost of higher memory overhead per entry. - /// - /// # Examples - /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::sparse(); - /// // Insertions can be in any order: 100, 5, 1000, ... - /// ``` - #[must_use] - pub fn sparse() -> Self { - Self { - inner: SymbolTableInner::Sparse(FastHashMap::default()), - } - } + pub(crate) unsafe fn prime(&mut self) { + self.inner.reserve(super::sym2::LOOKUP.len(), |_| { + unreachable!("all entries have been cleared") + }); - /// Inserts a symbol associated with the given identifier. - /// - /// - For dense tables, the `id` must be sequential starting from 0. - /// - For gapped tables, any `id` value is accepted, and gaps will be filled with `None`. - /// - For sparse tables, any `id` value is accepted. - /// - /// If the `id` already exists in a gapped or sparse table, the previous symbol is replaced. - /// - /// # Panics - /// - /// Panics if this is a dense table and the `id` is not sequential (i.e., not equal - /// to the current length of the internal vector). - /// - /// # Examples - /// - /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); - /// table.insert(MyId::from_u32(0), symbol); // First insertion - /// table.insert(MyId::from_u32(1), symbol); // Sequential insertion - /// ``` - /// - /// Non-sequential insertions will panic in dense tables: - /// - /// ```should_panic - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); - /// table.insert(MyId::from_u32(0), symbol); // First insertion - /// table.insert(MyId::from_u32(2), symbol); // Non-sequential insertion - /// ``` - pub fn insert(&mut self, id: I, symbol: Symbol<'heap>) { - match &mut self.inner { - SymbolTableInner::Dense(vec) => { - assert_eq!( - id, - vec.bound(), - "insertions into dense symbol tables must be sequential and contiguous" - ); + for &(name, value) in super::sym2::LOOKUP { + let hash = self.hasher.hash_one(name); - vec.push(symbol); - } - SymbolTableInner::Gapped(vec) => { - vec.insert(id, symbol); - } - SymbolTableInner::Sparse(map) => { - map.insert(id, symbol); - } + self.inner.insert_unique(hash, value, |_| { + unreachable!("capacity has been reserved beforehand") + }); } } - /// Retrieves the symbol associated with the given identifier. - /// - /// Returns the [`Symbol`] if the `id` exists in the table, or [`None`] if - /// the `id` is not found or if the entry is a gap (in gapped tables). - /// - /// # Examples - /// - /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::sparse(); - /// table.insert(MyId::from_u32(42), symbol); - /// - /// assert_eq!(table.get(MyId::from_u32(42)), Some(symbol)); - /// assert_eq!(table.get(MyId::from_u32(99)), None); - /// ``` - pub fn get(&self, id: I) -> Option> { - match &self.inner { - SymbolTableInner::Dense(vec) => vec.get(id).copied(), - SymbolTableInner::Gapped(vec) => vec.get(id).copied().flatten(), - SymbolTableInner::Sparse(map) => map.get(&id).copied(), + pub(crate) unsafe fn reset(&mut self) { + unsafe { + self.clear(); + self.prime(); } } -} - -impl<'heap, I> Index for SymbolTable<'heap, I> -where - I: Id, -{ - type Output = Symbol<'heap>; - fn index(&self, index: I) -> &Self::Output { - match &self.inner { - SymbolTableInner::Dense(vec) => &vec[index], - SymbolTableInner::Gapped(vec) => vec[index].as_ref().expect("index out of bounds"), - SymbolTableInner::Sparse(map) => &map[&index], + pub(crate) unsafe fn intern(&mut self, alloc: &B, value: &str) -> Repr { + let hash = self.hasher.hash_one(value); + + // We hash against the string, therefore we must pull out the string representation, instead + // of hashing against the Repr directly, as that would lead to incorrect results. + // We're mapping string -> repr. But the string representation is already stored in the + // Repr. + match self.inner.entry( + hash, + |repr| unsafe { repr.as_str() } == value, + |repr| self.hasher.hash_one(unsafe { repr.as_str() }), + ) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let repr = Repr::runtime(RuntimeSymbol::alloc(alloc, value)); + *entry.insert(repr).get() + } } } } diff --git a/libs/@local/hashql/hir/src/context.rs b/libs/@local/hashql/hir/src/context.rs index 9d0582ee182..bfeca6b0aaf 100644 --- a/libs/@local/hashql/hir/src/context.rs +++ b/libs/@local/hashql/hir/src/context.rs @@ -1,4 +1,4 @@ -use hashql_core::{heap::Heap, id::IdCounter, module::ModuleRegistry, symbol::SymbolTable}; +use hashql_core::{heap::Heap, id::IdCounter, module::ModuleRegistry, symbol::SymbolLookup}; use crate::{ intern::Interner, @@ -6,18 +6,18 @@ use crate::{ node::{HirId, r#let::VarId}, }; -pub type BinderSymbolTable<'heap> = SymbolTable<'heap, VarId>; +pub type BinderSymbolLookup<'heap> = SymbolLookup<'heap, VarId>; #[derive(Debug)] pub struct SymbolRegistry<'heap> { - pub binder: BinderSymbolTable<'heap>, + pub binder: BinderSymbolLookup<'heap>, } impl SymbolRegistry<'_> { #[must_use] pub const fn new() -> Self { Self { - binder: BinderSymbolTable::dense(), + binder: BinderSymbolLookup::dense(), } } } From 89b2aed3e0c6b835691197472b2765f137e0df31 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 15:35:09 +0100 Subject: [PATCH 06/21] feat: checkpoint --- libs/@local/hashql/core/src/symbol/table.rs | 501 +++++++++++++++++++- 1 file changed, 497 insertions(+), 4 deletions(-) diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index 4031dfe29c3..2032e1d3ba1 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -1,3 +1,41 @@ +//! String interning table for HashQL symbols. +//! +//! This module provides [`SymbolTable`], a hash-based interner that maps strings to their +//! canonical [`Repr`] representation. The table supports two kinds of symbols: +//! +//! - **Constant symbols**: Statically defined symbols from [`sym2::LOOKUP`]. Their [`Repr`] encodes +//! an index into the static [`sym2::SYMBOLS`] array (effectively `'static` lifetime). +//! +//! - **Runtime symbols**: Dynamically interned strings allocated on a bump allocator. Their +//! [`Repr`] holds a pointer to a [`RuntimeSymbol`] allocation. +//! +//! # Lifecycle and Epoch Coupling +//! +//! The `SymbolTable` is designed for epoch-based memory management where allocations are +//! made during a processing phase and then freed in bulk. The critical invariant is: +//! +//! **Runtime [`Repr`] values contain pointers to bump-allocated memory. When the bump +//! allocator resets, these pointers become dangling.** +//! +//! Therefore, the table must be reset **before** the bump allocator to prevent undefined +//! behavior from accessing dangling pointers during hash table operations. +//! +//! ## Correct Reset Ordering +//! +//! ```text +//! symbol_table.reset(); // Clear runtime Reprs, restore constants +//! heap.reset(); // Now safe: no dangling pointers in the table +//! ``` +//! +//! # Priming +//! +//! Calling [`SymbolTable::prime`] populates the table with predefined symbols from +//! [`sym2::LOOKUP`]. This ensures that interning a predefined string returns its +//! canonical constant [`Repr`] rather than allocating a runtime symbol. +//! +//! [`sym2::LOOKUP`]: super::sym2::LOOKUP +//! [`sym2::SYMBOLS`]: super::sym2::SYMBOLS + use alloc::alloc::Global; use core::{alloc::Allocator, hash::BuildHasher as _}; @@ -7,49 +45,210 @@ use hashbrown::{HashTable, hash_table::Entry}; use super::repr::{Repr, RuntimeSymbol}; use crate::heap::BumpAllocator; +/// A string interning table mapping `&str` to canonical [`Repr`] values. +/// +/// The table uses a [`HashTable`] with string-based hashing and equality. Two symbols +/// with identical string content will always map to the same [`Repr`]. +/// +/// # Safety Contract +/// +/// This type contains unsafe methods because runtime [`Repr`] values hold raw pointers +/// to bump-allocated memory. The caller must ensure: +/// +/// 1. **Epoch coupling**: [`reset`](Self::reset) must be called before resetting the bump allocator +/// that backs runtime symbols. Failure to do so causes undefined behavior when the table +/// attempts to hash or compare entries with dangling pointers. +/// +/// 2. **Allocator consistency**: The same bump allocator instance must be used for all +/// [`intern`](Self::intern) calls on this table. +/// +/// 3. **Allocator lifetime**: The bump allocator passed to [`intern`](Self::intern) must remain +/// live for as long as the table is in use (i.e., until [`reset`](Self::reset) is called). +/// +/// 4. **Priming precondition**: [`prime`](Self::prime) must only be called on an empty table +/// (typically after [`clear`](Self::clear)). +/// +/// # Drop Safety +/// +/// Dropping the `SymbolTable` after the bump allocator has been reset is **safe**. +/// [`Repr`] has no [`Drop`] implementation, so dropping the table does not dereference +/// any runtime symbol pointers. Only *using* the table (e.g., calling [`intern`](Self::intern)) +/// after the allocator reset causes undefined behavior. +/// +/// Note: This assumes the [`HashTable`]'s own allocator `A` (used for bucket storage) is +/// still valid. With the default `A = Global`, this is always the case. #[derive(Debug)] struct SymbolTable { inner: HashTable, hasher: RandomState, } +impl SymbolTable { + /// Creates a new, empty symbol table using the global allocator. + /// + /// The table is not primed. Call [`prime`](Self::prime) to populate it with + /// predefined symbols before use. + fn new() -> Self { + Self { + inner: HashTable::new(), + hasher: RandomState::default(), + } + } +} + +impl SymbolTable { + /// Creates a new, empty symbol table using the given allocator. + /// + /// The table is not primed. Call [`prime`](Self::prime) to populate it with + /// predefined symbols before use. + fn new_in(alloc: A) -> Self { + Self { + inner: HashTable::new_in(alloc), + hasher: RandomState::default(), + } + } + + /// Returns the number of symbols currently in the table. + fn len(&self) -> usize { + self.inner.len() + } + + /// Returns `true` if the table contains no symbols. + fn is_empty(&self) -> bool { + self.inner.is_empty() + } +} + #[expect(unsafe_code)] impl SymbolTable { + /// Removes all entries from the table. + /// + /// After calling this method, the table is empty and must be primed before use. + /// + /// # Safety + /// + /// The caller must call [`prime`](Self::prime) before any subsequent [`intern`](Self::intern) + /// calls. Without priming, interning a predefined symbol (e.g., `"and"`) would allocate + /// a new runtime symbol instead of returning the canonical constant [`Repr`] that matches + /// the static symbols in [`sym`](super::sym). This would break the invariant that + /// predefined symbols intern to their canonical constant representations. pub(crate) unsafe fn clear(&mut self) { self.inner.clear(); } + /// Populates the table with predefined symbols from [`sym2::LOOKUP`]. + /// + /// After priming, interning any predefined symbol string will return its canonical + /// constant [`Repr`] rather than allocating a new runtime symbol. + /// + /// # Preconditions + /// + /// The table must be empty. This is typically ensured by calling [`clear`](Self::clear) + /// beforehand, or by using a freshly constructed table. + /// + /// # Safety + /// + /// The caller must ensure that the table is empty before calling this method. + /// + /// [`sym2::LOOKUP`]: super::sym2::LOOKUP pub(crate) unsafe fn prime(&mut self) { self.inner.reserve(super::sym2::LOOKUP.len(), |_| { - unreachable!("all entries have been cleared") + unreachable!("prime() requires an empty table; hasher callback should not be invoked") }); for &(name, value) in super::sym2::LOOKUP { let hash = self.hasher.hash_one(name); self.inner.insert_unique(hash, value, |_| { - unreachable!("capacity has been reserved beforehand") + unreachable!("capacity was pre-reserved; hasher callback should not be invoked") }); } } + /// Resets the table to its initial primed state. + /// + /// This is equivalent to calling [`clear`](Self::clear) followed by [`prime`](Self::prime). + /// After resetting, the table contains only the predefined constant symbols. + /// + /// # Safety + /// + /// **This method must be called before resetting the bump allocator** that backs any + /// runtime symbols previously interned into this table. The reset ordering is: + /// + /// ```text + /// symbol_table.reset(); // ← First: clear dangling runtime Reprs + /// heap.reset(); // ← Second: now safe to invalidate allocations + /// ``` + /// + /// Violating this ordering causes undefined behavior: the bump allocator reset + /// invalidates runtime symbol pointers, and subsequent table operations (including + /// this method's `clear()` + `prime()` sequence, or future `intern()` calls) may + /// attempt to dereference those dangling pointers. + /// + /// # Invariants Restored + /// + /// After this method returns: + /// - All runtime symbols are removed from the table. + /// - All constant symbols from [`sym2::LOOKUP`] are present. + /// - The table is ready for a new epoch of interning. + /// + /// [`sym2::LOOKUP`]: super::sym2::LOOKUP pub(crate) unsafe fn reset(&mut self) { + // SAFETY: correct order of operations is present. unsafe { self.clear(); self.prime(); } } + /// Interns a string, returning its canonical [`Repr`]. + /// + /// If the string has already been interned (either as a predefined constant or a + /// previously interned runtime symbol), returns the existing [`Repr`]. Otherwise, + /// allocates a new [`RuntimeSymbol`] on the provided bump allocator and inserts it. + /// + /// # Returns + /// + /// The canonical [`Repr`] for `value`. Interning the same string multiple times + /// is idempotent—subsequent calls return the same [`Repr`]. + /// + /// # Safety + /// + /// The caller must ensure: + /// + /// 1. **No dangling pointers**: The table must not contain dangling runtime [`Repr`] values. + /// This means [`reset`](Self::reset) must have been called before any preceding bump + /// allocator reset. + /// + /// 2. **Allocator consistency**: The same allocator instance must be used for all `intern()` + /// calls on this table. Using different allocators would result in runtime symbols from + /// multiple allocators, and resetting one would leave dangling pointers from the other. + /// + /// 3. **Allocator lifetime**: The allocator must remain live for the lifetime of this symbol + /// table, or until [`reset`](Self::reset) is called. All runtime [`Repr`] values in the + /// table point into the allocator's memory and are dereferenced during table operations. + /// + /// # Implementation Notes + /// + /// The table hashes and compares entries by their string content, not by [`Repr`] + /// identity. This means: + /// - Equality: `repr.as_str() == value` + /// - Hashing: `hash(repr.as_str())` + /// + /// Both operations dereference runtime [`Repr`] pointers, which is why the caller + /// must ensure no dangling pointers exist in the table. pub(crate) unsafe fn intern(&mut self, alloc: &B, value: &str) -> Repr { let hash = self.hasher.hash_one(value); - // We hash against the string, therefore we must pull out the string representation, instead - // of hashing against the Repr directly, as that would lead to incorrect results. + // We hash against the string, therefore we must pull out the string representation, + // instead of hashing against the Repr directly, as that would lead to incorrect results. // We're mapping string -> repr. But the string representation is already stored in the // Repr. match self.inner.entry( hash, + // SAFETY: Caller guarantees no dangling runtime pointers in the table. |repr| unsafe { repr.as_str() } == value, + // SAFETY: Same as above; this is called during rehashing. |repr| self.hasher.hash_one(unsafe { repr.as_str() }), ) { Entry::Occupied(entry) => *entry.get(), @@ -60,3 +259,297 @@ impl SymbolTable { } } } + +#[cfg(test)] +mod tests { + #![expect(unsafe_code, clippy::non_ascii_literal)] + + use super::{super::sym2, SymbolTable}; + use crate::heap::Scratch; + + #[test] + fn new_table_is_empty() { + let table = SymbolTable::new(); + assert!(table.is_empty()); + assert_eq!(table.len(), 0); + } + + #[test] + fn prime_populates_table_with_lookup_entries() { + let mut table = SymbolTable::new(); + // SAFETY: Table is empty, no dangling pointers. + unsafe { + table.prime(); + } + + assert_eq!(table.len(), sym2::LOOKUP.len()); + assert!(!table.is_empty()); + } + + #[test] + fn clear_removes_all_entries() { + let mut table = SymbolTable::new(); + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + assert!(!table.is_empty()); + + // SAFETY: We will not call intern() after this without priming first. + unsafe { + table.clear(); + } + assert!(table.is_empty()); + assert_eq!(table.len(), 0); + } + + #[test] + fn reset_restores_primed_state() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + let initial_len = table.len(); + + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "user_defined_symbol"); + }; + assert_eq!(table.len(), initial_len + 1); + + // SAFETY: Scratch has not been reset, so runtime pointers are valid. + unsafe { + table.reset(); + }; + assert_eq!(table.len(), initial_len); + } + + #[test] + fn intern_predefined_symbol_returns_constant_repr() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // Intern a predefined symbol (e.g., "and" from LOOKUP). + // The returned Repr should match the one in LOOKUP. + for &(name, expected_repr) in sym2::LOOKUP { + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, name) }; + assert_eq!( + repr, expected_repr, + "predefined symbol '{name}' should return constant Repr" + ); + } + } + + #[test] + fn intern_is_idempotent() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // SAFETY: Table is primed, scratch is live. + let repr1 = unsafe { table.intern(&scratch, "my_custom_symbol") }; + // SAFETY: Table is primed, scratch is live. + let repr2 = unsafe { table.intern(&scratch, "my_custom_symbol") }; + + assert_eq!(repr1, repr2); + // SAFETY: scratch is live. + assert_eq!(unsafe { repr1.as_str() }, "my_custom_symbol"); + } + + #[test] + fn intern_different_strings_returns_different_reprs() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr_foo = unsafe { table.intern(&scratch, "foo_unique") }; + // SAFETY: Table is primed, scratch is live. + let repr_bar = unsafe { table.intern(&scratch, "bar_unique") }; + + assert_ne!(repr_foo, repr_bar); + } + + #[test] + fn intern_empty_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, "") }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, ""); + } + + #[test] + fn intern_unicode_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, "日本語 🎉 émojis") }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, "日本語 🎉 émojis"); + } + + #[test] + fn intern_long_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + let long_string = "a".repeat(10_000); + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, &long_string) }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, long_string); + } + + #[test] + fn constants_survive_reset() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // Get a constant Repr by interning a predefined symbol. + let (name, expected_repr) = sym2::LOOKUP[0]; + // SAFETY: Table is primed, scratch is live. + let repr_before = unsafe { table.intern(&scratch, name) }; + assert_eq!(repr_before, expected_repr); + + // SAFETY: Scratch has not been reset. + unsafe { + table.reset(); + }; + + // SAFETY: Table is primed, scratch is live. + let repr_after = unsafe { table.intern(&scratch, name) }; + + // Constants should be identical across resets. + assert_eq!(repr_before, repr_after); + } + + #[test] + fn runtime_symbols_cleared_on_reset() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + let primed_len = table.len(); + + // Intern some runtime symbols. + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "runtime_1"); + table.intern(&scratch, "runtime_2"); + table.intern(&scratch, "runtime_3"); + } + assert_eq!(table.len(), primed_len + 3); + + // SAFETY: Scratch has not been reset. + unsafe { + table.reset(); + }; + + // Runtime symbols should be gone, only constants remain. + assert_eq!(table.len(), primed_len); + } + + #[test] + fn multiple_intern_operations_grow_table() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + let initial_len = table.len(); + + // SAFETY: Table is primed, scratch is live. + unsafe { + for i in 0..100 { + table.intern(&scratch, &format!("symbol_{i}")); + } + } + + assert_eq!(table.len(), initial_len + 100); + } + + /// Test that dropping a `SymbolTable` after the backing allocator has been reset + /// does not cause undefined behavior. + /// + /// This test is designed to be run under Miri to verify drop safety. + /// The key invariant: `Repr` has no `Drop` impl, so dropping the table + /// does not dereference any (now-dangling) runtime symbol pointers. + #[test] + fn drop_after_allocator_reset_is_safe() { + let scratch = Scratch::new(); + let mut table = SymbolTable::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // Intern several runtime symbols to ensure we have dangling pointers after reset. + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "runtime_symbol_1"); + table.intern(&scratch, "runtime_symbol_2"); + table.intern(&scratch, "another_runtime_symbol"); + } + + // Drop the allocator FIRST - this invalidates all runtime symbol pointers. + // The table now contains dangling pointers, but we will NOT use it. + drop(scratch); + + // Drop the table. This should NOT cause UB because: + // - Repr has no Drop impl (it's Copy) + // - HashTable::drop doesn't hash/compare elements, just drops them in-place + // - Dropping a Repr is a no-op that doesn't dereference the pointer + drop(table); + } +} From 3aa5c4759b743d10ecbc764c48a8a46e585a631f Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 15:47:55 +0100 Subject: [PATCH 07/21] feat: checkpoint --- libs/@local/hashql/core/src/heap/mod.rs | 79 +++-- libs/@local/hashql/core/src/symbol/mod.rs | 57 ++-- libs/@local/hashql/core/src/symbol/repr.rs | 30 +- libs/@local/hashql/core/src/symbol/sym.rs | 314 +++++++++++--------- libs/@local/hashql/core/src/symbol/sym2.rs | 301 ------------------- libs/@local/hashql/core/src/symbol/table.rs | 20 +- 6 files changed, 273 insertions(+), 528 deletions(-) delete mode 100644 libs/@local/hashql/core/src/symbol/sym2.rs diff --git a/libs/@local/hashql/core/src/heap/mod.rs b/libs/@local/hashql/core/src/heap/mod.rs index 23c30e7c6d1..188ed3d74d2 100644 --- a/libs/@local/hashql/core/src/heap/mod.rs +++ b/libs/@local/hashql/core/src/heap/mod.rs @@ -117,7 +117,7 @@ pub use self::{ }; use crate::{ collections::{FastHashSet, fast_hash_set_with_capacity}, - symbol::{Symbol, sym::TABLES}, + symbol::{Symbol, SymbolTable}, }; /// A boxed value allocated on the `Heap`. @@ -152,11 +152,7 @@ pub type HashMap<'heap, K, V, S = foldhash::fast::RandomState> = #[derive(Debug)] pub struct Heap { inner: Allocator, - // Interned strings stored as `&'static str` for implementation convenience. - // SAFETY: The `'static` is a lie. These point into arena memory and are safe because: - // - All access goes through `Symbol<'heap>`, bounding the effective lifetime - // - This set is cleared before `inner.reset()` is called - strings: Mutex>, + strings: Mutex, } impl Heap { @@ -178,7 +174,7 @@ impl Heap { pub fn uninitialized() -> Self { Self { inner: Allocator::new(), - strings: Mutex::default(), + strings: Mutex::new(SymbolTable::new()), } } @@ -201,7 +197,10 @@ impl Heap { "heap has already been primed or has interned symbols" ); - Self::prime_symbols(strings); + // SAFETY: We have verified that the symbol table is empty. + unsafe { + strings.prime(); + } } /// Creates a new heap. @@ -214,12 +213,16 @@ impl Heap { #[must_use] #[inline] pub fn new() -> Self { - let mut strings = fast_hash_set_with_capacity(0); - Self::prime_symbols(&mut strings); + let mut table = SymbolTable::new(); + + // SAFETY: fresh symbol table is empty + unsafe { + table.prime(); + } Self { inner: Allocator::new(), - strings: Mutex::new(strings), + strings: Mutex::new(table), } } @@ -232,12 +235,16 @@ impl Heap { #[must_use] #[inline] pub fn with_capacity(capacity: usize) -> Self { - let mut strings = fast_hash_set_with_capacity(0); - Self::prime_symbols(&mut strings); + let mut table = SymbolTable::new(); + + // SAFETY: fresh symbol table is empty + unsafe { + table.prime(); + } Self { inner: Allocator::with_capacity(capacity), - strings: Mutex::new(strings), + strings: Mutex::new(table), } } @@ -252,16 +259,6 @@ impl Heap { self.inner.alloc_with(|| value) } - fn prime_symbols(strings: &mut FastHashSet<&'static str>) { - strings.reserve(TABLES.iter().map(|table| table.len()).sum()); - - for &table in TABLES { - for symbol in table { - assert!(strings.insert(symbol.as_str())); - } - } - } - /// Interns a string symbol, returning a reference to the interned value. /// /// If the string has already been interned, returns the existing [`Symbol`] pointing @@ -277,22 +274,16 @@ impl Heap { pub fn intern_symbol<'this>(&'this self, value: &str) -> Symbol<'this> { let mut strings = self.strings.lock().expect("lock should not be poisoned"); - if let Some(&string) = strings.get(value) { - return Symbol::new_unchecked(string); - } - - let string = &*value.transfer_into(self); - - // SAFETY: The `'static` lifetime is a lie to enable HashSet storage. - // Sound because: (1) external access is through `Symbol<'this>`, (2) strings - // are cleared before arena reset, (3) `reset()` requires `&mut self`. - #[expect(unsafe_code)] - let string: &'static str = unsafe { &*ptr::from_ref::(string) }; + // SAFETY: `SymbolTable::intern` requires: + // 1. No dangling pointers: The table is reset before the arena in `Heap::reset`. + // 2. Allocator consistency: We always pass `self` as the allocator. + // 3. Allocator lifetime: `self` outlives the returned `Repr`. + let repr = unsafe { strings.intern(self, value) }; - strings.insert(string); - drop(strings); - - Symbol::new_unchecked(string) + // SAFETY: The `Repr` was just interned with `self` as the allocator, so it is + // valid for `'this`. Runtime symbols point into `self.inner`, and constant + // symbols have static lifetime. + unsafe { Symbol::from_repr(repr) } } } @@ -347,12 +338,14 @@ impl ResetAllocator for Heap { /// Panics if the internal mutex is poisoned. #[inline] fn reset(&mut self) { - // IMPORTANT: Clear strings BEFORE resetting the arena to prevent dangling references. - // The HashSet stores `&'static str` that actually point into arena memory. { let mut strings = self.strings.lock().expect("lock should not be poisoned"); - strings.clear(); - Self::prime_symbols(&mut strings); + + // SAFETY: The symbol table is reset before the arena, so no dangling references exist. + unsafe { + strings.reset(); + }; + drop(strings); } diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index e09f3da27b9..e50d920bf82 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -21,7 +21,6 @@ mod lookup; mod repr; pub mod sym; -mod sym2; mod table; use core::{ @@ -33,6 +32,7 @@ use core::{ pub use self::lookup::SymbolLookup; use self::repr::{ConstantSymbol, Repr}; +pub(crate) use self::table::SymbolTable; use crate::span::SpanId; /// A string-like value used throughout the HashQL compiler. @@ -55,29 +55,29 @@ pub struct Symbol<'heap> { #[expect(unsafe_code)] impl<'heap> Symbol<'heap> { - /// Creates a new interned symbol from a string slice. - /// - /// The caller must ensure that the string is unique and interned. - pub(crate) const fn new_unchecked(string: &'heap str) -> Self { + #[inline] + const fn new_constant_unchecked(index: usize) -> Self { Symbol { - repr: Repr::constant(ConstantSymbol::new_unchecked(0)), + repr: Repr::constant(ConstantSymbol::new_unchecked(index)), _marker: PhantomData, } - // unimplemented!() } - const fn new_constant_unchecked(index: usize) -> Self { + #[inline] + pub(crate) const unsafe fn from_repr(repr: Repr) -> Self { Symbol { - repr: Repr::constant(ConstantSymbol::new_unchecked(index)), + repr, _marker: PhantomData, } } + #[inline] const fn into_repr(self) -> Repr { self.repr } #[must_use] + #[inline] pub fn as_str(&self) -> &str { // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. unsafe { self.repr.as_str() } @@ -89,21 +89,25 @@ impl<'heap> Symbol<'heap> { /// instead of the symbol itself, somewhat circumventing the protections given to the symbol /// itself. Any unwrapped type should be considered no longer unique and interned. #[must_use] - pub fn unwrap(&self) -> &'heap str { + #[inline] + pub fn unwrap(self) -> &'heap str { // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. unsafe { self.repr.as_str() } } #[must_use] - pub const fn as_bytes(&self) -> &[u8] { - unimplemented!() - // self.0.as_bytes() + #[inline] + pub fn as_bytes(&self) -> &[u8] { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_bytes() } } #[must_use] + #[inline] pub fn demangle(self) -> &'heap str { - unimplemented!() - // self.0.rsplit_once(':').map_or(self.0, |(name, _)| name) + let value = self.unwrap(); + + value.rsplit_once(':').map_or(value, |(name, _)| name) } } @@ -117,8 +121,7 @@ impl AsRef for Symbol<'_> { impl PartialEq for Symbol<'_> { fn eq(&self, other: &Self) -> bool { // Pointer equality implies string equality (due to the unique contents assumption) - // ptr::eq(self.0, other.0) - unimplemented!() + self.repr == other.repr } } @@ -134,27 +137,24 @@ impl Ord for Symbol<'_> { fn cmp(&self, other: &Self) -> Ordering { // Pointer equality implies string equality (due to the unique contents assumption), but if // not the same the contents must be compared. - // if self == other { - // Ordering::Equal - // } else { - // self.0.cmp(other.0) - // } - unimplemented!() + if self == other { + Ordering::Equal + } else { + self.as_str().cmp(other.as_str()) + } } } impl Hash for Symbol<'_> { fn hash(&self, state: &mut H) { // Pointer hashing is sufficient (due to the unique contents assumption) - // ptr::hash(self.0, state); - unimplemented!() + Hash::hash(&self.repr, state); } } impl Display for Symbol<'_> { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { - unimplemented!() - // Display::fmt(self.0, fmt) + Display::fmt(self.as_str(), fmt) } } @@ -314,7 +314,6 @@ impl AsRef for Ident<'_> { impl Display for Ident<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - unimplemented!() - // Display::fmt(&self.value.0, fmt) + Display::fmt(&self.value.as_str(), fmt) } } diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index dacf6fb4416..1a118cca1f4 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -1,4 +1,3 @@ -#![expect(unsafe_code)] //! Compact symbol representation using tagged pointers. //! //! This module provides [`Repr`], a single-word representation for symbols that can be either: @@ -25,6 +24,7 @@ //! full allocation provenance. Creating `&RuntimeSymbol` would narrow provenance to just the //! header, causing undefined behavior when accessing the trailing inline bytes under strict //! provenance / Stacked Borrows. +#![expect(unsafe_code)] use alloc::alloc::handle_alloc_error; use core::{ @@ -34,7 +34,7 @@ use core::{ ptr::{self, NonNull}, }; -use super::sym2::SYMBOLS; +use super::sym::SYMBOLS; use crate::heap::BumpAllocator; /// Header for a runtime-allocated symbol with inline string data. @@ -186,6 +186,16 @@ impl ConstantSymbol { // SAFETY: Caller guarantees the index is in bounds. unsafe { *SYMBOLS.as_ptr().add(self.0) } } + + /// Returns the byte slice for this constant symbol without bounds checking. + /// + /// # Safety + /// + /// The index must be within bounds of [`STRINGS`]. + const unsafe fn as_bytes_unchecked(self) -> &'static [u8] { + // SAFETY: Constant symbols return &'static str, which coerces to &'static [u8]. + unsafe { self.as_str_unchecked().as_bytes() } + } } /// A compact, single-word representation for symbols. @@ -276,6 +286,22 @@ impl Repr { } } + /// Returns the byte content of this symbol. + /// + /// # Safety + /// + /// - For runtime symbols: the allocation must remain live for lifetime `'str`. + /// - The returned bytes must not be mutated for lifetime `'str`. + pub(crate) unsafe fn as_bytes<'str>(self) -> &'str [u8] { + if self.tag() == Self::TAG_RUNTIME { + // SAFETY: Caller guarantees the allocation is live for 'str. + unsafe { RuntimeSymbol::as_bytes(self.as_runtime_symbol()) } + } else { + // SAFETY: Constant symbols return &'static str, which coerces to &'str. + unsafe { self.as_constant_symbol().as_bytes_unchecked() } + } + } + /// Creates a `Repr` for a constant symbol. /// /// The index is encoded directly in the pointer bits (shifted to make room for the tag). diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index 69f51aeaa21..b8bfc2ef615 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -1,66 +1,113 @@ -//! This module defines a collection of static symbol constants used throughout the codebase. -//! -//! # Usage -//! -//! These symbols should only ever be imported with the `sym` prefix to avoid naming conflicts -//! and maintain clarity about where the symbols are defined. For example: -//! -//! ```rust -//! use hashql_core::symbol::sym; -//! -//! // Correct usage: -//! let add_symbol = sym::lexical::add; -//! let asterisk = sym::symbol::asterisk; -//! -//! // Incorrect usage (avoid): -//! // use crate::symbol::sym::lexical::*; -//! ``` -//! -//! These symbols provide pointer equality guarantees when interned from a `Heap`, -//! which allows for efficient symbol comparison operations. #![expect(non_upper_case_globals, clippy::min_ident_chars)] use super::Symbol; -/// Macro for defining groups of static symbol constants. -/// -/// This macro creates modules containing static `Symbol` instances and -/// generates tables that group these symbols for efficient lookup. -/// -/// The macro supports several forms: -/// - Basic symbol: uses the identifier name as the symbol value -/// - Custom symbol: allows specifying a custom string value with the `name: "value"` syntax -/// - Special handling for Rust keywords using the `r#` prefix -/// -/// For each symbol group, this macro also creates a corresponding table of references -/// to all symbols in that group. macro_rules! symbols { - (@sym) => {}; - (@sym $name:ident $(, $($rest:tt)*)?) => { - pub static $name: super::Symbol<'static> = super::Symbol::new_unchecked(stringify!($name)); - $(symbols!(@sym $($rest)*);)? + (@strings [$($acc:tt)*];) => { + pub(crate) static SYMBOLS: &[&str] = &[ + $($acc),* + ]; }; - (@sym $name:ident : $value:literal $(, $($rest:tt)*)?) => { - pub static $name: super::Symbol<'static> = super::Symbol::new_unchecked($value); - $(symbols!(@sym $($rest)*);)? + (@strings [$($acc:tt)*]; , $($rest:tt)*) => { + symbols!(@strings [$($acc)*]; $($rest)*); }; - (@table $module:ident $table:ident #($($name:ident)*)) => { - const $table: &[&Symbol<'static>] = &[ - $(&$module::$name),* - ]; + (@strings [$($acc:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)*]; $($inner)* $(, $($rest)*)?); + }; + (@strings [$($acc:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* $value]; $($($rest)*)?); + }; + (@strings [$($acc:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* (stringify!($name))]; $($($rest)*)?); + }; + + (@consts @cont [$($count:tt)*] [$($next:tt)*];) => { + symbols!(@consts [$($count)*]; $($next)*); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; , $($rest:tt)*) => { + symbols!(@consts @cont [$($count)*] [$($next)*]; $($rest)*); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); }; - (@table $module:ident $table:ident #($($acc:tt)*) $name:ident $(: $value:literal)? $(, $($rest:tt)*)?) => { - symbols!(@table $module $table #($($acc)* $name) $($($rest)*)?); + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*];; $($inner)* $(, $($rest)*)?); }; - ($module:ident; $table:ident; $($items:tt)*) => { + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); + }; + + (@consts [$($count:tt)*];) => {}; + (@consts [$($count:tt)*]; , $($rest:tt)*) => { + symbols!(@consts [$($count)*]; $($rest)*); + }; + (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == $value) }; + pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + (@consts [$($count:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { pub mod $module { - symbols!(@sym $($items)*); + use super::*; + + symbols!(@consts [$($count)*]; $($inner)*); + } + + symbols!(@consts @cont [$($count)*] [$($($rest)*)?]; $($inner)*); + }; + (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == stringify!($name)) }; + pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + + (@path [] [$($path:ident)*];) => { + $($path)::* + }; + (@path [$next:tt $($rest:tt)*] [$($path:tt)*];) => { + symbols!(@path [$($rest)*] [$next $($path)*];) + }; + + (@lookup [$(, $arm:expr => $value:expr)*] [$($path:tt),*];) => { + #[expect(unsafe_code)] + pub(crate) fn prime(map: &mut hashbrown::HashMap<&'static str, super::repr::Repr, S, A>) { + debug_assert!(map.is_empty()); + map.reserve(SYMBOLS.len()); + + $( + // SAFETY: The declarative macro guarantees that the symbol is unique. + unsafe { map.insert_unique_unchecked($arm, $value.into_repr()); } + )* } - symbols!(@table $module $table #() $($items)*); + pub(crate) static LOOKUP: &[(&'static str, super::repr::Repr)] = &[ + $(($arm, $value.into_repr())),* + ]; + }; + (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; , $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, $value => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*] [$module $(, $path)*]; $($inner)* ,| $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, stringify!($name) => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + + (@table; $($items:tt)*) => { + symbols!(@strings []; $($items)*); + symbols!(@consts []; $($items)*); + symbols!(@lookup [] [self]; $($items)*); }; } -symbols![lexical; LEXICAL; +symbols! {@table; + // [tidy] sort alphabetically start access, add, and, @@ -85,6 +132,7 @@ symbols![lexical; LEXICAL; Dict, div, draft_id, + dummy: "", E, edition, edition_id, @@ -168,106 +216,86 @@ symbols![lexical; LEXICAL; Url, vectors, web_id, -]; - -// Internal names are non user constructible -symbols![internal; INTERNAL; - ClosureEnv: "'" -]; - -symbols![digit; DIGITS; - zero: "0", - one: "1", - two: "2", - three: "3", - four: "4", - five: "5", - six: "6", - seven: "7", - eight: "8", - nine: "9", -]; - -symbols![symbol; SYMBOLS; - add: "+", - ampersand: "&", - and: "&&", - arrow: "->", - arrow_head: "|>", - assign: "=", - asterisk: "*", - backets: "[]", - bit_shl: "<<", - bit_shr: ">>", - caret: "^", - colon: ":", - colon_colon: "::", - comma: ",", - dollar: "$", - dollar_question_mark: "$?", - dot: ".", - eq: "==", - exclamation_mark: "!", - gt: ">", - gte: ">=", - lt: "<", - lte: "<=", - ne: "!=", - or: "||", - pipe: "|", - question_mark: "?", - slash: "/", - sub: "-", - tilde: "~", -]; - -symbols![path; PATHS; - option: "::core::option::Option", - some: "::core::option::Some", - none: "::core::option::None", - graph_head_entities: "::graph::head::entities", - graph_body_filter: "::graph::body::filter", - graph_tail_collect: "::graph::tail::collect", - Entity: "::graph::types::knowledge::entity::Entity" -]; + // [tidy] sort alphabetically end -pub(crate) const TABLES: &[&[&Symbol<'static>]] = &[LEXICAL, DIGITS, SYMBOLS, PATHS, INTERNAL]; + internal: { + ClosureEnv: "'" + }, -// #[cfg(test)] -// mod test { -// use core::ptr; + symbol: { + // [tidy] sort alphabetically start + ampamp: "&&", + ampersand: "&", + arrow: "->", + arrow_head: "|>", + asterisk: "*", + exclamation: "!", + excleq: "!=", + brackets: "[]", + caret: "^", + colon: ":", + colon_colon: "::", + comma: ",", + dollar: "$", + dollar_question_mark: "$?", + dot: ".", + eq: "=", + eqeq: "==", + gt: ">", + gteq: ">=", + gtgt: ">>", + lt: "<", + lteq: "<=", + ltlt: "<<", + minus: "-", + pipepipe: "||", + pipe: "|", + plus: "+", + question_mark: "?", + slash: "/", + tilde: "~", + // [tidy] sort alphabetically end + }, -// use super::TABLES; -// use crate::{ -// heap::{Heap, ResetAllocator as _}, -// symbol::sym, -// }; + digit: { + zero: "0", + one: "1", + two: "2", + three: "3", + four: "4", + five: "5", + six: "6", + seven: "7", + eight: "8", + nine: "9", + }, -// #[test] -// fn pointer_equality_from_heap() { -// let mut heap = Heap::new(); - -// let mul_heap = heap.intern_symbol("*"); -// let mul_sym = sym::symbol::asterisk; + path: { + // [tidy] sort alphabetically start + option: "::core::option::Option", + some: "::core::option::Some", + none: "::core::option::None", + graph_head_entities: "::graph::head::entities", + graph_body_filter: "::graph::body::filter", + graph_tail_collect: "::graph::tail::collect", + // [tidy] sort alphabetically end + } +} -// assert!(ptr::eq(mul_heap.0, mul_sym.0)); +#[cfg(test)] +mod tests { + use std::collections::HashSet; -// // even after reset that should be the case -// heap.reset(); + use super::SYMBOLS; -// let mul_heap = heap.intern_symbol("*"); -// let mul_sym = sym::symbol::asterisk; + #[test] + fn symbols_are_unique() { + let mut set = HashSet::with_capacity(SYMBOLS.len()); -// assert!(ptr::eq(mul_heap.0, mul_sym.0)); -// } + for symbol in SYMBOLS { + set.insert(*symbol); + } -// #[test] -// fn ensure_no_collisions() { -// let mut set = std::collections::HashSet::new(); -// for &table in TABLES { -// for &symbol in table { -// assert!(set.insert(symbol.0)); -// } -// } -// } -// } + assert_eq!(set.len(), SYMBOLS.len(), "duplicate symbol value found"); + } +} diff --git a/libs/@local/hashql/core/src/symbol/sym2.rs b/libs/@local/hashql/core/src/symbol/sym2.rs deleted file mode 100644 index b8bfc2ef615..00000000000 --- a/libs/@local/hashql/core/src/symbol/sym2.rs +++ /dev/null @@ -1,301 +0,0 @@ -#![expect(non_upper_case_globals, clippy::min_ident_chars)] -use super::Symbol; - -macro_rules! symbols { - (@strings [$($acc:tt)*];) => { - pub(crate) static SYMBOLS: &[&str] = &[ - $($acc),* - ]; - }; - (@strings [$($acc:tt)*]; , $($rest:tt)*) => { - symbols!(@strings [$($acc)*]; $($rest)*); - }; - (@strings [$($acc:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { - symbols!(@strings [$($acc)*]; $($inner)* $(, $($rest)*)?); - }; - (@strings [$($acc:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { - symbols!(@strings [$($acc)* $value]; $($($rest)*)?); - }; - (@strings [$($acc:tt)*]; $name:ident $(, $($rest:tt)*)?) => { - symbols!(@strings [$($acc)* (stringify!($name))]; $($($rest)*)?); - }; - - (@consts @cont [$($count:tt)*] [$($next:tt)*];) => { - symbols!(@consts [$($count)*]; $($next)*); - }; - (@consts @cont [$($count:tt)*] [$($next:tt)*]; , $($rest:tt)*) => { - symbols!(@consts @cont [$($count)*] [$($next)*]; $($rest)*); - }; - (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { - symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); - }; - (@consts @cont [$($count:tt)*] [$($next:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { - symbols!(@consts @cont [$($count)* ()] [$($next)*];; $($inner)* $(, $($rest)*)?); - }; - (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident $(, $($rest:tt)*)?) => { - symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); - }; - - (@consts [$($count:tt)*];) => {}; - (@consts [$($count:tt)*]; , $($rest:tt)*) => { - symbols!(@consts [$($count)*]; $($rest)*); - }; - (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { - const _: () = { assert!(SYMBOLS[${count($count)}] == $value) }; - pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); - symbols!(@consts [$($count)* ()]; $($($rest)*)?); - }; - (@consts [$($count:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { - pub mod $module { - use super::*; - - symbols!(@consts [$($count)*]; $($inner)*); - } - - symbols!(@consts @cont [$($count)*] [$($($rest)*)?]; $($inner)*); - }; - (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { - const _: () = { assert!(SYMBOLS[${count($count)}] == stringify!($name)) }; - pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); - symbols!(@consts [$($count)* ()]; $($($rest)*)?); - }; - - (@path [] [$($path:ident)*];) => { - $($path)::* - }; - (@path [$next:tt $($rest:tt)*] [$($path:tt)*];) => { - symbols!(@path [$($rest)*] [$next $($path)*];) - }; - - (@lookup [$(, $arm:expr => $value:expr)*] [$($path:tt),*];) => { - #[expect(unsafe_code)] - pub(crate) fn prime(map: &mut hashbrown::HashMap<&'static str, super::repr::Repr, S, A>) { - debug_assert!(map.is_empty()); - map.reserve(SYMBOLS.len()); - - $( - // SAFETY: The declarative macro guarantees that the symbol is unique. - unsafe { map.insert_unique_unchecked($arm, $value.into_repr()); } - )* - } - - pub(crate) static LOOKUP: &[(&'static str, super::repr::Repr)] = &[ - $(($arm, $value.into_repr())),* - ]; - }; - (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { - symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); - }; - (@lookup [$($arms:tt)*] [$($path:tt),*]; , $($rest:tt)*) => { - symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); - }; - (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { - symbols!(@lookup [$($arms)*, $value => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); - }; - (@lookup [$($arms:tt)*] [$($path:tt),*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { - symbols!(@lookup [$($arms)*] [$module $(, $path)*]; $($inner)* ,| $($($rest)*)?); - }; - (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident $(, $($rest:tt)*)?) => { - symbols!(@lookup [$($arms)*, stringify!($name) => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); - }; - - (@table; $($items:tt)*) => { - symbols!(@strings []; $($items)*); - symbols!(@consts []; $($items)*); - symbols!(@lookup [] [self]; $($items)*); - }; -} - -symbols! {@table; - // [tidy] sort alphabetically start - access, - add, - and, - archived, - archived_by_id, - bar, - BaseUrl, - bit_and, - bit_not, - bit_or, - bit_shl, - bit_shr, - bit_xor, - Boolean, - collect, - confidence, - core, - created_at_decision_time, - created_at_transaction_time, - created_by_id, - decision_time, - Dict, - div, - draft_id, - dummy: "", - E, - edition, - edition_id, - encodings, - entity, - entity_edition_id, - entity_id, - entity_type_ids, - entity_uuid, - eq, - Err, - filter, - foo, - gt, - gte, - id, - index, - inferred, - input, - input_exists: "$exists", - Integer, - Intersection, - kernel, - left_entity_confidence, - left_entity_id, - left_entity_provenance, - link_data, - List, - lt, - lte, - math, - metadata, - mul, - ne, - Never, - None, - not, - Null, - null, - Number, - Ok, - option, - or, - pow, - properties, - provenance, - provided, - r#as: "as", - r#as_force: "as!", - r#else: "else", - r#false: "false", - r#fn: "fn", - r#if: "if", - r#in: "in", - r#is: "is", - r#let: "let", - r#mod: "mod", - r#newtype: "newtype", - r#true: "true", - r#type: "type", - r#use: "use", - R, - record_id, - Result, - right_entity_confidence, - right_entity_id, - right_entity_provenance, - Some, - special_form, - String, - sub, - T, - temporal_versioning, - then: "then", - thunk: "thunk", - transaction_time, - U, - Union, - Unknown, - unknown, - Url, - vectors, - web_id, - // [tidy] sort alphabetically end - - internal: { - ClosureEnv: "'" - }, - - symbol: { - // [tidy] sort alphabetically start - ampamp: "&&", - ampersand: "&", - arrow: "->", - arrow_head: "|>", - asterisk: "*", - exclamation: "!", - excleq: "!=", - brackets: "[]", - caret: "^", - colon: ":", - colon_colon: "::", - comma: ",", - dollar: "$", - dollar_question_mark: "$?", - dot: ".", - eq: "=", - eqeq: "==", - gt: ">", - gteq: ">=", - gtgt: ">>", - lt: "<", - lteq: "<=", - ltlt: "<<", - minus: "-", - pipepipe: "||", - pipe: "|", - plus: "+", - question_mark: "?", - slash: "/", - tilde: "~", - // [tidy] sort alphabetically end - }, - - digit: { - zero: "0", - one: "1", - two: "2", - three: "3", - four: "4", - five: "5", - six: "6", - seven: "7", - eight: "8", - nine: "9", - }, - - path: { - // [tidy] sort alphabetically start - option: "::core::option::Option", - some: "::core::option::Some", - none: "::core::option::None", - graph_head_entities: "::graph::head::entities", - graph_body_filter: "::graph::body::filter", - graph_tail_collect: "::graph::tail::collect", - // [tidy] sort alphabetically end - } -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - - use super::SYMBOLS; - - #[test] - fn symbols_are_unique() { - let mut set = HashSet::with_capacity(SYMBOLS.len()); - - for symbol in SYMBOLS { - set.insert(*symbol); - } - - assert_eq!(set.len(), SYMBOLS.len(), "duplicate symbol value found"); - } -} diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index 2032e1d3ba1..4e779e07e3d 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -78,7 +78,7 @@ use crate::heap::BumpAllocator; /// Note: This assumes the [`HashTable`]'s own allocator `A` (used for bucket storage) is /// still valid. With the default `A = Global`, this is always the case. #[derive(Debug)] -struct SymbolTable { +pub(crate) struct SymbolTable { inner: HashTable, hasher: RandomState, } @@ -88,7 +88,7 @@ impl SymbolTable { /// /// The table is not primed. Call [`prime`](Self::prime) to populate it with /// predefined symbols before use. - fn new() -> Self { + pub(crate) fn new() -> Self { Self { inner: HashTable::new(), hasher: RandomState::default(), @@ -109,12 +109,12 @@ impl SymbolTable { } /// Returns the number of symbols currently in the table. - fn len(&self) -> usize { + pub(crate) fn len(&self) -> usize { self.inner.len() } /// Returns `true` if the table contains no symbols. - fn is_empty(&self) -> bool { + pub(crate) fn is_empty(&self) -> bool { self.inner.is_empty() } } @@ -152,11 +152,11 @@ impl SymbolTable { /// /// [`sym2::LOOKUP`]: super::sym2::LOOKUP pub(crate) unsafe fn prime(&mut self) { - self.inner.reserve(super::sym2::LOOKUP.len(), |_| { + self.inner.reserve(super::sym::LOOKUP.len(), |_| { unreachable!("prime() requires an empty table; hasher callback should not be invoked") }); - for &(name, value) in super::sym2::LOOKUP { + for &(name, value) in super::sym::LOOKUP { let hash = self.hasher.hash_one(name); self.inner.insert_unique(hash, value, |_| { @@ -264,7 +264,7 @@ impl SymbolTable { mod tests { #![expect(unsafe_code, clippy::non_ascii_literal)] - use super::{super::sym2, SymbolTable}; + use super::{super::sym, SymbolTable}; use crate::heap::Scratch; #[test] @@ -282,7 +282,7 @@ mod tests { table.prime(); } - assert_eq!(table.len(), sym2::LOOKUP.len()); + assert_eq!(table.len(), sym::LOOKUP.len()); assert!(!table.is_empty()); } @@ -339,7 +339,7 @@ mod tests { // Intern a predefined symbol (e.g., "and" from LOOKUP). // The returned Repr should match the one in LOOKUP. - for &(name, expected_repr) in sym2::LOOKUP { + for &(name, expected_repr) in sym::LOOKUP { // SAFETY: Table is primed, scratch is live. let repr = unsafe { table.intern(&scratch, name) }; assert_eq!( @@ -451,7 +451,7 @@ mod tests { }; // Get a constant Repr by interning a predefined symbol. - let (name, expected_repr) = sym2::LOOKUP[0]; + let (name, expected_repr) = sym::LOOKUP[0]; // SAFETY: Table is primed, scratch is live. let repr_before = unsafe { table.intern(&scratch, name) }; assert_eq!(repr_before, expected_repr); From 1dd44d244b1e0fa76c52cfe6582f52cda84b4bdd Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 17:01:06 +0100 Subject: [PATCH 08/21] feat: checkpoint --- .../ast/src/lowering/import_resolver/error.rs | 2 +- .../ast/src/lowering/type_extractor/error.rs | 2 +- libs/@local/hashql/core/src/heap/mod.rs | 6 +- libs/@local/hashql/core/src/symbol/mod.rs | 55 ++++++---- libs/@local/hashql/core/src/symbol/repr.rs | 100 ++++++++++------- libs/@local/hashql/core/src/symbol/sym.rs | 46 ++++---- libs/@local/hashql/core/src/symbol/table.rs | 4 +- libs/@local/hashql/core/src/type/pretty.rs | 16 +-- .../hashql/core/src/value/primitive/float.rs | 6 +- .../core/src/value/primitive/integer.rs | 32 +++--- .../hashql/core/src/value/primitive/string.rs | 4 +- .../@local/hashql/eval/src/graph/read/path.rs | 46 +++----- .../hashql/hir/src/node/operation/binary.rs | 12 +- .../hashql/hir/src/node/operation/unary.rs | 4 +- libs/@local/hashql/hir/src/pretty.rs | 32 +++--- libs/@local/hashql/hir/src/reify/mod.rs | 10 +- .../hashql/mir/src/body/rvalue/binary.rs | 12 +- .../statement_placement/lookup/entity.rs | 103 ++++++------------ .../statement_placement/lookup/tests.rs | 36 +++--- .../statement_placement/lookup/trie.rs | 12 +- .../mir/src/pass/transform/inline/tests.rs | 2 +- 21 files changed, 257 insertions(+), 285 deletions(-) diff --git a/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs b/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs index da409876033..7e3c0e1476f 100644 --- a/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs +++ b/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs @@ -400,7 +400,7 @@ fn add_available_imports<'heap>( fn format_absolute_path<'heap>(item: &Item<'heap>, registry: &ModuleRegistry<'heap>) -> String { iter::once("") - .chain(item.absolute_path(registry).map(|symbol| symbol.unwrap())) + .chain(item.absolute_path(registry).map(Symbol::unwrap)) .intersperse("::") .collect() } diff --git a/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs b/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs index 77aea15536e..7135be9f61b 100644 --- a/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs +++ b/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs @@ -479,7 +479,7 @@ pub(crate) fn unknown_intrinsic_type( } else { let suggestions: String = similar .into_iter() - .map(|symbol| symbol.unwrap()) + .map(Symbol::unwrap) .intersperse("`, `") .collect(); diff --git a/libs/@local/hashql/core/src/heap/mod.rs b/libs/@local/hashql/core/src/heap/mod.rs index 188ed3d74d2..ad079291333 100644 --- a/libs/@local/hashql/core/src/heap/mod.rs +++ b/libs/@local/hashql/core/src/heap/mod.rs @@ -104,7 +104,6 @@ use core::{alloc, mem, ptr}; use std::sync::Mutex; use ::alloc::{boxed, collections::vec_deque, vec}; -use hashbrown::HashSet; use self::allocator::{Allocator, AllocatorScope, Checkpoint}; pub use self::{ @@ -115,10 +114,7 @@ pub use self::{ scratch::Scratch, transfer::TransferInto, }; -use crate::{ - collections::{FastHashSet, fast_hash_set_with_capacity}, - symbol::{Symbol, SymbolTable}, -}; +use crate::symbol::{Symbol, SymbolTable}; /// A boxed value allocated on the `Heap`. /// diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index e50d920bf82..96491667eb3 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -26,15 +26,34 @@ mod table; use core::{ cmp::Ordering, fmt::{self, Display, Formatter}, - hash::{Hash, Hasher}, + hash::Hash, marker::PhantomData, }; pub use self::lookup::SymbolLookup; -use self::repr::{ConstantSymbol, Repr}; +use self::repr::{ConstantRepr, Repr}; pub(crate) use self::table::SymbolTable; use crate::span::SpanId; +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct ConstantSymbol { + repr: ConstantRepr, +} + +impl ConstantSymbol { + #[inline] + const fn new_unchecked(index: usize) -> Self { + Self { + repr: ConstantRepr::new_unchecked(index), + } + } + + #[inline] + const fn from_repr(repr: ConstantRepr) -> Self { + Self { repr } + } +} + /// A string-like value used throughout the HashQL compiler. /// /// Symbols represent string data that appears in source code and persists throughout @@ -47,7 +66,9 @@ use crate::span::SpanId; /// /// The caller must ensure that the string is unique and interned. The types correctness requires /// relies on these *but it does not enforce it*. -#[derive(Debug, Copy, Clone)] +// We can relay to the derives for PartialEq, Eq, and Hash, as `_marker` is ignored, and the +// internal representation makes a pointer comparison. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct Symbol<'heap> { repr: Repr, _marker: PhantomData<&'heap ()>, @@ -56,9 +77,9 @@ pub struct Symbol<'heap> { #[expect(unsafe_code)] impl<'heap> Symbol<'heap> { #[inline] - const fn new_constant_unchecked(index: usize) -> Self { - Symbol { - repr: Repr::constant(ConstantSymbol::new_unchecked(index)), + const fn from_constant(constant: ConstantSymbol) -> Self { + Self { + repr: Repr::constant(constant.repr), _marker: PhantomData, } } @@ -76,6 +97,12 @@ impl<'heap> Symbol<'heap> { self.repr } + pub fn as_constant(self) -> Option { + self.repr + .try_as_constant_symbol() + .map(ConstantSymbol::from_repr) + } + #[must_use] #[inline] pub fn as_str(&self) -> &str { @@ -118,15 +145,6 @@ impl AsRef for Symbol<'_> { } } -impl PartialEq for Symbol<'_> { - fn eq(&self, other: &Self) -> bool { - // Pointer equality implies string equality (due to the unique contents assumption) - self.repr == other.repr - } -} - -impl Eq for Symbol<'_> {} - impl PartialOrd for Symbol<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -145,13 +163,6 @@ impl Ord for Symbol<'_> { } } -impl Hash for Symbol<'_> { - fn hash(&self, state: &mut H) { - // Pointer hashing is sufficient (due to the unique contents assumption) - Hash::hash(&self.repr, state); - } -} - impl Display for Symbol<'_> { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { Display::fmt(self.as_str(), fmt) diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index 1a118cca1f4..51c452fdbbe 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -57,13 +57,14 @@ use crate::heap::BumpAllocator; /// not the trailing bytes. All access must go through [`NonNull`] /// to preserve full allocation provenance. #[repr(C, align(2))] -pub(crate) struct RuntimeSymbol { +pub(crate) struct RuntimeRepr { len: usize, data: [u8; 0], } -impl RuntimeSymbol { +impl RuntimeRepr { /// Computes the allocation layout for a runtime symbol with `len` bytes of data. + #[inline] fn layout(len: usize) -> Layout { Layout::from_size_align( size_of::().checked_add(len).expect("overflow"), @@ -118,6 +119,7 @@ impl RuntimeSymbol { /// This performs pointer arithmetic without dereferencing, so it is safe. /// The returned pointer has provenance for the trailing bytes if `this` /// has provenance for the full allocation. + #[inline] const fn data_ptr(this: NonNull) -> NonNull { // SAFETY: `this` points to a valid `RuntimeSymbol` allocation, which // always has at least `size_of::()` bytes. Adding 1 moves past @@ -131,6 +133,7 @@ impl RuntimeSymbol { /// /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. /// - The allocation must remain live for the duration of this call. + #[inline] const unsafe fn len(this: NonNull) -> usize { // SAFETY: Caller guarantees `this` points to a valid, initialized allocation. unsafe { this.cast::().read() } @@ -143,6 +146,7 @@ impl RuntimeSymbol { /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. /// - The allocation must remain live for the lifetime `'a`. /// - The returned slice must not be mutated for the lifetime `'a`. + #[inline] const unsafe fn as_bytes<'a>(this: NonNull) -> &'a [u8] { // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. // `data_ptr` returns a pointer to the inline bytes, and `len` returns the count. @@ -156,6 +160,7 @@ impl RuntimeSymbol { /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. /// - The allocation must remain live for the lifetime `'a`. /// - The returned string must not be mutated for the lifetime `'a`. + #[inline] const unsafe fn as_str<'a>(this: NonNull) -> &'a str { // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. // The bytes are valid UTF-8 because they were copied from a `&str` in `try_alloc`. @@ -164,25 +169,22 @@ impl RuntimeSymbol { } /// A constant symbol represented as an index into [`STRINGS`]. -#[derive(Copy, Clone)] -pub(crate) struct ConstantSymbol(usize); +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub(crate) struct ConstantRepr(usize); -impl ConstantSymbol { +impl ConstantRepr { + #[inline] pub(crate) const fn new_unchecked(index: usize) -> Self { Self(index) } - /// Returns the string value for this constant symbol. - fn as_str(self) -> &'static str { - SYMBOLS[self.0] - } - /// Returns the string value without bounds checking. /// /// # Safety /// /// The index must be within bounds of [`STRINGS`]. - const unsafe fn as_str_unchecked(self) -> &'static str { + #[inline] + pub(super) const unsafe fn as_str_unchecked(self) -> &'static str { // SAFETY: Caller guarantees the index is in bounds. unsafe { *SYMBOLS.as_ptr().add(self.0) } } @@ -192,7 +194,8 @@ impl ConstantSymbol { /// # Safety /// /// The index must be within bounds of [`STRINGS`]. - const unsafe fn as_bytes_unchecked(self) -> &'static [u8] { + #[inline] + pub(super) const unsafe fn as_bytes_unchecked(self) -> &'static [u8] { // SAFETY: Constant symbols return &'static str, which coerces to &'static [u8]. unsafe { self.as_str_unchecked().as_bytes() } } @@ -236,6 +239,7 @@ impl Repr { const TAG_SHIFT: u32 = 1; /// Returns the tag value (0 for runtime, 1 for constant). + #[inline] fn tag(self) -> usize { self.ptr.addr().get() & Self::TAG_MASK } @@ -246,7 +250,8 @@ impl Repr { /// /// - `self` must have been created via [`Repr::runtime`]. /// - The underlying allocation must still be live. - unsafe fn as_runtime_symbol(self) -> NonNull { + #[inline] + unsafe fn as_runtime(self) -> NonNull { debug_assert!(self.tag() == Self::TAG_RUNTIME); self.ptr @@ -255,7 +260,7 @@ impl Repr { // lowest bit is always 0. Masking it off preserves a valid, non-zero address. unsafe { NonZero::new_unchecked(addr.get() & !Self::TAG_MASK) } }) - .cast::() + .cast::() } /// Extracts the constant symbol index. @@ -263,11 +268,22 @@ impl Repr { /// # Safety /// /// - `self` must have been created via [`Repr::constant`]. - unsafe fn as_constant_symbol(self) -> ConstantSymbol { + #[inline] + unsafe fn as_constant(self) -> ConstantRepr { debug_assert!(self.tag() == Self::TAG_CONSTANT); let addr = self.ptr.addr().get(); - ConstantSymbol((addr & !Self::TAG_MASK) >> Self::TAG_SHIFT) + ConstantRepr((addr & !Self::TAG_MASK) >> Self::TAG_SHIFT) + } + + #[inline] + pub(super) fn try_as_constant_symbol(self) -> Option { + if self.tag() != Self::TAG_CONSTANT { + return None; + } + + // SAFETY: We have just verified that the tag is constant. + Some(unsafe { self.as_constant() }) } /// Returns the string content of this symbol. @@ -276,13 +292,14 @@ impl Repr { /// /// - For runtime symbols: the allocation must remain live for lifetime `'str`. /// - The returned string must not be mutated for lifetime `'str`. + #[inline] pub(crate) unsafe fn as_str<'str>(self) -> &'str str { if self.tag() == Self::TAG_RUNTIME { // SAFETY: Caller guarantees the allocation is live for 'str. - unsafe { RuntimeSymbol::as_str(self.as_runtime_symbol()) } + unsafe { RuntimeRepr::as_str(self.as_runtime()) } } else { // SAFETY: Constant symbols return &'static str, which coerces to &'str. - unsafe { self.as_constant_symbol().as_str_unchecked() } + unsafe { self.as_constant().as_str_unchecked() } } } @@ -292,20 +309,22 @@ impl Repr { /// /// - For runtime symbols: the allocation must remain live for lifetime `'str`. /// - The returned bytes must not be mutated for lifetime `'str`. + #[inline] pub(crate) unsafe fn as_bytes<'str>(self) -> &'str [u8] { if self.tag() == Self::TAG_RUNTIME { // SAFETY: Caller guarantees the allocation is live for 'str. - unsafe { RuntimeSymbol::as_bytes(self.as_runtime_symbol()) } + unsafe { RuntimeRepr::as_bytes(self.as_runtime()) } } else { // SAFETY: Constant symbols return &'static str, which coerces to &'str. - unsafe { self.as_constant_symbol().as_bytes_unchecked() } + unsafe { self.as_constant().as_bytes_unchecked() } } } /// Creates a `Repr` for a constant symbol. /// /// The index is encoded directly in the pointer bits (shifted to make room for the tag). - pub(crate) const fn constant(constant: ConstantSymbol) -> Self { + #[inline] + pub(crate) const fn constant(constant: ConstantRepr) -> Self { const { assert!( Self::TAG_CONSTANT != 0, @@ -332,9 +351,10 @@ impl Repr { /// /// The pointer is stored directly with its tag bit set to 0 (which is a no-op /// since runtime allocations are already aligned). - pub(crate) fn runtime(symbol: NonNull) -> Self { + #[inline] + pub(crate) fn runtime(symbol: NonNull) -> Self { const { - assert!(align_of::() >= Self::MIN_ALIGN); + assert!(align_of::() >= Self::MIN_ALIGN); } let ptr = symbol.map_addr(|addr| addr | Self::TAG_RUNTIME).cast(); @@ -348,7 +368,7 @@ mod tests { #![expect(clippy::non_ascii_literal)] use core::mem; - use super::{ConstantSymbol, Repr, RuntimeSymbol, SYMBOLS}; + use super::{ConstantRepr, Repr, RuntimeRepr, SYMBOLS}; use crate::heap::Scratch; #[test] @@ -363,12 +383,12 @@ mod tests { #[test] fn runtime_symbol_has_minimum_alignment() { - assert!(mem::align_of::() >= Repr::MIN_ALIGN); + assert!(mem::align_of::() >= Repr::MIN_ALIGN); } #[test] fn constant_symbol_first_entry() { - let constant = ConstantSymbol(0); + let constant = ConstantRepr(0); let repr = Repr::constant(constant); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. @@ -377,7 +397,7 @@ mod tests { #[test] fn constant_symbol_first_entry_unchecked() { - let constant = ConstantSymbol(0); + let constant = ConstantRepr(0); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. assert_eq!(unsafe { constant.as_str_unchecked() }, SYMBOLS[0]); @@ -385,7 +405,7 @@ mod tests { #[test] fn constant_symbol_second_entry() { - let constant = ConstantSymbol(1); + let constant = ConstantRepr(1); let repr = Repr::constant(constant); // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. @@ -395,7 +415,7 @@ mod tests { #[test] fn runtime_symbol_empty_string() { let heap = Scratch::new(); - let symbol = RuntimeSymbol::alloc(&heap, ""); + let symbol = RuntimeRepr::alloc(&heap, ""); let repr = Repr::runtime(symbol); // SAFETY: `heap` is live for the duration of this assertion. @@ -405,7 +425,7 @@ mod tests { #[test] fn runtime_symbol_simple_string() { let heap = Scratch::new(); - let symbol = RuntimeSymbol::alloc(&heap, "hello"); + let symbol = RuntimeRepr::alloc(&heap, "hello"); let repr = Repr::runtime(symbol); // SAFETY: `heap` is live for the duration of this assertion. @@ -415,7 +435,7 @@ mod tests { #[test] fn runtime_symbol_unicode() { let heap = Scratch::new(); - let symbol = RuntimeSymbol::alloc(&heap, "日本語 🎉 émojis"); + let symbol = RuntimeRepr::alloc(&heap, "日本語 🎉 émojis"); let repr = Repr::runtime(symbol); // SAFETY: `heap` is live for the duration of this assertion. @@ -426,7 +446,7 @@ mod tests { fn runtime_symbol_long_string() { let heap = Scratch::new(); let long_string = "a".repeat(10_000); - let symbol = RuntimeSymbol::alloc(&heap, &long_string); + let symbol = RuntimeRepr::alloc(&heap, &long_string); let repr = Repr::runtime(symbol); // SAFETY: `heap` is live for the duration of this assertion. @@ -437,9 +457,9 @@ mod tests { fn multiple_runtime_symbols() { let heap = Scratch::new(); - let symbol1 = RuntimeSymbol::alloc(&heap, "first"); - let symbol2 = RuntimeSymbol::alloc(&heap, "second"); - let symbol3 = RuntimeSymbol::alloc(&heap, "third"); + let symbol1 = RuntimeRepr::alloc(&heap, "first"); + let symbol2 = RuntimeRepr::alloc(&heap, "second"); + let symbol3 = RuntimeRepr::alloc(&heap, "third"); let repr1 = Repr::runtime(symbol1); let repr2 = Repr::runtime(symbol2); @@ -457,8 +477,8 @@ mod tests { fn tag_distinguishes_constant_from_runtime() { let heap = Scratch::new(); - let constant = Repr::constant(ConstantSymbol(0)); - let runtime = Repr::runtime(RuntimeSymbol::alloc(&heap, "test")); + let constant = Repr::constant(ConstantRepr(0)); + let runtime = Repr::runtime(RuntimeRepr::alloc(&heap, "test")); assert_eq!(constant.tag(), Repr::TAG_CONSTANT); assert_eq!(runtime.tag(), Repr::TAG_RUNTIME); @@ -467,12 +487,12 @@ mod tests { #[test] fn runtime_symbol_stores_correct_length() { let heap = Scratch::new(); - let symbol = RuntimeSymbol::alloc(&heap, "hello"); + let symbol = RuntimeRepr::alloc(&heap, "hello"); // SAFETY: `symbol` points to a valid allocation and `heap` is live. unsafe { - assert_eq!(RuntimeSymbol::len(symbol), 5); - assert_eq!(RuntimeSymbol::as_str(symbol).len(), 5); + assert_eq!(RuntimeRepr::len(symbol), 5); + assert_eq!(RuntimeRepr::as_str(symbol).len(), 5); } } } diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index b8bfc2ef615..19512be01c4 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -1,5 +1,5 @@ -#![expect(non_upper_case_globals, clippy::min_ident_chars)] -use super::Symbol; +#![expect(non_upper_case_globals, non_snake_case, clippy::min_ident_chars)] +use super::{ConstantSymbol, Symbol}; macro_rules! symbols { (@strings [$($acc:tt)*];) => { @@ -42,7 +42,15 @@ macro_rules! symbols { }; (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { const _: () = { assert!(SYMBOLS[${count($count)}] == $value) }; - pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + #[doc = concat!("The symbol `", $value, "`")] + pub const $name: Symbol<'static> = Symbol::from_constant($name::CONST); + + pub mod $name { + use super::*; + + pub const CONST: ConstantSymbol = ConstantSymbol::new_unchecked(${count($count)}); + } + symbols!(@consts [$($count)* ()]; $($($rest)*)?); }; (@consts [$($count:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { @@ -56,7 +64,15 @@ macro_rules! symbols { }; (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { const _: () = { assert!(SYMBOLS[${count($count)}] == stringify!($name)) }; - pub const $name: Symbol<'static> = Symbol::new_constant_unchecked(${count($count)}); + #[doc = concat!("The symbol `", stringify!($name), "`")] + pub const $name: Symbol<'static> = Symbol::from_constant($name::CONST); + + pub mod $name { + use super::*; + + pub const CONST: ConstantSymbol = ConstantSymbol::new_unchecked(${count($count)}); + } + symbols!(@consts [$($count)* ()]; $($($rest)*)?); }; @@ -68,17 +84,6 @@ macro_rules! symbols { }; (@lookup [$(, $arm:expr => $value:expr)*] [$($path:tt),*];) => { - #[expect(unsafe_code)] - pub(crate) fn prime(map: &mut hashbrown::HashMap<&'static str, super::repr::Repr, S, A>) { - debug_assert!(map.is_empty()); - map.reserve(SYMBOLS.len()); - - $( - // SAFETY: The declarative macro guarantees that the symbol is unique. - unsafe { map.insert_unique_unchecked($arm, $value.into_repr()); } - )* - } - pub(crate) static LOOKUP: &[(&'static str, super::repr::Repr)] = &[ $(($arm, $value.into_repr())),* ]; @@ -234,7 +239,7 @@ symbols! {@table; brackets: "[]", caret: "^", colon: ":", - colon_colon: "::", + coloncolon: "::", comma: ",", dollar: "$", dollar_question_mark: "$?", @@ -272,12 +277,13 @@ symbols! {@table; path: { // [tidy] sort alphabetically start - option: "::core::option::Option", - some: "::core::option::Some", - none: "::core::option::None", - graph_head_entities: "::graph::head::entities", + Entity: "::graph::types::knowledge::entity::Entity", graph_body_filter: "::graph::body::filter", + graph_head_entities: "::graph::head::entities", graph_tail_collect: "::graph::tail::collect", + none: "::core::option::None", + option: "::core::option::Option", + some: "::core::option::Some", // [tidy] sort alphabetically end } } diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index 4e779e07e3d..4c1c7e07016 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -42,7 +42,7 @@ use core::{alloc::Allocator, hash::BuildHasher as _}; use foldhash::fast::RandomState; use hashbrown::{HashTable, hash_table::Entry}; -use super::repr::{Repr, RuntimeSymbol}; +use super::repr::{Repr, RuntimeRepr}; use crate::heap::BumpAllocator; /// A string interning table mapping `&str` to canonical [`Repr`] values. @@ -253,7 +253,7 @@ impl SymbolTable { ) { Entry::Occupied(entry) => *entry.get(), Entry::Vacant(entry) => { - let repr = Repr::runtime(RuntimeSymbol::alloc(alloc, value)); + let repr = Repr::runtime(RuntimeRepr::alloc(alloc, value)); *entry.insert(repr).get() } } diff --git a/libs/@local/hashql/core/src/type/pretty.rs b/libs/@local/hashql/core/src/type/pretty.rs index e715adf69fd..67ed6d0558b 100644 --- a/libs/@local/hashql/core/src/type/pretty.rs +++ b/libs/@local/hashql/core/src/type/pretty.rs @@ -313,7 +313,7 @@ impl<'fmt, 'heap> FormatType<'fmt, TypeKind<'heap>> for TypeFormatter<'fmt, '_, TypeKind::Generic(generic) => self.format_type(generic), TypeKind::Param(param) => self.format_type(param), TypeKind::Infer(infer) => self.format_type(infer), - TypeKind::Never => self.fmt.type_name(sym::symbol::exclamation_mark), + TypeKind::Never => self.fmt.type_name(sym::symbol::exclamation), TypeKind::Unknown => self.fmt.type_name(sym::symbol::question_mark), }; @@ -397,11 +397,11 @@ impl<'fmt, 'heap> FormatType<'fmt, OpaqueType<'heap>> for TypeFormatter<'fmt, '_ impl<'fmt> FormatType<'fmt, PrimitiveType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, value: PrimitiveType) -> Doc<'fmt> { match value { - PrimitiveType::Number => self.fmt.type_name(sym::lexical::Number), - PrimitiveType::Integer => self.fmt.type_name(sym::lexical::Integer), - PrimitiveType::String => self.fmt.type_name(sym::lexical::String), - PrimitiveType::Null => self.fmt.type_name(sym::lexical::Null), - PrimitiveType::Boolean => self.fmt.type_name(sym::lexical::Boolean), + PrimitiveType::Number => self.fmt.type_name(sym::Number), + PrimitiveType::Integer => self.fmt.type_name(sym::Integer), + PrimitiveType::String => self.fmt.type_name(sym::String), + PrimitiveType::Null => self.fmt.type_name(sym::Null), + PrimitiveType::Boolean => self.fmt.type_name(sym::Boolean), } } } @@ -409,14 +409,14 @@ impl<'fmt> FormatType<'fmt, PrimitiveType> for TypeFormatter<'fmt, '_, '_> { impl<'fmt> FormatType<'fmt, ListType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, ListType { element }: ListType) -> Doc<'fmt> { self.fmt - .type_name(sym::lexical::List) + .type_name(sym::List) .append(self.fmt.angles(self.format_type(element))) } } impl<'fmt> FormatType<'fmt, DictType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, DictType { key, value }: DictType) -> Doc<'fmt> { - self.fmt.type_name(sym::lexical::Dict).append( + self.fmt.type_name(sym::Dict).append( self.fmt.angles( self.fmt .comma_sep([self.format_type(key), self.format_type(value)]), diff --git a/libs/@local/hashql/core/src/value/primitive/float.rs b/libs/@local/hashql/core/src/value/primitive/float.rs index acd81091f63..0e14f094247 100644 --- a/libs/@local/hashql/core/src/value/primitive/float.rs +++ b/libs/@local/hashql/core/src/value/primitive/float.rs @@ -103,7 +103,7 @@ impl<'heap> Float<'heap> { /// Panics if the stored value is not a valid JSON-formatted floating-point number. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f32(&self) -> f32 { + pub fn as_f32(self) -> f32 { f32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE) .expect("float literal should be formatted according to JSON specification") } @@ -140,7 +140,7 @@ impl<'heap> Float<'heap> { /// Panics if the stored value is not a valid JSON-formatted floating-point number. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f64(&self) -> f64 { + pub fn as_f64(self) -> f64 { f64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE) .expect("float literal should be formatted according to JSON specification") } @@ -243,7 +243,7 @@ impl<'heap> Float<'heap> { /// assert_eq!(symbol.as_str(), "1.23e4"); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/core/src/value/primitive/integer.rs b/libs/@local/hashql/core/src/value/primitive/integer.rs index b53b9719549..70d384dd7b7 100644 --- a/libs/@local/hashql/core/src/value/primitive/integer.rs +++ b/libs/@local/hashql/core/src/value/primitive/integer.rs @@ -83,7 +83,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("300").as_u8(), None); /// ``` #[must_use] - pub fn as_u8(&self) -> Option { + pub fn as_u8(self) -> Option { u8::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -110,7 +110,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("70000").as_u16(), None); /// ``` #[must_use] - pub fn as_u16(&self) -> Option { + pub fn as_u16(self) -> Option { u16::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -137,7 +137,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("5000000000").as_u32(), None); /// ``` #[must_use] - pub fn as_u32(&self) -> Option { + pub fn as_u32(self) -> Option { u32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -164,7 +164,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999999").as_u64(), None); /// ``` #[must_use] - pub fn as_u64(&self) -> Option { + pub fn as_u64(self) -> Option { u64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -197,7 +197,7 @@ impl<'heap> Integer<'heap> { /// ); /// ``` #[must_use] - pub fn as_u128(&self) -> Option { + pub fn as_u128(self) -> Option { u128::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -224,7 +224,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999999").as_usize(), None); /// ``` #[must_use] - pub fn as_usize(&self) -> Option { + pub fn as_usize(self) -> Option { usize::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -251,7 +251,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("200").as_i8(), None); /// ``` #[must_use] - pub fn as_i8(&self) -> Option { + pub fn as_i8(self) -> Option { i8::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -278,7 +278,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("50000").as_i16(), None); /// ``` #[must_use] - pub fn as_i16(&self) -> Option { + pub fn as_i16(self) -> Option { i16::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -305,7 +305,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("3000000000").as_i32(), None); /// ``` #[must_use] - pub fn as_i32(&self) -> Option { + pub fn as_i32(self) -> Option { i32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -332,7 +332,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("999999999999999999999").as_i64(), None); /// ``` #[must_use] - pub fn as_i64(&self) -> Option { + pub fn as_i64(self) -> Option { i64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -368,7 +368,7 @@ impl<'heap> Integer<'heap> { /// ); /// ``` #[must_use] - pub fn as_i128(&self) -> Option { + pub fn as_i128(self) -> Option { i128::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -395,7 +395,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999").as_isize(), None); /// ``` #[must_use] - pub fn as_isize(&self) -> Option { + pub fn as_isize(self) -> Option { isize::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -428,7 +428,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f32(&self) -> f32 { + pub fn as_f32(self) -> f32 { f32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &float::PARSE) .expect("integer literal should be formatted according to JSON specification") } @@ -462,7 +462,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f64(&self) -> f64 { + pub fn as_f64(self) -> f64 { f64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &float::PARSE) .expect("integer literal should be formatted according to JSON specification") } @@ -498,7 +498,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_real(&self) -> Real { + pub fn as_real(self) -> Real { Real::from_str(self.value.as_str()) .expect("integer literal should be formatted according to JSON specification") } @@ -527,7 +527,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(symbol.as_str(), "123456789012345678901234567890"); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/core/src/value/primitive/string.rs b/libs/@local/hashql/core/src/value/primitive/string.rs index 0fff8c8d6ae..1fe0d42533d 100644 --- a/libs/@local/hashql/core/src/value/primitive/string.rs +++ b/libs/@local/hashql/core/src/value/primitive/string.rs @@ -74,7 +74,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_bytes(), b"Hello"); /// ``` #[must_use] - pub const fn as_bytes(&self) -> &[u8] { + pub fn as_bytes(&self) -> &[u8] { self.value.as_bytes() } @@ -91,7 +91,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_symbol(), heap.intern_symbol("Hello")); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/eval/src/graph/read/path.rs b/libs/@local/hashql/eval/src/graph/read/path.rs index 395d1d70cf2..810fb4e4477 100644 --- a/libs/@local/hashql/eval/src/graph/read/path.rs +++ b/libs/@local/hashql/eval/src/graph/read/path.rs @@ -97,14 +97,11 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityIdQueryPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::web_id { - Some(Self::WebId) - } else if field == sym::lexical::entity_uuid { - Some(Self::EntityUuid) - } else if field == sym::lexical::draft_id { - Some(Self::DraftId) - } else { - None + match field.as_constant()? { + sym::web_id::CONST => Some(Self::WebId), + sym::entity_uuid::CONST => Some(Self::EntityUuid), + sym::draft_id::CONST => Some(Self::DraftId), + _ => None, } } @@ -139,12 +136,10 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityRecordIdPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::entity_id { - Some(Self::EntityId(None)) - } else if field == sym::lexical::entity_edition_id { - Some(Self::EntityEditionId) - } else { - None + match field.as_constant()? { + sym::entity_id::CONST => Some(Self::EntityId(None)), + sym::entity_edition_id::CONST => Some(Self::EntityEditionId), + _ => None, } } @@ -191,12 +186,10 @@ impl<'heap> PartialQueryPath<'heap> for PartialLinkDataPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::left_entity_id { - Some(Self::LeftEntityId(None)) - } else if field == sym::lexical::right_entity_id { - Some(Self::RightEntityId(None)) - } else { - None + match field.as_constant()? { + sym::left_entity_id::CONST => Some(Self::LeftEntityId(None)), + sym::right_entity_id::CONST => Some(Self::RightEntityId(None)), + _ => None, } } @@ -310,14 +303,11 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityQueryPath<'heap> { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::id { - Some(PartialEntityQueryPath::Id(None)) - } else if field == sym::lexical::properties { - Some(PartialEntityQueryPath::Properties(None)) - } else if field == sym::lexical::link_data { - Some(PartialEntityQueryPath::LinkData(None)) - } else { - None + match field.as_constant()? { + sym::id::CONST => Some(PartialEntityQueryPath::Id(None)), + sym::properties::CONST => Some(PartialEntityQueryPath::Properties(None)), + sym::link_data::CONST => Some(PartialEntityQueryPath::LinkData(None)), + _ => None, } } diff --git a/libs/@local/hashql/hir/src/node/operation/binary.rs b/libs/@local/hashql/hir/src/node/operation/binary.rs index e8d23632e12..d9ecea6a7dd 100644 --- a/libs/@local/hashql/hir/src/node/operation/binary.rs +++ b/libs/@local/hashql/hir/src/node/operation/binary.rs @@ -71,13 +71,13 @@ impl BinOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::And => sym::symbol::and, - Self::Or => sym::symbol::or, - Self::Eq => sym::symbol::eq, + Self::And => sym::symbol::ampamp, + Self::Or => sym::symbol::pipepipe, + Self::Eq => sym::symbol::eqeq, Self::Lt => sym::symbol::lt, - Self::Lte => sym::symbol::lte, - Self::Ne => sym::symbol::ne, - Self::Gte => sym::symbol::gte, + Self::Lte => sym::symbol::lteq, + Self::Ne => sym::symbol::excleq, + Self::Gte => sym::symbol::gteq, Self::Gt => sym::symbol::gt, } } diff --git a/libs/@local/hashql/hir/src/node/operation/unary.rs b/libs/@local/hashql/hir/src/node/operation/unary.rs index 13cb1d7b773..8bb835e0c4f 100644 --- a/libs/@local/hashql/hir/src/node/operation/unary.rs +++ b/libs/@local/hashql/hir/src/node/operation/unary.rs @@ -32,9 +32,9 @@ impl UnOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::Not => sym::symbol::exclamation_mark, + Self::Not => sym::symbol::exclamation, Self::BitNot => sym::symbol::tilde, - Self::Neg => sym::symbol::sub, + Self::Neg => sym::symbol::minus, } } } diff --git a/libs/@local/hashql/hir/src/pretty.rs b/libs/@local/hashql/hir/src/pretty.rs index 51c58fd6381..b8c562352d9 100644 --- a/libs/@local/hashql/hir/src/pretty.rs +++ b/libs/@local/hashql/hir/src/pretty.rs @@ -256,9 +256,9 @@ impl<'fmt, 'heap> FormatNode<'fmt, &List<'heap>> for NodeFormatter<'fmt, '_, 'he impl<'fmt, 'heap> FormatNode<'fmt, &Primitive<'heap>> for NodeFormatter<'fmt, '_, 'heap> { fn format_node(&mut self, node: &Primitive<'heap>) -> Doc<'fmt> { match node { - Primitive::Null => self.fmt.literal(sym::lexical::null), - Primitive::Boolean(true) => self.fmt.literal(sym::lexical::r#true), - Primitive::Boolean(false) => self.fmt.literal(sym::lexical::r#false), + Primitive::Null => self.fmt.literal(sym::null), + Primitive::Boolean(true) => self.fmt.literal(sym::r#true), + Primitive::Boolean(false) => self.fmt.literal(sym::r#false), Primitive::Float(float) => self.fmt.literal(float.as_symbol()), Primitive::Integer(integer) => self.fmt.literal(integer.as_symbol()), Primitive::String(string) => { @@ -303,10 +303,10 @@ impl<'fmt, 'heap> FormatNode<'fmt, &QualifiedVariable<'heap>> for NodeFormatter< ) -> Doc<'fmt> { // Format as: ::path::to::var self.fmt - .punct(sym::symbol::colon_colon) + .punct(sym::symbol::coloncolon) .append(self.fmt.intersperse( path.0.iter().map(|ident| self.fmt.variable(ident.value)), - self.fmt.punct(sym::symbol::colon_colon), + self.fmt.punct(sym::symbol::coloncolon), )) .append(self.format_type_arguments(arguments)) } @@ -317,8 +317,8 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Let<'heap>> for NodeFormatter<'fmt, '_, 'hea let fmt = self.fmt; // Format as: let foo = ..., bar = ... in body - let r#let = self.fmt.keyword(sym::lexical::r#let); - let r#in = self.fmt.keyword(sym::lexical::r#in); + let r#let = self.fmt.keyword(sym::r#let); + let r#in = self.fmt.keyword(sym::r#in); let bindings = bindings.iter().map(|binding| self.format_node(binding)); let bindings = fmt.intersperse( @@ -358,7 +358,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Binding<'heap>> for NodeFormatter<'fmt, '_, name_doc .append(self.fmt.space()) - .append(self.fmt.punct(sym::symbol::assign)) + .append(self.fmt.punct(sym::symbol::eq)) .append(self.fmt.space()) .append(value_doc) } @@ -397,11 +397,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &TypeAssertion<'heap>> for NodeFormatter<'fmt let value = self.format_node(*value); let r#type = self.format_type(*r#type); - let op = if *force { - sym::lexical::r#as_force - } else { - sym::lexical::r#as - }; + let op = if *force { sym::r#as_force } else { sym::r#as }; value .append(self.fmt.space()) @@ -460,7 +456,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &InputOperation<'heap>> for NodeFormatter<'fm } InputOp::Exists => { // Format as: $exists(name) - let keyword = self.fmt.keyword(sym::lexical::input_exists); + let keyword = self.fmt.keyword(sym::input_exists); let name = self.fmt.variable(name.value); keyword.append(self.fmt.parens(name)) @@ -562,9 +558,9 @@ impl<'fmt, 'heap> FormatNode<'fmt, &If<'heap>> for NodeFormatter<'fmt, '_, 'heap // value1 // else // value2 - let if_keyword = self.fmt.keyword(sym::lexical::r#if); - let then_keyword = self.fmt.keyword(sym::lexical::then).into_doc(); - let else_keyword = self.fmt.keyword(sym::lexical::r#else).into_doc(); + let if_keyword = self.fmt.keyword(sym::r#if); + let then_keyword = self.fmt.keyword(sym::then).into_doc(); + let else_keyword = self.fmt.keyword(sym::r#else).into_doc(); let test_doc = self.format_node(test).into_doc(); let then_doc = self.format_node(then).into_doc(); @@ -688,7 +684,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Thunk<'heap>> for NodeFormatter<'fmt, '_, 'h fn format_node(&mut self, Thunk { body }: &Thunk<'heap>) -> Doc<'fmt> { // Format thunks differently from closures using the thunk keyword // Format as: thunk -> body - let keyword = self.fmt.keyword(sym::lexical::thunk); + let keyword = self.fmt.keyword(sym::thunk); let arrow = self.fmt.op(sym::symbol::arrow); let body_doc = self.format_node(*body); diff --git a/libs/@local/hashql/hir/src/reify/mod.rs b/libs/@local/hashql/hir/src/reify/mod.rs index 98138f0d7e8..38dca9faa87 100644 --- a/libs/@local/hashql/hir/src/reify/mod.rs +++ b/libs/@local/hashql/hir/src/reify/mod.rs @@ -715,10 +715,7 @@ impl<'heap> ReificationContext<'_, '_, '_, 'heap> { } fn if_expr_then_some(&mut self, node: Node<'heap>) -> Node<'heap> { - let some_some = self.make_qualified_path( - node.span, - &[sym::lexical::core, sym::lexical::option, sym::lexical::Some], - ); + let some_some = self.make_qualified_path(node.span, &[sym::core, sym::option, sym::Some]); let node = NodeData { id: self.context.counter.hir.next(), @@ -737,10 +734,7 @@ impl<'heap> ReificationContext<'_, '_, '_, 'heap> { } fn if_expr_else_none(&mut self, span: SpanId) -> Node<'heap> { - let none_path = self.make_qualified_path( - span, - &[sym::lexical::core, sym::lexical::option, sym::lexical::None], - ); + let none_path = self.make_qualified_path(span, &[sym::core, sym::option, sym::None]); let node = NodeData { id: self.context.counter.hir.next(), diff --git a/libs/@local/hashql/mir/src/body/rvalue/binary.rs b/libs/@local/hashql/mir/src/body/rvalue/binary.rs index a10e946b3fc..c6bae1ae131 100644 --- a/libs/@local/hashql/mir/src/body/rvalue/binary.rs +++ b/libs/@local/hashql/mir/src/body/rvalue/binary.rs @@ -72,15 +72,15 @@ impl BinOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::Add => sym::symbol::add, - Self::Sub => sym::symbol::sub, + Self::Add => sym::symbol::plus, + Self::Sub => sym::symbol::minus, Self::BitAnd => sym::symbol::ampersand, Self::BitOr => sym::symbol::pipe, - Self::Eq => sym::symbol::eq, + Self::Eq => sym::symbol::eqeq, Self::Lt => sym::symbol::lt, - Self::Lte => sym::symbol::lte, - Self::Ne => sym::symbol::ne, - Self::Gte => sym::symbol::gte, + Self::Lte => sym::symbol::lteq, + Self::Ne => sym::symbol::excleq, + Self::Gte => sym::symbol::gteq, Self::Gt => sym::symbol::gt, } } diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs index 75b1248858a..1d194e9d27f 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs @@ -17,156 +17,119 @@ use super::trie::{Access, AccessMode, PathNode}; // the same interned string. pub(super) static ENTITY_PATHS: PathNode = PathNode::root(&[ // entity_editions.properties (JSONB) - PathNode::jsonb(&sym::lexical::properties), + PathNode::jsonb(sym::properties), // (tbd) encodings PathNode::branch( - &sym::lexical::encodings, + sym::encodings, None, &[ // Vectors are stored outside the entity inside of an embeddings database - PathNode::branch( - &sym::lexical::vectors, - Access::Embedding(AccessMode::Direct), - &[], - ), + PathNode::branch(sym::vectors, Access::Embedding(AccessMode::Direct), &[]), ], ), PathNode::branch( - &sym::lexical::metadata, + sym::metadata, None, &[ // entity_temporal_metadata: web_id, entity_uuid, draft_id, entity_edition_id PathNode::branch( - &sym::lexical::record_id, + sym::record_id, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata: web_id, entity_uuid, draft_id PathNode::branch( - &sym::lexical::entity_id, + sym::entity_id, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata.web_id - PathNode::leaf( - &sym::lexical::web_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.draft_id - PathNode::leaf( - &sym::lexical::draft_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::draft_id, Access::Postgres(AccessMode::Direct)), ], ), // entity_temporal_metadata.entity_edition_id - PathNode::leaf( - &sym::lexical::edition_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::edition_id, Access::Postgres(AccessMode::Direct)), ], ), // entity_temporal_metadata: decision_time, transaction_time PathNode::branch( - &sym::lexical::temporal_versioning, + sym::temporal_versioning, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata.decision_time - PathNode::leaf( - &sym::lexical::decision_time, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::decision_time, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.transaction_time - PathNode::leaf( - &sym::lexical::transaction_time, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::transaction_time, Access::Postgres(AccessMode::Direct)), ], ), // entity_is_of_type (via JOIN) - PathNode::leaf( - &sym::lexical::entity_type_ids, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_type_ids, Access::Postgres(AccessMode::Direct)), // entity_editions.archived - PathNode::leaf( - &sym::lexical::archived, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::archived, Access::Postgres(AccessMode::Direct)), // entity_editions.confidence - PathNode::leaf( - &sym::lexical::confidence, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::confidence, Access::Postgres(AccessMode::Direct)), // spans entity_ids.provenance + entity_editions.provenance PathNode::branch( - &sym::lexical::provenance, + sym::provenance, None, &[ // entity_ids.provenance (JSONB) - PathNode::jsonb(&sym::lexical::inferred), + PathNode::jsonb(sym::inferred), // entity_editions.provenance (JSONB) - PathNode::jsonb(&sym::lexical::edition), + PathNode::jsonb(sym::edition), ], ), // entity_editions.property_metadata (JSONB) - PathNode::jsonb(&sym::lexical::properties), + PathNode::jsonb(sym::properties), ], ), // contains synthesized draft_id fields PathNode::branch( - &sym::lexical::link_data, + sym::link_data, None, &[ // draft_id is synthesized (always None), not stored PathNode::branch( - &sym::lexical::left_entity_id, + sym::left_entity_id, None, &[ // entity_has_left_entity -> entity_edge.target_web_id - PathNode::leaf(&sym::lexical::web_id, Access::Postgres(AccessMode::Direct)), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_has_left_entity -> entity_edge.target_entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // synthesized, not in entity_edge - PathNode::leaf(&sym::lexical::draft_id, None), + PathNode::leaf(sym::draft_id, None), ], ), // draft_id is synthesized (always None), not stored PathNode::branch( - &sym::lexical::right_entity_id, + sym::right_entity_id, None, &[ // entity_has_right_entity -> entity_edge.target_web_id - PathNode::leaf(&sym::lexical::web_id, Access::Postgres(AccessMode::Direct)), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_has_right_entity -> entity_edge.target_entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // synthesized, not in entity_edge - PathNode::leaf(&sym::lexical::draft_id, None), + PathNode::leaf(sym::draft_id, None), ], ), // entity_edge.confidence (via entity_has_left_entity) PathNode::leaf( - &sym::lexical::left_entity_confidence, + sym::left_entity_confidence, Access::Postgres(AccessMode::Direct), ), // entity_edge.provenance (JSONB, via entity_has_left_entity) - PathNode::jsonb(&sym::lexical::left_entity_provenance), + PathNode::jsonb(sym::left_entity_provenance), // entity_edge.confidence (via entity_has_right_entity) PathNode::leaf( - &sym::lexical::right_entity_confidence, + sym::right_entity_confidence, Access::Postgres(AccessMode::Direct), ), // entity_edge.provenance (JSONB, via entity_has_right_entity) - PathNode::jsonb(&sym::lexical::right_entity_provenance), + PathNode::jsonb(sym::right_entity_provenance), ], ), ]); diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs index 173257fda61..e3a338d5cc5 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs @@ -19,7 +19,7 @@ fn proj(name: impl Into>) -> Projection<'st /// `[.properties]` → `Access::Postgres(Direct)` (JSONB column). #[test] fn properties_is_postgres() { - let projections = &[proj(sym::lexical::properties)]; + let projections = &[proj(sym::properties)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Postgres(AccessMode::Direct))); @@ -30,11 +30,7 @@ fn properties_is_postgres() { /// JSONB nodes have `otherwise` set, so any sub-path is also Postgres-accessible. #[test] fn properties_subpath_is_postgres() { - let projections = &[ - proj(sym::lexical::properties), - proj(sym::lexical::foo), - proj(sym::lexical::bar), - ]; + let projections = &[proj(sym::properties), proj(sym::foo), proj(sym::bar)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Postgres(AccessMode::Direct))); @@ -43,7 +39,7 @@ fn properties_subpath_is_postgres() { /// `[.encodings.vectors]` → `Access::Embedding(Direct)`. #[test] fn vectors_is_embedding() { - let projections = &[proj(sym::lexical::encodings), proj(sym::lexical::vectors)]; + let projections = &[proj(sym::encodings), proj(sym::vectors)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Embedding(AccessMode::Direct))); @@ -53,14 +49,14 @@ fn vectors_is_embedding() { #[test] fn metadata_columns_are_postgres() { // metadata.archived -> Direct - let projections = &[proj(sym::lexical::metadata), proj(sym::lexical::archived)]; + let projections = &[proj(sym::metadata), proj(sym::archived)]; assert_eq!( entity_projection_access(projections), Some(Access::Postgres(AccessMode::Direct)) ); // metadata.record_id -> Composite - let projections = &[proj(sym::lexical::metadata), proj(sym::lexical::record_id)]; + let projections = &[proj(sym::metadata), proj(sym::record_id)]; assert_eq!( entity_projection_access(projections), Some(Access::Postgres(AccessMode::Composite)) @@ -68,10 +64,10 @@ fn metadata_columns_are_postgres() { // metadata.record_id.entity_id.web_id -> Direct let projections = &[ - proj(sym::lexical::metadata), - proj(sym::lexical::record_id), - proj(sym::lexical::entity_id), - proj(sym::lexical::web_id), + proj(sym::metadata), + proj(sym::record_id), + proj(sym::entity_id), + proj(sym::web_id), ]; assert_eq!( entity_projection_access(projections), @@ -80,9 +76,9 @@ fn metadata_columns_are_postgres() { // metadata.temporal_versioning.decision_time -> Direct let projections = &[ - proj(sym::lexical::metadata), - proj(sym::lexical::temporal_versioning), - proj(sym::lexical::decision_time), + proj(sym::metadata), + proj(sym::temporal_versioning), + proj(sym::decision_time), ]; assert_eq!( entity_projection_access(projections), @@ -94,9 +90,9 @@ fn metadata_columns_are_postgres() { #[test] fn link_data_synthesized_is_none() { let projections = &[ - proj(sym::lexical::link_data), - proj(sym::lexical::left_entity_id), - proj(sym::lexical::draft_id), + proj(sym::link_data), + proj(sym::left_entity_id), + proj(sym::draft_id), ]; let access = entity_projection_access(projections); @@ -106,7 +102,7 @@ fn link_data_synthesized_is_none() { /// Invalid path like `[.unknown]` → `None`. #[test] fn unknown_path_returns_none() { - let projections = &[proj(sym::lexical::unknown)]; + let projections = &[proj(sym::unknown)]; let access = entity_projection_access(projections); assert_eq!(access, None); diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs index d9db87d455b..f7f56e2f2f5 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs @@ -22,7 +22,7 @@ pub(crate) enum Access { #[derive(Debug, Copy, Clone)] pub(crate) struct PathNode { /// Field name this node matches (empty string for root). - pub name: &'static Symbol<'static>, + pub name: Symbol<'static>, /// Access level when the path ends at this node (no more projections). pub access: Option, /// Access level for paths beyond known children (e.g., JSONB allows any sub-path). @@ -34,7 +34,7 @@ pub(crate) struct PathNode { impl PathNode { pub(crate) const fn root(children: &'static [Self]) -> Self { Self { - name: &sym::lexical::entity, + name: sym::entity, access: None, otherwise: None, children, @@ -42,7 +42,7 @@ impl PathNode { } pub(crate) const fn leaf( - name: &'static Symbol<'static>, + name: Symbol<'static>, access: impl [const] Into>, ) -> Self { Self { @@ -54,7 +54,7 @@ impl PathNode { } /// Creates a JSONB node where any sub-path is also Postgres-accessible. - pub(crate) const fn jsonb(name: &'static Symbol<'static>) -> Self { + pub(crate) const fn jsonb(name: Symbol<'static>) -> Self { Self { name, access: Some(Access::Postgres(AccessMode::Direct)), @@ -64,7 +64,7 @@ impl PathNode { } pub(crate) const fn branch( - name: &'static Symbol<'static>, + name: Symbol<'static>, access: impl [const] Into>, children: &'static [Self], ) -> Self { @@ -77,6 +77,6 @@ impl PathNode { } pub(crate) fn lookup(&self, name: Symbol<'_>) -> Option<&Self> { - self.children.iter().find(|node| *node.name == name) + self.children.iter().find(|node| node.name == name) } } diff --git a/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs b/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs index 13dec06b17a..304d20060e2 100644 --- a/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs +++ b/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs @@ -534,7 +534,7 @@ fn analysis_directives_by_source() { let mut ctor_body = closure_body.clone(); ctor_body.id = DefId::new(1); - ctor_body.source = Source::Ctor(sym::lexical::Some); + ctor_body.source = Source::Ctor(sym::Some); let mut intrinsic_body = closure_body.clone(); intrinsic_body.id = DefId::new(2); From 83f76ec53e42919bfb1d75216a7312fb07605c8e Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 17:09:15 +0100 Subject: [PATCH 09/21] fix: lints --- Cargo.lock | 1 - libs/@local/hashql/core/Cargo.toml | 1 - libs/@local/hashql/core/src/pretty/mod.rs | 4 +-- libs/@local/hashql/core/src/symbol/lookup.rs | 32 ++++++++++---------- libs/@local/hashql/core/src/symbol/mod.rs | 10 ++++-- libs/@local/hashql/core/src/symbol/table.rs | 11 ++++--- 6 files changed, 33 insertions(+), 26 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 4c4f9a6433a..1a95ca3e368 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3894,7 +3894,6 @@ dependencies = [ "insta", "lexical", "memchr", - "phf 0.13.1", "pretty", "proptest", "rapidfuzz", diff --git a/libs/@local/hashql/core/Cargo.toml b/libs/@local/hashql/core/Cargo.toml index 923ffa875fb..038725456eb 100644 --- a/libs/@local/hashql/core/Cargo.toml +++ b/libs/@local/hashql/core/Cargo.toml @@ -29,7 +29,6 @@ derive_more = { workspace = true, features = ["debug", "from"] } ena = { workspace = true } lexical = { workspace = true, features = ["parse-integers", "parse-floats", "format"] } memchr = { workspace = true } -phf = { version = "0.13.1", features = ["macros"] } rapidfuzz = { workspace = true } roaring = { workspace = true, features = ["std", "simd"] } rpds = { workspace = true, features = ["std"] } diff --git a/libs/@local/hashql/core/src/pretty/mod.rs b/libs/@local/hashql/core/src/pretty/mod.rs index 178729361b4..b7fa841ce28 100644 --- a/libs/@local/hashql/core/src/pretty/mod.rs +++ b/libs/@local/hashql/core/src/pretty/mod.rs @@ -24,11 +24,11 @@ //! let fmt = Formatter::new(&heap); //! //! let doc = fmt -//! .keyword(sym::lexical::r#let) +//! .keyword(sym::r#let) //! .append(fmt.space()) //! .append(fmt.literal_str("43")) //! .append(fmt.space()) -//! .append(fmt.punct(sym::symbol::assign)) +//! .append(fmt.punct(sym::symbol::eq)) //! .append(fmt.space()) //! .append(fmt.literal_str("42")); //! diff --git a/libs/@local/hashql/core/src/symbol/lookup.rs b/libs/@local/hashql/core/src/symbol/lookup.rs index 09dfbfedc4d..7e39019c9ee 100644 --- a/libs/@local/hashql/core/src/symbol/lookup.rs +++ b/libs/@local/hashql/core/src/symbol/lookup.rs @@ -45,17 +45,17 @@ enum SymbolLookupInner<'heap, I> { /// # Examples /// /// ``` -/// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; +/// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); /// # let mut heap = Heap::new(); /// # let symbol = heap.intern_symbol("example"); /// // Dense storage for sequential IDs -/// let mut dense_table = SymbolTable::::dense(); +/// let mut dense_table = SymbolLookup::::dense(); /// dense_table.insert(MyId::from_u32(0), symbol); /// assert_eq!(dense_table.get(MyId::from_u32(0)), Some(symbol)); /// /// // Gapped storage for mostly contiguous IDs with some gaps -/// let mut gapped_table = SymbolTable::::gapped(); +/// let mut gapped_table = SymbolLookup::::gapped(); /// gapped_table.insert(MyId::from_u32(0), symbol); /// gapped_table.insert(MyId::from_u32(5), symbol); // Gap at IDs 1-4 /// assert_eq!(gapped_table.get(MyId::from_u32(0)), Some(symbol)); @@ -63,7 +63,7 @@ enum SymbolLookupInner<'heap, I> { /// assert_eq!(gapped_table.get(MyId::from_u32(5)), Some(symbol)); /// /// // Sparse storage for arbitrary IDs -/// let mut sparse_table = SymbolTable::::sparse(); +/// let mut sparse_table = SymbolLookup::::sparse(); /// sparse_table.insert(MyId::from_u32(100), symbol); /// assert_eq!(sparse_table.get(MyId::from_u32(100)), Some(symbol)); /// sparse_table.insert(MyId::from_u32(5), symbol); @@ -86,9 +86,9 @@ where /// # Examples /// /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # use hashql_core::{symbol::SymbolLookup, newtype}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::dense(); + /// let table = SymbolLookup::::dense(); /// // Insertions must be sequential: 0, 1, 2, ... /// ``` #[must_use] @@ -107,9 +107,9 @@ where /// # Examples /// /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # use hashql_core::{symbol::SymbolLookup, newtype}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::gapped(); + /// let table = SymbolLookup::::gapped(); /// // Insertions can have gaps: 0, 5, 3, 10, ... /// ``` #[must_use] @@ -127,9 +127,9 @@ where /// # Examples /// /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; + /// # use hashql_core::{symbol::SymbolLookup, newtype}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::sparse(); + /// let table = SymbolLookup::::sparse(); /// // Insertions can be in any order: 100, 5, 1000, ... /// ``` #[must_use] @@ -155,11 +155,11 @@ where /// # Examples /// /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); /// # let mut heap = Heap::new(); /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); + /// let mut table = SymbolLookup::::dense(); /// table.insert(MyId::from_u32(0), symbol); // First insertion /// table.insert(MyId::from_u32(1), symbol); // Sequential insertion /// ``` @@ -167,11 +167,11 @@ where /// Non-sequential insertions will panic in dense tables: /// /// ```should_panic - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); /// # let mut heap = Heap::new(); /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); + /// let mut table = SymbolLookup::::dense(); /// table.insert(MyId::from_u32(0), symbol); // First insertion /// table.insert(MyId::from_u32(2), symbol); // Non-sequential insertion /// ``` @@ -203,11 +203,11 @@ where /// # Examples /// /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); /// # let mut heap = Heap::new(); /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::sparse(); + /// let mut table = SymbolLookup::::sparse(); /// table.insert(MyId::from_u32(42), symbol); /// /// assert_eq!(table.get(MyId::from_u32(42)), Some(symbol)); diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index 96491667eb3..e4471d6ce52 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -25,7 +25,7 @@ mod table; use core::{ cmp::Ordering, - fmt::{self, Display, Formatter}, + fmt::{self, Debug, Display, Formatter}, hash::Hash, marker::PhantomData, }; @@ -68,7 +68,7 @@ impl ConstantSymbol { /// relies on these *but it does not enforce it*. // We can relay to the derives for PartialEq, Eq, and Hash, as `_marker` is ignored, and the // internal representation makes a pointer comparison. -#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +#[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Symbol<'heap> { repr: Repr, _marker: PhantomData<&'heap ()>, @@ -163,6 +163,12 @@ impl Ord for Symbol<'_> { } } +impl Debug for Symbol<'_> { + fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + fmt.debug_tuple("Symbol").field(&self.as_str()).finish() + } +} + impl Display for Symbol<'_> { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { Display::fmt(self.as_str(), fmt) diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index 4c1c7e07016..ff7e9df8e2b 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -88,11 +88,9 @@ impl SymbolTable { /// /// The table is not primed. Call [`prime`](Self::prime) to populate it with /// predefined symbols before use. + #[inline] pub(crate) fn new() -> Self { - Self { - inner: HashTable::new(), - hasher: RandomState::default(), - } + Self::new_in(Global) } } @@ -101,6 +99,7 @@ impl SymbolTable { /// /// The table is not primed. Call [`prime`](Self::prime) to populate it with /// predefined symbols before use. + #[inline] fn new_in(alloc: A) -> Self { Self { inner: HashTable::new_in(alloc), @@ -109,11 +108,13 @@ impl SymbolTable { } /// Returns the number of symbols currently in the table. + #[cfg(test)] pub(crate) fn len(&self) -> usize { self.inner.len() } /// Returns `true` if the table contains no symbols. + #[inline] pub(crate) fn is_empty(&self) -> bool { self.inner.is_empty() } @@ -132,6 +133,7 @@ impl SymbolTable { /// a new runtime symbol instead of returning the canonical constant [`Repr`] that matches /// the static symbols in [`sym`](super::sym). This would break the invariant that /// predefined symbols intern to their canonical constant representations. + #[inline] pub(crate) unsafe fn clear(&mut self) { self.inner.clear(); } @@ -193,6 +195,7 @@ impl SymbolTable { /// - The table is ready for a new epoch of interning. /// /// [`sym2::LOOKUP`]: super::sym2::LOOKUP + #[inline] pub(crate) unsafe fn reset(&mut self) { // SAFETY: correct order of operations is present. unsafe { From 170ecb170da041d6b18fc59de2d6665832f30d7a Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 18:14:22 +0100 Subject: [PATCH 10/21] feat: benchmarking --- libs/@local/hashql/core/Cargo.toml | 4 + libs/@local/hashql/core/benches/symbol.rs | 384 ++++++++++++++++++++ libs/@local/hashql/core/src/symbol/repr.rs | 28 +- libs/@local/hashql/core/src/symbol/table.rs | 18 +- 4 files changed, 406 insertions(+), 28 deletions(-) create mode 100644 libs/@local/hashql/core/benches/symbol.rs diff --git a/libs/@local/hashql/core/Cargo.toml b/libs/@local/hashql/core/Cargo.toml index 038725456eb..9fc644fac7e 100644 --- a/libs/@local/hashql/core/Cargo.toml +++ b/libs/@local/hashql/core/Cargo.toml @@ -54,3 +54,7 @@ test-strategy = { workspace = true } [[bench]] name = "type_system" harness = false + +[[bench]] +name = "symbol" +harness = false diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs new file mode 100644 index 00000000000..887a146a8fa --- /dev/null +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -0,0 +1,384 @@ +//! Benchmarks for Symbol operations. +//! +//! These benchmarks measure the performance of symbol creation, comparison, +//! hashing, and string access operations. +#![expect( + clippy::indexing_slicing, + clippy::min_ident_chars, + clippy::significant_drop_tightening +)] +use core::{ + hash::{Hash as _, Hasher as _}, + hint::black_box, +}; +use std::collections::hash_map::DefaultHasher; + +use codspeed_criterion_compat::{ + BenchmarkId, Criterion, Throughput, criterion_group, criterion_main, +}; +use hashql_core::{ + heap::{Heap, ResetAllocator as _}, + symbol::{Symbol, sym}, +}; + +// ============================================================================= +// Test Data +// ============================================================================= + +/// Sample identifiers that simulate real source code tokens. +const IDENTIFIERS: &[&str] = &[ + // Common programming identifiers + "x", + "y", + "i", + "n", + "foo", + "bar", + "baz", + "count", + "index", + "value", + "result", + "data", + "items", + "length", + "size", + "name", + "type", + "id", + "key", + "user", + "config", + "options", + "handler", + "callback", + "response", + "request", + "context", + "state", + "props", + "children", + // Longer identifiers + "getUserById", + "setConfiguration", + "handleResponse", + "processRequest", + "validateInput", + "transformData", + "calculateTotal", + "renderComponent", + "initializeState", + "updateMetadata", +]; + +/// Generate unique identifiers with a numeric suffix. +fn generate_unique_identifiers(count: usize) -> Vec { + (0..count).map(|i| format!("ident_{i}")).collect() +} + +// ============================================================================= +// Interning Benchmarks +// ============================================================================= + +fn interning(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("symbol/intern"); + + // Benchmark: Intern unique strings (no dedup hits) + for count in [100, 1000, 10000] { + group.throughput(Throughput::Elements(count as u64)); + group.bench_with_input( + BenchmarkId::new("unique", count), + &count, + |bencher, &count| { + let identifiers = generate_unique_identifiers(count); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + for ident in &identifiers { + black_box(heap.intern_symbol(ident)); + } + }); + }, + ); + } + + // Benchmark: Intern repeated strings (dedup path) + for count in [100, 1000, 10000] { + group.throughput(Throughput::Elements(count)); + group.bench_with_input( + BenchmarkId::new("repeated", count), + &count, + |bencher, &count| { + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + for _ in 0..count { + for ident in IDENTIFIERS { + black_box(heap.intern_symbol(ident)); + } + } + }); + }, + ); + } + + // Benchmark: Mixed workload (realistic lexer simulation) + group.bench_function("mixed_workload", |bencher| { + let unique = generate_unique_identifiers(100); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + // Simulate lexing: mix of repeated keywords and unique identifiers + for _ in 0..10 { + // Keywords (repeated) + for ident in IDENTIFIERS.iter().take(20) { + black_box(heap.intern_symbol(ident)); + } + // Unique identifiers + for ident in &unique { + black_box(heap.intern_symbol(ident)); + } + } + }); + }); + + group.finish(); +} + +// ============================================================================= +// Constant Symbol Access Benchmarks +// ============================================================================= + +fn constant_access(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("symbol/constant"); + + // Benchmark: Access pre-defined constant symbols + group.bench_function("access", |bencher| { + bencher.iter(|| black_box(sym::r#let)); + }); + + // Benchmark: Extract constant for pattern matching + // group.bench_function("as_constant", |bencher| { + // let symbol = sym::r#let; + // bencher.iter(|| black_box(symbol).as_constant()); + // }); + + group.finish(); +} + +// ============================================================================= +// Equality Comparison Benchmarks +// ============================================================================= + +fn equality(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("symbol/equality"); + + // Benchmark: Compare constant symbols (fast path - same pointer) + group.bench_function("constant_equal", |bencher| { + let a = sym::Integer; + let b = sym::Integer; + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare constant symbols (different) + group.bench_function("constant_not_equal", |bencher| { + let a = sym::Integer; + let b = sym::String; + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare runtime symbols (same interned string) + group.bench_function("runtime_equal", |bencher| { + let heap = Heap::new(); + let a = heap.intern_symbol("some_identifier"); + let b = heap.intern_symbol("some_identifier"); + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare runtime symbols (different strings) + group.bench_function("runtime_not_equal", |bencher| { + let heap = Heap::new(); + let a = heap.intern_symbol("identifier_one"); + let b = heap.intern_symbol("identifier_two"); + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Pattern matching on constants + group.bench_function("pattern_match_constant", |bencher| { + let symbol = sym::r#fn; // middle of the match arms + + bencher.iter(|| match black_box(symbol).as_constant() { + Some(sym::r#let::CONST) => 1, + Some(sym::r#if::CONST) => 2, + Some(sym::r#else::CONST) => 3, + Some(sym::r#fn::CONST) => 4, + Some(sym::Integer::CONST) => 5, + Some(sym::String::CONST) => 6, + Some(sym::Boolean::CONST) => 7, + _ => 0, + }); + }); + + group.finish(); +} + +// ============================================================================= +// Hashing Benchmarks +// ============================================================================= + +fn hashing(criterion: &mut Criterion) { + use codspeed_criterion_compat::BatchSize; + + let mut group = criterion.benchmark_group("symbol/hash"); + + // Benchmark: Hash constant symbols + group.bench_function("constant", |bencher| { + let symbol = sym::r#let; + + bencher.iter_batched( + DefaultHasher::new, + |mut hasher| { + symbol.hash(&mut hasher); + hasher.finish() + }, + BatchSize::SmallInput, + ); + }); + + // Benchmark: Hash runtime symbols + group.bench_function("runtime", |bencher| { + let heap = Heap::new(); + let symbol = heap.intern_symbol("some_identifier"); + + bencher.iter_batched( + DefaultHasher::new, + |mut hasher| { + symbol.hash(&mut hasher); + hasher.finish() + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// ============================================================================= +// String Access Benchmarks +// ============================================================================= + +fn string_access(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("symbol/as_str"); + + // Benchmark: Access string content of constant symbols + group.bench_function("constant", |bencher| { + let symbol = sym::r#let; + bencher.iter(|| black_box(symbol.as_str())); + }); + + // Benchmark: Access string content of runtime symbols + group.bench_function("runtime", |bencher| { + let heap = Heap::new(); + let symbol = heap.intern_symbol("some_identifier"); + bencher.iter(|| black_box(symbol.as_str())); + }); + + group.finish(); +} + +// ============================================================================= +// Realistic Workload Benchmarks +// ============================================================================= +#[expect(clippy::integer_division_remainder_used)] +fn realistic(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("symbol/realistic"); + + // Simulate a lexer: tokenize identifiers and compare against keywords + group.bench_function("lexer_simulation", |bencher| { + // Pre-generate "source code" tokens + let source_tokens: Vec<&str> = (0..1000) + .map(|index| IDENTIFIERS[index % IDENTIFIERS.len()]) + .collect(); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + let mut keyword_count = 0; + let mut ident_count = 0; + + for token in &source_tokens { + let symbol = heap.intern_symbol(token); + + // Check if it's a keyword + if matches!( + symbol.as_constant(), + Some( + sym::r#let::CONST + | sym::r#if::CONST + | sym::r#else::CONST + | sym::r#fn::CONST + | sym::r#type::CONST + ) + ) { + keyword_count += 1; + } else { + ident_count += 1; + } + } + + black_box((keyword_count, ident_count)); + }); + }); + + // Simulate type checker: lots of symbol comparisons + group.bench_function("type_checker_simulation", |bencher| { + let heap = Heap::new(); + let symbols: Vec<_> = IDENTIFIERS.iter().map(|s| heap.intern_symbol(s)).collect(); + + bencher.iter(|| { + let mut matches = 0; + + // Compare each symbol against a set of "expected" symbols + for &symbol in &symbols { + if matches!( + symbol.as_constant(), + Some( + sym::Integer::CONST + | sym::String::CONST + | sym::Boolean::CONST + | sym::List::CONST + | sym::Dict::CONST + ) + ) { + matches += 1; + } + } + + black_box(matches); + }); + }); + + group.finish(); +} + +// ============================================================================= +// Entry Point +// ============================================================================= + +criterion_group!( + benches, + interning, + constant_access, + equality, + hashing, + string_access, + realistic, +); +criterion_main!(benches); diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index 51c452fdbbe..15b7944eb16 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -184,9 +184,9 @@ impl ConstantRepr { /// /// The index must be within bounds of [`STRINGS`]. #[inline] - pub(super) const unsafe fn as_str_unchecked(self) -> &'static str { + pub(super) unsafe fn as_str_unchecked(self) -> &'static str { // SAFETY: Caller guarantees the index is in bounds. - unsafe { *SYMBOLS.as_ptr().add(self.0) } + unsafe { SYMBOLS.get_unchecked(self.0) } } /// Returns the byte slice for this constant symbol without bounds checking. @@ -195,7 +195,7 @@ impl ConstantRepr { /// /// The index must be within bounds of [`STRINGS`]. #[inline] - pub(super) const unsafe fn as_bytes_unchecked(self) -> &'static [u8] { + pub(super) unsafe fn as_bytes_unchecked(self) -> &'static [u8] { // SAFETY: Constant symbols return &'static str, which coerces to &'static [u8]. unsafe { self.as_str_unchecked().as_bytes() } } @@ -363,29 +363,19 @@ impl Repr { } } +const _: () = { + assert!(size_of::() == size_of::<*const ()>()); + assert!(size_of::>() == size_of::<*const ()>()); + assert!(align_of::() >= Repr::MIN_ALIGN); +}; + #[cfg(test)] mod tests { #![expect(clippy::non_ascii_literal)] - use core::mem; use super::{ConstantRepr, Repr, RuntimeRepr, SYMBOLS}; use crate::heap::Scratch; - #[test] - fn repr_size_is_one_pointer() { - assert_eq!(mem::size_of::(), mem::size_of::<*const ()>()); - } - - #[test] - fn option_repr_size_is_one_pointer() { - assert_eq!(mem::size_of::>(), mem::size_of::<*const ()>()); - } - - #[test] - fn runtime_symbol_has_minimum_alignment() { - assert!(mem::align_of::() >= Repr::MIN_ALIGN); - } - #[test] fn constant_symbol_first_entry() { let constant = ConstantRepr(0); diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index ff7e9df8e2b..0755d12bd64 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -3,8 +3,8 @@ //! This module provides [`SymbolTable`], a hash-based interner that maps strings to their //! canonical [`Repr`] representation. The table supports two kinds of symbols: //! -//! - **Constant symbols**: Statically defined symbols from [`sym2::LOOKUP`]. Their [`Repr`] encodes -//! an index into the static [`sym2::SYMBOLS`] array (effectively `'static` lifetime). +//! - **Constant symbols**: Statically defined symbols from [`sym::LOOKUP`]. Their [`Repr`] encodes +//! an index into the static [`sym::SYMBOLS`] array (effectively `'static` lifetime). //! //! - **Runtime symbols**: Dynamically interned strings allocated on a bump allocator. Their //! [`Repr`] holds a pointer to a [`RuntimeSymbol`] allocation. @@ -30,11 +30,11 @@ //! # Priming //! //! Calling [`SymbolTable::prime`] populates the table with predefined symbols from -//! [`sym2::LOOKUP`]. This ensures that interning a predefined string returns its +//! [`sym::LOOKUP`]. This ensures that interning a predefined string returns its //! canonical constant [`Repr`] rather than allocating a runtime symbol. //! -//! [`sym2::LOOKUP`]: super::sym2::LOOKUP -//! [`sym2::SYMBOLS`]: super::sym2::SYMBOLS +//! [`sym::LOOKUP`]: super::sym::LOOKUP +//! [`sym::SYMBOLS`]: super::sym::SYMBOLS use alloc::alloc::Global; use core::{alloc::Allocator, hash::BuildHasher as _}; @@ -138,7 +138,7 @@ impl SymbolTable { self.inner.clear(); } - /// Populates the table with predefined symbols from [`sym2::LOOKUP`]. + /// Populates the table with predefined symbols from [`sym::LOOKUP`]. /// /// After priming, interning any predefined symbol string will return its canonical /// constant [`Repr`] rather than allocating a new runtime symbol. @@ -152,7 +152,7 @@ impl SymbolTable { /// /// The caller must ensure that the table is empty before calling this method. /// - /// [`sym2::LOOKUP`]: super::sym2::LOOKUP + /// [`sym::LOOKUP`]: super::sym::LOOKUP pub(crate) unsafe fn prime(&mut self) { self.inner.reserve(super::sym::LOOKUP.len(), |_| { unreachable!("prime() requires an empty table; hasher callback should not be invoked") @@ -191,10 +191,10 @@ impl SymbolTable { /// /// After this method returns: /// - All runtime symbols are removed from the table. - /// - All constant symbols from [`sym2::LOOKUP`] are present. + /// - All constant symbols from [`sym::LOOKUP`] are present. /// - The table is ready for a new epoch of interning. /// - /// [`sym2::LOOKUP`]: super::sym2::LOOKUP + /// [`sym::LOOKUP`]: super::sym::LOOKUP #[inline] pub(crate) unsafe fn reset(&mut self) { // SAFETY: correct order of operations is present. From af0e3667bd0208e56d25fe1e76da04dd98e47eb5 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 18:35:53 +0100 Subject: [PATCH 11/21] fix: docs --- libs/@local/hashql/core/package.json | 2 +- libs/@local/hashql/core/src/symbol/lookup.rs | 10 +- libs/@local/hashql/core/src/symbol/mod.rs | 246 +++++++++++++++++-- libs/@local/hashql/core/src/symbol/repr.rs | 10 +- 4 files changed, 238 insertions(+), 30 deletions(-) diff --git a/libs/@local/hashql/core/package.json b/libs/@local/hashql/core/package.json index 60f659b7864..bc96b2c8164 100644 --- a/libs/@local/hashql/core/package.json +++ b/libs/@local/hashql/core/package.json @@ -9,7 +9,7 @@ "fix:clippy": "just clippy --fix", "lint:clippy": "just clippy", "test:codspeed": "cargo codspeed run -p hashql-core", - "test:miri": "cargo miri nextest run -- co_sort try_scan heap::transfer stable_empty_slice id::slice tarjan::tests::members", + "test:miri": "cargo miri nextest run -- co_sort try_scan heap::transfer stable_empty_slice id::slice tarjan::tests::members symbol", "test:unit": "mise run test:unit @rust/hashql-core" }, "dependencies": { diff --git a/libs/@local/hashql/core/src/symbol/lookup.rs b/libs/@local/hashql/core/src/symbol/lookup.rs index 7e39019c9ee..61c6360b0b4 100644 --- a/libs/@local/hashql/core/src/symbol/lookup.rs +++ b/libs/@local/hashql/core/src/symbol/lookup.rs @@ -15,22 +15,22 @@ enum SymbolLookupInner<'heap, I> { /// A mapping from identifiers to symbols optimized for different access patterns. /// -/// [`SymbolTable`] provides efficient storage and retrieval of [`Symbol`] instances which are tied +/// `SymbolLookup` provides efficient storage and retrieval of [`Symbol`] instances which are tied /// to a specific identifier (which is any type that implements the [`Id`] trait). /// /// # Storage Strategies /// -/// To accommodate different access patterns, [`SymbolTable`] supports three storage strategies: +/// To accommodate different access patterns, `SymbolLookup` supports three storage strategies: /// /// ## Dense Storage /// -/// Created with [`SymbolTable::dense()`], this mode uses a [`Vec`] internally and requires +/// Created with [`SymbolLookup::dense()`], this mode uses a [`Vec`] internally and requires /// IDs to be inserted sequentially starting from 0. This provides optimal memory efficiency /// and cache performance for contiguous ID ranges. /// /// ## Gapped Storage /// -/// Created with [`SymbolTable::gapped()`], this mode uses a [`Vec`] of [`Option`] +/// Created with [`SymbolLookup::gapped()`], this mode uses a [`Vec`] of [`Option`] /// internally and allows insertion at arbitrary indices. Unlike dense storage, gaps are allowed in /// the ID sequence. This provides a balance between the memory efficiency of dense storage and the /// flexibility of sparse storage, making it ideal for scenarios where most IDs are contiguous but @@ -38,7 +38,7 @@ enum SymbolLookupInner<'heap, I> { /// /// ## Sparse Storage /// -/// Created with [`SymbolTable::sparse()`], this mode uses a [`FastHashMap`] internally and +/// Created with [`SymbolLookup::sparse()`], this mode uses a [`FastHashMap`] internally and /// supports arbitrary ID insertion order. This provides flexibility at the cost of higher /// memory overhead per entry. /// diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index e4471d6ce52..7ab38ad8f0d 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -7,16 +7,28 @@ //! //! The module provides: //! -//! - [`Symbol`]: An opaque wrapper around string data that enables efficient storage and comparison -//! - [`SymbolTable`]: A mapping from identifiers to symbols optimized for different access patterns +//! - [`Symbol`]: An interned string reference used throughout the compiler +//! - [`ConstantSymbol`]: A wrapper for predefined symbols, enabling pattern matching +//! - [`SymbolLookup`]: A mapping from identifiers to symbols optimized for different access +//! patterns //! - [`Ident`]: A named identifier with source location and categorization //! - [`IdentKind`]: Classification of different identifier types in HashQL //! -//! ## Design Philosophy +//! # Pattern Matching on Predefined Symbols //! -//! The [`Symbol`] type is designed as an opaque wrapper around its internal string storage. -//! This encapsulation enables future optimizations such as string interning (either through -//! the `string_interner` crate or a custom implementation) without requiring API changes. +//! Use [`Symbol::as_constant()`] to match against predefined symbols from the [`sym`] module: +//! +//! ``` +//! # use hashql_core::symbol::{Symbol, sym}; +//! fn classify(symbol: Symbol<'_>) -> &'static str { +//! match symbol.as_constant() { +//! Some(sym::r#let::CONST) => "let keyword", +//! Some(sym::r#if::CONST) => "if keyword", +//! Some(sym::Integer::CONST) => "Integer type", +//! _ => "other", +//! } +//! } +//! ``` mod lookup; mod repr; @@ -35,12 +47,39 @@ use self::repr::{ConstantRepr, Repr}; pub(crate) use self::table::SymbolTable; use crate::span::SpanId; +/// A predefined symbol that can be used in pattern matching. +/// +/// This is a structural wrapper around a constant symbol index, designed to +/// enable exhaustive pattern matching on predefined symbols. Unlike [`Symbol`], +/// which uses a tagged pointer that cannot appear in const patterns, `ConstantSymbol` +/// is a simple newtype over an index that derives [`PartialEq`] and [`Eq`] structurally. +/// +/// # Usage +/// +/// Obtained via [`Symbol::as_constant()`], then matched against `sym::NAME::CONST`: +/// +/// ``` +/// # use hashql_core::symbol::{Symbol, ConstantSymbol, sym}; +/// fn handle_keyword(sym: Symbol<'_>) { +/// if let Some(c) = sym.as_constant() { +/// match c { +/// sym::r#let::CONST => println!("let keyword"), +/// sym::r#fn::CONST => println!("fn keyword"), +/// _ => {} +/// } +/// } +/// } +/// ``` #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub struct ConstantSymbol { repr: ConstantRepr, } impl ConstantSymbol { + /// Creates a `ConstantSymbol` from a raw index without bounds checking. + /// + /// This is used by the [`sym`] macro to generate constant symbol definitions. + /// The index must be valid for the static `SYMBOLS` table. #[inline] const fn new_unchecked(index: usize) -> Self { Self { @@ -54,19 +93,17 @@ impl ConstantSymbol { } } -/// A string-like value used throughout the HashQL compiler. +/// An interned string reference used throughout the HashQL compiler. /// /// Symbols represent string data that appears in source code and persists throughout -/// compilation, they are read-only and immutable. +/// compilation. They are read-only, immutable, and designed for efficient comparison +/// and hashing. /// -/// This type is deliberately opaque to hide its internal representation, -/// allowing for future optimizations like string interning without changing -/// the public API. Symbols are designed to be efficient for long-lived objects -/// that are frequently compared, hashed, and referenced during compilation. +/// # Pattern Matching /// -/// The caller must ensure that the string is unique and interned. The types correctness requires -/// relies on these *but it does not enforce it*. -// We can relay to the derives for PartialEq, Eq, and Hash, as `_marker` is ignored, and the +/// Use [`as_constant()`](Self::as_constant) to extract a [`ConstantSymbol`] for pattern +/// matching against predefined symbols from the [`sym`] module. +// We can rely on the derives for PartialEq, Eq, and Hash, as `_marker` is ignored, and the // internal representation makes a pointer comparison. #[derive(Copy, Clone, PartialEq, Eq, Hash)] pub struct Symbol<'heap> { @@ -84,6 +121,16 @@ impl<'heap> Symbol<'heap> { } } + /// Creates a [`Symbol`] from a raw [`Repr`]. + /// + /// # Safety + /// + /// The caller must ensure: + /// + /// - For runtime symbols: the [`Repr`] must point to a valid allocation that remains live for + /// the `'heap` lifetime. + /// - For constant symbols: the [`Repr`] must encode a valid index into the static symbol table. + /// - The symbol must be properly interned (unique string content maps to unique [`Repr`]). #[inline] pub(crate) const unsafe fn from_repr(repr: Repr) -> Self { Symbol { @@ -97,12 +144,46 @@ impl<'heap> Symbol<'heap> { self.repr } + /// Returns the constant symbol representation if this is a predefined symbol. + /// + /// Use this to pattern match against predefined symbols from the [`sym`] module: + /// + /// ``` + /// # use hashql_core::symbol::{Symbol, sym}; + /// fn is_keyword(sym: Symbol<'_>) -> bool { + /// matches!( + /// sym.as_constant(), + /// Some(sym::r#let::CONST | sym::r#if::CONST | sym::r#fn::CONST) + /// ) + /// } + /// ``` + /// + /// Returns [`None`] for runtime (heap-allocated) symbols. pub fn as_constant(self) -> Option { self.repr .try_as_constant_symbol() .map(ConstantSymbol::from_repr) } + /// Returns the string content of this symbol. + /// + /// The returned reference is valid for the lifetime of this symbol. For access with the + /// full `'heap` lifetime, use [`unwrap()`](Self::unwrap) instead. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::sym; + /// assert_eq!(sym::Integer.as_str(), "Integer"); + /// assert_eq!(sym::r#let.as_str(), "let"); + /// ``` + /// + /// ``` + /// # use hashql_core::heap::Heap; + /// let heap = Heap::new(); + /// let symbol = heap.intern_symbol("hello"); + /// assert_eq!(symbol.as_str(), "hello"); + /// ``` #[must_use] #[inline] pub fn as_str(&self) -> &str { @@ -110,11 +191,22 @@ impl<'heap> Symbol<'heap> { unsafe { self.repr.as_str() } } - /// Returns the string representation of the symbol. + /// Returns the string content with the full heap lifetime. + /// + /// Unlike [`as_str()`](Self::as_str), this method returns a reference with the `'heap` + /// lifetime rather than the symbol's own lifetime. This is useful when the string needs + /// to outlive the symbol itself. + /// + /// Note that the returned string should be treated as no longer subject to the interning + /// guarantee—it's just a plain `&str`. + /// + /// # Examples /// - /// Unlike [`Self::as_str`], this method provides access for the lifetime of the interner - /// instead of the symbol itself, somewhat circumventing the protections given to the symbol - /// itself. Any unwrapped type should be considered no longer unique and interned. + /// ``` + /// # use hashql_core::symbol::sym; + /// let s: &'static str = sym::Integer.unwrap(); + /// assert_eq!(s, "Integer"); + /// ``` #[must_use] #[inline] pub fn unwrap(self) -> &'heap str { @@ -122,6 +214,14 @@ impl<'heap> Symbol<'heap> { unsafe { self.repr.as_str() } } + /// Returns the raw bytes of this symbol's string content. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::sym; + /// assert_eq!(sym::Integer.as_bytes(), b"Integer"); + /// ``` #[must_use] #[inline] pub fn as_bytes(&self) -> &[u8] { @@ -129,6 +229,26 @@ impl<'heap> Symbol<'heap> { unsafe { self.repr.as_bytes() } } + /// Returns the demangled name, stripping any suffix after the last `:`. + /// + /// This is used for symbols with mangled names (e.g., `"foo:123"` → `"foo"`). + /// If there is no `:`, returns the full symbol content. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::heap::Heap; + /// let heap = Heap::new(); + /// + /// let mangled = heap.intern_symbol("variable:42"); + /// assert_eq!(mangled.demangle(), "variable"); + /// + /// let plain = heap.intern_symbol("plain_name"); + /// assert_eq!(plain.demangle(), "plain_name"); + /// + /// let multiple = heap.intern_symbol("a:b:c"); + /// assert_eq!(multiple.demangle(), "a:b"); + /// ``` #[must_use] #[inline] pub fn demangle(self) -> &'heap str { @@ -313,6 +433,23 @@ pub struct Ident<'heap> { } impl<'heap> Ident<'heap> { + /// Creates a synthetic identifier with no source location. + /// + /// Synthetic identifiers are used for compiler-generated names that don't + /// correspond to any location in source code. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::{Ident, IdentKind, sym}; + /// # use hashql_core::span::SpanId; + /// let ident = Ident::synthetic(sym::foo); + /// + /// assert_eq!(ident.span, SpanId::SYNTHETIC); + /// assert_eq!(ident.value, sym::foo); + /// assert_eq!(ident.kind, IdentKind::Lexical); + /// assert_eq!(ident.as_ref(), "foo"); + /// ``` #[must_use] pub const fn synthetic(value: Symbol<'heap>) -> Self { Self { @@ -334,3 +471,74 @@ impl Display for Ident<'_> { Display::fmt(&self.value.as_str(), fmt) } } + +const _: () = { + assert!(size_of::() == size_of::()); + assert!(size_of::>() == size_of::()); +}; + +#[cfg(test)] +mod tests { + #![expect(clippy::min_ident_chars, clippy::many_single_char_names)] + use core::{cmp::Ordering, hash::BuildHasher as _}; + use std::hash::RandomState; + + use super::sym; + use crate::heap::Heap; + + #[test] + fn symbol_equality() { + let heap = Heap::new(); + let a = heap.intern_symbol("foo"); + let b = heap.intern_symbol("bar"); + let c = heap.intern_symbol("bar"); + let d = sym::Integer; + let e = sym::String; + let f = sym::String; + + assert_ne!(a, b); + assert_eq!(b, c); + assert_ne!(c, d); + assert_ne!(d, e); + assert_eq!(e, f); + } + + #[test] + fn symbol_ordering() { + let heap = Heap::new(); + let a = heap.intern_symbol("aaa"); + let b = sym::bar; + let c = heap.intern_symbol("ccc"); + + assert_eq!(a.cmp(&b), Ordering::Less); + assert_eq!(b.cmp(&c), Ordering::Less); + assert_eq!(c.cmp(&a), Ordering::Greater); + assert_eq!(b.cmp(&b), Ordering::Equal); + } + + #[test] + fn symbol_consistent_hashing() { + let heap = Heap::new(); + let a = heap.intern_symbol("test"); + + let hasher = RandomState::new(); + + assert_eq!(hasher.hash_one(a), hasher.hash_one(a.repr)); + } + + #[test] + fn interned_predefined_returns_constant() { + let heap = Heap::new(); + let interned = heap.intern_symbol("let"); + + assert_eq!(interned.as_constant(), Some(sym::r#let::CONST)); + } + + #[test] + fn runtime_symbol_returns_no_constant() { + let heap = Heap::new(); + let runtime = heap.intern_symbol("not_a_keyword"); + + assert!(runtime.as_constant().is_none()); + } +} diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index 15b7944eb16..229086827c9 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -168,7 +168,7 @@ impl RuntimeRepr { } } -/// A constant symbol represented as an index into [`STRINGS`]. +/// A constant symbol represented as an index into [`SYMBOLS`]. #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] pub(crate) struct ConstantRepr(usize); @@ -182,7 +182,7 @@ impl ConstantRepr { /// /// # Safety /// - /// The index must be within bounds of [`STRINGS`]. + /// The index must be within bounds of [`SYMBOLS`]. #[inline] pub(super) unsafe fn as_str_unchecked(self) -> &'static str { // SAFETY: Caller guarantees the index is in bounds. @@ -193,7 +193,7 @@ impl ConstantRepr { /// /// # Safety /// - /// The index must be within bounds of [`STRINGS`]. + /// The index must be within bounds of [`SYMBOLS`]. #[inline] pub(super) unsafe fn as_bytes_unchecked(self) -> &'static [u8] { // SAFETY: Constant symbols return &'static str, which coerces to &'static [u8]. @@ -205,8 +205,8 @@ impl ConstantRepr { /// /// Uses a tagged pointer to distinguish between runtime and constant symbols: /// -/// - **Runtime** (tag = 0): Pointer to a [`RuntimeSymbol`] allocation -/// - **Constant** (tag = 1): Index into [`STRINGS`] encoded in the pointer bits +/// - **Runtime** (tag = 0): Pointer to a [`RuntimeRepr`] allocation +/// - **Constant** (tag = 1): Index into [`SYMBOLS`] encoded in the pointer bits /// /// # Size /// From f7967e3149dfaf049512c548494d5dbd4df52668 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 18:39:35 +0100 Subject: [PATCH 12/21] chore: use local lock for interner --- libs/@local/hashql/core/benches/symbol.rs | 2 +- libs/@local/hashql/core/src/heap/mod.rs | 20 +++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs index 887a146a8fa..dd12a7ec99c 100644 --- a/libs/@local/hashql/core/benches/symbol.rs +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -18,7 +18,7 @@ use codspeed_criterion_compat::{ }; use hashql_core::{ heap::{Heap, ResetAllocator as _}, - symbol::{Symbol, sym}, + symbol::sym, }; // ============================================================================= diff --git a/libs/@local/hashql/core/src/heap/mod.rs b/libs/@local/hashql/core/src/heap/mod.rs index ad079291333..eda86f1decc 100644 --- a/libs/@local/hashql/core/src/heap/mod.rs +++ b/libs/@local/hashql/core/src/heap/mod.rs @@ -101,7 +101,6 @@ mod scratch; mod transfer; use core::{alloc, mem, ptr}; -use std::sync::Mutex; use ::alloc::{boxed, collections::vec_deque, vec}; @@ -114,7 +113,10 @@ pub use self::{ scratch::Scratch, transfer::TransferInto, }; -use crate::symbol::{Symbol, SymbolTable}; +use crate::{ + symbol::{Symbol, SymbolTable}, + sync::lock::LocalLock, +}; /// A boxed value allocated on the `Heap`. /// @@ -148,7 +150,7 @@ pub type HashMap<'heap, K, V, S = foldhash::fast::RandomState> = #[derive(Debug)] pub struct Heap { inner: Allocator, - strings: Mutex, + strings: LocalLock, } impl Heap { @@ -170,7 +172,7 @@ impl Heap { pub fn uninitialized() -> Self { Self { inner: Allocator::new(), - strings: Mutex::new(SymbolTable::new()), + strings: LocalLock::new(SymbolTable::new()), } } @@ -187,7 +189,7 @@ impl Heap { /// /// Panics if the heap is already primed. pub fn prime(&mut self) { - let strings = self.strings.get_mut().expect("lock should not be poisoned"); + let strings = self.strings.get_mut(); assert!( strings.is_empty(), "heap has already been primed or has interned symbols" @@ -218,7 +220,7 @@ impl Heap { Self { inner: Allocator::new(), - strings: Mutex::new(table), + strings: LocalLock::new(table), } } @@ -240,7 +242,7 @@ impl Heap { Self { inner: Allocator::with_capacity(capacity), - strings: Mutex::new(table), + strings: LocalLock::new(table), } } @@ -268,7 +270,7 @@ impl Heap { /// /// Panics if the internal mutex is poisoned. pub fn intern_symbol<'this>(&'this self, value: &str) -> Symbol<'this> { - let mut strings = self.strings.lock().expect("lock should not be poisoned"); + let mut strings = self.strings.lock(); // SAFETY: `SymbolTable::intern` requires: // 1. No dangling pointers: The table is reset before the arena in `Heap::reset`. @@ -335,7 +337,7 @@ impl ResetAllocator for Heap { #[inline] fn reset(&mut self) { { - let mut strings = self.strings.lock().expect("lock should not be poisoned"); + let mut strings = self.strings.lock(); // SAFETY: The symbol table is reset before the arena, so no dangling references exist. unsafe { From 67655252ef2e0035595fe7e4ccf80da4ed601fa4 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Sun, 1 Feb 2026 18:44:07 +0100 Subject: [PATCH 13/21] fix: benchmark naming --- libs/@local/hashql/core/benches/symbol.rs | 24 +++++++++++------------ 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs index dd12a7ec99c..583743f9be0 100644 --- a/libs/@local/hashql/core/benches/symbol.rs +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -14,7 +14,7 @@ use core::{ use std::collections::hash_map::DefaultHasher; use codspeed_criterion_compat::{ - BenchmarkId, Criterion, Throughput, criterion_group, criterion_main, + BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main, }; use hashql_core::{ heap::{Heap, ResetAllocator as _}, @@ -81,7 +81,7 @@ fn generate_unique_identifiers(count: usize) -> Vec { // ============================================================================= fn interning(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("symbol/intern"); + let mut group = criterion.benchmark_group("intern"); // Benchmark: Intern unique strings (no dedup hits) for count in [100, 1000, 10000] { @@ -153,7 +153,7 @@ fn interning(criterion: &mut Criterion) { // ============================================================================= fn constant_access(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("symbol/constant"); + let mut group = criterion.benchmark_group("constant"); // Benchmark: Access pre-defined constant symbols group.bench_function("access", |bencher| { @@ -161,10 +161,10 @@ fn constant_access(criterion: &mut Criterion) { }); // Benchmark: Extract constant for pattern matching - // group.bench_function("as_constant", |bencher| { - // let symbol = sym::r#let; - // bencher.iter(|| black_box(symbol).as_constant()); - // }); + group.bench_function("as_constant", |bencher| { + let symbol = sym::r#let; + bencher.iter(|| black_box(symbol).as_constant()); + }); group.finish(); } @@ -174,7 +174,7 @@ fn constant_access(criterion: &mut Criterion) { // ============================================================================= fn equality(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("symbol/equality"); + let mut group = criterion.benchmark_group("equality"); // Benchmark: Compare constant symbols (fast path - same pointer) group.bench_function("constant_equal", |bencher| { @@ -234,9 +234,7 @@ fn equality(criterion: &mut Criterion) { // ============================================================================= fn hashing(criterion: &mut Criterion) { - use codspeed_criterion_compat::BatchSize; - - let mut group = criterion.benchmark_group("symbol/hash"); + let mut group = criterion.benchmark_group("hash"); // Benchmark: Hash constant symbols group.bench_function("constant", |bencher| { @@ -275,7 +273,7 @@ fn hashing(criterion: &mut Criterion) { // ============================================================================= fn string_access(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("symbol/as_str"); + let mut group = criterion.benchmark_group("as_str"); // Benchmark: Access string content of constant symbols group.bench_function("constant", |bencher| { @@ -298,7 +296,7 @@ fn string_access(criterion: &mut Criterion) { // ============================================================================= #[expect(clippy::integer_division_remainder_used)] fn realistic(criterion: &mut Criterion) { - let mut group = criterion.benchmark_group("symbol/realistic"); + let mut group = criterion.benchmark_group("realistic"); // Simulate a lexer: tokenize identifiers and compare against keywords group.bench_function("lexer_simulation", |bencher| { From b81bed002f68d8f4e4bb9cb0971a16fb0c56d2e7 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Mon, 2 Feb 2026 10:23:35 +0100 Subject: [PATCH 14/21] chore: document macro --- libs/@local/hashql/core/src/symbol/sym.rs | 49 +++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index 19512be01c4..ba6012bbc0e 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -1,6 +1,55 @@ #![expect(non_upper_case_globals, non_snake_case, clippy::min_ident_chars)] use super::{ConstantSymbol, Symbol}; +/// Generates pre-interned symbols available at compile time. +/// +/// This macro produces three artifacts from a single symbol table definition: +/// +/// 1. **`SYMBOLS`** - A static slice of string values for interner pre-population +/// 2. **Symbol constants** - `Symbol<'static>` constants (e.g., `sym::foo`, `sym::symbol::plus`) +/// 3. **`LOOKUP`** - A static slice mapping string values to their [`Repr`] for fast lookup +/// +/// # Syntax +/// +/// ```text +/// symbols! {@table; +/// // Simple symbol: name becomes both the constant and string value +/// foo, +/// +/// // Explicit string: use when string differs from identifier +/// r#true: "true", +/// input_exists: "$exists", +/// +/// // Nested module: groups related symbols under a namespace +/// symbol: { +/// plus: "+", +/// minus: "-", +/// }, +/// } +/// ``` +/// +/// Each symbol `name` or `name: "value"` generates: +/// - A constant `name: Symbol<'static>` with auto-generated docs +/// - A submodule `name` containing `CONST: ConstantSymbol` for pattern matching +/// +/// Modules create nested namespaces, so `symbol::plus` becomes accessible as `sym::symbol::plus`. +/// +/// # Internal Rules +/// +/// The macro uses internal rules (prefixed with `@`) to process the token stream: +/// +/// - **`@strings`** - Collects all string values into the `SYMBOLS` slice +/// - **`@consts`** - Generates `Symbol` constants and companion modules with index tracking +/// - **`@consts @cont`** - Continuation after processing a nested module to resume counting +/// - **`@lookup`** - Builds the string-to-repr mapping table for runtime lookup +/// - **`@path`** - Helper to construct module paths (reverses accumulated path segments) +/// - **`@table`** - Entry point that dispatches to all three generators +/// +/// Index tracking uses the `${count($count)}` metavariable to assign sequential indices. +/// Each processed symbol appends `()` to the count accumulator, and `${count(...)}` returns +/// the number of elements. +/// +/// [`Repr`]: super::repr::Repr macro_rules! symbols { (@strings [$($acc:tt)*];) => { pub(crate) static SYMBOLS: &[&str] = &[ From 394ebaecf68f1f6668e6f3853057d48a086fd3e2 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Mon, 2 Feb 2026 10:35:03 +0100 Subject: [PATCH 15/21] chore: remove spec --- SPEC.md | 77 --------------------------------------------------------- 1 file changed, 77 deletions(-) delete mode 100644 SPEC.md diff --git a/SPEC.md b/SPEC.md deleted file mode 100644 index 89e288927e1..00000000000 --- a/SPEC.md +++ /dev/null @@ -1,77 +0,0 @@ -# sym2.rs symbols! macro specification - -## Goal - -Define a declarative `symbols!` macro in `libs/@local/hashql/core/src/symbol/sym2.rs` that expands a compact symbol list into: - -- A single global string table `SYMBOLS` containing every symbol literal exactly once. -- `const` `Symbol` values for each symbol name (including nested modules). -- A `phf::Map` lookup table from string to `Symbol`. - -## Inputs - -The macro is invoked in `sym2.rs` with a mixture of: - -- Bare identifiers (e.g., `access, add`). -- Identifier-to-string pairs using `name: "..."` (e.g., `r#if: "if"`). -- Nested module blocks using `module_name: { ... }` with the same item forms inside. - -## Outputs and behavior - -1. **Global string table** - - Emit `static SYMBOLS: &[&str] = &[ ... ];` that contains **all** symbol string values, in macro traversal order. - - Order is deterministic and mirrors the macro input order, flattening nested blocks in-place. - - Each string appears **exactly once**; duplicates are detected by a runtime test. - -2. **Symbol constants** - - For each symbol item, emit a `const : Symbol = Symbol::constant_unchecked();`. - - The `` is the position of the symbol’s string in `SYMBOLS`. - - When items are inside `module_name: { ... }`, emit a `mod module_name { use super::*; ... }` containing the `const`s for that module’s items. - -3. **Lookup map** - - Emit `static LOOKUP: phf::Map<&'static str, Symbol> = phf_map! { ... };`. - - Each entry maps the **string value** to the corresponding `Symbol` constant. - - The map includes entries for both top-level and nested module items, with the value referencing the correct constant (e.g., `"*" => symbol::asterisk`). - -4. **Uniqueness checks** - - No macro-time checks; uniqueness is enforced by a runtime test that fails if duplicate strings exist in `SYMBOLS`. - -## Expansion details - -Given the existing example in `sym2.rs`, the expansion will: - -- Generate `SYMBOLS` covering: `"access"`, `"add"`, `"and"`, `"archived"`, `"archived_by_id"`, `"bar"`, `"BaseUrl"`, `"bit_and"`, `"bit_not"`, `"bit_or"`, `"if"`, `""`, `"'"`, `"*"`, `"0"`, `"1"`, `"::core::option::Option"`, `"::core::option::Some"`, `"::core::option::None"`, `"::graph::head::entities"`, `"::graph::body::filter"`, `"::graph::tail::collect"`. -- Create `const` bindings for each name in the correct scope, each using the index into `SYMBOLS`. -- Create a `LOOKUP` `phf_map!` with all strings mapped to their corresponding `Symbol` constants. - -## Implementation approach - -1. **Define the macro interface** to accept a comma-separated list of `symbol_item` forms: - - `ident` (string = ident name) - - `ident : literal` (string = literal) - - `module_ident : { ... }` - -2. **Flatten items** into a single sequence of `(string_literal, const_path)` in the exact order of appearance. - - For nested modules, the `const_path` is `module_ident::item_ident`. - -3. **Generate indices** by counting from `0` in the flattened order. - - Use a recursive macro to emit tuples `((string, path), index)` as it walks the input. - -4. **Emit `SYMBOLS`** by collecting the flattened string list. - -5. **Emit consts** - - For top-level items: `const name: Symbol = Symbol::constant_unchecked(index);`. - - For module items: `mod module { use super::*; const name: Symbol = Symbol::constant_unchecked(index); }`. - -6. **Emit `LOOKUP`** by mapping each flattened string to its `const_path`. - -7. **Uniqueness enforcement** - - Add a `#[test]` in `sym2.rs` that inserts every entry from `SYMBOLS` into a `HashSet` and asserts that the set size equals `SYMBOLS.len()`. - - The test is the only enforcement mechanism; compilation is not affected. - -## Definition of done - -- `sym2.rs` contains the new `symbols!` macro that expands as specified. -- `SYMBOLS`, all `const` symbols, and `LOOKUP` are generated from the macro invocation. -- Duplicate string values fail the uniqueness test. -- Code builds without additional files or edits outside `sym2.rs` (other than this spec file). From f0c622bada485e4c46232d7f98cdc60eecffa814 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Mon, 2 Feb 2026 10:43:44 +0100 Subject: [PATCH 16/21] chore: docs --- libs/@local/hashql/core/src/symbol/repr.rs | 12 ++++++------ libs/@local/hashql/core/src/symbol/table.rs | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs index 229086827c9..891017c53a8 100644 --- a/libs/@local/hashql/core/src/symbol/repr.rs +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -15,13 +15,13 @@ //! //! Uses the lowest bit as a discriminant tag (possible because allocations are 2-byte aligned): //! -//! - Bit 0 = `0`: Runtime symbol (pointer to [`RuntimeSymbol`] allocation) +//! - Bit 0 = `0`: Runtime symbol (pointer to [`RuntimeRepr`] allocation) //! - Bit 0 = `1`: Constant symbol (index shifted left by 1, `OR`ed with tag) //! //! # Provenance //! -//! Runtime symbols store a [`NonNull`] rather than a reference to preserve -//! full allocation provenance. Creating `&RuntimeSymbol` would narrow provenance to just the +//! Runtime symbols store a [`NonNull`] rather than a reference to preserve +//! full allocation provenance. Creating `&RuntimeRepr` would narrow provenance to just the //! header, causing undefined behavior when accessing the trailing inline bytes under strict //! provenance / Stacked Borrows. #![expect(unsafe_code)] @@ -131,7 +131,7 @@ impl RuntimeRepr { /// /// # Safety /// - /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. /// - The allocation must remain live for the duration of this call. #[inline] const unsafe fn len(this: NonNull) -> usize { @@ -143,7 +143,7 @@ impl RuntimeRepr { /// /// # Safety /// - /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. /// - The allocation must remain live for the lifetime `'a`. /// - The returned slice must not be mutated for the lifetime `'a`. #[inline] @@ -157,7 +157,7 @@ impl RuntimeRepr { /// /// # Safety /// - /// - `this` must point to a valid, initialized [`RuntimeSymbol`] allocation. + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. /// - The allocation must remain live for the lifetime `'a`. /// - The returned string must not be mutated for the lifetime `'a`. #[inline] diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index 0755d12bd64..858da553711 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -7,7 +7,7 @@ //! an index into the static [`sym::SYMBOLS`] array (effectively `'static` lifetime). //! //! - **Runtime symbols**: Dynamically interned strings allocated on a bump allocator. Their -//! [`Repr`] holds a pointer to a [`RuntimeSymbol`] allocation. +//! [`Repr`] holds a pointer to a [`RuntimeRepr`] allocation. //! //! # Lifecycle and Epoch Coupling //! @@ -208,7 +208,7 @@ impl SymbolTable { /// /// If the string has already been interned (either as a predefined constant or a /// previously interned runtime symbol), returns the existing [`Repr`]. Otherwise, - /// allocates a new [`RuntimeSymbol`] on the provided bump allocator and inserts it. + /// allocates a new [`RuntimeRepr`] on the provided bump allocator and inserts it. /// /// # Returns /// From 6f8dcc8215e48680ee0e98f60c51fe6da53610db Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Mon, 2 Feb 2026 10:46:56 +0100 Subject: [PATCH 17/21] chore: docs --- libs/@local/hashql/core/src/heap/mod.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/@local/hashql/core/src/heap/mod.rs b/libs/@local/hashql/core/src/heap/mod.rs index eda86f1decc..6049dfa8c6a 100644 --- a/libs/@local/hashql/core/src/heap/mod.rs +++ b/libs/@local/hashql/core/src/heap/mod.rs @@ -18,7 +18,7 @@ //! // Intern strings for efficient comparison //! let sym1 = heap.intern_symbol("hello"); //! let sym2 = heap.intern_symbol("hello"); -//! assert!(std::ptr::eq(sym1.as_str(), sym2.as_str())); // Same pointer +//! assert!(core::ptr::eq(sym1.as_str(), sym2.as_str())); // Same pointer //! ``` //! //! # Type Aliases @@ -80,8 +80,8 @@ //! //! ## [`TransferInto`] //! -//! Copy borrowed data (`&[T]` or `&str`) into the arena. Only implemented for arena allocators -//! to prevent memory leaks from creating `&'static` references: +//! Copy borrowed data (`&[T]` or `&str`) into the arena. The returned reference is tied to the +//! arena's lifetime, ensuring the data is freed when the arena resets: //! //! ``` //! # #![feature(allocator_api)] @@ -143,10 +143,10 @@ pub type VecDeque<'heap, T> = vec_deque::VecDeque; pub type HashMap<'heap, K, V, S = foldhash::fast::RandomState> = hashbrown::HashMap; -/// An arena allocator for AST nodes and collections with string interning. +/// An arena allocator for AST nodes, collections, and symbols. /// /// Combines a bump allocator with a string interning table for deduplicated -/// symbol storage. Interned strings enable O(1) comparison via pointer equality. +/// symbol storage. Interned symbols enable O(1) comparison via pointer equality. #[derive(Debug)] pub struct Heap { inner: Allocator, @@ -248,7 +248,7 @@ impl Heap { /// Allocates a value in the arena, returning a mutable reference. /// - /// Only accepts types that do **not** require [`Drop`]. Types requiring destructors + /// Only accepts types that do not require [`Drop`]. Types requiring destructors /// must use [`heap::Box`](Box) or [`heap::Vec`](Vec) instead. #[inline] pub fn alloc(&self, value: T) -> &mut T { From 9e194a90de9e5771d1f6ecc51919257a1fd9851e Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Mon, 2 Feb 2026 10:47:45 +0100 Subject: [PATCH 18/21] chore: promote nested module in module to a compile error --- libs/@local/hashql/core/src/symbol/sym.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index ba6012bbc0e..578e7327372 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -79,7 +79,7 @@ macro_rules! symbols { symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); }; (@consts @cont [$($count:tt)*] [$($next:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { - symbols!(@consts @cont [$($count)* ()] [$($next)*];; $($inner)* $(, $($rest)*)?); + compile_error!("nested modules in modules are not supported"); }; (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident $(, $($rest:tt)*)?) => { symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); From 6691c2f83db61cff2c8af9857e95ee155b3809f1 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Wed, 4 Feb 2026 11:22:16 +0100 Subject: [PATCH 19/21] chore: correctly calculate the throughput --- libs/@local/hashql/core/benches/symbol.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs index 583743f9be0..24115969529 100644 --- a/libs/@local/hashql/core/benches/symbol.rs +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -105,7 +105,7 @@ fn interning(criterion: &mut Criterion) { // Benchmark: Intern repeated strings (dedup path) for count in [100, 1000, 10000] { - group.throughput(Throughput::Elements(count)); + group.throughput(Throughput::Elements((count * IDENTIFIERS.len()) as u64)); group.bench_with_input( BenchmarkId::new("repeated", count), &count, From 1e572adc57bbb2947dcc57575a8e62cbf687f186 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Wed, 4 Feb 2026 11:23:03 +0100 Subject: [PATCH 20/21] chore: correctly calculate the throughput --- libs/@local/hashql/core/benches/symbol.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs index 24115969529..56a8e1afe6e 100644 --- a/libs/@local/hashql/core/benches/symbol.rs +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -112,6 +112,11 @@ fn interning(criterion: &mut Criterion) { |bencher, &count| { let mut heap = Heap::new(); + // pre-intern so that the dedup path is exercised + for ident in IDENTIFIERS { + black_box(heap.intern_symbol(ident)); + } + bencher.iter(|| { heap.reset(); for _ in 0..count { From a78848f995130fe37810d0d4dd2eb24f76be9cc0 Mon Sep 17 00:00:00 2001 From: Bilal Mahmoud Date: Wed, 4 Feb 2026 11:23:18 +0100 Subject: [PATCH 21/21] chore: correctly calculate the throughput --- libs/@local/hashql/core/benches/symbol.rs | 5 ----- 1 file changed, 5 deletions(-) diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs index 56a8e1afe6e..24115969529 100644 --- a/libs/@local/hashql/core/benches/symbol.rs +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -112,11 +112,6 @@ fn interning(criterion: &mut Criterion) { |bencher, &count| { let mut heap = Heap::new(); - // pre-intern so that the dedup path is exercised - for ident in IDENTIFIERS { - black_box(heap.intern_symbol(ident)); - } - bencher.iter(|| { heap.reset(); for _ in 0..count {