diff --git a/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs b/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs index da409876033..7e3c0e1476f 100644 --- a/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs +++ b/libs/@local/hashql/ast/src/lowering/import_resolver/error.rs @@ -400,7 +400,7 @@ fn add_available_imports<'heap>( fn format_absolute_path<'heap>(item: &Item<'heap>, registry: &ModuleRegistry<'heap>) -> String { iter::once("") - .chain(item.absolute_path(registry).map(|symbol| symbol.unwrap())) + .chain(item.absolute_path(registry).map(Symbol::unwrap)) .intersperse("::") .collect() } diff --git a/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs b/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs index 77aea15536e..7135be9f61b 100644 --- a/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs +++ b/libs/@local/hashql/ast/src/lowering/type_extractor/error.rs @@ -479,7 +479,7 @@ pub(crate) fn unknown_intrinsic_type( } else { let suggestions: String = similar .into_iter() - .map(|symbol| symbol.unwrap()) + .map(Symbol::unwrap) .intersperse("`, `") .collect(); diff --git a/libs/@local/hashql/core/Cargo.toml b/libs/@local/hashql/core/Cargo.toml index 038725456eb..9fc644fac7e 100644 --- a/libs/@local/hashql/core/Cargo.toml +++ b/libs/@local/hashql/core/Cargo.toml @@ -54,3 +54,7 @@ test-strategy = { workspace = true } [[bench]] name = "type_system" harness = false + +[[bench]] +name = "symbol" +harness = false diff --git a/libs/@local/hashql/core/benches/symbol.rs b/libs/@local/hashql/core/benches/symbol.rs new file mode 100644 index 00000000000..24115969529 --- /dev/null +++ b/libs/@local/hashql/core/benches/symbol.rs @@ -0,0 +1,382 @@ +//! Benchmarks for Symbol operations. +//! +//! These benchmarks measure the performance of symbol creation, comparison, +//! hashing, and string access operations. +#![expect( + clippy::indexing_slicing, + clippy::min_ident_chars, + clippy::significant_drop_tightening +)] +use core::{ + hash::{Hash as _, Hasher as _}, + hint::black_box, +}; +use std::collections::hash_map::DefaultHasher; + +use codspeed_criterion_compat::{ + BatchSize, BenchmarkId, Criterion, Throughput, criterion_group, criterion_main, +}; +use hashql_core::{ + heap::{Heap, ResetAllocator as _}, + symbol::sym, +}; + +// ============================================================================= +// Test Data +// ============================================================================= + +/// Sample identifiers that simulate real source code tokens. +const IDENTIFIERS: &[&str] = &[ + // Common programming identifiers + "x", + "y", + "i", + "n", + "foo", + "bar", + "baz", + "count", + "index", + "value", + "result", + "data", + "items", + "length", + "size", + "name", + "type", + "id", + "key", + "user", + "config", + "options", + "handler", + "callback", + "response", + "request", + "context", + "state", + "props", + "children", + // Longer identifiers + "getUserById", + "setConfiguration", + "handleResponse", + "processRequest", + "validateInput", + "transformData", + "calculateTotal", + "renderComponent", + "initializeState", + "updateMetadata", +]; + +/// Generate unique identifiers with a numeric suffix. +fn generate_unique_identifiers(count: usize) -> Vec { + (0..count).map(|i| format!("ident_{i}")).collect() +} + +// ============================================================================= +// Interning Benchmarks +// ============================================================================= + +fn interning(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("intern"); + + // Benchmark: Intern unique strings (no dedup hits) + for count in [100, 1000, 10000] { + group.throughput(Throughput::Elements(count as u64)); + group.bench_with_input( + BenchmarkId::new("unique", count), + &count, + |bencher, &count| { + let identifiers = generate_unique_identifiers(count); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + for ident in &identifiers { + black_box(heap.intern_symbol(ident)); + } + }); + }, + ); + } + + // Benchmark: Intern repeated strings (dedup path) + for count in [100, 1000, 10000] { + group.throughput(Throughput::Elements((count * IDENTIFIERS.len()) as u64)); + group.bench_with_input( + BenchmarkId::new("repeated", count), + &count, + |bencher, &count| { + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + for _ in 0..count { + for ident in IDENTIFIERS { + black_box(heap.intern_symbol(ident)); + } + } + }); + }, + ); + } + + // Benchmark: Mixed workload (realistic lexer simulation) + group.bench_function("mixed_workload", |bencher| { + let unique = generate_unique_identifiers(100); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + // Simulate lexing: mix of repeated keywords and unique identifiers + for _ in 0..10 { + // Keywords (repeated) + for ident in IDENTIFIERS.iter().take(20) { + black_box(heap.intern_symbol(ident)); + } + // Unique identifiers + for ident in &unique { + black_box(heap.intern_symbol(ident)); + } + } + }); + }); + + group.finish(); +} + +// ============================================================================= +// Constant Symbol Access Benchmarks +// ============================================================================= + +fn constant_access(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("constant"); + + // Benchmark: Access pre-defined constant symbols + group.bench_function("access", |bencher| { + bencher.iter(|| black_box(sym::r#let)); + }); + + // Benchmark: Extract constant for pattern matching + group.bench_function("as_constant", |bencher| { + let symbol = sym::r#let; + bencher.iter(|| black_box(symbol).as_constant()); + }); + + group.finish(); +} + +// ============================================================================= +// Equality Comparison Benchmarks +// ============================================================================= + +fn equality(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("equality"); + + // Benchmark: Compare constant symbols (fast path - same pointer) + group.bench_function("constant_equal", |bencher| { + let a = sym::Integer; + let b = sym::Integer; + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare constant symbols (different) + group.bench_function("constant_not_equal", |bencher| { + let a = sym::Integer; + let b = sym::String; + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare runtime symbols (same interned string) + group.bench_function("runtime_equal", |bencher| { + let heap = Heap::new(); + let a = heap.intern_symbol("some_identifier"); + let b = heap.intern_symbol("some_identifier"); + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Compare runtime symbols (different strings) + group.bench_function("runtime_not_equal", |bencher| { + let heap = Heap::new(); + let a = heap.intern_symbol("identifier_one"); + let b = heap.intern_symbol("identifier_two"); + + bencher.iter(|| black_box(a) == black_box(b)); + }); + + // Benchmark: Pattern matching on constants + group.bench_function("pattern_match_constant", |bencher| { + let symbol = sym::r#fn; // middle of the match arms + + bencher.iter(|| match black_box(symbol).as_constant() { + Some(sym::r#let::CONST) => 1, + Some(sym::r#if::CONST) => 2, + Some(sym::r#else::CONST) => 3, + Some(sym::r#fn::CONST) => 4, + Some(sym::Integer::CONST) => 5, + Some(sym::String::CONST) => 6, + Some(sym::Boolean::CONST) => 7, + _ => 0, + }); + }); + + group.finish(); +} + +// ============================================================================= +// Hashing Benchmarks +// ============================================================================= + +fn hashing(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("hash"); + + // Benchmark: Hash constant symbols + group.bench_function("constant", |bencher| { + let symbol = sym::r#let; + + bencher.iter_batched( + DefaultHasher::new, + |mut hasher| { + symbol.hash(&mut hasher); + hasher.finish() + }, + BatchSize::SmallInput, + ); + }); + + // Benchmark: Hash runtime symbols + group.bench_function("runtime", |bencher| { + let heap = Heap::new(); + let symbol = heap.intern_symbol("some_identifier"); + + bencher.iter_batched( + DefaultHasher::new, + |mut hasher| { + symbol.hash(&mut hasher); + hasher.finish() + }, + BatchSize::SmallInput, + ); + }); + + group.finish(); +} + +// ============================================================================= +// String Access Benchmarks +// ============================================================================= + +fn string_access(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("as_str"); + + // Benchmark: Access string content of constant symbols + group.bench_function("constant", |bencher| { + let symbol = sym::r#let; + bencher.iter(|| black_box(symbol.as_str())); + }); + + // Benchmark: Access string content of runtime symbols + group.bench_function("runtime", |bencher| { + let heap = Heap::new(); + let symbol = heap.intern_symbol("some_identifier"); + bencher.iter(|| black_box(symbol.as_str())); + }); + + group.finish(); +} + +// ============================================================================= +// Realistic Workload Benchmarks +// ============================================================================= +#[expect(clippy::integer_division_remainder_used)] +fn realistic(criterion: &mut Criterion) { + let mut group = criterion.benchmark_group("realistic"); + + // Simulate a lexer: tokenize identifiers and compare against keywords + group.bench_function("lexer_simulation", |bencher| { + // Pre-generate "source code" tokens + let source_tokens: Vec<&str> = (0..1000) + .map(|index| IDENTIFIERS[index % IDENTIFIERS.len()]) + .collect(); + let mut heap = Heap::new(); + + bencher.iter(|| { + heap.reset(); + let mut keyword_count = 0; + let mut ident_count = 0; + + for token in &source_tokens { + let symbol = heap.intern_symbol(token); + + // Check if it's a keyword + if matches!( + symbol.as_constant(), + Some( + sym::r#let::CONST + | sym::r#if::CONST + | sym::r#else::CONST + | sym::r#fn::CONST + | sym::r#type::CONST + ) + ) { + keyword_count += 1; + } else { + ident_count += 1; + } + } + + black_box((keyword_count, ident_count)); + }); + }); + + // Simulate type checker: lots of symbol comparisons + group.bench_function("type_checker_simulation", |bencher| { + let heap = Heap::new(); + let symbols: Vec<_> = IDENTIFIERS.iter().map(|s| heap.intern_symbol(s)).collect(); + + bencher.iter(|| { + let mut matches = 0; + + // Compare each symbol against a set of "expected" symbols + for &symbol in &symbols { + if matches!( + symbol.as_constant(), + Some( + sym::Integer::CONST + | sym::String::CONST + | sym::Boolean::CONST + | sym::List::CONST + | sym::Dict::CONST + ) + ) { + matches += 1; + } + } + + black_box(matches); + }); + }); + + group.finish(); +} + +// ============================================================================= +// Entry Point +// ============================================================================= + +criterion_group!( + benches, + interning, + constant_access, + equality, + hashing, + string_access, + realistic, +); +criterion_main!(benches); diff --git a/libs/@local/hashql/core/package.json b/libs/@local/hashql/core/package.json index 60f659b7864..bc96b2c8164 100644 --- a/libs/@local/hashql/core/package.json +++ b/libs/@local/hashql/core/package.json @@ -9,7 +9,7 @@ "fix:clippy": "just clippy --fix", "lint:clippy": "just clippy", "test:codspeed": "cargo codspeed run -p hashql-core", - "test:miri": "cargo miri nextest run -- co_sort try_scan heap::transfer stable_empty_slice id::slice tarjan::tests::members", + "test:miri": "cargo miri nextest run -- co_sort try_scan heap::transfer stable_empty_slice id::slice tarjan::tests::members symbol", "test:unit": "mise run test:unit @rust/hashql-core" }, "dependencies": { diff --git a/libs/@local/hashql/core/src/heap/mod.rs b/libs/@local/hashql/core/src/heap/mod.rs index 23c30e7c6d1..6049dfa8c6a 100644 --- a/libs/@local/hashql/core/src/heap/mod.rs +++ b/libs/@local/hashql/core/src/heap/mod.rs @@ -18,7 +18,7 @@ //! // Intern strings for efficient comparison //! let sym1 = heap.intern_symbol("hello"); //! let sym2 = heap.intern_symbol("hello"); -//! assert!(std::ptr::eq(sym1.as_str(), sym2.as_str())); // Same pointer +//! assert!(core::ptr::eq(sym1.as_str(), sym2.as_str())); // Same pointer //! ``` //! //! # Type Aliases @@ -80,8 +80,8 @@ //! //! ## [`TransferInto`] //! -//! Copy borrowed data (`&[T]` or `&str`) into the arena. Only implemented for arena allocators -//! to prevent memory leaks from creating `&'static` references: +//! Copy borrowed data (`&[T]` or `&str`) into the arena. The returned reference is tied to the +//! arena's lifetime, ensuring the data is freed when the arena resets: //! //! ``` //! # #![feature(allocator_api)] @@ -101,10 +101,8 @@ mod scratch; mod transfer; use core::{alloc, mem, ptr}; -use std::sync::Mutex; use ::alloc::{boxed, collections::vec_deque, vec}; -use hashbrown::HashSet; use self::allocator::{Allocator, AllocatorScope, Checkpoint}; pub use self::{ @@ -116,8 +114,8 @@ pub use self::{ transfer::TransferInto, }; use crate::{ - collections::{FastHashSet, fast_hash_set_with_capacity}, - symbol::{Symbol, sym::TABLES}, + symbol::{Symbol, SymbolTable}, + sync::lock::LocalLock, }; /// A boxed value allocated on the `Heap`. @@ -145,18 +143,14 @@ pub type VecDeque<'heap, T> = vec_deque::VecDeque; pub type HashMap<'heap, K, V, S = foldhash::fast::RandomState> = hashbrown::HashMap; -/// An arena allocator for AST nodes and collections with string interning. +/// An arena allocator for AST nodes, collections, and symbols. /// /// Combines a bump allocator with a string interning table for deduplicated -/// symbol storage. Interned strings enable O(1) comparison via pointer equality. +/// symbol storage. Interned symbols enable O(1) comparison via pointer equality. #[derive(Debug)] pub struct Heap { inner: Allocator, - // Interned strings stored as `&'static str` for implementation convenience. - // SAFETY: The `'static` is a lie. These point into arena memory and are safe because: - // - All access goes through `Symbol<'heap>`, bounding the effective lifetime - // - This set is cleared before `inner.reset()` is called - strings: Mutex>, + strings: LocalLock, } impl Heap { @@ -178,7 +172,7 @@ impl Heap { pub fn uninitialized() -> Self { Self { inner: Allocator::new(), - strings: Mutex::default(), + strings: LocalLock::new(SymbolTable::new()), } } @@ -195,13 +189,16 @@ impl Heap { /// /// Panics if the heap is already primed. pub fn prime(&mut self) { - let strings = self.strings.get_mut().expect("lock should not be poisoned"); + let strings = self.strings.get_mut(); assert!( strings.is_empty(), "heap has already been primed or has interned symbols" ); - Self::prime_symbols(strings); + // SAFETY: We have verified that the symbol table is empty. + unsafe { + strings.prime(); + } } /// Creates a new heap. @@ -214,12 +211,16 @@ impl Heap { #[must_use] #[inline] pub fn new() -> Self { - let mut strings = fast_hash_set_with_capacity(0); - Self::prime_symbols(&mut strings); + let mut table = SymbolTable::new(); + + // SAFETY: fresh symbol table is empty + unsafe { + table.prime(); + } Self { inner: Allocator::new(), - strings: Mutex::new(strings), + strings: LocalLock::new(table), } } @@ -232,18 +233,22 @@ impl Heap { #[must_use] #[inline] pub fn with_capacity(capacity: usize) -> Self { - let mut strings = fast_hash_set_with_capacity(0); - Self::prime_symbols(&mut strings); + let mut table = SymbolTable::new(); + + // SAFETY: fresh symbol table is empty + unsafe { + table.prime(); + } Self { inner: Allocator::with_capacity(capacity), - strings: Mutex::new(strings), + strings: LocalLock::new(table), } } /// Allocates a value in the arena, returning a mutable reference. /// - /// Only accepts types that do **not** require [`Drop`]. Types requiring destructors + /// Only accepts types that do not require [`Drop`]. Types requiring destructors /// must use [`heap::Box`](Box) or [`heap::Vec`](Vec) instead. #[inline] pub fn alloc(&self, value: T) -> &mut T { @@ -252,16 +257,6 @@ impl Heap { self.inner.alloc_with(|| value) } - fn prime_symbols(strings: &mut FastHashSet<&'static str>) { - strings.reserve(TABLES.iter().map(|table| table.len()).sum()); - - for &table in TABLES { - for symbol in table { - assert!(strings.insert(symbol.as_str())); - } - } - } - /// Interns a string symbol, returning a reference to the interned value. /// /// If the string has already been interned, returns the existing [`Symbol`] pointing @@ -275,24 +270,18 @@ impl Heap { /// /// Panics if the internal mutex is poisoned. pub fn intern_symbol<'this>(&'this self, value: &str) -> Symbol<'this> { - let mut strings = self.strings.lock().expect("lock should not be poisoned"); - - if let Some(&string) = strings.get(value) { - return Symbol::new_unchecked(string); - } - - let string = &*value.transfer_into(self); - - // SAFETY: The `'static` lifetime is a lie to enable HashSet storage. - // Sound because: (1) external access is through `Symbol<'this>`, (2) strings - // are cleared before arena reset, (3) `reset()` requires `&mut self`. - #[expect(unsafe_code)] - let string: &'static str = unsafe { &*ptr::from_ref::(string) }; - - strings.insert(string); - drop(strings); - - Symbol::new_unchecked(string) + let mut strings = self.strings.lock(); + + // SAFETY: `SymbolTable::intern` requires: + // 1. No dangling pointers: The table is reset before the arena in `Heap::reset`. + // 2. Allocator consistency: We always pass `self` as the allocator. + // 3. Allocator lifetime: `self` outlives the returned `Repr`. + let repr = unsafe { strings.intern(self, value) }; + + // SAFETY: The `Repr` was just interned with `self` as the allocator, so it is + // valid for `'this`. Runtime symbols point into `self.inner`, and constant + // symbols have static lifetime. + unsafe { Symbol::from_repr(repr) } } } @@ -347,12 +336,14 @@ impl ResetAllocator for Heap { /// Panics if the internal mutex is poisoned. #[inline] fn reset(&mut self) { - // IMPORTANT: Clear strings BEFORE resetting the arena to prevent dangling references. - // The HashSet stores `&'static str` that actually point into arena memory. { - let mut strings = self.strings.lock().expect("lock should not be poisoned"); - strings.clear(); - Self::prime_symbols(&mut strings); + let mut strings = self.strings.lock(); + + // SAFETY: The symbol table is reset before the arena, so no dangling references exist. + unsafe { + strings.reset(); + }; + drop(strings); } diff --git a/libs/@local/hashql/core/src/lib.rs b/libs/@local/hashql/core/src/lib.rs index 64671748fd5..861e8482e9e 100644 --- a/libs/@local/hashql/core/src/lib.rs +++ b/libs/@local/hashql/core/src/lib.rs @@ -3,6 +3,7 @@ //! ## Workspace dependencies #![cfg_attr(doc, doc = simple_mermaid::mermaid!("../docs/dependency-diagram.mmd"))] #![expect(clippy::indexing_slicing)] +#![recursion_limit = "256"] #![feature( // Language Features arbitrary_self_types, @@ -20,6 +21,8 @@ assert_matches, binary_heap_into_iter_sorted, clone_from_ref, + const_cmp, + const_trait_impl, debug_closure_helpers, extend_one, formatting_options, @@ -30,6 +33,7 @@ slice_partition_dedup, slice_swap_unchecked, step_trait, + str_from_raw_parts, try_trait_v2, variant_count, )] diff --git a/libs/@local/hashql/core/src/pretty/mod.rs b/libs/@local/hashql/core/src/pretty/mod.rs index 178729361b4..b7fa841ce28 100644 --- a/libs/@local/hashql/core/src/pretty/mod.rs +++ b/libs/@local/hashql/core/src/pretty/mod.rs @@ -24,11 +24,11 @@ //! let fmt = Formatter::new(&heap); //! //! let doc = fmt -//! .keyword(sym::lexical::r#let) +//! .keyword(sym::r#let) //! .append(fmt.space()) //! .append(fmt.literal_str("43")) //! .append(fmt.space()) -//! .append(fmt.punct(sym::symbol::assign)) +//! .append(fmt.punct(sym::symbol::eq)) //! .append(fmt.space()) //! .append(fmt.literal_str("42")); //! diff --git a/libs/@local/hashql/core/src/symbol/lookup.rs b/libs/@local/hashql/core/src/symbol/lookup.rs new file mode 100644 index 00000000000..61c6360b0b4 --- /dev/null +++ b/libs/@local/hashql/core/src/symbol/lookup.rs @@ -0,0 +1,238 @@ +use core::ops::Index; + +use super::Symbol; +use crate::{ + collections::FastHashMap, + id::{Id, IdVec}, +}; + +#[derive(Debug)] +enum SymbolLookupInner<'heap, I> { + Dense(IdVec>), + Gapped(IdVec>>), + Sparse(FastHashMap>), +} + +/// A mapping from identifiers to symbols optimized for different access patterns. +/// +/// `SymbolLookup` provides efficient storage and retrieval of [`Symbol`] instances which are tied +/// to a specific identifier (which is any type that implements the [`Id`] trait). +/// +/// # Storage Strategies +/// +/// To accommodate different access patterns, `SymbolLookup` supports three storage strategies: +/// +/// ## Dense Storage +/// +/// Created with [`SymbolLookup::dense()`], this mode uses a [`Vec`] internally and requires +/// IDs to be inserted sequentially starting from 0. This provides optimal memory efficiency +/// and cache performance for contiguous ID ranges. +/// +/// ## Gapped Storage +/// +/// Created with [`SymbolLookup::gapped()`], this mode uses a [`Vec`] of [`Option`] +/// internally and allows insertion at arbitrary indices. Unlike dense storage, gaps are allowed in +/// the ID sequence. This provides a balance between the memory efficiency of dense storage and the +/// flexibility of sparse storage, making it ideal for scenarios where most IDs are contiguous but +/// some gaps may exist. +/// +/// ## Sparse Storage +/// +/// Created with [`SymbolLookup::sparse()`], this mode uses a [`FastHashMap`] internally and +/// supports arbitrary ID insertion order. This provides flexibility at the cost of higher +/// memory overhead per entry. +/// +/// # Examples +/// +/// ``` +/// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; +/// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); +/// # let mut heap = Heap::new(); +/// # let symbol = heap.intern_symbol("example"); +/// // Dense storage for sequential IDs +/// let mut dense_table = SymbolLookup::::dense(); +/// dense_table.insert(MyId::from_u32(0), symbol); +/// assert_eq!(dense_table.get(MyId::from_u32(0)), Some(symbol)); +/// +/// // Gapped storage for mostly contiguous IDs with some gaps +/// let mut gapped_table = SymbolLookup::::gapped(); +/// gapped_table.insert(MyId::from_u32(0), symbol); +/// gapped_table.insert(MyId::from_u32(5), symbol); // Gap at IDs 1-4 +/// assert_eq!(gapped_table.get(MyId::from_u32(0)), Some(symbol)); +/// assert_eq!(gapped_table.get(MyId::from_u32(2)), None); // Gap +/// assert_eq!(gapped_table.get(MyId::from_u32(5)), Some(symbol)); +/// +/// // Sparse storage for arbitrary IDs +/// let mut sparse_table = SymbolLookup::::sparse(); +/// sparse_table.insert(MyId::from_u32(100), symbol); +/// assert_eq!(sparse_table.get(MyId::from_u32(100)), Some(symbol)); +/// sparse_table.insert(MyId::from_u32(5), symbol); +/// assert_eq!(sparse_table.get(MyId::from_u32(5)), Some(symbol)); +/// ``` +#[derive(Debug)] +pub struct SymbolLookup<'heap, I> { + inner: SymbolLookupInner<'heap, I>, +} + +impl<'heap, I> SymbolLookup<'heap, I> +where + I: Id, +{ + /// Creates a new symbol table using dense vector-based storage. + /// + /// Dense tables require sequential ID insertion starting from 0 and provide + /// optimal memory efficiency and cache performance for contiguous ID ranges. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolLookup, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolLookup::::dense(); + /// // Insertions must be sequential: 0, 1, 2, ... + /// ``` + #[must_use] + pub const fn dense() -> Self { + Self { + inner: SymbolLookupInner::Dense(IdVec::new()), + } + } + + /// Creates a new symbol table using gapped vector-based storage. + /// + /// Gapped tables allow insertion at arbitrary indices within a vector, automatically + /// filling gaps with `None` values. This provides better memory locality than sparse + /// tables while still allowing non-contiguous ID ranges. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolLookup, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolLookup::::gapped(); + /// // Insertions can have gaps: 0, 5, 3, 10, ... + /// ``` + #[must_use] + pub const fn gapped() -> Self { + Self { + inner: SymbolLookupInner::Gapped(IdVec::new()), + } + } + + /// Creates a new symbol table using sparse hash-based storage. + /// + /// Sparse tables support arbitrary ID insertion order and provide flexibility + /// for non-contiguous ID ranges at the cost of higher memory overhead per entry. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{symbol::SymbolLookup, newtype}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// let table = SymbolLookup::::sparse(); + /// // Insertions can be in any order: 100, 5, 1000, ... + /// ``` + #[must_use] + pub fn sparse() -> Self { + Self { + inner: SymbolLookupInner::Sparse(FastHashMap::default()), + } + } + + /// Inserts a symbol associated with the given identifier. + /// + /// - For dense tables, the `id` must be sequential starting from 0. + /// - For gapped tables, any `id` value is accepted, and gaps will be filled with `None`. + /// - For sparse tables, any `id` value is accepted. + /// + /// If the `id` already exists in a gapped or sparse table, the previous symbol is replaced. + /// + /// # Panics + /// + /// Panics if this is a dense table and the `id` is not sequential (i.e., not equal + /// to the current length of the internal vector). + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolLookup::::dense(); + /// table.insert(MyId::from_u32(0), symbol); // First insertion + /// table.insert(MyId::from_u32(1), symbol); // Sequential insertion + /// ``` + /// + /// Non-sequential insertions will panic in dense tables: + /// + /// ```should_panic + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolLookup::::dense(); + /// table.insert(MyId::from_u32(0), symbol); // First insertion + /// table.insert(MyId::from_u32(2), symbol); // Non-sequential insertion + /// ``` + pub fn insert(&mut self, id: I, symbol: Symbol<'heap>) { + match &mut self.inner { + SymbolLookupInner::Dense(vec) => { + assert_eq!( + id, + vec.bound(), + "insertions into dense symbol tables must be sequential and contiguous" + ); + + vec.push(symbol); + } + SymbolLookupInner::Gapped(vec) => { + vec.insert(id, symbol); + } + SymbolLookupInner::Sparse(map) => { + map.insert(id, symbol); + } + } + } + + /// Retrieves the symbol associated with the given identifier. + /// + /// Returns the [`Symbol`] if the `id` exists in the table, or [`None`] if + /// the `id` is not found or if the entry is a gap (in gapped tables). + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::{heap::Heap, symbol::SymbolLookup, newtype, id::Id as _}; + /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); + /// # let mut heap = Heap::new(); + /// # let symbol = heap.intern_symbol("example"); + /// let mut table = SymbolLookup::::sparse(); + /// table.insert(MyId::from_u32(42), symbol); + /// + /// assert_eq!(table.get(MyId::from_u32(42)), Some(symbol)); + /// assert_eq!(table.get(MyId::from_u32(99)), None); + /// ``` + pub fn get(&self, id: I) -> Option> { + match &self.inner { + SymbolLookupInner::Dense(vec) => vec.get(id).copied(), + SymbolLookupInner::Gapped(vec) => vec.get(id).copied().flatten(), + SymbolLookupInner::Sparse(map) => map.get(&id).copied(), + } + } +} + +impl<'heap, I> Index for SymbolLookup<'heap, I> +where + I: Id, +{ + type Output = Symbol<'heap>; + + fn index(&self, index: I) -> &Self::Output { + match &self.inner { + SymbolLookupInner::Dense(vec) => &vec[index], + SymbolLookupInner::Gapped(vec) => vec[index].as_ref().expect("index out of bounds"), + SymbolLookupInner::Sparse(map) => &map[&index], + } + } +} diff --git a/libs/@local/hashql/core/src/symbol/mod.rs b/libs/@local/hashql/core/src/symbol/mod.rs index 94f8d355f83..7ab38ad8f0d 100644 --- a/libs/@local/hashql/core/src/symbol/mod.rs +++ b/libs/@local/hashql/core/src/symbol/mod.rs @@ -7,76 +7,254 @@ //! //! The module provides: //! -//! - [`Symbol`]: An opaque wrapper around string data that enables efficient storage and comparison -//! - [`SymbolTable`]: A mapping from identifiers to symbols optimized for different access patterns +//! - [`Symbol`]: An interned string reference used throughout the compiler +//! - [`ConstantSymbol`]: A wrapper for predefined symbols, enabling pattern matching +//! - [`SymbolLookup`]: A mapping from identifiers to symbols optimized for different access +//! patterns //! - [`Ident`]: A named identifier with source location and categorization //! - [`IdentKind`]: Classification of different identifier types in HashQL //! -//! ## Design Philosophy +//! # Pattern Matching on Predefined Symbols //! -//! The [`Symbol`] type is designed as an opaque wrapper around its internal string storage. -//! This encapsulation enables future optimizations such as string interning (either through -//! the `string_interner` crate or a custom implementation) without requiring API changes. +//! Use [`Symbol::as_constant()`] to match against predefined symbols from the [`sym`] module: +//! +//! ``` +//! # use hashql_core::symbol::{Symbol, sym}; +//! fn classify(symbol: Symbol<'_>) -> &'static str { +//! match symbol.as_constant() { +//! Some(sym::r#let::CONST) => "let keyword", +//! Some(sym::r#if::CONST) => "if keyword", +//! Some(sym::Integer::CONST) => "Integer type", +//! _ => "other", +//! } +//! } +//! ``` +mod lookup; +mod repr; pub mod sym; mod table; use core::{ cmp::Ordering, - fmt::{self, Display, Formatter}, - hash::{Hash, Hasher}, - ptr, + fmt::{self, Debug, Display, Formatter}, + hash::Hash, + marker::PhantomData, }; -pub use self::table::SymbolTable; +pub use self::lookup::SymbolLookup; +use self::repr::{ConstantRepr, Repr}; +pub(crate) use self::table::SymbolTable; use crate::span::SpanId; -/// A string-like value used throughout the HashQL compiler. +/// A predefined symbol that can be used in pattern matching. +/// +/// This is a structural wrapper around a constant symbol index, designed to +/// enable exhaustive pattern matching on predefined symbols. Unlike [`Symbol`], +/// which uses a tagged pointer that cannot appear in const patterns, `ConstantSymbol` +/// is a simple newtype over an index that derives [`PartialEq`] and [`Eq`] structurally. +/// +/// # Usage +/// +/// Obtained via [`Symbol::as_constant()`], then matched against `sym::NAME::CONST`: +/// +/// ``` +/// # use hashql_core::symbol::{Symbol, ConstantSymbol, sym}; +/// fn handle_keyword(sym: Symbol<'_>) { +/// if let Some(c) = sym.as_constant() { +/// match c { +/// sym::r#let::CONST => println!("let keyword"), +/// sym::r#fn::CONST => println!("fn keyword"), +/// _ => {} +/// } +/// } +/// } +/// ``` +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub struct ConstantSymbol { + repr: ConstantRepr, +} + +impl ConstantSymbol { + /// Creates a `ConstantSymbol` from a raw index without bounds checking. + /// + /// This is used by the [`sym`] macro to generate constant symbol definitions. + /// The index must be valid for the static `SYMBOLS` table. + #[inline] + const fn new_unchecked(index: usize) -> Self { + Self { + repr: ConstantRepr::new_unchecked(index), + } + } + + #[inline] + const fn from_repr(repr: ConstantRepr) -> Self { + Self { repr } + } +} + +/// An interned string reference used throughout the HashQL compiler. /// /// Symbols represent string data that appears in source code and persists throughout -/// compilation, they are read-only and immutable. +/// compilation. They are read-only, immutable, and designed for efficient comparison +/// and hashing. /// -/// This type is deliberately opaque to hide its internal representation, -/// allowing for future optimizations like string interning without changing -/// the public API. Symbols are designed to be efficient for long-lived objects -/// that are frequently compared, hashed, and referenced during compilation. +/// # Pattern Matching /// -/// The caller must ensure that the string is unique and interned. The types correctness requires -/// relies on these *but it does not enforce it*. -#[derive(Debug, Copy, Clone)] -pub struct Symbol<'heap>(&'heap str); +/// Use [`as_constant()`](Self::as_constant) to extract a [`ConstantSymbol`] for pattern +/// matching against predefined symbols from the [`sym`] module. +// We can rely on the derives for PartialEq, Eq, and Hash, as `_marker` is ignored, and the +// internal representation makes a pointer comparison. +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Symbol<'heap> { + repr: Repr, + _marker: PhantomData<&'heap ()>, +} +#[expect(unsafe_code)] impl<'heap> Symbol<'heap> { - /// Creates a new interned symbol from a string slice. + #[inline] + const fn from_constant(constant: ConstantSymbol) -> Self { + Self { + repr: Repr::constant(constant.repr), + _marker: PhantomData, + } + } + + /// Creates a [`Symbol`] from a raw [`Repr`]. + /// + /// # Safety + /// + /// The caller must ensure: /// - /// The caller must ensure that the string is unique and interned. - pub(crate) const fn new_unchecked(string: &'heap str) -> Self { - Self(string) + /// - For runtime symbols: the [`Repr`] must point to a valid allocation that remains live for + /// the `'heap` lifetime. + /// - For constant symbols: the [`Repr`] must encode a valid index into the static symbol table. + /// - The symbol must be properly interned (unique string content maps to unique [`Repr`]). + #[inline] + pub(crate) const unsafe fn from_repr(repr: Repr) -> Self { + Symbol { + repr, + _marker: PhantomData, + } } + #[inline] + const fn into_repr(self) -> Repr { + self.repr + } + + /// Returns the constant symbol representation if this is a predefined symbol. + /// + /// Use this to pattern match against predefined symbols from the [`sym`] module: + /// + /// ``` + /// # use hashql_core::symbol::{Symbol, sym}; + /// fn is_keyword(sym: Symbol<'_>) -> bool { + /// matches!( + /// sym.as_constant(), + /// Some(sym::r#let::CONST | sym::r#if::CONST | sym::r#fn::CONST) + /// ) + /// } + /// ``` + /// + /// Returns [`None`] for runtime (heap-allocated) symbols. + pub fn as_constant(self) -> Option { + self.repr + .try_as_constant_symbol() + .map(ConstantSymbol::from_repr) + } + + /// Returns the string content of this symbol. + /// + /// The returned reference is valid for the lifetime of this symbol. For access with the + /// full `'heap` lifetime, use [`unwrap()`](Self::unwrap) instead. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::sym; + /// assert_eq!(sym::Integer.as_str(), "Integer"); + /// assert_eq!(sym::r#let.as_str(), "let"); + /// ``` + /// + /// ``` + /// # use hashql_core::heap::Heap; + /// let heap = Heap::new(); + /// let symbol = heap.intern_symbol("hello"); + /// assert_eq!(symbol.as_str(), "hello"); + /// ``` #[must_use] - pub const fn as_str(&self) -> &str { - self.0 + #[inline] + pub fn as_str(&self) -> &str { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_str() } } - /// Returns the string representation of the symbol. + /// Returns the string content with the full heap lifetime. + /// + /// Unlike [`as_str()`](Self::as_str), this method returns a reference with the `'heap` + /// lifetime rather than the symbol's own lifetime. This is useful when the string needs + /// to outlive the symbol itself. + /// + /// Note that the returned string should be treated as no longer subject to the interning + /// guarantee—it's just a plain `&str`. + /// + /// # Examples /// - /// Unlike [`Self::as_str`], this method provides access for the lifetime of the interner - /// instead of the symbol itself, somewhat circumventing the protections given to the symbol - /// itself. Any unwrapped type should be considered no longer unique and interned. + /// ``` + /// # use hashql_core::symbol::sym; + /// let s: &'static str = sym::Integer.unwrap(); + /// assert_eq!(s, "Integer"); + /// ``` #[must_use] - pub const fn unwrap(&self) -> &'heap str { - self.0 + #[inline] + pub fn unwrap(self) -> &'heap str { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_str() } } + /// Returns the raw bytes of this symbol's string content. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::sym; + /// assert_eq!(sym::Integer.as_bytes(), b"Integer"); + /// ``` #[must_use] - pub const fn as_bytes(&self) -> &[u8] { - self.0.as_bytes() + #[inline] + pub fn as_bytes(&self) -> &[u8] { + // SAFETY: Symbol carries a `'heap` lifetime, that is tied to the allocation of the string. + unsafe { self.repr.as_bytes() } } + /// Returns the demangled name, stripping any suffix after the last `:`. + /// + /// This is used for symbols with mangled names (e.g., `"foo:123"` → `"foo"`). + /// If there is no `:`, returns the full symbol content. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::heap::Heap; + /// let heap = Heap::new(); + /// + /// let mangled = heap.intern_symbol("variable:42"); + /// assert_eq!(mangled.demangle(), "variable"); + /// + /// let plain = heap.intern_symbol("plain_name"); + /// assert_eq!(plain.demangle(), "plain_name"); + /// + /// let multiple = heap.intern_symbol("a:b:c"); + /// assert_eq!(multiple.demangle(), "a:b"); + /// ``` #[must_use] + #[inline] pub fn demangle(self) -> &'heap str { - self.0.rsplit_once(':').map_or(self.0, |(name, _)| name) + let value = self.unwrap(); + + value.rsplit_once(':').map_or(value, |(name, _)| name) } } @@ -87,15 +265,6 @@ impl AsRef for Symbol<'_> { } } -impl PartialEq for Symbol<'_> { - fn eq(&self, other: &Self) -> bool { - // Pointer equality implies string equality (due to the unique contents assumption) - ptr::eq(self.0, other.0) - } -} - -impl Eq for Symbol<'_> {} - impl PartialOrd for Symbol<'_> { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -109,21 +278,20 @@ impl Ord for Symbol<'_> { if self == other { Ordering::Equal } else { - self.0.cmp(other.0) + self.as_str().cmp(other.as_str()) } } } -impl Hash for Symbol<'_> { - fn hash(&self, state: &mut H) { - // Pointer hashing is sufficient (due to the unique contents assumption) - ptr::hash(self.0, state); +impl Debug for Symbol<'_> { + fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { + fmt.debug_tuple("Symbol").field(&self.as_str()).finish() } } impl Display for Symbol<'_> { fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result { - Display::fmt(self.0, fmt) + Display::fmt(self.as_str(), fmt) } } @@ -265,6 +433,23 @@ pub struct Ident<'heap> { } impl<'heap> Ident<'heap> { + /// Creates a synthetic identifier with no source location. + /// + /// Synthetic identifiers are used for compiler-generated names that don't + /// correspond to any location in source code. + /// + /// # Examples + /// + /// ``` + /// # use hashql_core::symbol::{Ident, IdentKind, sym}; + /// # use hashql_core::span::SpanId; + /// let ident = Ident::synthetic(sym::foo); + /// + /// assert_eq!(ident.span, SpanId::SYNTHETIC); + /// assert_eq!(ident.value, sym::foo); + /// assert_eq!(ident.kind, IdentKind::Lexical); + /// assert_eq!(ident.as_ref(), "foo"); + /// ``` #[must_use] pub const fn synthetic(value: Symbol<'heap>) -> Self { Self { @@ -283,6 +468,77 @@ impl AsRef for Ident<'_> { impl Display for Ident<'_> { fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result { - Display::fmt(&self.value.0, fmt) + Display::fmt(&self.value.as_str(), fmt) + } +} + +const _: () = { + assert!(size_of::() == size_of::()); + assert!(size_of::>() == size_of::()); +}; + +#[cfg(test)] +mod tests { + #![expect(clippy::min_ident_chars, clippy::many_single_char_names)] + use core::{cmp::Ordering, hash::BuildHasher as _}; + use std::hash::RandomState; + + use super::sym; + use crate::heap::Heap; + + #[test] + fn symbol_equality() { + let heap = Heap::new(); + let a = heap.intern_symbol("foo"); + let b = heap.intern_symbol("bar"); + let c = heap.intern_symbol("bar"); + let d = sym::Integer; + let e = sym::String; + let f = sym::String; + + assert_ne!(a, b); + assert_eq!(b, c); + assert_ne!(c, d); + assert_ne!(d, e); + assert_eq!(e, f); + } + + #[test] + fn symbol_ordering() { + let heap = Heap::new(); + let a = heap.intern_symbol("aaa"); + let b = sym::bar; + let c = heap.intern_symbol("ccc"); + + assert_eq!(a.cmp(&b), Ordering::Less); + assert_eq!(b.cmp(&c), Ordering::Less); + assert_eq!(c.cmp(&a), Ordering::Greater); + assert_eq!(b.cmp(&b), Ordering::Equal); + } + + #[test] + fn symbol_consistent_hashing() { + let heap = Heap::new(); + let a = heap.intern_symbol("test"); + + let hasher = RandomState::new(); + + assert_eq!(hasher.hash_one(a), hasher.hash_one(a.repr)); + } + + #[test] + fn interned_predefined_returns_constant() { + let heap = Heap::new(); + let interned = heap.intern_symbol("let"); + + assert_eq!(interned.as_constant(), Some(sym::r#let::CONST)); + } + + #[test] + fn runtime_symbol_returns_no_constant() { + let heap = Heap::new(); + let runtime = heap.intern_symbol("not_a_keyword"); + + assert!(runtime.as_constant().is_none()); } } diff --git a/libs/@local/hashql/core/src/symbol/repr.rs b/libs/@local/hashql/core/src/symbol/repr.rs new file mode 100644 index 00000000000..891017c53a8 --- /dev/null +++ b/libs/@local/hashql/core/src/symbol/repr.rs @@ -0,0 +1,488 @@ +//! Compact symbol representation using tagged pointers. +//! +//! This module provides [`Repr`], a single-word representation for symbols that can be either: +//! +//! - **Runtime symbols**: Heap-allocated on a bump allocator with inline string data +//! - **Constant symbols**: Indices into a static string table, encoded directly in pointer bits +//! +//! # Design Goals +//! +//! - **Compact**: `Repr` is exactly one pointer in size (8 bytes on 64-bit) +//! - **Niche optimization**: `Option` is also one pointer in size +//! - **Efficient**: Symbols are frequently created but rarely accessed +//! +//! # Tagged Pointer Scheme +//! +//! Uses the lowest bit as a discriminant tag (possible because allocations are 2-byte aligned): +//! +//! - Bit 0 = `0`: Runtime symbol (pointer to [`RuntimeRepr`] allocation) +//! - Bit 0 = `1`: Constant symbol (index shifted left by 1, `OR`ed with tag) +//! +//! # Provenance +//! +//! Runtime symbols store a [`NonNull`] rather than a reference to preserve +//! full allocation provenance. Creating `&RuntimeRepr` would narrow provenance to just the +//! header, causing undefined behavior when accessing the trailing inline bytes under strict +//! provenance / Stacked Borrows. +#![expect(unsafe_code)] + +use alloc::alloc::handle_alloc_error; +use core::{ + alloc::{AllocError, Layout}, + mem, + num::NonZero, + ptr::{self, NonNull}, +}; + +use super::sym::SYMBOLS; +use crate::heap::BumpAllocator; + +/// Header for a runtime-allocated symbol with inline string data. +/// +/// # Memory Layout +/// +/// ```text +/// ┌──────────────┬──────────────────────┐ +/// │ len: usize │ data: [u8; len] │ +/// └──────────────┴──────────────────────┘ +/// ``` +/// +/// The `data` field is a zero-sized array marker; actual bytes are allocated +/// immediately after the header. The struct uses `#[repr(C)]` to guarantee +/// this layout. +/// +/// # Provenance +/// +/// References to this type (`&RuntimeSymbol`) only have provenance for the header, +/// not the trailing bytes. All access must go through [`NonNull`] +/// to preserve full allocation provenance. +#[repr(C, align(2))] +pub(crate) struct RuntimeRepr { + len: usize, + data: [u8; 0], +} + +impl RuntimeRepr { + /// Computes the allocation layout for a runtime symbol with `len` bytes of data. + #[inline] + fn layout(len: usize) -> Layout { + Layout::from_size_align( + size_of::().checked_add(len).expect("overflow"), + mem::align_of::(), + ) + .expect("invalid RuntimeSymbol layout") + } + + /// Allocates a runtime symbol containing `value` on the given allocator. + /// + /// Returns a [`NonNull`] pointer with provenance for the entire allocation, + /// including the trailing string bytes. + /// + /// # Panics + /// + /// Panics if allocation fails. + pub(crate) fn alloc(alloc: &A, value: &str) -> NonNull { + let Ok(value) = Self::try_alloc(alloc, value) else { + handle_alloc_error(Self::layout(value.len())) + }; + + value + } + + /// Attempts to allocate a runtime symbol containing `value`. + /// + /// # Errors + /// + /// Returns [`AllocError`] if the allocator cannot satisfy the request. + fn try_alloc(alloc: &A, value: &str) -> Result, AllocError> { + let len = value.len(); + + let layout = Self::layout(value.len()); + + let ptr = alloc.allocate(layout)?.cast::(); + + // SAFETY: `ptr` points to a freshly allocated block of `layout` size. + // We write `len` to the header and copy `len` bytes of string data + // immediately after the header, which fits within the allocation. + unsafe { + ptr.cast::().write(len); + + let buf = ptr.add(1).cast::(); + ptr::copy_nonoverlapping(value.as_ptr(), buf.as_ptr(), len); + } + + Ok(ptr) + } + + /// Returns a pointer to the inline string data. + /// + /// This performs pointer arithmetic without dereferencing, so it is safe. + /// The returned pointer has provenance for the trailing bytes if `this` + /// has provenance for the full allocation. + #[inline] + const fn data_ptr(this: NonNull) -> NonNull { + // SAFETY: `this` points to a valid `RuntimeSymbol` allocation, which + // always has at least `size_of::()` bytes. Adding 1 moves past + // the header to the inline data region. + unsafe { this.add(1) }.cast() + } + + /// Reads the length of the inline string data. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. + /// - The allocation must remain live for the duration of this call. + #[inline] + const unsafe fn len(this: NonNull) -> usize { + // SAFETY: Caller guarantees `this` points to a valid, initialized allocation. + unsafe { this.cast::().read() } + } + + /// Returns the inline data as a byte slice. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. + /// - The allocation must remain live for the lifetime `'a`. + /// - The returned slice must not be mutated for the lifetime `'a`. + #[inline] + const unsafe fn as_bytes<'a>(this: NonNull) -> &'a [u8] { + // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. + // `data_ptr` returns a pointer to the inline bytes, and `len` returns the count. + unsafe { core::slice::from_raw_parts(Self::data_ptr(this).as_ptr(), Self::len(this)) } + } + + /// Returns the inline data as a string slice. + /// + /// # Safety + /// + /// - `this` must point to a valid, initialized [`RuntimeRepr`] allocation. + /// - The allocation must remain live for the lifetime `'a`. + /// - The returned string must not be mutated for the lifetime `'a`. + #[inline] + const unsafe fn as_str<'a>(this: NonNull) -> &'a str { + // SAFETY: Caller guarantees `this` is valid and the allocation outlives `'a`. + // The bytes are valid UTF-8 because they were copied from a `&str` in `try_alloc`. + unsafe { core::str::from_raw_parts(Self::data_ptr(this).as_ptr(), Self::len(this)) } + } +} + +/// A constant symbol represented as an index into [`SYMBOLS`]. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub(crate) struct ConstantRepr(usize); + +impl ConstantRepr { + #[inline] + pub(crate) const fn new_unchecked(index: usize) -> Self { + Self(index) + } + + /// Returns the string value without bounds checking. + /// + /// # Safety + /// + /// The index must be within bounds of [`SYMBOLS`]. + #[inline] + pub(super) unsafe fn as_str_unchecked(self) -> &'static str { + // SAFETY: Caller guarantees the index is in bounds. + unsafe { SYMBOLS.get_unchecked(self.0) } + } + + /// Returns the byte slice for this constant symbol without bounds checking. + /// + /// # Safety + /// + /// The index must be within bounds of [`SYMBOLS`]. + #[inline] + pub(super) unsafe fn as_bytes_unchecked(self) -> &'static [u8] { + // SAFETY: Constant symbols return &'static str, which coerces to &'static [u8]. + unsafe { self.as_str_unchecked().as_bytes() } + } +} + +/// A compact, single-word representation for symbols. +/// +/// Uses a tagged pointer to distinguish between runtime and constant symbols: +/// +/// - **Runtime** (tag = 0): Pointer to a [`RuntimeRepr`] allocation +/// - **Constant** (tag = 1): Index into [`SYMBOLS`] encoded in the pointer bits +/// +/// # Size +/// +/// `Repr` is exactly one pointer in size. Thanks to [`NonNull`], `Option` +/// is also one pointer in size (niche optimization). +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub(crate) struct Repr { + ptr: NonNull, +} + +// SAFETY: while NonNull (for niche optimization), the pointer itself is only accessed via `*const` +// ptr and never modified. The underlying data is Send + Sync. +unsafe impl Send for Repr {} +// SAFETY: while NonNull (for niche optimization), the pointer itself is only accessed via `*const` +// ptr and never modified. The underlying data is Send + Sync. +unsafe impl Sync for Repr {} + +impl Repr { + /// Minimum alignment for runtime symbol allocations. + /// + /// Must be at least 2 to ensure the lowest bit is always 0 for valid pointers. + const MIN_ALIGN: usize = 2; + /// Tag value for constant symbols (bit 0 = 1). + const TAG_CONSTANT: usize = 0b1; + /// Bitmask for extracting the tag from a pointer address. + const TAG_MASK: usize = 0b1; + /// Tag value for runtime symbols (bit 0 = 0). + const TAG_RUNTIME: usize = 0b0; + /// Number of bits used for the tag (determines how much to shift indices). + const TAG_SHIFT: u32 = 1; + + /// Returns the tag value (0 for runtime, 1 for constant). + #[inline] + fn tag(self) -> usize { + self.ptr.addr().get() & Self::TAG_MASK + } + + /// Extracts the runtime symbol pointer. + /// + /// # Safety + /// + /// - `self` must have been created via [`Repr::runtime`]. + /// - The underlying allocation must still be live. + #[inline] + unsafe fn as_runtime(self) -> NonNull { + debug_assert!(self.tag() == Self::TAG_RUNTIME); + + self.ptr + .map_addr(|addr| { + // SAFETY: Runtime symbols are aligned to at least MIN_ALIGN (2), so the + // lowest bit is always 0. Masking it off preserves a valid, non-zero address. + unsafe { NonZero::new_unchecked(addr.get() & !Self::TAG_MASK) } + }) + .cast::() + } + + /// Extracts the constant symbol index. + /// + /// # Safety + /// + /// - `self` must have been created via [`Repr::constant`]. + #[inline] + unsafe fn as_constant(self) -> ConstantRepr { + debug_assert!(self.tag() == Self::TAG_CONSTANT); + + let addr = self.ptr.addr().get(); + ConstantRepr((addr & !Self::TAG_MASK) >> Self::TAG_SHIFT) + } + + #[inline] + pub(super) fn try_as_constant_symbol(self) -> Option { + if self.tag() != Self::TAG_CONSTANT { + return None; + } + + // SAFETY: We have just verified that the tag is constant. + Some(unsafe { self.as_constant() }) + } + + /// Returns the string content of this symbol. + /// + /// # Safety + /// + /// - For runtime symbols: the allocation must remain live for lifetime `'str`. + /// - The returned string must not be mutated for lifetime `'str`. + #[inline] + pub(crate) unsafe fn as_str<'str>(self) -> &'str str { + if self.tag() == Self::TAG_RUNTIME { + // SAFETY: Caller guarantees the allocation is live for 'str. + unsafe { RuntimeRepr::as_str(self.as_runtime()) } + } else { + // SAFETY: Constant symbols return &'static str, which coerces to &'str. + unsafe { self.as_constant().as_str_unchecked() } + } + } + + /// Returns the byte content of this symbol. + /// + /// # Safety + /// + /// - For runtime symbols: the allocation must remain live for lifetime `'str`. + /// - The returned bytes must not be mutated for lifetime `'str`. + #[inline] + pub(crate) unsafe fn as_bytes<'str>(self) -> &'str [u8] { + if self.tag() == Self::TAG_RUNTIME { + // SAFETY: Caller guarantees the allocation is live for 'str. + unsafe { RuntimeRepr::as_bytes(self.as_runtime()) } + } else { + // SAFETY: Constant symbols return &'static str, which coerces to &'str. + unsafe { self.as_constant().as_bytes_unchecked() } + } + } + + /// Creates a `Repr` for a constant symbol. + /// + /// The index is encoded directly in the pointer bits (shifted to make room for the tag). + #[inline] + pub(crate) const fn constant(constant: ConstantRepr) -> Self { + const { + assert!( + Self::TAG_CONSTANT != 0, + "Constant symbol tag must be non-zero" + ); + } + + debug_assert!( + (constant.0 << Self::TAG_SHIFT >> Self::TAG_SHIFT) == constant.0, + "constant has set the top most bit" + ); + debug_assert!(constant.0 < SYMBOLS.len(), "constant is out of range"); + + let addr = (constant.0 << Self::TAG_SHIFT) | Self::TAG_CONSTANT; + let ptr = ptr::without_provenance_mut(addr); + + Self { + // SAFETY: TAG_CONSTANT is non-zero, therefore `addr` is non-null. + ptr: unsafe { NonNull::new_unchecked(ptr) }, + } + } + + /// Creates a `Repr` for a runtime symbol. + /// + /// The pointer is stored directly with its tag bit set to 0 (which is a no-op + /// since runtime allocations are already aligned). + #[inline] + pub(crate) fn runtime(symbol: NonNull) -> Self { + const { + assert!(align_of::() >= Self::MIN_ALIGN); + } + + let ptr = symbol.map_addr(|addr| addr | Self::TAG_RUNTIME).cast(); + + Self { ptr } + } +} + +const _: () = { + assert!(size_of::() == size_of::<*const ()>()); + assert!(size_of::>() == size_of::<*const ()>()); + assert!(align_of::() >= Repr::MIN_ALIGN); +}; + +#[cfg(test)] +mod tests { + #![expect(clippy::non_ascii_literal)] + + use super::{ConstantRepr, Repr, RuntimeRepr, SYMBOLS}; + use crate::heap::Scratch; + + #[test] + fn constant_symbol_first_entry() { + let constant = ConstantRepr(0); + let repr = Repr::constant(constant); + + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, SYMBOLS[0]); + } + + #[test] + fn constant_symbol_first_entry_unchecked() { + let constant = ConstantRepr(0); + + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { constant.as_str_unchecked() }, SYMBOLS[0]); + } + + #[test] + fn constant_symbol_second_entry() { + let constant = ConstantRepr(1); + let repr = Repr::constant(constant); + + // SAFETY: `repr` is a constant symbol with a valid index, no allocation lifetime concerns. + assert_eq!(unsafe { repr.as_str() }, SYMBOLS[1]); + } + + #[test] + fn runtime_symbol_empty_string() { + let heap = Scratch::new(); + let symbol = RuntimeRepr::alloc(&heap, ""); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, ""); + } + + #[test] + fn runtime_symbol_simple_string() { + let heap = Scratch::new(); + let symbol = RuntimeRepr::alloc(&heap, "hello"); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, "hello"); + } + + #[test] + fn runtime_symbol_unicode() { + let heap = Scratch::new(); + let symbol = RuntimeRepr::alloc(&heap, "日本語 🎉 émojis"); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, "日本語 🎉 émojis"); + } + + #[test] + fn runtime_symbol_long_string() { + let heap = Scratch::new(); + let long_string = "a".repeat(10_000); + let symbol = RuntimeRepr::alloc(&heap, &long_string); + let repr = Repr::runtime(symbol); + + // SAFETY: `heap` is live for the duration of this assertion. + assert_eq!(unsafe { repr.as_str() }, long_string); + } + + #[test] + fn multiple_runtime_symbols() { + let heap = Scratch::new(); + + let symbol1 = RuntimeRepr::alloc(&heap, "first"); + let symbol2 = RuntimeRepr::alloc(&heap, "second"); + let symbol3 = RuntimeRepr::alloc(&heap, "third"); + + let repr1 = Repr::runtime(symbol1); + let repr2 = Repr::runtime(symbol2); + let repr3 = Repr::runtime(symbol3); + + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr1.as_str() }, "first"); + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr2.as_str() }, "second"); + // SAFETY: `heap` is live for the duration of these assertions. + assert_eq!(unsafe { repr3.as_str() }, "third"); + } + + #[test] + fn tag_distinguishes_constant_from_runtime() { + let heap = Scratch::new(); + + let constant = Repr::constant(ConstantRepr(0)); + let runtime = Repr::runtime(RuntimeRepr::alloc(&heap, "test")); + + assert_eq!(constant.tag(), Repr::TAG_CONSTANT); + assert_eq!(runtime.tag(), Repr::TAG_RUNTIME); + } + + #[test] + fn runtime_symbol_stores_correct_length() { + let heap = Scratch::new(); + let symbol = RuntimeRepr::alloc(&heap, "hello"); + + // SAFETY: `symbol` points to a valid allocation and `heap` is live. + unsafe { + assert_eq!(RuntimeRepr::len(symbol), 5); + assert_eq!(RuntimeRepr::as_str(symbol).len(), 5); + } + } +} diff --git a/libs/@local/hashql/core/src/symbol/sym.rs b/libs/@local/hashql/core/src/symbol/sym.rs index e276aa5b355..578e7327372 100644 --- a/libs/@local/hashql/core/src/symbol/sym.rs +++ b/libs/@local/hashql/core/src/symbol/sym.rs @@ -1,66 +1,167 @@ -//! This module defines a collection of static symbol constants used throughout the codebase. -//! -//! # Usage -//! -//! These symbols should only ever be imported with the `sym` prefix to avoid naming conflicts -//! and maintain clarity about where the symbols are defined. For example: -//! -//! ```rust -//! use hashql_core::symbol::sym; -//! -//! // Correct usage: -//! let add_symbol = sym::lexical::add; -//! let asterisk = sym::symbol::asterisk; -//! -//! // Incorrect usage (avoid): -//! // use crate::symbol::sym::lexical::*; -//! ``` -//! -//! These symbols provide pointer equality guarantees when interned from a `Heap`, -//! which allows for efficient symbol comparison operations. -#![expect(non_upper_case_globals, clippy::min_ident_chars)] -use super::Symbol; +#![expect(non_upper_case_globals, non_snake_case, clippy::min_ident_chars)] +use super::{ConstantSymbol, Symbol}; -/// Macro for defining groups of static symbol constants. +/// Generates pre-interned symbols available at compile time. /// -/// This macro creates modules containing static `Symbol` instances and -/// generates tables that group these symbols for efficient lookup. +/// This macro produces three artifacts from a single symbol table definition: /// -/// The macro supports several forms: -/// - Basic symbol: uses the identifier name as the symbol value -/// - Custom symbol: allows specifying a custom string value with the `name: "value"` syntax -/// - Special handling for Rust keywords using the `r#` prefix +/// 1. **`SYMBOLS`** - A static slice of string values for interner pre-population +/// 2. **Symbol constants** - `Symbol<'static>` constants (e.g., `sym::foo`, `sym::symbol::plus`) +/// 3. **`LOOKUP`** - A static slice mapping string values to their [`Repr`] for fast lookup /// -/// For each symbol group, this macro also creates a corresponding table of references -/// to all symbols in that group. +/// # Syntax +/// +/// ```text +/// symbols! {@table; +/// // Simple symbol: name becomes both the constant and string value +/// foo, +/// +/// // Explicit string: use when string differs from identifier +/// r#true: "true", +/// input_exists: "$exists", +/// +/// // Nested module: groups related symbols under a namespace +/// symbol: { +/// plus: "+", +/// minus: "-", +/// }, +/// } +/// ``` +/// +/// Each symbol `name` or `name: "value"` generates: +/// - A constant `name: Symbol<'static>` with auto-generated docs +/// - A submodule `name` containing `CONST: ConstantSymbol` for pattern matching +/// +/// Modules create nested namespaces, so `symbol::plus` becomes accessible as `sym::symbol::plus`. +/// +/// # Internal Rules +/// +/// The macro uses internal rules (prefixed with `@`) to process the token stream: +/// +/// - **`@strings`** - Collects all string values into the `SYMBOLS` slice +/// - **`@consts`** - Generates `Symbol` constants and companion modules with index tracking +/// - **`@consts @cont`** - Continuation after processing a nested module to resume counting +/// - **`@lookup`** - Builds the string-to-repr mapping table for runtime lookup +/// - **`@path`** - Helper to construct module paths (reverses accumulated path segments) +/// - **`@table`** - Entry point that dispatches to all three generators +/// +/// Index tracking uses the `${count($count)}` metavariable to assign sequential indices. +/// Each processed symbol appends `()` to the count accumulator, and `${count(...)}` returns +/// the number of elements. +/// +/// [`Repr`]: super::repr::Repr macro_rules! symbols { - (@sym) => {}; - (@sym $name:ident $(, $($rest:tt)*)?) => { - pub static $name: super::Symbol<'static> = super::Symbol::new_unchecked(stringify!($name)); - $(symbols!(@sym $($rest)*);)? + (@strings [$($acc:tt)*];) => { + pub(crate) static SYMBOLS: &[&str] = &[ + $($acc),* + ]; }; - (@sym $name:ident : $value:literal $(, $($rest:tt)*)?) => { - pub static $name: super::Symbol<'static> = super::Symbol::new_unchecked($value); - $(symbols!(@sym $($rest)*);)? + (@strings [$($acc:tt)*]; , $($rest:tt)*) => { + symbols!(@strings [$($acc)*]; $($rest)*); }; - (@table $module:ident $table:ident #($($name:ident)*)) => { - const $table: &[&Symbol<'static>] = &[ - $(&$module::$name),* - ]; + (@strings [$($acc:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)*]; $($inner)* $(, $($rest)*)?); }; - (@table $module:ident $table:ident #($($acc:tt)*) $name:ident $(: $value:literal)? $(, $($rest:tt)*)?) => { - symbols!(@table $module $table #($($acc)* $name) $($($rest)*)?); + (@strings [$($acc:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* $value]; $($($rest)*)?); + }; + (@strings [$($acc:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@strings [$($acc)* (stringify!($name))]; $($($rest)*)?); + }; + + (@consts @cont [$($count:tt)*] [$($next:tt)*];) => { + symbols!(@consts [$($count)*]; $($next)*); }; - ($module:ident; $table:ident; $($items:tt)*) => { + (@consts @cont [$($count:tt)*] [$($next:tt)*]; , $($rest:tt)*) => { + symbols!(@consts @cont [$($count)*] [$($next)*]; $($rest)*); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + compile_error!("nested modules in modules are not supported"); + }; + (@consts @cont [$($count:tt)*] [$($next:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@consts @cont [$($count)* ()] [$($next)*]; $($($rest)*)?); + }; + + (@consts [$($count:tt)*];) => {}; + (@consts [$($count:tt)*]; , $($rest:tt)*) => { + symbols!(@consts [$($count)*]; $($rest)*); + }; + (@consts [$($count:tt)*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == $value) }; + #[doc = concat!("The symbol `", $value, "`")] + pub const $name: Symbol<'static> = Symbol::from_constant($name::CONST); + + pub mod $name { + use super::*; + + pub const CONST: ConstantSymbol = ConstantSymbol::new_unchecked(${count($count)}); + } + + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + (@consts [$($count:tt)*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { pub mod $module { - symbols!(@sym $($items)*); + use super::*; + + symbols!(@consts [$($count)*]; $($inner)*); } - symbols!(@table $module $table #() $($items)*); + symbols!(@consts @cont [$($count)*] [$($($rest)*)?]; $($inner)*); + }; + (@consts [$($count:tt)*]; $name:ident $(, $($rest:tt)*)?) => { + const _: () = { assert!(SYMBOLS[${count($count)}] == stringify!($name)) }; + #[doc = concat!("The symbol `", stringify!($name), "`")] + pub const $name: Symbol<'static> = Symbol::from_constant($name::CONST); + + pub mod $name { + use super::*; + + pub const CONST: ConstantSymbol = ConstantSymbol::new_unchecked(${count($count)}); + } + + symbols!(@consts [$($count)* ()]; $($($rest)*)?); + }; + + (@path [] [$($path:ident)*];) => { + $($path)::* + }; + (@path [$next:tt $($rest:tt)*] [$($path:tt)*];) => { + symbols!(@path [$($rest)*] [$next $($path)*];) + }; + + (@lookup [$(, $arm:expr => $value:expr)*] [$($path:tt),*];) => { + pub(crate) static LOOKUP: &[(&'static str, super::repr::Repr)] = &[ + $(($arm, $value.into_repr())),* + ]; + }; + (@lookup [$($arms:tt)*] [$tail:tt $(, $path:tt)*]; | $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; , $($rest:tt)*) => { + symbols!(@lookup [$($arms)*] [$($path),*]; $($rest)*); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident : $value:literal $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, $value => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $module:ident : { $($inner:tt)* } $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*] [$module $(, $path)*]; $($inner)* ,| $($($rest)*)?); + }; + (@lookup [$($arms:tt)*] [$($path:tt),*]; $name:ident $(, $($rest:tt)*)?) => { + symbols!(@lookup [$($arms)*, stringify!($name) => symbols!(@path [$name $($path)*] [];)] [$($path),*]; $($($rest)*)?); + }; + + (@table; $($items:tt)*) => { + symbols!(@strings []; $($items)*); + symbols!(@consts []; $($items)*); + symbols!(@lookup [] [self]; $($items)*); }; } -symbols![lexical; LEXICAL; +symbols! {@table; + // [tidy] sort alphabetically start access, add, and, @@ -85,6 +186,7 @@ symbols![lexical; LEXICAL; Dict, div, draft_id, + dummy: "", E, edition, edition_id, @@ -168,106 +270,87 @@ symbols![lexical; LEXICAL; Url, vectors, web_id, -]; + // [tidy] sort alphabetically end -// Internal names are non user constructible -symbols![internal; INTERNAL; - ClosureEnv: "'" -]; + internal: { + ClosureEnv: "'" + }, -symbols![digit; DIGITS; - zero: "0", - one: "1", - two: "2", - three: "3", - four: "4", - five: "5", - six: "6", - seven: "7", - eight: "8", - nine: "9", -]; + symbol: { + // [tidy] sort alphabetically start + ampamp: "&&", + ampersand: "&", + arrow: "->", + arrow_head: "|>", + asterisk: "*", + exclamation: "!", + excleq: "!=", + brackets: "[]", + caret: "^", + colon: ":", + coloncolon: "::", + comma: ",", + dollar: "$", + dollar_question_mark: "$?", + dot: ".", + eq: "=", + eqeq: "==", + gt: ">", + gteq: ">=", + gtgt: ">>", + lt: "<", + lteq: "<=", + ltlt: "<<", + minus: "-", + pipepipe: "||", + pipe: "|", + plus: "+", + question_mark: "?", + slash: "/", + tilde: "~", + // [tidy] sort alphabetically end + }, -symbols![symbol; SYMBOLS; - add: "+", - ampersand: "&", - and: "&&", - arrow: "->", - arrow_head: "|>", - assign: "=", - asterisk: "*", - backets: "[]", - bit_shl: "<<", - bit_shr: ">>", - caret: "^", - colon: ":", - colon_colon: "::", - comma: ",", - dollar: "$", - dollar_question_mark: "$?", - dot: ".", - eq: "==", - exclamation_mark: "!", - gt: ">", - gte: ">=", - lt: "<", - lte: "<=", - ne: "!=", - or: "||", - pipe: "|", - question_mark: "?", - slash: "/", - sub: "-", - tilde: "~", -]; + digit: { + zero: "0", + one: "1", + two: "2", + three: "3", + four: "4", + five: "5", + six: "6", + seven: "7", + eight: "8", + nine: "9", + }, -symbols![path; PATHS; - option: "::core::option::Option", - some: "::core::option::Some", - none: "::core::option::None", - graph_head_entities: "::graph::head::entities", - graph_body_filter: "::graph::body::filter", - graph_tail_collect: "::graph::tail::collect", - Entity: "::graph::types::knowledge::entity::Entity" -]; - -pub(crate) const TABLES: &[&[&Symbol<'static>]] = &[LEXICAL, DIGITS, SYMBOLS, PATHS, INTERNAL]; + path: { + // [tidy] sort alphabetically start + Entity: "::graph::types::knowledge::entity::Entity", + graph_body_filter: "::graph::body::filter", + graph_head_entities: "::graph::head::entities", + graph_tail_collect: "::graph::tail::collect", + none: "::core::option::None", + option: "::core::option::Option", + some: "::core::option::Some", + // [tidy] sort alphabetically end + } +} #[cfg(test)] -mod test { - use core::ptr; +mod tests { + use std::collections::HashSet; - use super::TABLES; - use crate::{ - heap::{Heap, ResetAllocator as _}, - symbol::sym, - }; + use super::SYMBOLS; #[test] - fn pointer_equality_from_heap() { - let mut heap = Heap::new(); - - let mul_heap = heap.intern_symbol("*"); - let mul_sym = sym::symbol::asterisk; + fn symbols_are_unique() { + let mut set = HashSet::with_capacity(SYMBOLS.len()); - assert!(ptr::eq(mul_heap.0, mul_sym.0)); - - // even after reset that should be the case - heap.reset(); - - let mul_heap = heap.intern_symbol("*"); - let mul_sym = sym::symbol::asterisk; - - assert!(ptr::eq(mul_heap.0, mul_sym.0)); - } - - #[test] - fn ensure_no_collisions() { - let mut set = std::collections::HashSet::new(); - for &table in TABLES { - for &symbol in table { - assert!(set.insert(symbol.0)); - } + for symbol in SYMBOLS { + set.insert(*symbol); } + + assert_eq!(set.len(), SYMBOLS.len(), "duplicate symbol value found"); } } diff --git a/libs/@local/hashql/core/src/symbol/table.rs b/libs/@local/hashql/core/src/symbol/table.rs index eabcaf8d0c1..858da553711 100644 --- a/libs/@local/hashql/core/src/symbol/table.rs +++ b/libs/@local/hashql/core/src/symbol/table.rs @@ -1,238 +1,558 @@ -use core::ops::Index; +//! String interning table for HashQL symbols. +//! +//! This module provides [`SymbolTable`], a hash-based interner that maps strings to their +//! canonical [`Repr`] representation. The table supports two kinds of symbols: +//! +//! - **Constant symbols**: Statically defined symbols from [`sym::LOOKUP`]. Their [`Repr`] encodes +//! an index into the static [`sym::SYMBOLS`] array (effectively `'static` lifetime). +//! +//! - **Runtime symbols**: Dynamically interned strings allocated on a bump allocator. Their +//! [`Repr`] holds a pointer to a [`RuntimeRepr`] allocation. +//! +//! # Lifecycle and Epoch Coupling +//! +//! The `SymbolTable` is designed for epoch-based memory management where allocations are +//! made during a processing phase and then freed in bulk. The critical invariant is: +//! +//! **Runtime [`Repr`] values contain pointers to bump-allocated memory. When the bump +//! allocator resets, these pointers become dangling.** +//! +//! Therefore, the table must be reset **before** the bump allocator to prevent undefined +//! behavior from accessing dangling pointers during hash table operations. +//! +//! ## Correct Reset Ordering +//! +//! ```text +//! symbol_table.reset(); // Clear runtime Reprs, restore constants +//! heap.reset(); // Now safe: no dangling pointers in the table +//! ``` +//! +//! # Priming +//! +//! Calling [`SymbolTable::prime`] populates the table with predefined symbols from +//! [`sym::LOOKUP`]. This ensures that interning a predefined string returns its +//! canonical constant [`Repr`] rather than allocating a runtime symbol. +//! +//! [`sym::LOOKUP`]: super::sym::LOOKUP +//! [`sym::SYMBOLS`]: super::sym::SYMBOLS -use super::Symbol; -use crate::{ - collections::FastHashMap, - id::{Id, IdVec}, -}; +use alloc::alloc::Global; +use core::{alloc::Allocator, hash::BuildHasher as _}; -#[derive(Debug)] -enum SymbolTableInner<'heap, I> { - Dense(IdVec>), - Gapped(IdVec>>), - Sparse(FastHashMap>), -} +use foldhash::fast::RandomState; +use hashbrown::{HashTable, hash_table::Entry}; -/// A mapping from identifiers to symbols optimized for different access patterns. -/// -/// [`SymbolTable`] provides efficient storage and retrieval of [`Symbol`] instances which are tied -/// to a specific identifier (which is any type that implements the [`Id`] trait). -/// -/// # Storage Strategies -/// -/// To accommodate different access patterns, [`SymbolTable`] supports three storage strategies: +use super::repr::{Repr, RuntimeRepr}; +use crate::heap::BumpAllocator; + +/// A string interning table mapping `&str` to canonical [`Repr`] values. /// -/// ## Dense Storage +/// The table uses a [`HashTable`] with string-based hashing and equality. Two symbols +/// with identical string content will always map to the same [`Repr`]. /// -/// Created with [`SymbolTable::dense()`], this mode uses a [`Vec`] internally and requires -/// IDs to be inserted sequentially starting from 0. This provides optimal memory efficiency -/// and cache performance for contiguous ID ranges. +/// # Safety Contract /// -/// ## Gapped Storage +/// This type contains unsafe methods because runtime [`Repr`] values hold raw pointers +/// to bump-allocated memory. The caller must ensure: /// -/// Created with [`SymbolTable::gapped()`], this mode uses a [`Vec`] of [`Option`] -/// internally and allows insertion at arbitrary indices. Unlike dense storage, gaps are allowed in -/// the ID sequence. This provides a balance between the memory efficiency of dense storage and the -/// flexibility of sparse storage, making it ideal for scenarios where most IDs are contiguous but -/// some gaps may exist. +/// 1. **Epoch coupling**: [`reset`](Self::reset) must be called before resetting the bump allocator +/// that backs runtime symbols. Failure to do so causes undefined behavior when the table +/// attempts to hash or compare entries with dangling pointers. /// -/// ## Sparse Storage +/// 2. **Allocator consistency**: The same bump allocator instance must be used for all +/// [`intern`](Self::intern) calls on this table. /// -/// Created with [`SymbolTable::sparse()`], this mode uses a [`FastHashMap`] internally and -/// supports arbitrary ID insertion order. This provides flexibility at the cost of higher -/// memory overhead per entry. +/// 3. **Allocator lifetime**: The bump allocator passed to [`intern`](Self::intern) must remain +/// live for as long as the table is in use (i.e., until [`reset`](Self::reset) is called). /// -/// # Examples +/// 4. **Priming precondition**: [`prime`](Self::prime) must only be called on an empty table +/// (typically after [`clear`](Self::clear)). /// -/// ``` -/// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; -/// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); -/// # let mut heap = Heap::new(); -/// # let symbol = heap.intern_symbol("example"); -/// // Dense storage for sequential IDs -/// let mut dense_table = SymbolTable::::dense(); -/// dense_table.insert(MyId::from_u32(0), symbol); -/// assert_eq!(dense_table.get(MyId::from_u32(0)), Some(symbol)); +/// # Drop Safety /// -/// // Gapped storage for mostly contiguous IDs with some gaps -/// let mut gapped_table = SymbolTable::::gapped(); -/// gapped_table.insert(MyId::from_u32(0), symbol); -/// gapped_table.insert(MyId::from_u32(5), symbol); // Gap at IDs 1-4 -/// assert_eq!(gapped_table.get(MyId::from_u32(0)), Some(symbol)); -/// assert_eq!(gapped_table.get(MyId::from_u32(2)), None); // Gap -/// assert_eq!(gapped_table.get(MyId::from_u32(5)), Some(symbol)); +/// Dropping the `SymbolTable` after the bump allocator has been reset is **safe**. +/// [`Repr`] has no [`Drop`] implementation, so dropping the table does not dereference +/// any runtime symbol pointers. Only *using* the table (e.g., calling [`intern`](Self::intern)) +/// after the allocator reset causes undefined behavior. /// -/// // Sparse storage for arbitrary IDs -/// let mut sparse_table = SymbolTable::::sparse(); -/// sparse_table.insert(MyId::from_u32(100), symbol); -/// assert_eq!(sparse_table.get(MyId::from_u32(100)), Some(symbol)); -/// sparse_table.insert(MyId::from_u32(5), symbol); -/// assert_eq!(sparse_table.get(MyId::from_u32(5)), Some(symbol)); -/// ``` +/// Note: This assumes the [`HashTable`]'s own allocator `A` (used for bucket storage) is +/// still valid. With the default `A = Global`, this is always the case. #[derive(Debug)] -pub struct SymbolTable<'heap, I> { - inner: SymbolTableInner<'heap, I>, +pub(crate) struct SymbolTable { + inner: HashTable, + hasher: RandomState, } -impl<'heap, I> SymbolTable<'heap, I> -where - I: Id, -{ - /// Creates a new symbol table using dense vector-based storage. +impl SymbolTable { + /// Creates a new, empty symbol table using the global allocator. /// - /// Dense tables require sequential ID insertion starting from 0 and provide - /// optimal memory efficiency and cache performance for contiguous ID ranges. - /// - /// # Examples + /// The table is not primed. Call [`prime`](Self::prime) to populate it with + /// predefined symbols before use. + #[inline] + pub(crate) fn new() -> Self { + Self::new_in(Global) + } +} + +impl SymbolTable { + /// Creates a new, empty symbol table using the given allocator. /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::dense(); - /// // Insertions must be sequential: 0, 1, 2, ... - /// ``` - #[must_use] - pub const fn dense() -> Self { + /// The table is not primed. Call [`prime`](Self::prime) to populate it with + /// predefined symbols before use. + #[inline] + fn new_in(alloc: A) -> Self { Self { - inner: SymbolTableInner::Dense(IdVec::new()), + inner: HashTable::new_in(alloc), + hasher: RandomState::default(), } } - /// Creates a new symbol table using gapped vector-based storage. + /// Returns the number of symbols currently in the table. + #[cfg(test)] + pub(crate) fn len(&self) -> usize { + self.inner.len() + } + + /// Returns `true` if the table contains no symbols. + #[inline] + pub(crate) fn is_empty(&self) -> bool { + self.inner.is_empty() + } +} + +#[expect(unsafe_code)] +impl SymbolTable { + /// Removes all entries from the table. /// - /// Gapped tables allow insertion at arbitrary indices within a vector, automatically - /// filling gaps with `None` values. This provides better memory locality than sparse - /// tables while still allowing non-contiguous ID ranges. + /// After calling this method, the table is empty and must be primed before use. /// - /// # Examples + /// # Safety /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::gapped(); - /// // Insertions can have gaps: 0, 5, 3, 10, ... - /// ``` - #[must_use] - pub const fn gapped() -> Self { - Self { - inner: SymbolTableInner::Gapped(IdVec::new()), - } + /// The caller must call [`prime`](Self::prime) before any subsequent [`intern`](Self::intern) + /// calls. Without priming, interning a predefined symbol (e.g., `"and"`) would allocate + /// a new runtime symbol instead of returning the canonical constant [`Repr`] that matches + /// the static symbols in [`sym`](super::sym). This would break the invariant that + /// predefined symbols intern to their canonical constant representations. + #[inline] + pub(crate) unsafe fn clear(&mut self) { + self.inner.clear(); } - /// Creates a new symbol table using sparse hash-based storage. + /// Populates the table with predefined symbols from [`sym::LOOKUP`]. /// - /// Sparse tables support arbitrary ID insertion order and provide flexibility - /// for non-contiguous ID ranges at the cost of higher memory overhead per entry. + /// After priming, interning any predefined symbol string will return its canonical + /// constant [`Repr`] rather than allocating a new runtime symbol. /// - /// # Examples + /// # Preconditions /// - /// ``` - /// # use hashql_core::{symbol::SymbolTable, newtype}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// let table = SymbolTable::::sparse(); - /// // Insertions can be in any order: 100, 5, 1000, ... - /// ``` - #[must_use] - pub fn sparse() -> Self { - Self { - inner: SymbolTableInner::Sparse(FastHashMap::default()), + /// The table must be empty. This is typically ensured by calling [`clear`](Self::clear) + /// beforehand, or by using a freshly constructed table. + /// + /// # Safety + /// + /// The caller must ensure that the table is empty before calling this method. + /// + /// [`sym::LOOKUP`]: super::sym::LOOKUP + pub(crate) unsafe fn prime(&mut self) { + self.inner.reserve(super::sym::LOOKUP.len(), |_| { + unreachable!("prime() requires an empty table; hasher callback should not be invoked") + }); + + for &(name, value) in super::sym::LOOKUP { + let hash = self.hasher.hash_one(name); + + self.inner.insert_unique(hash, value, |_| { + unreachable!("capacity was pre-reserved; hasher callback should not be invoked") + }); } } - /// Inserts a symbol associated with the given identifier. + /// Resets the table to its initial primed state. /// - /// - For dense tables, the `id` must be sequential starting from 0. - /// - For gapped tables, any `id` value is accepted, and gaps will be filled with `None`. - /// - For sparse tables, any `id` value is accepted. + /// This is equivalent to calling [`clear`](Self::clear) followed by [`prime`](Self::prime). + /// After resetting, the table contains only the predefined constant symbols. /// - /// If the `id` already exists in a gapped or sparse table, the previous symbol is replaced. + /// # Safety /// - /// # Panics + /// **This method must be called before resetting the bump allocator** that backs any + /// runtime symbols previously interned into this table. The reset ordering is: /// - /// Panics if this is a dense table and the `id` is not sequential (i.e., not equal - /// to the current length of the internal vector). + /// ```text + /// symbol_table.reset(); // ← First: clear dangling runtime Reprs + /// heap.reset(); // ← Second: now safe to invalidate allocations + /// ``` /// - /// # Examples + /// Violating this ordering causes undefined behavior: the bump allocator reset + /// invalidates runtime symbol pointers, and subsequent table operations (including + /// this method's `clear()` + `prime()` sequence, or future `intern()` calls) may + /// attempt to dereference those dangling pointers. /// - /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); - /// table.insert(MyId::from_u32(0), symbol); // First insertion - /// table.insert(MyId::from_u32(1), symbol); // Sequential insertion - /// ``` + /// # Invariants Restored /// - /// Non-sequential insertions will panic in dense tables: + /// After this method returns: + /// - All runtime symbols are removed from the table. + /// - All constant symbols from [`sym::LOOKUP`] are present. + /// - The table is ready for a new epoch of interning. /// - /// ```should_panic - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::dense(); - /// table.insert(MyId::from_u32(0), symbol); // First insertion - /// table.insert(MyId::from_u32(2), symbol); // Non-sequential insertion - /// ``` - pub fn insert(&mut self, id: I, symbol: Symbol<'heap>) { - match &mut self.inner { - SymbolTableInner::Dense(vec) => { - assert_eq!( - id, - vec.bound(), - "insertions into dense symbol tables must be sequential and contiguous" - ); - - vec.push(symbol); - } - SymbolTableInner::Gapped(vec) => { - vec.insert(id, symbol); - } - SymbolTableInner::Sparse(map) => { - map.insert(id, symbol); - } + /// [`sym::LOOKUP`]: super::sym::LOOKUP + #[inline] + pub(crate) unsafe fn reset(&mut self) { + // SAFETY: correct order of operations is present. + unsafe { + self.clear(); + self.prime(); } } - /// Retrieves the symbol associated with the given identifier. + /// Interns a string, returning its canonical [`Repr`]. /// - /// Returns the [`Symbol`] if the `id` exists in the table, or [`None`] if - /// the `id` is not found or if the entry is a gap (in gapped tables). + /// If the string has already been interned (either as a predefined constant or a + /// previously interned runtime symbol), returns the existing [`Repr`]. Otherwise, + /// allocates a new [`RuntimeRepr`] on the provided bump allocator and inserts it. /// - /// # Examples + /// # Returns /// - /// ``` - /// # use hashql_core::{heap::Heap, symbol::SymbolTable, newtype, id::Id as _}; - /// # newtype!(struct MyId(u32 is 0..=0xFFFF_FF00)); - /// # let mut heap = Heap::new(); - /// # let symbol = heap.intern_symbol("example"); - /// let mut table = SymbolTable::::sparse(); - /// table.insert(MyId::from_u32(42), symbol); - /// - /// assert_eq!(table.get(MyId::from_u32(42)), Some(symbol)); - /// assert_eq!(table.get(MyId::from_u32(99)), None); - /// ``` - pub fn get(&self, id: I) -> Option> { - match &self.inner { - SymbolTableInner::Dense(vec) => vec.get(id).copied(), - SymbolTableInner::Gapped(vec) => vec.get(id).copied().flatten(), - SymbolTableInner::Sparse(map) => map.get(&id).copied(), + /// The canonical [`Repr`] for `value`. Interning the same string multiple times + /// is idempotent—subsequent calls return the same [`Repr`]. + /// + /// # Safety + /// + /// The caller must ensure: + /// + /// 1. **No dangling pointers**: The table must not contain dangling runtime [`Repr`] values. + /// This means [`reset`](Self::reset) must have been called before any preceding bump + /// allocator reset. + /// + /// 2. **Allocator consistency**: The same allocator instance must be used for all `intern()` + /// calls on this table. Using different allocators would result in runtime symbols from + /// multiple allocators, and resetting one would leave dangling pointers from the other. + /// + /// 3. **Allocator lifetime**: The allocator must remain live for the lifetime of this symbol + /// table, or until [`reset`](Self::reset) is called. All runtime [`Repr`] values in the + /// table point into the allocator's memory and are dereferenced during table operations. + /// + /// # Implementation Notes + /// + /// The table hashes and compares entries by their string content, not by [`Repr`] + /// identity. This means: + /// - Equality: `repr.as_str() == value` + /// - Hashing: `hash(repr.as_str())` + /// + /// Both operations dereference runtime [`Repr`] pointers, which is why the caller + /// must ensure no dangling pointers exist in the table. + pub(crate) unsafe fn intern(&mut self, alloc: &B, value: &str) -> Repr { + let hash = self.hasher.hash_one(value); + + // We hash against the string, therefore we must pull out the string representation, + // instead of hashing against the Repr directly, as that would lead to incorrect results. + // We're mapping string -> repr. But the string representation is already stored in the + // Repr. + match self.inner.entry( + hash, + // SAFETY: Caller guarantees no dangling runtime pointers in the table. + |repr| unsafe { repr.as_str() } == value, + // SAFETY: Same as above; this is called during rehashing. + |repr| self.hasher.hash_one(unsafe { repr.as_str() }), + ) { + Entry::Occupied(entry) => *entry.get(), + Entry::Vacant(entry) => { + let repr = Repr::runtime(RuntimeRepr::alloc(alloc, value)); + *entry.insert(repr).get() + } } } } -impl<'heap, I> Index for SymbolTable<'heap, I> -where - I: Id, -{ - type Output = Symbol<'heap>; +#[cfg(test)] +mod tests { + #![expect(unsafe_code, clippy::non_ascii_literal)] + + use super::{super::sym, SymbolTable}; + use crate::heap::Scratch; + + #[test] + fn new_table_is_empty() { + let table = SymbolTable::new(); + assert!(table.is_empty()); + assert_eq!(table.len(), 0); + } + + #[test] + fn prime_populates_table_with_lookup_entries() { + let mut table = SymbolTable::new(); + // SAFETY: Table is empty, no dangling pointers. + unsafe { + table.prime(); + } + + assert_eq!(table.len(), sym::LOOKUP.len()); + assert!(!table.is_empty()); + } + + #[test] + fn clear_removes_all_entries() { + let mut table = SymbolTable::new(); + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + assert!(!table.is_empty()); + + // SAFETY: We will not call intern() after this without priming first. + unsafe { + table.clear(); + } + assert!(table.is_empty()); + assert_eq!(table.len(), 0); + } + + #[test] + fn reset_restores_primed_state() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + let initial_len = table.len(); + + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "user_defined_symbol"); + }; + assert_eq!(table.len(), initial_len + 1); - fn index(&self, index: I) -> &Self::Output { - match &self.inner { - SymbolTableInner::Dense(vec) => &vec[index], - SymbolTableInner::Gapped(vec) => vec[index].as_ref().expect("index out of bounds"), - SymbolTableInner::Sparse(map) => &map[&index], + // SAFETY: Scratch has not been reset, so runtime pointers are valid. + unsafe { + table.reset(); + }; + assert_eq!(table.len(), initial_len); + } + + #[test] + fn intern_predefined_symbol_returns_constant_repr() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // Intern a predefined symbol (e.g., "and" from LOOKUP). + // The returned Repr should match the one in LOOKUP. + for &(name, expected_repr) in sym::LOOKUP { + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, name) }; + assert_eq!( + repr, expected_repr, + "predefined symbol '{name}' should return constant Repr" + ); + } + } + + #[test] + fn intern_is_idempotent() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // SAFETY: Table is primed, scratch is live. + let repr1 = unsafe { table.intern(&scratch, "my_custom_symbol") }; + // SAFETY: Table is primed, scratch is live. + let repr2 = unsafe { table.intern(&scratch, "my_custom_symbol") }; + + assert_eq!(repr1, repr2); + // SAFETY: scratch is live. + assert_eq!(unsafe { repr1.as_str() }, "my_custom_symbol"); + } + + #[test] + fn intern_different_strings_returns_different_reprs() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr_foo = unsafe { table.intern(&scratch, "foo_unique") }; + // SAFETY: Table is primed, scratch is live. + let repr_bar = unsafe { table.intern(&scratch, "bar_unique") }; + + assert_ne!(repr_foo, repr_bar); + } + + #[test] + fn intern_empty_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, "") }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, ""); + } + + #[test] + fn intern_unicode_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + } + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, "日本語 🎉 émojis") }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, "日本語 🎉 émojis"); + } + + #[test] + fn intern_long_string() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); } + + let long_string = "a".repeat(10_000); + + // SAFETY: Table is primed, scratch is live. + let repr = unsafe { table.intern(&scratch, &long_string) }; + + // SAFETY: scratch is live. + assert_eq!(unsafe { repr.as_str() }, long_string); + } + + #[test] + fn constants_survive_reset() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // Get a constant Repr by interning a predefined symbol. + let (name, expected_repr) = sym::LOOKUP[0]; + // SAFETY: Table is primed, scratch is live. + let repr_before = unsafe { table.intern(&scratch, name) }; + assert_eq!(repr_before, expected_repr); + + // SAFETY: Scratch has not been reset. + unsafe { + table.reset(); + }; + + // SAFETY: Table is primed, scratch is live. + let repr_after = unsafe { table.intern(&scratch, name) }; + + // Constants should be identical across resets. + assert_eq!(repr_before, repr_after); + } + + #[test] + fn runtime_symbols_cleared_on_reset() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + let primed_len = table.len(); + + // Intern some runtime symbols. + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "runtime_1"); + table.intern(&scratch, "runtime_2"); + table.intern(&scratch, "runtime_3"); + } + assert_eq!(table.len(), primed_len + 3); + + // SAFETY: Scratch has not been reset. + unsafe { + table.reset(); + }; + + // Runtime symbols should be gone, only constants remain. + assert_eq!(table.len(), primed_len); + } + + #[test] + fn multiple_intern_operations_grow_table() { + let mut table = SymbolTable::new(); + let scratch = Scratch::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + let initial_len = table.len(); + + // SAFETY: Table is primed, scratch is live. + unsafe { + for i in 0..100 { + table.intern(&scratch, &format!("symbol_{i}")); + } + } + + assert_eq!(table.len(), initial_len + 100); + } + + /// Test that dropping a `SymbolTable` after the backing allocator has been reset + /// does not cause undefined behavior. + /// + /// This test is designed to be run under Miri to verify drop safety. + /// The key invariant: `Repr` has no `Drop` impl, so dropping the table + /// does not dereference any (now-dangling) runtime symbol pointers. + #[test] + fn drop_after_allocator_reset_is_safe() { + let scratch = Scratch::new(); + let mut table = SymbolTable::new(); + + // SAFETY: Table is empty. + unsafe { + table.prime(); + }; + + // Intern several runtime symbols to ensure we have dangling pointers after reset. + // SAFETY: Table is primed, scratch is live. + unsafe { + table.intern(&scratch, "runtime_symbol_1"); + table.intern(&scratch, "runtime_symbol_2"); + table.intern(&scratch, "another_runtime_symbol"); + } + + // Drop the allocator FIRST - this invalidates all runtime symbol pointers. + // The table now contains dangling pointers, but we will NOT use it. + drop(scratch); + + // Drop the table. This should NOT cause UB because: + // - Repr has no Drop impl (it's Copy) + // - HashTable::drop doesn't hash/compare elements, just drops them in-place + // - Dropping a Repr is a no-op that doesn't dereference the pointer + drop(table); } } diff --git a/libs/@local/hashql/core/src/type/pretty.rs b/libs/@local/hashql/core/src/type/pretty.rs index e715adf69fd..67ed6d0558b 100644 --- a/libs/@local/hashql/core/src/type/pretty.rs +++ b/libs/@local/hashql/core/src/type/pretty.rs @@ -313,7 +313,7 @@ impl<'fmt, 'heap> FormatType<'fmt, TypeKind<'heap>> for TypeFormatter<'fmt, '_, TypeKind::Generic(generic) => self.format_type(generic), TypeKind::Param(param) => self.format_type(param), TypeKind::Infer(infer) => self.format_type(infer), - TypeKind::Never => self.fmt.type_name(sym::symbol::exclamation_mark), + TypeKind::Never => self.fmt.type_name(sym::symbol::exclamation), TypeKind::Unknown => self.fmt.type_name(sym::symbol::question_mark), }; @@ -397,11 +397,11 @@ impl<'fmt, 'heap> FormatType<'fmt, OpaqueType<'heap>> for TypeFormatter<'fmt, '_ impl<'fmt> FormatType<'fmt, PrimitiveType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, value: PrimitiveType) -> Doc<'fmt> { match value { - PrimitiveType::Number => self.fmt.type_name(sym::lexical::Number), - PrimitiveType::Integer => self.fmt.type_name(sym::lexical::Integer), - PrimitiveType::String => self.fmt.type_name(sym::lexical::String), - PrimitiveType::Null => self.fmt.type_name(sym::lexical::Null), - PrimitiveType::Boolean => self.fmt.type_name(sym::lexical::Boolean), + PrimitiveType::Number => self.fmt.type_name(sym::Number), + PrimitiveType::Integer => self.fmt.type_name(sym::Integer), + PrimitiveType::String => self.fmt.type_name(sym::String), + PrimitiveType::Null => self.fmt.type_name(sym::Null), + PrimitiveType::Boolean => self.fmt.type_name(sym::Boolean), } } } @@ -409,14 +409,14 @@ impl<'fmt> FormatType<'fmt, PrimitiveType> for TypeFormatter<'fmt, '_, '_> { impl<'fmt> FormatType<'fmt, ListType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, ListType { element }: ListType) -> Doc<'fmt> { self.fmt - .type_name(sym::lexical::List) + .type_name(sym::List) .append(self.fmt.angles(self.format_type(element))) } } impl<'fmt> FormatType<'fmt, DictType> for TypeFormatter<'fmt, '_, '_> { fn format_type(&mut self, DictType { key, value }: DictType) -> Doc<'fmt> { - self.fmt.type_name(sym::lexical::Dict).append( + self.fmt.type_name(sym::Dict).append( self.fmt.angles( self.fmt .comma_sep([self.format_type(key), self.format_type(value)]), diff --git a/libs/@local/hashql/core/src/value/primitive/float.rs b/libs/@local/hashql/core/src/value/primitive/float.rs index acd81091f63..0e14f094247 100644 --- a/libs/@local/hashql/core/src/value/primitive/float.rs +++ b/libs/@local/hashql/core/src/value/primitive/float.rs @@ -103,7 +103,7 @@ impl<'heap> Float<'heap> { /// Panics if the stored value is not a valid JSON-formatted floating-point number. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f32(&self) -> f32 { + pub fn as_f32(self) -> f32 { f32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE) .expect("float literal should be formatted according to JSON specification") } @@ -140,7 +140,7 @@ impl<'heap> Float<'heap> { /// Panics if the stored value is not a valid JSON-formatted floating-point number. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f64(&self) -> f64 { + pub fn as_f64(self) -> f64 { f64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE) .expect("float literal should be formatted according to JSON specification") } @@ -243,7 +243,7 @@ impl<'heap> Float<'heap> { /// assert_eq!(symbol.as_str(), "1.23e4"); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/core/src/value/primitive/integer.rs b/libs/@local/hashql/core/src/value/primitive/integer.rs index b53b9719549..70d384dd7b7 100644 --- a/libs/@local/hashql/core/src/value/primitive/integer.rs +++ b/libs/@local/hashql/core/src/value/primitive/integer.rs @@ -83,7 +83,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("300").as_u8(), None); /// ``` #[must_use] - pub fn as_u8(&self) -> Option { + pub fn as_u8(self) -> Option { u8::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -110,7 +110,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("70000").as_u16(), None); /// ``` #[must_use] - pub fn as_u16(&self) -> Option { + pub fn as_u16(self) -> Option { u16::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -137,7 +137,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("5000000000").as_u32(), None); /// ``` #[must_use] - pub fn as_u32(&self) -> Option { + pub fn as_u32(self) -> Option { u32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -164,7 +164,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999999").as_u64(), None); /// ``` #[must_use] - pub fn as_u64(&self) -> Option { + pub fn as_u64(self) -> Option { u64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -197,7 +197,7 @@ impl<'heap> Integer<'heap> { /// ); /// ``` #[must_use] - pub fn as_u128(&self) -> Option { + pub fn as_u128(self) -> Option { u128::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -224,7 +224,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999999").as_usize(), None); /// ``` #[must_use] - pub fn as_usize(&self) -> Option { + pub fn as_usize(self) -> Option { usize::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -251,7 +251,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("200").as_i8(), None); /// ``` #[must_use] - pub fn as_i8(&self) -> Option { + pub fn as_i8(self) -> Option { i8::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -278,7 +278,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("50000").as_i16(), None); /// ``` #[must_use] - pub fn as_i16(&self) -> Option { + pub fn as_i16(self) -> Option { i16::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -305,7 +305,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("3000000000").as_i32(), None); /// ``` #[must_use] - pub fn as_i32(&self) -> Option { + pub fn as_i32(self) -> Option { i32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -332,7 +332,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("999999999999999999999").as_i64(), None); /// ``` #[must_use] - pub fn as_i64(&self) -> Option { + pub fn as_i64(self) -> Option { i64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -368,7 +368,7 @@ impl<'heap> Integer<'heap> { /// ); /// ``` #[must_use] - pub fn as_i128(&self) -> Option { + pub fn as_i128(self) -> Option { i128::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -395,7 +395,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(integer("99999999999999999999").as_isize(), None); /// ``` #[must_use] - pub fn as_isize(&self) -> Option { + pub fn as_isize(self) -> Option { isize::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &PARSE).ok() } @@ -428,7 +428,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f32(&self) -> f32 { + pub fn as_f32(self) -> f32 { f32::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &float::PARSE) .expect("integer literal should be formatted according to JSON specification") } @@ -462,7 +462,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_f64(&self) -> f64 { + pub fn as_f64(self) -> f64 { f64::from_lexical_with_options::<{ format::JSON }>(self.value.as_bytes(), &float::PARSE) .expect("integer literal should be formatted according to JSON specification") } @@ -498,7 +498,7 @@ impl<'heap> Integer<'heap> { /// Panics if the stored value is not a valid JSON-formatted integer. /// This should never happen for properly constructed AST nodes. #[must_use] - pub fn as_real(&self) -> Real { + pub fn as_real(self) -> Real { Real::from_str(self.value.as_str()) .expect("integer literal should be formatted according to JSON specification") } @@ -527,7 +527,7 @@ impl<'heap> Integer<'heap> { /// assert_eq!(symbol.as_str(), "123456789012345678901234567890"); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/core/src/value/primitive/string.rs b/libs/@local/hashql/core/src/value/primitive/string.rs index d4618d860b5..1fe0d42533d 100644 --- a/libs/@local/hashql/core/src/value/primitive/string.rs +++ b/libs/@local/hashql/core/src/value/primitive/string.rs @@ -57,7 +57,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_str(), "Hello, world!"); /// ``` #[must_use] - pub const fn as_str(&self) -> &str { + pub fn as_str(&self) -> &str { self.value.as_str() } @@ -74,7 +74,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_bytes(), b"Hello"); /// ``` #[must_use] - pub const fn as_bytes(&self) -> &[u8] { + pub fn as_bytes(&self) -> &[u8] { self.value.as_bytes() } @@ -91,7 +91,7 @@ impl<'heap> String<'heap> { /// assert_eq!(literal.as_symbol(), heap.intern_symbol("Hello")); /// ``` #[must_use] - pub const fn as_symbol(&self) -> Symbol<'heap> { + pub const fn as_symbol(self) -> Symbol<'heap> { self.value } } diff --git a/libs/@local/hashql/eval/src/graph/read/path.rs b/libs/@local/hashql/eval/src/graph/read/path.rs index 395d1d70cf2..810fb4e4477 100644 --- a/libs/@local/hashql/eval/src/graph/read/path.rs +++ b/libs/@local/hashql/eval/src/graph/read/path.rs @@ -97,14 +97,11 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityIdQueryPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::web_id { - Some(Self::WebId) - } else if field == sym::lexical::entity_uuid { - Some(Self::EntityUuid) - } else if field == sym::lexical::draft_id { - Some(Self::DraftId) - } else { - None + match field.as_constant()? { + sym::web_id::CONST => Some(Self::WebId), + sym::entity_uuid::CONST => Some(Self::EntityUuid), + sym::draft_id::CONST => Some(Self::DraftId), + _ => None, } } @@ -139,12 +136,10 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityRecordIdPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::entity_id { - Some(Self::EntityId(None)) - } else if field == sym::lexical::entity_edition_id { - Some(Self::EntityEditionId) - } else { - None + match field.as_constant()? { + sym::entity_id::CONST => Some(Self::EntityId(None)), + sym::entity_edition_id::CONST => Some(Self::EntityEditionId), + _ => None, } } @@ -191,12 +186,10 @@ impl<'heap> PartialQueryPath<'heap> for PartialLinkDataPath { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::left_entity_id { - Some(Self::LeftEntityId(None)) - } else if field == sym::lexical::right_entity_id { - Some(Self::RightEntityId(None)) - } else { - None + match field.as_constant()? { + sym::left_entity_id::CONST => Some(Self::LeftEntityId(None)), + sym::right_entity_id::CONST => Some(Self::RightEntityId(None)), + _ => None, } } @@ -310,14 +303,11 @@ impl<'heap> PartialQueryPath<'heap> for PartialEntityQueryPath<'heap> { type QueryPath = EntityQueryPath<'heap>; fn from_field(_: &'heap Heap, field: Symbol<'heap>) -> Option { - if field == sym::lexical::id { - Some(PartialEntityQueryPath::Id(None)) - } else if field == sym::lexical::properties { - Some(PartialEntityQueryPath::Properties(None)) - } else if field == sym::lexical::link_data { - Some(PartialEntityQueryPath::LinkData(None)) - } else { - None + match field.as_constant()? { + sym::id::CONST => Some(PartialEntityQueryPath::Id(None)), + sym::properties::CONST => Some(PartialEntityQueryPath::Properties(None)), + sym::link_data::CONST => Some(PartialEntityQueryPath::LinkData(None)), + _ => None, } } diff --git a/libs/@local/hashql/hir/src/context.rs b/libs/@local/hashql/hir/src/context.rs index 9d0582ee182..bfeca6b0aaf 100644 --- a/libs/@local/hashql/hir/src/context.rs +++ b/libs/@local/hashql/hir/src/context.rs @@ -1,4 +1,4 @@ -use hashql_core::{heap::Heap, id::IdCounter, module::ModuleRegistry, symbol::SymbolTable}; +use hashql_core::{heap::Heap, id::IdCounter, module::ModuleRegistry, symbol::SymbolLookup}; use crate::{ intern::Interner, @@ -6,18 +6,18 @@ use crate::{ node::{HirId, r#let::VarId}, }; -pub type BinderSymbolTable<'heap> = SymbolTable<'heap, VarId>; +pub type BinderSymbolLookup<'heap> = SymbolLookup<'heap, VarId>; #[derive(Debug)] pub struct SymbolRegistry<'heap> { - pub binder: BinderSymbolTable<'heap>, + pub binder: BinderSymbolLookup<'heap>, } impl SymbolRegistry<'_> { #[must_use] pub const fn new() -> Self { Self { - binder: BinderSymbolTable::dense(), + binder: BinderSymbolLookup::dense(), } } } diff --git a/libs/@local/hashql/hir/src/node/operation/binary.rs b/libs/@local/hashql/hir/src/node/operation/binary.rs index e8d23632e12..d9ecea6a7dd 100644 --- a/libs/@local/hashql/hir/src/node/operation/binary.rs +++ b/libs/@local/hashql/hir/src/node/operation/binary.rs @@ -71,13 +71,13 @@ impl BinOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::And => sym::symbol::and, - Self::Or => sym::symbol::or, - Self::Eq => sym::symbol::eq, + Self::And => sym::symbol::ampamp, + Self::Or => sym::symbol::pipepipe, + Self::Eq => sym::symbol::eqeq, Self::Lt => sym::symbol::lt, - Self::Lte => sym::symbol::lte, - Self::Ne => sym::symbol::ne, - Self::Gte => sym::symbol::gte, + Self::Lte => sym::symbol::lteq, + Self::Ne => sym::symbol::excleq, + Self::Gte => sym::symbol::gteq, Self::Gt => sym::symbol::gt, } } diff --git a/libs/@local/hashql/hir/src/node/operation/unary.rs b/libs/@local/hashql/hir/src/node/operation/unary.rs index 13cb1d7b773..8bb835e0c4f 100644 --- a/libs/@local/hashql/hir/src/node/operation/unary.rs +++ b/libs/@local/hashql/hir/src/node/operation/unary.rs @@ -32,9 +32,9 @@ impl UnOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::Not => sym::symbol::exclamation_mark, + Self::Not => sym::symbol::exclamation, Self::BitNot => sym::symbol::tilde, - Self::Neg => sym::symbol::sub, + Self::Neg => sym::symbol::minus, } } } diff --git a/libs/@local/hashql/hir/src/pretty.rs b/libs/@local/hashql/hir/src/pretty.rs index 51c58fd6381..b8c562352d9 100644 --- a/libs/@local/hashql/hir/src/pretty.rs +++ b/libs/@local/hashql/hir/src/pretty.rs @@ -256,9 +256,9 @@ impl<'fmt, 'heap> FormatNode<'fmt, &List<'heap>> for NodeFormatter<'fmt, '_, 'he impl<'fmt, 'heap> FormatNode<'fmt, &Primitive<'heap>> for NodeFormatter<'fmt, '_, 'heap> { fn format_node(&mut self, node: &Primitive<'heap>) -> Doc<'fmt> { match node { - Primitive::Null => self.fmt.literal(sym::lexical::null), - Primitive::Boolean(true) => self.fmt.literal(sym::lexical::r#true), - Primitive::Boolean(false) => self.fmt.literal(sym::lexical::r#false), + Primitive::Null => self.fmt.literal(sym::null), + Primitive::Boolean(true) => self.fmt.literal(sym::r#true), + Primitive::Boolean(false) => self.fmt.literal(sym::r#false), Primitive::Float(float) => self.fmt.literal(float.as_symbol()), Primitive::Integer(integer) => self.fmt.literal(integer.as_symbol()), Primitive::String(string) => { @@ -303,10 +303,10 @@ impl<'fmt, 'heap> FormatNode<'fmt, &QualifiedVariable<'heap>> for NodeFormatter< ) -> Doc<'fmt> { // Format as: ::path::to::var self.fmt - .punct(sym::symbol::colon_colon) + .punct(sym::symbol::coloncolon) .append(self.fmt.intersperse( path.0.iter().map(|ident| self.fmt.variable(ident.value)), - self.fmt.punct(sym::symbol::colon_colon), + self.fmt.punct(sym::symbol::coloncolon), )) .append(self.format_type_arguments(arguments)) } @@ -317,8 +317,8 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Let<'heap>> for NodeFormatter<'fmt, '_, 'hea let fmt = self.fmt; // Format as: let foo = ..., bar = ... in body - let r#let = self.fmt.keyword(sym::lexical::r#let); - let r#in = self.fmt.keyword(sym::lexical::r#in); + let r#let = self.fmt.keyword(sym::r#let); + let r#in = self.fmt.keyword(sym::r#in); let bindings = bindings.iter().map(|binding| self.format_node(binding)); let bindings = fmt.intersperse( @@ -358,7 +358,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Binding<'heap>> for NodeFormatter<'fmt, '_, name_doc .append(self.fmt.space()) - .append(self.fmt.punct(sym::symbol::assign)) + .append(self.fmt.punct(sym::symbol::eq)) .append(self.fmt.space()) .append(value_doc) } @@ -397,11 +397,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &TypeAssertion<'heap>> for NodeFormatter<'fmt let value = self.format_node(*value); let r#type = self.format_type(*r#type); - let op = if *force { - sym::lexical::r#as_force - } else { - sym::lexical::r#as - }; + let op = if *force { sym::r#as_force } else { sym::r#as }; value .append(self.fmt.space()) @@ -460,7 +456,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &InputOperation<'heap>> for NodeFormatter<'fm } InputOp::Exists => { // Format as: $exists(name) - let keyword = self.fmt.keyword(sym::lexical::input_exists); + let keyword = self.fmt.keyword(sym::input_exists); let name = self.fmt.variable(name.value); keyword.append(self.fmt.parens(name)) @@ -562,9 +558,9 @@ impl<'fmt, 'heap> FormatNode<'fmt, &If<'heap>> for NodeFormatter<'fmt, '_, 'heap // value1 // else // value2 - let if_keyword = self.fmt.keyword(sym::lexical::r#if); - let then_keyword = self.fmt.keyword(sym::lexical::then).into_doc(); - let else_keyword = self.fmt.keyword(sym::lexical::r#else).into_doc(); + let if_keyword = self.fmt.keyword(sym::r#if); + let then_keyword = self.fmt.keyword(sym::then).into_doc(); + let else_keyword = self.fmt.keyword(sym::r#else).into_doc(); let test_doc = self.format_node(test).into_doc(); let then_doc = self.format_node(then).into_doc(); @@ -688,7 +684,7 @@ impl<'fmt, 'heap> FormatNode<'fmt, &Thunk<'heap>> for NodeFormatter<'fmt, '_, 'h fn format_node(&mut self, Thunk { body }: &Thunk<'heap>) -> Doc<'fmt> { // Format thunks differently from closures using the thunk keyword // Format as: thunk -> body - let keyword = self.fmt.keyword(sym::lexical::thunk); + let keyword = self.fmt.keyword(sym::thunk); let arrow = self.fmt.op(sym::symbol::arrow); let body_doc = self.format_node(*body); diff --git a/libs/@local/hashql/hir/src/reify/mod.rs b/libs/@local/hashql/hir/src/reify/mod.rs index 98138f0d7e8..38dca9faa87 100644 --- a/libs/@local/hashql/hir/src/reify/mod.rs +++ b/libs/@local/hashql/hir/src/reify/mod.rs @@ -715,10 +715,7 @@ impl<'heap> ReificationContext<'_, '_, '_, 'heap> { } fn if_expr_then_some(&mut self, node: Node<'heap>) -> Node<'heap> { - let some_some = self.make_qualified_path( - node.span, - &[sym::lexical::core, sym::lexical::option, sym::lexical::Some], - ); + let some_some = self.make_qualified_path(node.span, &[sym::core, sym::option, sym::Some]); let node = NodeData { id: self.context.counter.hir.next(), @@ -737,10 +734,7 @@ impl<'heap> ReificationContext<'_, '_, '_, 'heap> { } fn if_expr_else_none(&mut self, span: SpanId) -> Node<'heap> { - let none_path = self.make_qualified_path( - span, - &[sym::lexical::core, sym::lexical::option, sym::lexical::None], - ); + let none_path = self.make_qualified_path(span, &[sym::core, sym::option, sym::None]); let node = NodeData { id: self.context.counter.hir.next(), diff --git a/libs/@local/hashql/mir/src/body/rvalue/binary.rs b/libs/@local/hashql/mir/src/body/rvalue/binary.rs index a10e946b3fc..c6bae1ae131 100644 --- a/libs/@local/hashql/mir/src/body/rvalue/binary.rs +++ b/libs/@local/hashql/mir/src/body/rvalue/binary.rs @@ -72,15 +72,15 @@ impl BinOp { #[must_use] pub const fn as_symbol(self) -> Symbol<'static> { match self { - Self::Add => sym::symbol::add, - Self::Sub => sym::symbol::sub, + Self::Add => sym::symbol::plus, + Self::Sub => sym::symbol::minus, Self::BitAnd => sym::symbol::ampersand, Self::BitOr => sym::symbol::pipe, - Self::Eq => sym::symbol::eq, + Self::Eq => sym::symbol::eqeq, Self::Lt => sym::symbol::lt, - Self::Lte => sym::symbol::lte, - Self::Ne => sym::symbol::ne, - Self::Gte => sym::symbol::gte, + Self::Lte => sym::symbol::lteq, + Self::Ne => sym::symbol::excleq, + Self::Gte => sym::symbol::gteq, Self::Gt => sym::symbol::gt, } } diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs index 75b1248858a..1d194e9d27f 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/entity.rs @@ -17,156 +17,119 @@ use super::trie::{Access, AccessMode, PathNode}; // the same interned string. pub(super) static ENTITY_PATHS: PathNode = PathNode::root(&[ // entity_editions.properties (JSONB) - PathNode::jsonb(&sym::lexical::properties), + PathNode::jsonb(sym::properties), // (tbd) encodings PathNode::branch( - &sym::lexical::encodings, + sym::encodings, None, &[ // Vectors are stored outside the entity inside of an embeddings database - PathNode::branch( - &sym::lexical::vectors, - Access::Embedding(AccessMode::Direct), - &[], - ), + PathNode::branch(sym::vectors, Access::Embedding(AccessMode::Direct), &[]), ], ), PathNode::branch( - &sym::lexical::metadata, + sym::metadata, None, &[ // entity_temporal_metadata: web_id, entity_uuid, draft_id, entity_edition_id PathNode::branch( - &sym::lexical::record_id, + sym::record_id, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata: web_id, entity_uuid, draft_id PathNode::branch( - &sym::lexical::entity_id, + sym::entity_id, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata.web_id - PathNode::leaf( - &sym::lexical::web_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.draft_id - PathNode::leaf( - &sym::lexical::draft_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::draft_id, Access::Postgres(AccessMode::Direct)), ], ), // entity_temporal_metadata.entity_edition_id - PathNode::leaf( - &sym::lexical::edition_id, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::edition_id, Access::Postgres(AccessMode::Direct)), ], ), // entity_temporal_metadata: decision_time, transaction_time PathNode::branch( - &sym::lexical::temporal_versioning, + sym::temporal_versioning, Access::Postgres(AccessMode::Composite), &[ // entity_temporal_metadata.decision_time - PathNode::leaf( - &sym::lexical::decision_time, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::decision_time, Access::Postgres(AccessMode::Direct)), // entity_temporal_metadata.transaction_time - PathNode::leaf( - &sym::lexical::transaction_time, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::transaction_time, Access::Postgres(AccessMode::Direct)), ], ), // entity_is_of_type (via JOIN) - PathNode::leaf( - &sym::lexical::entity_type_ids, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_type_ids, Access::Postgres(AccessMode::Direct)), // entity_editions.archived - PathNode::leaf( - &sym::lexical::archived, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::archived, Access::Postgres(AccessMode::Direct)), // entity_editions.confidence - PathNode::leaf( - &sym::lexical::confidence, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::confidence, Access::Postgres(AccessMode::Direct)), // spans entity_ids.provenance + entity_editions.provenance PathNode::branch( - &sym::lexical::provenance, + sym::provenance, None, &[ // entity_ids.provenance (JSONB) - PathNode::jsonb(&sym::lexical::inferred), + PathNode::jsonb(sym::inferred), // entity_editions.provenance (JSONB) - PathNode::jsonb(&sym::lexical::edition), + PathNode::jsonb(sym::edition), ], ), // entity_editions.property_metadata (JSONB) - PathNode::jsonb(&sym::lexical::properties), + PathNode::jsonb(sym::properties), ], ), // contains synthesized draft_id fields PathNode::branch( - &sym::lexical::link_data, + sym::link_data, None, &[ // draft_id is synthesized (always None), not stored PathNode::branch( - &sym::lexical::left_entity_id, + sym::left_entity_id, None, &[ // entity_has_left_entity -> entity_edge.target_web_id - PathNode::leaf(&sym::lexical::web_id, Access::Postgres(AccessMode::Direct)), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_has_left_entity -> entity_edge.target_entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // synthesized, not in entity_edge - PathNode::leaf(&sym::lexical::draft_id, None), + PathNode::leaf(sym::draft_id, None), ], ), // draft_id is synthesized (always None), not stored PathNode::branch( - &sym::lexical::right_entity_id, + sym::right_entity_id, None, &[ // entity_has_right_entity -> entity_edge.target_web_id - PathNode::leaf(&sym::lexical::web_id, Access::Postgres(AccessMode::Direct)), + PathNode::leaf(sym::web_id, Access::Postgres(AccessMode::Direct)), // entity_has_right_entity -> entity_edge.target_entity_uuid - PathNode::leaf( - &sym::lexical::entity_uuid, - Access::Postgres(AccessMode::Direct), - ), + PathNode::leaf(sym::entity_uuid, Access::Postgres(AccessMode::Direct)), // synthesized, not in entity_edge - PathNode::leaf(&sym::lexical::draft_id, None), + PathNode::leaf(sym::draft_id, None), ], ), // entity_edge.confidence (via entity_has_left_entity) PathNode::leaf( - &sym::lexical::left_entity_confidence, + sym::left_entity_confidence, Access::Postgres(AccessMode::Direct), ), // entity_edge.provenance (JSONB, via entity_has_left_entity) - PathNode::jsonb(&sym::lexical::left_entity_provenance), + PathNode::jsonb(sym::left_entity_provenance), // entity_edge.confidence (via entity_has_right_entity) PathNode::leaf( - &sym::lexical::right_entity_confidence, + sym::right_entity_confidence, Access::Postgres(AccessMode::Direct), ), // entity_edge.provenance (JSONB, via entity_has_right_entity) - PathNode::jsonb(&sym::lexical::right_entity_provenance), + PathNode::jsonb(sym::right_entity_provenance), ], ), ]); diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs index 173257fda61..e3a338d5cc5 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/tests.rs @@ -19,7 +19,7 @@ fn proj(name: impl Into>) -> Projection<'st /// `[.properties]` → `Access::Postgres(Direct)` (JSONB column). #[test] fn properties_is_postgres() { - let projections = &[proj(sym::lexical::properties)]; + let projections = &[proj(sym::properties)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Postgres(AccessMode::Direct))); @@ -30,11 +30,7 @@ fn properties_is_postgres() { /// JSONB nodes have `otherwise` set, so any sub-path is also Postgres-accessible. #[test] fn properties_subpath_is_postgres() { - let projections = &[ - proj(sym::lexical::properties), - proj(sym::lexical::foo), - proj(sym::lexical::bar), - ]; + let projections = &[proj(sym::properties), proj(sym::foo), proj(sym::bar)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Postgres(AccessMode::Direct))); @@ -43,7 +39,7 @@ fn properties_subpath_is_postgres() { /// `[.encodings.vectors]` → `Access::Embedding(Direct)`. #[test] fn vectors_is_embedding() { - let projections = &[proj(sym::lexical::encodings), proj(sym::lexical::vectors)]; + let projections = &[proj(sym::encodings), proj(sym::vectors)]; let access = entity_projection_access(projections); assert_eq!(access, Some(Access::Embedding(AccessMode::Direct))); @@ -53,14 +49,14 @@ fn vectors_is_embedding() { #[test] fn metadata_columns_are_postgres() { // metadata.archived -> Direct - let projections = &[proj(sym::lexical::metadata), proj(sym::lexical::archived)]; + let projections = &[proj(sym::metadata), proj(sym::archived)]; assert_eq!( entity_projection_access(projections), Some(Access::Postgres(AccessMode::Direct)) ); // metadata.record_id -> Composite - let projections = &[proj(sym::lexical::metadata), proj(sym::lexical::record_id)]; + let projections = &[proj(sym::metadata), proj(sym::record_id)]; assert_eq!( entity_projection_access(projections), Some(Access::Postgres(AccessMode::Composite)) @@ -68,10 +64,10 @@ fn metadata_columns_are_postgres() { // metadata.record_id.entity_id.web_id -> Direct let projections = &[ - proj(sym::lexical::metadata), - proj(sym::lexical::record_id), - proj(sym::lexical::entity_id), - proj(sym::lexical::web_id), + proj(sym::metadata), + proj(sym::record_id), + proj(sym::entity_id), + proj(sym::web_id), ]; assert_eq!( entity_projection_access(projections), @@ -80,9 +76,9 @@ fn metadata_columns_are_postgres() { // metadata.temporal_versioning.decision_time -> Direct let projections = &[ - proj(sym::lexical::metadata), - proj(sym::lexical::temporal_versioning), - proj(sym::lexical::decision_time), + proj(sym::metadata), + proj(sym::temporal_versioning), + proj(sym::decision_time), ]; assert_eq!( entity_projection_access(projections), @@ -94,9 +90,9 @@ fn metadata_columns_are_postgres() { #[test] fn link_data_synthesized_is_none() { let projections = &[ - proj(sym::lexical::link_data), - proj(sym::lexical::left_entity_id), - proj(sym::lexical::draft_id), + proj(sym::link_data), + proj(sym::left_entity_id), + proj(sym::draft_id), ]; let access = entity_projection_access(projections); @@ -106,7 +102,7 @@ fn link_data_synthesized_is_none() { /// Invalid path like `[.unknown]` → `None`. #[test] fn unknown_path_returns_none() { - let projections = &[proj(sym::lexical::unknown)]; + let projections = &[proj(sym::unknown)]; let access = entity_projection_access(projections); assert_eq!(access, None); diff --git a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs index d9db87d455b..f7f56e2f2f5 100644 --- a/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs +++ b/libs/@local/hashql/mir/src/pass/analysis/execution/statement_placement/lookup/trie.rs @@ -22,7 +22,7 @@ pub(crate) enum Access { #[derive(Debug, Copy, Clone)] pub(crate) struct PathNode { /// Field name this node matches (empty string for root). - pub name: &'static Symbol<'static>, + pub name: Symbol<'static>, /// Access level when the path ends at this node (no more projections). pub access: Option, /// Access level for paths beyond known children (e.g., JSONB allows any sub-path). @@ -34,7 +34,7 @@ pub(crate) struct PathNode { impl PathNode { pub(crate) const fn root(children: &'static [Self]) -> Self { Self { - name: &sym::lexical::entity, + name: sym::entity, access: None, otherwise: None, children, @@ -42,7 +42,7 @@ impl PathNode { } pub(crate) const fn leaf( - name: &'static Symbol<'static>, + name: Symbol<'static>, access: impl [const] Into>, ) -> Self { Self { @@ -54,7 +54,7 @@ impl PathNode { } /// Creates a JSONB node where any sub-path is also Postgres-accessible. - pub(crate) const fn jsonb(name: &'static Symbol<'static>) -> Self { + pub(crate) const fn jsonb(name: Symbol<'static>) -> Self { Self { name, access: Some(Access::Postgres(AccessMode::Direct)), @@ -64,7 +64,7 @@ impl PathNode { } pub(crate) const fn branch( - name: &'static Symbol<'static>, + name: Symbol<'static>, access: impl [const] Into>, children: &'static [Self], ) -> Self { @@ -77,6 +77,6 @@ impl PathNode { } pub(crate) fn lookup(&self, name: Symbol<'_>) -> Option<&Self> { - self.children.iter().find(|node| *node.name == name) + self.children.iter().find(|node| node.name == name) } } diff --git a/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs b/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs index 13dec06b17a..304d20060e2 100644 --- a/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs +++ b/libs/@local/hashql/mir/src/pass/transform/inline/tests.rs @@ -534,7 +534,7 @@ fn analysis_directives_by_source() { let mut ctor_body = closure_body.clone(); ctor_body.id = DefId::new(1); - ctor_body.source = Source::Ctor(sym::lexical::Some); + ctor_body.source = Source::Ctor(sym::Some); let mut intrinsic_body = closure_body.clone(); intrinsic_body.id = DefId::new(2);