Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 34 additions & 5 deletions parquet-variant/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
// under the License.
use std::{borrow::Cow, ops::Deref};

use crate::utils::parse_path;

/// Represents a qualified path to a potential subfield or index of a variant
/// value.
///
Expand Down Expand Up @@ -112,11 +114,7 @@ impl<'a> From<Vec<VariantPathElement<'a>>> for VariantPath<'a> {
/// Create from &str with support for dot notation
impl<'a> From<&'a str> for VariantPath<'a> {
fn from(path: &'a str) -> Self {
if path.is_empty() {
VariantPath::new(vec![])
} else {
VariantPath::new(path.split('.').map(Into::into).collect())
}
VariantPath::new(path.split(".").flat_map(parse_path).collect())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is beautiful

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does the flat_map mean this will parse paths like foo..bar as [foo] [bar]?

I am not sure exactly what should happen when parsing a path like that 🤔

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that you mention, it seems like the existing code already had that bug?

In theory, that should be a malformed path that causes an error. Ditto for mismatched [ or ], or e.g. [a.b] (which this PR would incorrectly attempt to treat as [a and b]).

It seems like we'll need a more sophisticated (and integrated) parsing approach here?

Copy link
Contributor

@scovich scovich Dec 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit of quality time with an LLM produced the following (along with a bunch of unit tests to validate it works):

Details
/// Create from &str with support for dot notation and array indices.
///
/// # Example
/// ```
/// # use parquet_variant::VariantPath;
/// let path: VariantPath = "foo.bar[0]".try_into().unwrap();
/// ```
impl<'a> TryFrom<&'a str> for VariantPath<'a> {
    type Error = ArrowError;

    fn try_from(path: &'a str) -> Result<Self, Self::Error> {
        parse_path(path).map(VariantPath::new)
    }
}

/// Parse a path string into a vector of [`VariantPathElement`].
///
/// Supports the following syntax:
/// - `""` - empty path
/// - `"foo"` - single field
/// - `"foo.bar"` - nested fields
/// - `"[1]"` - single array index
/// - `"[1][2]"` - multiple array indices
/// - `"foo[1]"` - field with array index
/// - `"foo[1][2]"` - field with multiple array indices
/// - `"foo[1].bar"` - mixed field and array access
/// - etc.
///
/// Field names can contain any characters except `.`, `[`, and `]`.
fn parse_path(s: &str) -> Result<Vec<VariantPathElement<'_>>, ArrowError> {
    let scan_field = |start: usize| {
        s[start..].find(['.', '[', ']']).map_or_else(|| s.len(), |p| start + p)
    };

    let bytes = s.as_bytes();
    if let Some(b'.') = bytes.first() {
        return Err(ArrowError::ParseError("unexpected leading '.'".into()));
    }

    let mut elements = Vec::new();
    let mut i = 0;

    while i < bytes.len() {
        let (elem, end) = match bytes[i] {
            b'.' => {
                i += 1; // skip the dot; a field must follow
                let end = scan_field(i);
                if end == i {
                    return Err(ArrowError::ParseError(match bytes.get(i) {
                        None => "unexpected trailing '.'".into(),
                        Some(&c) => format!("unexpected '{}' at byte {i}", c as char),
                    }));
                }
                (VariantPathElement::field(&s[i..end]), end)
            }
            b'[' => {
                let (idx, end) = parse_index(s, i)?;
                (VariantPathElement::index(idx), end)
            }
            b']' => {
                return Err(ArrowError::ParseError(format!(
                    "unexpected ']' at byte {i}"
                )));
            }
            _ => {
                let end = scan_field(i);
                (VariantPathElement::field(&s[i..end]), end)
            }
        };
        elements.push(elem);
        i = end;
    }

    Ok(elements)
}


/// Parse `[digits]` starting at `i` (which points to `[`).
/// Returns (index_value, position after `]`).
fn parse_index(s: &str, i: usize) -> Result<(usize, usize), ArrowError> {
    let start = i + 1; // skip '['

    // Find closing ']'
    let end = match s[start..].find(']') {
        Some(p) => start + p,
        None => return Err(ArrowError::ParseError(format!("unterminated '[' at byte {i}"))),
    };

    let idx = s[start..end]
        .parse()
        .map_err(|_| ArrowError::ParseError(format!("invalid index at byte {start}")))?;

    Ok((idx, end + 1))
}

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I initially considered using TryFrom with explicit error boundaries , but doing so would require changing an existing public API, which would be a breaking change

pub fn with_path<'a, P, F>(mut self, path: P, field: F) -> Self
where
P: Into<VariantPath<'a>>,
F: IntoShreddingField,
{
let path: VariantPath<'a> = path.into();
self.root.insert_path(&path, field.into_shredding_field());
self
}

and as @scovich mentioned the current approach can also parse invalid inputs as fields, making debugging hard. So I'm a bit skeptical about how to proceed from here.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AFAIK, variant is still experimental, so breaking changes are ok?

Copy link
Contributor

@scovich scovich Dec 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Meanwhile, even the parser I cooked up above is technically incomplete, because JSON allows field names to contain any character (including ., [ and ] which my suggestion would reject as ambiguous). The only way to avoid that is with some kind of escaping syntax. I believe Iceberg uses canonical JSONpath, for example.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's an open issue regarding the escaping syntax #8954

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting. It looks like the mini-parser I posted above is a strict subset of the databricks syntax linked above. Would it be a reasonable starting point to merge this PR and add the missing ['fieldName'] syntax as a follow-up? Or just expand the above suggestion into a full parser right away?

}
}

Expand Down Expand Up @@ -223,4 +221,35 @@ mod tests {
let path = VariantPath::from_iter([p]);
assert!(!path.is_empty());
}

#[test]
fn test_variant_path_dot_notation_with_array_index() {
let path = VariantPath::from("city.store.books[3].title");

let expected = VariantPath::from("city")
.join("store")
.join("books")
.join(3)
.join("title");

assert_eq!(path, expected);
}

#[test]
fn test_variant_path_dot_notation_with_only_array_index() {
let path = VariantPath::from("[3]");

let expected = VariantPath::from(3);

assert_eq!(path, expected);
}

#[test]
fn test_variant_path_dot_notation_with_starting_array_index() {
let path = VariantPath::from("[3].title");

let expected = VariantPath::from(3).join("title");

assert_eq!(path, expected);
}
}
33 changes: 33 additions & 0 deletions parquet-variant/src/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
// under the License.
use std::{array::TryFromSliceError, ops::Range, str};

use crate::VariantPathElement;
use arrow_schema::ArrowError;

use std::cmp::Ordering;
Expand Down Expand Up @@ -149,6 +150,38 @@ pub(crate) fn fits_precision<const N: u32>(n: impl Into<i64>) -> bool {
n.into().unsigned_abs().leading_zeros() >= (i64::BITS - N)
}

// Helper fn to parse input segments like foo[0] or foo[0][0]
#[inline]
pub(crate) fn parse_path<'a>(segment: &'a str) -> Vec<VariantPathElement<'a>> {
if segment.is_empty() {
return Vec::new();
}

let mut path_elements = Vec::new();
let mut base = segment;

while let Some(stripped) = base.strip_suffix(']') {
let Some(open_pos) = stripped.rfind('[') else {
return vec![VariantPathElement::field(segment)];
};

let index_str = &stripped[open_pos + 1..];
let Ok(index) = index_str.parse::<usize>() else {
return vec![VariantPathElement::field(segment)];
};

path_elements.push(VariantPathElement::index(index));
base = &stripped[..open_pos];
}

if !base.is_empty() {
path_elements.push(VariantPathElement::field(base));
}

path_elements.reverse();
path_elements
}

#[cfg(test)]
mod test {
use super::*;
Expand Down
Loading