From 3b159682a6296abe54bf0022f14d22fe7307884d Mon Sep 17 00:00:00 2001 From: Jubin Soni Date: Fri, 19 Jun 2026 17:25:31 -0700 Subject: [PATCH 1/2] [SPARK-57578][SQL] Fix UTF8String.codePointFrom/trimLeft/trimRight out-of-bounds read on truncated trailing UTF-8 sequences --- .../apache/spark/unsafe/types/UTF8String.java | 10 +++-- .../spark/unsafe/types/UTF8StringSuite.java | 39 +++++++++++++++++++ 2 files changed, 46 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index 06ec4af7a5f27..1b238ce78a6b8 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -728,7 +728,8 @@ public int getChar(int charIndex) { public int codePointFrom(int byteIndex) { Objects.checkIndex(byteIndex, numBytes); byte b = getByte(byteIndex); - int numBytes = numBytesForFirstByte(b); + // Clamp to remaining bytes so a truncated trailing sequence never reads past the end. + int numBytes = Math.min(numBytesForFirstByte(b), this.numBytes - byteIndex); return switch (numBytes) { case 1 -> b & 0x7F; @@ -1049,7 +1050,9 @@ public UTF8String trimLeft(UTF8String trimString) { while (searchIdx < numBytes) { UTF8String searchChar = copyUTF8String( - searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1); + searchIdx, + searchIdx + Math.min(numBytesForFirstByte(this.getByte(searchIdx)), + numBytes - searchIdx) - 1); int searchCharBytes = searchChar.numBytes; // try to find the matching for the searchChar in the trimString set if (trimString.find(searchChar, 0) >= 0) { @@ -1121,7 +1124,8 @@ public UTF8String trimRight(UTF8String trimString) { // build the position and length array while (charIdx < numBytes) { stringCharPos[numChars] = charIdx; - stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx)); + stringCharLen[numChars] = Math.min(numBytesForFirstByte(getByte(charIdx)), + numBytes - charIdx); charIdx += stringCharLen[numChars]; numChars ++; } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index a9f365173be64..a1c15080dcb80 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -889,6 +889,28 @@ public void trimRightWithTrimString() { assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab"))); assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a"))); assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b"))); + + // SPARK-57578: trimLeft/trimRight with a trim-set whose last byte is a truncated + // multi-byte leader must not read past the end of the backing buffer. + // 0xC3 is the leading byte of a 2-byte sequence; the continuation byte is absent. + UTF8String truncated2 = fromBytes(new byte[]{(byte) 0xC3}); + assertEquals(fromString("hello "), fromString("hello ").trimLeft(truncated2)); + assertEquals(fromString(" hello"), fromString(" hello").trimRight(truncated2)); + + // 'A' followed by a lone 2-byte leader — truncation is at the end of the source string. + UTF8String srcWithTruncatedTail = fromBytes(new byte[]{0x41, (byte) 0xC3}); + assertEquals(fromString("B"), srcWithTruncatedTail.trimLeft(fromString("A"))); + assertEquals(fromString("B"), fromBytes(new byte[]{(byte) 0xC3, 0x42}) + .trimRight(fromString("B"))); + + // Lone 3-byte and 4-byte leaders as the trim string. + UTF8String truncated3 = fromBytes(new byte[]{(byte) 0xE4}); + assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated3)); + assertEquals(fromString("hello"), fromString("hello").trimRight(truncated3)); + + UTF8String truncated4 = fromBytes(new byte[]{(byte) 0xF0}); + assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated4)); + assertEquals(fromString("hello"), fromString("hello").trimRight(truncated4)); } @Test @@ -1228,6 +1250,23 @@ public void testCodePointFrom() { assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1)); assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length())); assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1)); + + // SPARK-57578: a string ending in a truncated multi-byte leader must not read past the buffer. + // 0xC3 alone is the leading byte of a 2-byte sequence; codePointFrom(0) must not read byte 1. + UTF8String lone2 = fromBytes(new byte[]{(byte) 0xC3}); + // Returns the partial/clamped code point without throwing or crashing. + lone2.codePointFrom(0); + + // 'A' (0x41) followed by a lone 3-byte leader (0xE4) — codePointFrom(1) must not read bytes + // 2 or 3 which do not exist. + UTF8String aLone3 = fromBytes(new byte[]{0x41, (byte) 0xE4}); + assertEquals(0x41, aLone3.codePointFrom(0)); + aLone3.codePointFrom(1); + + // 'A' followed by a lone 4-byte leader (0xF0). + UTF8String aLone4 = fromBytes(new byte[]{0x41, (byte) 0xF0}); + assertEquals(0x41, aLone4.codePointFrom(0)); + aLone4.codePointFrom(1); } @Test From e0763a6c25f5bd569e13adbc3ec08384f68f6db1 Mon Sep 17 00:00:00 2001 From: Jubin Soni Date: Sat, 20 Jun 2026 22:51:42 -0700 Subject: [PATCH 2/2] Attempt to fix tests --- .../org/apache/spark/unsafe/types/UTF8StringSuite.java | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index a1c15080dcb80..c3c258c4e5f43 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -898,10 +898,14 @@ public void trimRightWithTrimString() { assertEquals(fromString(" hello"), fromString(" hello").trimRight(truncated2)); // 'A' followed by a lone 2-byte leader — truncation is at the end of the source string. + // After trimming 'A' from the left, the lone 0xC3 leader byte remains. + UTF8String truncated2Tail = fromBytes(new byte[]{(byte) 0xC3}); UTF8String srcWithTruncatedTail = fromBytes(new byte[]{0x41, (byte) 0xC3}); - assertEquals(fromString("B"), srcWithTruncatedTail.trimLeft(fromString("A"))); - assertEquals(fromString("B"), fromBytes(new byte[]{(byte) 0xC3, 0x42}) - .trimRight(fromString("B"))); + assertEquals(truncated2Tail, srcWithTruncatedTail.trimLeft(fromString("A"))); + // {0xC3, 0x42}: the lone 2-byte leader 0xC3 at position 0 consumes the next byte (0x42 = 'B') + // into one clamped character, so trimRight("B") finds no standalone 'B' and returns unchanged. + UTF8String lone2LeaderPlusB = fromBytes(new byte[]{(byte) 0xC3, 0x42}); + assertEquals(lone2LeaderPlusB, lone2LeaderPlusB.trimRight(fromString("B"))); // Lone 3-byte and 4-byte leaders as the trim string. UTF8String truncated3 = fromBytes(new byte[]{(byte) 0xE4});