From 3b159682a6296abe54bf0022f14d22fe7307884d Mon Sep 17 00:00:00 2001
From: Jubin Soni <jubins@nyu.edu>
Date: Fri, 19 Jun 2026 17:25:31 -0700
Subject: [PATCH 1/2] [SPARK-57578][SQL] Fix
 UTF8String.codePointFrom/trimLeft/trimRight out-of-bounds read on truncated
 trailing UTF-8 sequences

---
 .../apache/spark/unsafe/types/UTF8String.java | 10 +++--
 .../spark/unsafe/types/UTF8StringSuite.java   | 39 +++++++++++++++++++
 2 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
index 06ec4af7a5f27..1b238ce78a6b8 100644
--- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
+++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java
@@ -728,7 +728,8 @@ public int getChar(int charIndex) {
   public int codePointFrom(int byteIndex) {
     Objects.checkIndex(byteIndex, numBytes);
     byte b = getByte(byteIndex);
-    int numBytes = numBytesForFirstByte(b);
+    // Clamp to remaining bytes so a truncated trailing sequence never reads past the end.
+    int numBytes = Math.min(numBytesForFirstByte(b), this.numBytes - byteIndex);
     return switch (numBytes) {
       case 1 ->
         b & 0x7F;
@@ -1049,7 +1050,9 @@ public UTF8String trimLeft(UTF8String trimString) {
 
     while (searchIdx < numBytes) {
       UTF8String searchChar = copyUTF8String(
-          searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1);
+          searchIdx,
+          searchIdx + Math.min(numBytesForFirstByte(this.getByte(searchIdx)),
+                               numBytes - searchIdx) - 1);
       int searchCharBytes = searchChar.numBytes;
       // try to find the matching for the searchChar in the trimString set
       if (trimString.find(searchChar, 0) >= 0) {
@@ -1121,7 +1124,8 @@ public UTF8String trimRight(UTF8String trimString) {
     // build the position and length array
     while (charIdx < numBytes) {
       stringCharPos[numChars] = charIdx;
-      stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
+      stringCharLen[numChars] = Math.min(numBytesForFirstByte(getByte(charIdx)),
+                                         numBytes - charIdx);
       charIdx += stringCharLen[numChars];
       numChars ++;
     }
diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index a9f365173be64..a1c15080dcb80 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -889,6 +889,28 @@ public void trimRightWithTrimString() {
     assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
     assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
     assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));
+
+    // SPARK-57578: trimLeft/trimRight with a trim-set whose last byte is a truncated
+    // multi-byte leader must not read past the end of the backing buffer.
+    // 0xC3 is the leading byte of a 2-byte sequence; the continuation byte is absent.
+    UTF8String truncated2 = fromBytes(new byte[]{(byte) 0xC3});
+    assertEquals(fromString("hello "), fromString("hello ").trimLeft(truncated2));
+    assertEquals(fromString(" hello"), fromString(" hello").trimRight(truncated2));
+
+    // 'A' followed by a lone 2-byte leader — truncation is at the end of the source string.
+    UTF8String srcWithTruncatedTail = fromBytes(new byte[]{0x41, (byte) 0xC3});
+    assertEquals(fromString("B"), srcWithTruncatedTail.trimLeft(fromString("A")));
+    assertEquals(fromString("B"), fromBytes(new byte[]{(byte) 0xC3, 0x42})
+        .trimRight(fromString("B")));
+
+    // Lone 3-byte and 4-byte leaders as the trim string.
+    UTF8String truncated3 = fromBytes(new byte[]{(byte) 0xE4});
+    assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated3));
+    assertEquals(fromString("hello"), fromString("hello").trimRight(truncated3));
+
+    UTF8String truncated4 = fromBytes(new byte[]{(byte) 0xF0});
+    assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated4));
+    assertEquals(fromString("hello"), fromString("hello").trimRight(truncated4));
   }
 
   @Test
@@ -1228,6 +1250,23 @@ public void testCodePointFrom() {
     assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1));
     assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length()));
     assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1));
+
+    // SPARK-57578: a string ending in a truncated multi-byte leader must not read past the buffer.
+    // 0xC3 alone is the leading byte of a 2-byte sequence; codePointFrom(0) must not read byte 1.
+    UTF8String lone2 = fromBytes(new byte[]{(byte) 0xC3});
+    // Returns the partial/clamped code point without throwing or crashing.
+    lone2.codePointFrom(0);
+
+    // 'A' (0x41) followed by a lone 3-byte leader (0xE4) — codePointFrom(1) must not read bytes
+    // 2 or 3 which do not exist.
+    UTF8String aLone3 = fromBytes(new byte[]{0x41, (byte) 0xE4});
+    assertEquals(0x41, aLone3.codePointFrom(0));
+    aLone3.codePointFrom(1);
+
+    // 'A' followed by a lone 4-byte leader (0xF0).
+    UTF8String aLone4 = fromBytes(new byte[]{0x41, (byte) 0xF0});
+    assertEquals(0x41, aLone4.codePointFrom(0));
+    aLone4.codePointFrom(1);
   }
 
   @Test

From e0763a6c25f5bd569e13adbc3ec08384f68f6db1 Mon Sep 17 00:00:00 2001
From: Jubin Soni <jubins@nyu.edu>
Date: Sat, 20 Jun 2026 22:51:42 -0700
Subject: [PATCH 2/2] Attempt to fix tests

---
 .../org/apache/spark/unsafe/types/UTF8StringSuite.java | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
index a1c15080dcb80..c3c258c4e5f43 100644
--- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
+++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java
@@ -898,10 +898,14 @@ public void trimRightWithTrimString() {
     assertEquals(fromString(" hello"), fromString(" hello").trimRight(truncated2));
 
     // 'A' followed by a lone 2-byte leader — truncation is at the end of the source string.
+    // After trimming 'A' from the left, the lone 0xC3 leader byte remains.
+    UTF8String truncated2Tail = fromBytes(new byte[]{(byte) 0xC3});
     UTF8String srcWithTruncatedTail = fromBytes(new byte[]{0x41, (byte) 0xC3});
-    assertEquals(fromString("B"), srcWithTruncatedTail.trimLeft(fromString("A")));
-    assertEquals(fromString("B"), fromBytes(new byte[]{(byte) 0xC3, 0x42})
-        .trimRight(fromString("B")));
+    assertEquals(truncated2Tail, srcWithTruncatedTail.trimLeft(fromString("A")));
+    // {0xC3, 0x42}: the lone 2-byte leader 0xC3 at position 0 consumes the next byte (0x42 = 'B')
+    // into one clamped character, so trimRight("B") finds no standalone 'B' and returns unchanged.
+    UTF8String lone2LeaderPlusB = fromBytes(new byte[]{(byte) 0xC3, 0x42});
+    assertEquals(lone2LeaderPlusB, lone2LeaderPlusB.trimRight(fromString("B")));
 
     // Lone 3-byte and 4-byte leaders as the trim string.
     UTF8String truncated3 = fromBytes(new byte[]{(byte) 0xE4});