Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -728,7 +728,8 @@ public int getChar(int charIndex) {
public int codePointFrom(int byteIndex) {
Objects.checkIndex(byteIndex, numBytes);
byte b = getByte(byteIndex);
int numBytes = numBytesForFirstByte(b);
// Clamp to remaining bytes so a truncated trailing sequence never reads past the end.
int numBytes = Math.min(numBytesForFirstByte(b), this.numBytes - byteIndex);
return switch (numBytes) {
case 1 ->
b & 0x7F;
Expand Down Expand Up @@ -1049,7 +1050,9 @@ public UTF8String trimLeft(UTF8String trimString) {

while (searchIdx < numBytes) {
UTF8String searchChar = copyUTF8String(
searchIdx, searchIdx + numBytesForFirstByte(this.getByte(searchIdx)) - 1);
searchIdx,
searchIdx + Math.min(numBytesForFirstByte(this.getByte(searchIdx)),
numBytes - searchIdx) - 1);
int searchCharBytes = searchChar.numBytes;
// try to find the matching for the searchChar in the trimString set
if (trimString.find(searchChar, 0) >= 0) {
Expand Down Expand Up @@ -1121,7 +1124,8 @@ public UTF8String trimRight(UTF8String trimString) {
// build the position and length array
while (charIdx < numBytes) {
stringCharPos[numChars] = charIdx;
stringCharLen[numChars] = numBytesForFirstByte(getByte(charIdx));
stringCharLen[numChars] = Math.min(numBytesForFirstByte(getByte(charIdx)),
numBytes - charIdx);
charIdx += stringCharLen[numChars];
numChars ++;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -889,6 +889,32 @@ public void trimRightWithTrimString() {
assertEquals(fromString(""), fromString("数数数据砖ab").trimRight(fromString("数据砖ab")));
assertEquals(fromString("头"), fromString("头a???/").trimRight(fromString("数?/*&^%a")));
assertEquals(fromString("头"), fromString("头数b数数 [").trimRight(fromString(" []数b")));

// SPARK-57578: trimLeft/trimRight with a trim-set whose last byte is a truncated
// multi-byte leader must not read past the end of the backing buffer.
// 0xC3 is the leading byte of a 2-byte sequence; the continuation byte is absent.
UTF8String truncated2 = fromBytes(new byte[]{(byte) 0xC3});
assertEquals(fromString("hello "), fromString("hello ").trimLeft(truncated2));
assertEquals(fromString(" hello"), fromString(" hello").trimRight(truncated2));

// 'A' followed by a lone 2-byte leader — truncation is at the end of the source string.
// After trimming 'A' from the left, the lone 0xC3 leader byte remains.
UTF8String truncated2Tail = fromBytes(new byte[]{(byte) 0xC3});
UTF8String srcWithTruncatedTail = fromBytes(new byte[]{0x41, (byte) 0xC3});
assertEquals(truncated2Tail, srcWithTruncatedTail.trimLeft(fromString("A")));
// {0xC3, 0x42}: the lone 2-byte leader 0xC3 at position 0 consumes the next byte (0x42 = 'B')
// into one clamped character, so trimRight("B") finds no standalone 'B' and returns unchanged.
UTF8String lone2LeaderPlusB = fromBytes(new byte[]{(byte) 0xC3, 0x42});
assertEquals(lone2LeaderPlusB, lone2LeaderPlusB.trimRight(fromString("B")));

// Lone 3-byte and 4-byte leaders as the trim string.
UTF8String truncated3 = fromBytes(new byte[]{(byte) 0xE4});
assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated3));
assertEquals(fromString("hello"), fromString("hello").trimRight(truncated3));

UTF8String truncated4 = fromBytes(new byte[]{(byte) 0xF0});
assertEquals(fromString("hello"), fromString("hello").trimLeft(truncated4));
assertEquals(fromString("hello"), fromString("hello").trimRight(truncated4));
}

@Test
Expand Down Expand Up @@ -1228,6 +1254,23 @@ public void testCodePointFrom() {
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(-1));
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length()));
assertThrows(IndexOutOfBoundsException.class, () -> s.codePointFrom(str.length() + 1));

// SPARK-57578: a string ending in a truncated multi-byte leader must not read past the buffer.
// 0xC3 alone is the leading byte of a 2-byte sequence; codePointFrom(0) must not read byte 1.
UTF8String lone2 = fromBytes(new byte[]{(byte) 0xC3});
// Returns the partial/clamped code point without throwing or crashing.
lone2.codePointFrom(0);

// 'A' (0x41) followed by a lone 3-byte leader (0xE4) — codePointFrom(1) must not read bytes
// 2 or 3 which do not exist.
UTF8String aLone3 = fromBytes(new byte[]{0x41, (byte) 0xE4});
assertEquals(0x41, aLone3.codePointFrom(0));
aLone3.codePointFrom(1);

// 'A' followed by a lone 4-byte leader (0xF0).
UTF8String aLone4 = fromBytes(new byte[]{0x41, (byte) 0xF0});
assertEquals(0x41, aLone4.codePointFrom(0));
aLone4.codePointFrom(1);
}

@Test
Expand Down