Skip to content

Commit

Permalink
[GR-44464] TruffleString: add ToValidStringNode.
Browse files Browse the repository at this point in the history
PullRequest: graal/15062
  • Loading branch information
djoooooe committed Oct 25, 2023
2 parents b7efba3 + 772dd13 commit 1e9366e
Show file tree
Hide file tree
Showing 10 changed files with 457 additions and 10 deletions.
1 change: 1 addition & 0 deletions truffle/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ This changelog summarizes major changes between Truffle versions relevant to lan
* Bundle the necessary files into a jar distribution.
* Implement the `InternalResource` interface for handling the resource file unpacking.
* Call the `Env#getInternalResource` when the language or instrument needs the bundled resource files. This method ensures that the requested `InternalResource` is unpacked and provides a directory containing the unpacked files. Since unpacking internal resources can be an expensive operation, the implementation ensures that internal resources are cached.
* GR-44464 Added `TruffleString.ToValidStringNode` for encoding-level string sanitization.

## Version 23.0.0

Expand Down
2 changes: 2 additions & 0 deletions truffle/docs/TruffleStrings.md
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ Conversion:
Convert a MutableTruffleString to an immutable TruffleString.
* [AsManaged](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.AsManagedNode.html):
Convert a TruffleString backed by a native pointer to one backed by a java byte array.
* [ToValidString](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.ToValidStringNode.html):
Convert a TruffleString to a version that is encoded correctly.
* [CopyToByteArray](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.CopyToByteArrayNode.html):
Copy a string's content into a byte array.
* [GetInternalByteArray](https://www.graalvm.org/truffle/javadoc/com/oracle/truffle/api/strings/TruffleString.GetInternalByteArrayNode.html):
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* The Universal Permissive License (UPL), Version 1.0
*
* Subject to the condition set forth below, permission is hereby granted to any
* person obtaining a copy of this software, associated documentation and/or
* data (collectively the "Software"), free of charge and under any and all
* copyright rights in the Software, and any and all patent rights owned or
* freely licensable by each licensor hereunder covering either (i) the
* unmodified Software as contributed to or provided by such licensor, or (ii)
* the Larger Works (as defined below), to deal in both
*
* (a) the Software, and
*
* (b) any piece of software and/or hardware listed in the lrgrwrks.txt file if
* one is included with the Software each a "Larger Work" to which the Software
* is contributed by such licensors),
*
* without restriction, including without limitation the rights to copy, create
* derivative works of, display, perform, and distribute the Software and make,
* use, sell, offer for sale, import, export, have made, and have sold the
* Software and the Larger Work(s), and to sublicense the foregoing rights on
* either these or other terms.
*
* This license is subject to the following condition:
*
* The above copyright notice and either this complete permission notice or at a
* minimum a reference to the UPL must be included in all copies or substantial
* portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/

package com.oracle.truffle.api.strings.test.ops;

import static com.oracle.truffle.api.strings.TruffleString.Encoding.BYTES;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.ISO_8859_1;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.US_ASCII;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_16;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_32;
import static com.oracle.truffle.api.strings.TruffleString.Encoding.UTF_8;
import static com.oracle.truffle.api.strings.test.TStringTestUtil.byteArray;
import static org.junit.runners.Parameterized.Parameter;

import java.util.Arrays;

import org.junit.Assert;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import org.junit.runners.Parameterized.Parameters;

import com.oracle.truffle.api.strings.TruffleString;
import com.oracle.truffle.api.strings.test.TStringTestBase;

@RunWith(Parameterized.class)
public class TStringToValidStringTest extends TStringTestBase {

@Parameter public TruffleString.ToValidStringNode node;

@Parameters(name = "{0}")
public static Iterable<TruffleString.ToValidStringNode> data() {
return Arrays.asList(TruffleString.ToValidStringNode.create(), TruffleString.ToValidStringNode.getUncached());
}

@Test
public void testAll() throws Exception {
forAllStrings(new TruffleString.Encoding[]{US_ASCII, ISO_8859_1, BYTES, UTF_8, UTF_16, UTF_32}, true, (a, array, codeRange, isValid, encoding, codepoints, byteIndices) -> {
TruffleString wellFormed = node.execute(a, encoding);
if (isValid && a instanceof TruffleString) {
Assert.assertSame(a, wellFormed);
}
Assert.assertTrue(wellFormed.isValidUncached(encoding));
});
}

@Test
public void testAscii() {
testAscii(byteArray('a', '?'), byteArray('a', 0xff));
testAscii(byteArray('a', '?'), byteArray('a', 0x80));
testAscii(byteArray('a', '?', 'b'), byteArray('a', 0xff, 'b'));
testAscii(byteArray('a', '?', 'b'), byteArray('a', 0x80, 'b'));
testAscii(byteArray('a', 0x7f, 'b'), byteArray('a', 0x7f, 'b'));
}

@Test
public void testUTF8() {
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xff));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90, 0x80));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xf0, 0x90, 0x80, 0x80), byteArray('a', 0xf0, 0x90, 0x80, 0xf0, 0x90, 0x80, 0x80));
testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80, 0xEF, 0xBF, 0xBD), byteArray('a', 0xf0, 0x90, 0x80, 0x80, 0xf0, 0x90, 0x80));
testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80), byteArray('a', 0xf0, 0x90, 0x80, 0x80));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD), byteArray('a', 0xf8, 0x90));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xff, 'b'));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf0, 0x90, 'b'));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 'b'));
testUTF8(byteArray('a', 0xf0, 0x90, 0x80, 0x80, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 0x80, 'b'));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xf0, 0x90, 0x80, 0x80, 'b'), byteArray('a', 0xf0, 0x90, 0x80, 0xf0, 0x90, 0x80, 0x80, 'b'));
testUTF8(byteArray('a', 0xEF, 0xBF, 0xBD, 0xEF, 0xBF, 0xBD, 'b'), byteArray('a', 0xf8, 0x90, 'b'));
}

private void testAscii(byte[] expected, byte[] input) {
testByteArray(expected, input, US_ASCII);
}

private void testUTF8(byte[] expected, byte[] input) {
testByteArray(expected, input, UTF_8);
}

private void testByteArray(byte[] expected, byte[] input, TruffleString.Encoding encoding) {
TruffleString wellFormed = node.execute(TruffleString.fromByteArrayUncached(input, encoding), encoding);
for (int i = 0; i < expected.length; i++) {
Assert.assertEquals(Byte.toUnsignedInt(expected[i]), wellFormed.readByteUncached(i, encoding));
}
Assert.assertTrue(wellFormed.isValidUncached(encoding));
}

@Test
public void testUTF16() {
testUTF16("a\ufffd", "a\udfff");
testUTF16("a\ufffd", "a\udbff");
testUTF16("a\ufffd\ufffd", "a\udfff\udfff");
testUTF16("a\ufffd\ufffd", "a\udbff\udbff");
testUTF16("a\udbff\udfff\ufffd", "a\udbff\udfff\udbff");
testUTF16("a\udbff\udfff\ufffdb", "a\udbff\udfff\udbffb");
}

private void testUTF16(String expected, String input) {
TruffleString wellFormed = node.execute(TruffleString.fromJavaStringUncached(input, UTF_16), UTF_16);
Assert.assertEquals(expected, wellFormed.toJavaStringUncached());
Assert.assertTrue(wellFormed.isValidUncached(UTF_16));
}

@Test
public void testUTF32() {
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Character.MIN_SURROGATE});
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Character.MAX_SURROGATE});
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Integer.MAX_VALUE});
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', Integer.MIN_VALUE});
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', 0x110000});
testUTF32(new int[]{'a', 0xfffd}, new int[]{'a', 0xffff_ffff});
testUTF32(new int[]{'a', Character.MAX_CODE_POINT}, new int[]{'a', Character.MAX_CODE_POINT});
testUTF32(new int[]{'a', Character.MAX_CODE_POINT, 0xfffd}, new int[]{'a', Character.MAX_CODE_POINT, Character.MIN_SURROGATE});
}

private void testUTF32(int[] expected, int[] input) {
TruffleString wellFormed = node.execute(TruffleString.fromIntArrayUTF32Uncached(input), UTF_32);
for (int i = 0; i < expected.length; i++) {
Assert.assertEquals(expected[i], wellFormed.codePointAtIndexUncached(i, UTF_32));
}
Assert.assertTrue(wellFormed.isValidUncached(UTF_32));
}

@Test
public void testNull() throws Exception {
expectNullPointerException(() -> node.execute(null, UTF_16));
expectNullPointerException(() -> node.execute(S_UTF16, null));
}
}
11 changes: 10 additions & 1 deletion truffle/src/com.oracle.truffle.api.strings/snapshot.sigtest
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ hfds GIL_LOCK,PARENT_LIMIT,SAME_LANGUAGE_CHECK_VISITOR,parent
CLSS public abstract interface com.oracle.truffle.api.nodes.NodeInterface

CLSS public abstract com.oracle.truffle.api.strings.AbstractTruffleString
meth public com.oracle.truffle.api.strings.TruffleString toValidStringUncached(com.oracle.truffle.api.strings.TruffleString$Encoding)
meth public final boolean codeRangeEqualsUncached(com.oracle.truffle.api.strings.TruffleString$CodeRange)
meth public final boolean equals(java.lang.Object)
meth public final boolean equalsUncached(com.oracle.truffle.api.strings.AbstractTruffleString,com.oracle.truffle.api.strings.TruffleString$Encoding)
Expand Down Expand Up @@ -353,6 +354,7 @@ innr public abstract static SubstringByteIndexNode
innr public abstract static SubstringNode
innr public abstract static SwitchEncodingNode
innr public abstract static ToJavaStringNode
innr public abstract static ToValidStringNode
innr public final static !enum CodeRange
innr public final static !enum CompactionLevel
innr public final static !enum Encoding
Expand Down Expand Up @@ -985,6 +987,13 @@ meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode
meth public static com.oracle.truffle.api.strings.TruffleString$ToJavaStringNode getUncached()
supr com.oracle.truffle.api.nodes.Node

CLSS public abstract static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode
outer com.oracle.truffle.api.strings.TruffleString
meth public abstract com.oracle.truffle.api.strings.TruffleString execute(com.oracle.truffle.api.strings.AbstractTruffleString,com.oracle.truffle.api.strings.TruffleString$Encoding)
meth public static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode create()
meth public static com.oracle.truffle.api.strings.TruffleString$ToValidStringNode getUncached()
supr com.oracle.truffle.api.nodes.Node

CLSS public final static com.oracle.truffle.api.strings.TruffleString$WithMask
outer com.oracle.truffle.api.strings.TruffleString
innr public abstract static CreateNode
Expand Down Expand Up @@ -1139,7 +1148,7 @@ CLSS public final com.oracle.truffle.api.strings.TruffleStringFactory
cons public init()
innr public final static WithMaskFactory
supr java.lang.Object
hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen
hcls AsManagedNodeGen,AsNativeNodeGen,AsTruffleStringNodeGen,ByteIndexOfAnyByteNodeGen,ByteIndexOfCodePointNodeGen,ByteIndexOfCodePointSetNodeGen,ByteIndexOfStringNodeGen,ByteIndexToCodePointIndexNodeGen,ByteLengthOfCodePointNodeGen,CharIndexOfAnyCharUTF16NodeGen,CodePointAtByteIndexNodeGen,CodePointAtIndexNodeGen,CodePointIndexToByteIndexNodeGen,CodePointLengthNodeGen,CodeRangeEqualsNodeGen,CompareBytesNodeGen,CompareCharsUTF16NodeGen,CompareIntsUTF32NodeGen,ConcatNodeGen,CopyToByteArrayNodeGen,CopyToNativeMemoryNodeGen,CreateBackwardCodePointIteratorNodeGen,CreateCodePointIteratorNodeGen,EqualNodeGen,ForceEncodingNodeGen,FromByteArrayNodeGen,FromCharArrayUTF16NodeGen,FromCodePointNodeGen,FromIntArrayUTF32NodeGen,FromJavaStringNodeGen,FromLongNodeGen,FromNativePointerNodeGen,GetByteCodeRangeNodeGen,GetCodeRangeImpreciseNodeGen,GetCodeRangeNodeGen,GetInternalByteArrayNodeGen,GetInternalNativePointerNodeGen,GetStringCompactionLevelNodeGen,HashCodeNodeGen,IndexOfCodePointNodeGen,IndexOfStringNodeGen,IntIndexOfAnyIntUTF32NodeGen,InternalAsTruffleStringNodeGen,InternalCopyToByteArrayNodeGen,InternalSwitchEncodingNodeGen,IsValidNodeGen,LastByteIndexOfCodePointNodeGen,LastByteIndexOfStringNodeGen,LastIndexOfCodePointNodeGen,LastIndexOfStringNodeGen,MaterializeNodeGen,ParseDoubleNodeGen,ParseIntNodeGen,ParseLongNodeGen,ReadByteNodeGen,ReadCharUTF16NodeGen,RegionEqualByteIndexNodeGen,RegionEqualNodeGen,RepeatNodeGen,SubstringByteIndexNodeGen,SubstringNodeGen,SwitchEncodingNodeGen,ToIndexableNodeGen,ToJavaStringNodeGen,ToValidStringNodeGen

CLSS public final static com.oracle.truffle.api.strings.TruffleStringFactory$WithMaskFactory
outer com.oracle.truffle.api.strings.TruffleStringFactory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,10 @@ final void invalidateCodePointLength() {
codePointLength = -1;
}

final boolean isCodePointLengthKnown() {
return codePointLength >= 0;
}

final void invalidateHashCode() {
hashCode = 0;
}
Expand Down Expand Up @@ -1207,6 +1211,16 @@ public final void copyToNativeMemoryUncached(int byteFromIndexA, Object pointerO
TruffleString.CopyToNativeMemoryNode.getUncached().execute(this, byteFromIndexA, pointerObject, byteFromIndexDst, byteLength, expectedEncoding);
}

/**
* Shorthand for calling the uncached version of {@link TruffleString.ToValidStringNode}.
*
* @since 23.1
*/
@TruffleBoundary
public TruffleString toValidStringUncached(Encoding expectedEncoding) {
return TruffleString.ToValidStringNode.getUncached().execute(this, expectedEncoding);
}

/**
* Shorthand for calling the uncached version of {@link TruffleString.ToJavaStringNode}.
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ final class Encodings {
static final byte UTF8_ACCEPT = 0;
static final byte UTF8_REJECT = 12;
static final byte UTF8_REVERSE_INCOMPLETE_SEQ = 24;
/**
* UTF-8 encoded 0xfffd.
*/
static final byte[] CONVERSION_REPLACEMENT_UTF_8 = {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD};

static byte[] getUTF8DecodingStateMachine(DecodingErrorHandler errorHandler) {
return errorHandler == DecodingErrorHandler.DEFAULT_KEEP_SURROGATES_IN_UTF8 ? Encodings.UTF_8_STATE_MACHINE_ALLOW_UTF16_SURROGATES : Encodings.UTF_8_STATE_MACHINE;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ private static void econvInsertOutput(TruffleString.Encoding targetEncoding, Enc
}

private static final byte[] CONVERSION_REPLACEMENT = {'?'};
private static final byte[] CONVERSION_REPLACEMENT_UTF_8 = {(byte) 0xEF, (byte) 0xBF, (byte) 0xBD};
private static final byte[] CONVERSION_REPLACEMENT_UTF_16 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF} : new byte[]{(byte) 0xFF, (byte) 0xFD};
private static final byte[] CONVERSION_REPLACEMENT_UTF_32 = TStringGuards.littleEndian() ? new byte[]{(byte) 0xFD, (byte) 0xFF, 0, 0} : new byte[]{0, 0, (byte) 0xFF, (byte) 0xFD};

Expand Down Expand Up @@ -318,7 +317,7 @@ public TruffleString transcode(Node location, AbstractTruffleString a, Object ar
} else {
final byte[] replacement;
if (isUTF8(targetEncoding)) {
replacement = CONVERSION_REPLACEMENT_UTF_8;
replacement = Encodings.CONVERSION_REPLACEMENT_UTF_8;
} else if (isUTF16(targetEncoding)) {
replacement = CONVERSION_REPLACEMENT_UTF_16;
} else if (isUTF32(targetEncoding)) {
Expand Down
Loading

0 comments on commit 1e9366e

Please sign in to comment.