diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java index 9c9a887aa7e36c..70263b4bf1649c 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java +++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java @@ -30,6 +30,8 @@ import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.Collectors; /** @@ -60,11 +62,12 @@ * Virtual Host AWS Client (Hadoop S3) Mixed Style: isPathStyle = false && forceParsingByStandardUri = true * Path AWS Client (Hadoop S3) Mixed Style: isPathStyle = true && forceParsingByStandardUri = true * - * When the incoming location is url encoded, the encoded string will be returned. - * For getKey(), getQueryParams() will return the encoding string */ public class S3URI { + + private static final Pattern URI_PATTERN = + Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?"); public static final String SCHEME_DELIM = "://"; public static final String PATH_DELIM = "/"; private static final Set VALID_SCHEMES = ImmutableSet.of("http", "https", "s3", "s3a", "s3n", @@ -117,7 +120,8 @@ private S3URI(String location, boolean isPathStyle, boolean forceParsingByStanda } private void parseUri(String location, boolean forceParsingStandardUri) throws UserException { - validateUri(location); + parseURILocation(location); + validateUri(); if (!forceParsingStandardUri && OS_SCHEMES.contains(uri.getScheme().toLowerCase())) { parseAwsCliStyleUri(); @@ -127,12 +131,29 @@ private void parseUri(String location, boolean forceParsingStandardUri) throws U parseEndpointAndRegion(); } - private void validateUri(String location) throws UserException { + /** + * parse uri location and encode to a URI. + * @param location + * @throws UserException + */ + private void parseURILocation(String location) throws UserException { + Matcher matcher = URI_PATTERN.matcher(location); + if (!matcher.matches()) { + throw new UserException("Failed to parse uri: " + location); + } + String scheme = matcher.group(2); + String authority = matcher.group(4); + String path = matcher.group(5); + String query = matcher.group(7); + String fragment = matcher.group(9); try { - uri = new URI(location); + uri = new URI(scheme, authority, path, query, fragment).normalize(); } catch (URISyntaxException e) { throw new UserException(e); } + } + + private void validateUri() throws UserException { if (uri.getScheme() == null || !VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) { throw new UserException("Invalid scheme: " + this.uri); } @@ -143,7 +164,7 @@ private void parseAwsCliStyleUri() throws UserException { if (bucket == null) { throw new UserException("missing bucket: " + uri); } - String path = uri.getRawPath(); + String path = uri.getPath(); if (path.length() > 1) { key = path.substring(1); } else { @@ -173,7 +194,7 @@ private void parseStandardUri() throws UserException { private void addQueryParamsIfNeeded() { if (uri.getQuery() != null) { - queryParams = splitQueryString(uri.getRawQuery()).stream().map((s) -> s.split("=")) + queryParams = splitQueryString(uri.getQuery()).stream().map((s) -> s.split("=")) .map((s) -> s.length == 1 ? new String[] {s[0], null} : s).collect( Collectors.groupingBy((a) -> a[0], Collectors.mapping((a) -> a[1], Collectors.toList()))); @@ -201,7 +222,7 @@ private static List splitQueryString(String queryString) { } private void parsePathStyleUri() throws UserException { - String path = uri.getRawPath(); + String path = uri.getPath(); if (!StringUtils.isEmpty(path) && !"/".equals(path)) { int index = path.indexOf('/', 1); @@ -226,7 +247,7 @@ private void parsePathStyleUri() throws UserException { private void parseVirtualHostedStyleUri() throws UserException { bucket = uri.getHost().split("\\.")[0]; - String path = uri.getRawPath(); + String path = uri.getPath(); if (!StringUtils.isEmpty(path) && !"/".equals(path)) { key = path.substring(1); } else { diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java index 1d92158c9cf0cf..87fcdae27ac585 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java @@ -170,6 +170,19 @@ public void testEncodedString() throws UserException { Assert.assertTrue(uri1.getQueryParams().get().get("partNumber").contains("88")); } + @Test + public void testHadoopEncodedString() throws UserException { + String p1 = "s3://bucket/path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz"; + boolean isPathStyle = false; + boolean forceParsingStandardUri = false; + S3URI uri1 = S3URI.create(p1, isPathStyle, forceParsingStandardUri); + + Assert.assertEquals("bucket", uri1.getBucket()); + Assert.assertEquals("path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz", uri1.getKey()); + Assert.assertEquals(Optional.empty(), uri1.getEndpoint()); + Assert.assertEquals(Optional.empty(), uri1.getRegion()); + } + @Test(expected = UserException.class) public void missingBucket() throws UserException { S3URI.create("https:///");