diff --git a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
index 9c9a887aa7e36c..70263b4bf1649c 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
@@ -30,6 +30,8 @@
import java.util.Map;
import java.util.Optional;
import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import java.util.stream.Collectors;
/**
@@ -60,11 +62,12 @@
* Virtual Host AWS Client (Hadoop S3) Mixed Style: isPathStyle = false && forceParsingByStandardUri = true
* Path AWS Client (Hadoop S3) Mixed Style: isPathStyle = true && forceParsingByStandardUri = true
*
- * When the incoming location is url encoded, the encoded string will be returned.
- * For getKey()
, getQueryParams()
will return the encoding string
*/
public class S3URI {
+
+ private static final Pattern URI_PATTERN =
+ Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?");
public static final String SCHEME_DELIM = "://";
public static final String PATH_DELIM = "/";
private static final Set VALID_SCHEMES = ImmutableSet.of("http", "https", "s3", "s3a", "s3n",
@@ -117,7 +120,8 @@ private S3URI(String location, boolean isPathStyle, boolean forceParsingByStanda
}
private void parseUri(String location, boolean forceParsingStandardUri) throws UserException {
- validateUri(location);
+ parseURILocation(location);
+ validateUri();
if (!forceParsingStandardUri && OS_SCHEMES.contains(uri.getScheme().toLowerCase())) {
parseAwsCliStyleUri();
@@ -127,12 +131,29 @@ private void parseUri(String location, boolean forceParsingStandardUri) throws U
parseEndpointAndRegion();
}
- private void validateUri(String location) throws UserException {
+ /**
+ * parse uri location and encode to a URI.
+ * @param location
+ * @throws UserException
+ */
+ private void parseURILocation(String location) throws UserException {
+ Matcher matcher = URI_PATTERN.matcher(location);
+ if (!matcher.matches()) {
+ throw new UserException("Failed to parse uri: " + location);
+ }
+ String scheme = matcher.group(2);
+ String authority = matcher.group(4);
+ String path = matcher.group(5);
+ String query = matcher.group(7);
+ String fragment = matcher.group(9);
try {
- uri = new URI(location);
+ uri = new URI(scheme, authority, path, query, fragment).normalize();
} catch (URISyntaxException e) {
throw new UserException(e);
}
+ }
+
+ private void validateUri() throws UserException {
if (uri.getScheme() == null || !VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) {
throw new UserException("Invalid scheme: " + this.uri);
}
@@ -143,7 +164,7 @@ private void parseAwsCliStyleUri() throws UserException {
if (bucket == null) {
throw new UserException("missing bucket: " + uri);
}
- String path = uri.getRawPath();
+ String path = uri.getPath();
if (path.length() > 1) {
key = path.substring(1);
} else {
@@ -173,7 +194,7 @@ private void parseStandardUri() throws UserException {
private void addQueryParamsIfNeeded() {
if (uri.getQuery() != null) {
- queryParams = splitQueryString(uri.getRawQuery()).stream().map((s) -> s.split("="))
+ queryParams = splitQueryString(uri.getQuery()).stream().map((s) -> s.split("="))
.map((s) -> s.length == 1 ? new String[] {s[0], null} : s).collect(
Collectors.groupingBy((a) -> a[0],
Collectors.mapping((a) -> a[1], Collectors.toList())));
@@ -201,7 +222,7 @@ private static List splitQueryString(String queryString) {
}
private void parsePathStyleUri() throws UserException {
- String path = uri.getRawPath();
+ String path = uri.getPath();
if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
int index = path.indexOf('/', 1);
@@ -226,7 +247,7 @@ private void parsePathStyleUri() throws UserException {
private void parseVirtualHostedStyleUri() throws UserException {
bucket = uri.getHost().split("\\.")[0];
- String path = uri.getRawPath();
+ String path = uri.getPath();
if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
key = path.substring(1);
} else {
diff --git a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
index 1d92158c9cf0cf..87fcdae27ac585 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/util/S3URITest.java
@@ -170,6 +170,19 @@ public void testEncodedString() throws UserException {
Assert.assertTrue(uri1.getQueryParams().get().get("partNumber").contains("88"));
}
+ @Test
+ public void testHadoopEncodedString() throws UserException {
+ String p1 = "s3://bucket/path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz";
+ boolean isPathStyle = false;
+ boolean forceParsingStandardUri = false;
+ S3URI uri1 = S3URI.create(p1, isPathStyle, forceParsingStandardUri);
+
+ Assert.assertEquals("bucket", uri1.getBucket());
+ Assert.assertEquals("path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz", uri1.getKey());
+ Assert.assertEquals(Optional.empty(), uri1.getEndpoint());
+ Assert.assertEquals(Optional.empty(), uri1.getRegion());
+ }
+
@Test(expected = UserException.class)
public void missingBucket() throws UserException {
S3URI.create("https:///");