Skip to content

Commit

Permalink
(Fix)[hive-writer] Fixed the issue when partition values contain spac…
Browse files Browse the repository at this point in the history
…es when writing to s3. (apache#35645)

## Proposed changes

Issue Number: close apache#31442

(Fix) [hive-writer] Fixed the issue when partition values contain spaces
when writing to s3.

### Error msg
```
org.apache.doris.common.UserException: errCode = 2, detailMessage = java.net.URISyntaxException: Illegal character in path at index 114: oss://xxxxxxxxxxx/hive/tpcds1000_partition_oss/call_center/cc_call_center_sk=1/cc_mkt_class=A bit narrow forms matter animals. Consist/cc_market_manager=Daniel Weller/cc_rec_end_date=2001-12-31/f6b5ff4253414b06-9fd365ef68e5ddc5_133f02fb-a7e0-4109-9100-fb748a28259e-0.zlib.orc
        at org.apache.doris.common.util.S3URI.validateUri(S3URI.java:134) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.common.util.S3URI.parseUri(S3URI.java:120) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.common.util.S3URI.<init>(S3URI.java:116) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.common.util.S3URI.create(S3URI.java:108) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.fs.obj.S3ObjStorage.deleteObject(S3ObjStorage.java:194) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.fs.remote.ObjFileSystem.delete(ObjFileSystem.java:150) ~[doris-fe.jar:1.2-SNAPSHOT]
        at org.apache.doris.fs.remote.SwitchingFileSystem.delete(SwitchingFileSystem.java:92) ~[doris-fe.jar:1.2-
```

### Root Cause
Hadoop partition names will encode some special characters, but not
space characters, which is different from URI encoding. Therefore, an
error will be reported when constructing URI.

### Solution
The solution is to use regular expressions to parse URI, and then pass
in each part of URI to construct URI. This URI constructor will encode
each part of URI.
  • Loading branch information
kaka11chen authored and Doris-Extras committed May 31, 2024
1 parent cb96a79 commit 2bad561
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 9 deletions.
39 changes: 30 additions & 9 deletions fe/fe-core/src/main/java/org/apache/doris/common/util/S3URI.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;

/**
Expand Down Expand Up @@ -60,11 +62,12 @@
* Virtual Host AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = false && forceParsingByStandardUri = true</code>
* Path AWS Client (Hadoop S3) Mixed Style: <code>isPathStyle = true && forceParsingByStandardUri = true</code>
*
* When the incoming location is url encoded, the encoded string will be returned.
* For <code>getKey()</code>, <code>getQueryParams()</code> will return the encoding string
*/

public class S3URI {

private static final Pattern URI_PATTERN =
Pattern.compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?");
public static final String SCHEME_DELIM = "://";
public static final String PATH_DELIM = "/";
private static final Set<String> VALID_SCHEMES = ImmutableSet.of("http", "https", "s3", "s3a", "s3n",
Expand Down Expand Up @@ -117,7 +120,8 @@ private S3URI(String location, boolean isPathStyle, boolean forceParsingByStanda
}

private void parseUri(String location, boolean forceParsingStandardUri) throws UserException {
validateUri(location);
parseURILocation(location);
validateUri();

if (!forceParsingStandardUri && OS_SCHEMES.contains(uri.getScheme().toLowerCase())) {
parseAwsCliStyleUri();
Expand All @@ -127,12 +131,29 @@ private void parseUri(String location, boolean forceParsingStandardUri) throws U
parseEndpointAndRegion();
}

private void validateUri(String location) throws UserException {
/**
* parse uri location and encode to a URI.
* @param location
* @throws UserException
*/
private void parseURILocation(String location) throws UserException {
Matcher matcher = URI_PATTERN.matcher(location);
if (!matcher.matches()) {
throw new UserException("Failed to parse uri: " + location);
}
String scheme = matcher.group(2);
String authority = matcher.group(4);
String path = matcher.group(5);
String query = matcher.group(7);
String fragment = matcher.group(9);
try {
uri = new URI(location);
uri = new URI(scheme, authority, path, query, fragment).normalize();
} catch (URISyntaxException e) {
throw new UserException(e);
}
}

private void validateUri() throws UserException {
if (uri.getScheme() == null || !VALID_SCHEMES.contains(uri.getScheme().toLowerCase())) {
throw new UserException("Invalid scheme: " + this.uri);
}
Expand All @@ -143,7 +164,7 @@ private void parseAwsCliStyleUri() throws UserException {
if (bucket == null) {
throw new UserException("missing bucket: " + uri);
}
String path = uri.getRawPath();
String path = uri.getPath();
if (path.length() > 1) {
key = path.substring(1);
} else {
Expand Down Expand Up @@ -173,7 +194,7 @@ private void parseStandardUri() throws UserException {

private void addQueryParamsIfNeeded() {
if (uri.getQuery() != null) {
queryParams = splitQueryString(uri.getRawQuery()).stream().map((s) -> s.split("="))
queryParams = splitQueryString(uri.getQuery()).stream().map((s) -> s.split("="))
.map((s) -> s.length == 1 ? new String[] {s[0], null} : s).collect(
Collectors.groupingBy((a) -> a[0],
Collectors.mapping((a) -> a[1], Collectors.toList())));
Expand Down Expand Up @@ -201,7 +222,7 @@ private static List<String> splitQueryString(String queryString) {
}

private void parsePathStyleUri() throws UserException {
String path = uri.getRawPath();
String path = uri.getPath();

if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
int index = path.indexOf('/', 1);
Expand All @@ -226,7 +247,7 @@ private void parsePathStyleUri() throws UserException {
private void parseVirtualHostedStyleUri() throws UserException {
bucket = uri.getHost().split("\\.")[0];

String path = uri.getRawPath();
String path = uri.getPath();
if (!StringUtils.isEmpty(path) && !"/".equals(path)) {
key = path.substring(1);
} else {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,19 @@ public void testEncodedString() throws UserException {
Assert.assertTrue(uri1.getQueryParams().get().get("partNumber").contains("88"));
}

@Test
public void testHadoopEncodedString() throws UserException {
String p1 = "s3://bucket/path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz";
boolean isPathStyle = false;
boolean forceParsingStandardUri = false;
S3URI uri1 = S3URI.create(p1, isPathStyle, forceParsingStandardUri);

Assert.assertEquals("bucket", uri1.getBucket());
Assert.assertEquals("path%20to%20file/abc%3Aqqq=xyz%2Fyyy zzz", uri1.getKey());
Assert.assertEquals(Optional.empty(), uri1.getEndpoint());
Assert.assertEquals(Optional.empty(), uri1.getRegion());
}

@Test(expected = UserException.class)
public void missingBucket() throws UserException {
S3URI.create("https:///");
Expand Down

0 comments on commit 2bad561

Please sign in to comment.