This repository has been archived by the owner on Nov 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
s3.yaml
49 lines (47 loc) · 2.54 KB
/
s3.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# S3 collector-config.yaml example
# All AWS S3 parameters are optional according to default behavior of boto3.
# If not provided, boto3 will search for credentials in environment variables, ~/.aws/credentials and ~/.aws/config
platform_host_url: http://localhost:8080
default_pulling_interval: 60 # Pulling interval in minutes. Can be omitted to run collector once
token: "" # Token that must be retrieved from the platform
plugins:
- type: s3
name: s3_adapter
aws_secret_access_key: <aws_secret_access_key> # Optional.
aws_access_key_id: <aws_access_key_id> # Optional.
aws_session_token: <aws_session_token> # Optional. Required if using temporary credentials.
aws_region: <aws_region> # Optional.
aws_role_arn: <aws_role_arn> # Optional. Required for assuming role with temporary credentials.
aws_role_session_name: <aws_role_session_name> # Optional. Required for assuming role with temporary credentials.
profile_name: <profile_name> # Optional.
filename_filter: # Optional. Default filter allows each file to be ingested to platform.
include: [ '.*.parquet' ]
exclude: [ 'dev_.*' ]
dataset_config:
bucket: my_bucket
prefix: folder/subfolder/file.csv # Optional. Default is empty string.
# When we want to use the folder as a dataset. Very useful for partitioned datasets.
- type: s3
name: s3_partitioned_adapter
aws_secret_access_key: <aws_secret_access_key> # Optional.
aws_access_key_id: <aws_access_key_id> # Optional.
aws_session_token: <aws_session_token> # Optional. Required if using temporary credentials.
aws_region: <aws_region> # Optional.
aws_role_arn: <aws_role_arn> # Optional. Required for assuming role with temporary credentials.
aws_role_session_name: <aws_role_session_name> # Optional. Required for assuming role with temporary credentials.
profile_name: <profile_name> # Optional.
dataset_config:
bucket: my_bucket
prefix: partitioned_data/
folder_as_dataset:
file_format: parquet # Format of the files in the folder. Can be parquet csv, tsv.
flavor: hive # Optional. Default is hive. Can be hive or presto.
field_names: ['year', 'month'] # Optional. Must be provided if flavor is other than hive. I.e. structure s3://my_bucket/partitioned_data/year/...
# When S3 storage is compatible with AWS S3 API, for example Minio.
- type: s3
name: s3_minio_adapter
endpoint_url: http://localhost:9000
aws_secret_access_key: minioadmin
aws_access_key_id: minioadmin
dataset_config:
bucket: my_bucket