-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinux-Dockerfile
143 lines (118 loc) · 6.84 KB
/
linux-Dockerfile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
# Base image from https://nightlies.apache.org/flink/flink-docs-release-1.19/docs/deployment/resource-providers/standalone/docker/
FROM flink:1.20.0-scala_2.12-java17
# Build argument(s)
ARG FLINK_LANGUAGE
ARG AWS_ACCESS_KEY_ID
ARG AWS_SECRET_ACCESS_KEY
ARG AWS_REGION
ARG AWS_S3_BUCKET
ARG AWS_SESSION_TOKEN
# Map build argument(s) to container environment variable(s)
ENV FLINK_LANGUAGE=${FLINK_LANGUAGE}
ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID}
ENV AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY}
ENV AWS_REGION=${AWS_REGION}
ENV AWS_S3_BUCKET=${AWS_S3_BUCKET}
ENV AWS_SESSION_TOKEN=${AWS_SESSION_TOKEN}
# Create/set version number container environment variable(s)
ENV APACHE_FLINK_VERSION=1.20
ENV APACHE_FLINK_PATCH=.0
ENV APACHE_FLINK_VERSION_WITH_PATCH=${APACHE_FLINK_VERSION}${APACHE_FLINK_PATCH}
ENV PYTHON_VERSION=3.11
ENV PYTHON_PATCH=.9
ENV PYTHON_VERSION_WITH_PATCH=${PYTHON_VERSION}${PYTHON_PATCH}
ENV HADOOP_VERSION=3.4.1
ENV ICEBERG_VERSION=1.7.0
# Create/set folder contianer environment variable(s)
ENV FLINK_CONF_DIR=/opt/flink/conf
ENV FLINK_LIB_DIR=/opt/flink/lib/
# Create/set URL contianer environment variable(s)
ENV MAVEN_ROOT_URL=https://repo1.maven.org/maven2/org/apache/
# Container metadata
LABEL maintainer="j3@thej3.com" \
description="Apache Flink ${APACHE_FLINK_VERSION_WITH_PATCH} container with Python ${PYTHON_VERSION_WITH_PATCH} installed."
# Install Python dependencies and whatever utilities are needed to install Python from source
# Install libbz2-dev, a development package for the bzip2 library (libbz2), which is used for compressing and decompressing data using the Burrows-Wheeler block-sorting text compression algorithm and Huffman coding.
# Install libffi-dev, a development package for the libffi library, which provides a portable, high-level programming interface to various calling conventions.
# Install libssl-dev, a development package for the OpenSSL library, which is a general-purpose cryptography library that provides an open-source implementation of the Secure Sockets Layer (SSL) and Transport Layer Security (TLS) protocols.
# Install zlib1g-dev, a development package for the zlib library, which is a software library used for data compression.
# Install openjdk-17-jdk-headless, a headless version of the OpenJDK 17 Java Development Kit (JDK) that does not include any graphical user interface (GUI) libraries or tools.
RUN apt-get update && apt-get install -y \
build-essential \
openjdk-17-jdk-headless \
zlib1g-dev \
libbz2-dev \
libpq-dev \
libncurses5-dev \
libgdbm-dev \
liblzma-dev \
libnss3-dev \
libssl-dev \
libsqlite3-dev \
libreadline-dev \
libffi-dev \
wget \
git \
python3-pip \
python3-venv \
zip \
gradle \
file \
nano
# Download JAR files for Java and Python
RUN curl -L "${MAVEN_ROOT_URL}flink/flink-s3-fs-hadoop/${APACHE_FLINK_VERSION_WITH_PATCH}/flink-s3-fs-hadoop-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" -o "${FLINK_LIB_DIR}flink-s3-fs-hadoop-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" && \
curl -L "${MAVEN_ROOT_URL}hadoop/hadoop-common/${HADOOP_VERSION}/hadoop-common-${HADOOP_VERSION}.jar" -o "${FLINK_LIB_DIR}hadoop-common-${HADOOP_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}flink/flink-shaded-hadoop-2-uber/2.8.3-10.0/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar" -o "${FLINK_LIB_DIR}flink-shaded-hadoop-2-uber-2.8.3-10.0.jar";
# Download JAR files for Python
RUN if [ "$FLINK_LANGUAGE" = "python" ]; then \
curl -L "${MAVEN_ROOT_URL}iceberg/iceberg-flink-runtime-${APACHE_FLINK_VERSION}/${ICEBERG_VERSION}/iceberg-flink-runtime-${APACHE_FLINK_VERSION}-${ICEBERG_VERSION}.jar" -o "${FLINK_LIB_DIR}iceberg-flink-runtime-${APACHE_FLINK_VERSION}-${ICEBERG_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar" -o "${FLINK_LIB_DIR}iceberg-aws-bundle-${ICEBERG_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}hadoop/hadoop-hdfs-client/${HADOOP_VERSION}/hadoop-hdfs-client-${HADOOP_VERSION}.jar" -o "${FLINK_LIB_DIR}hadoop-hdfs-client-${HADOOP_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}flink/flink-sql-connector-kafka/3.3.0-${APACHE_FLINK_VERSION}/flink-sql-connector-kafka-3.3.0-${APACHE_FLINK_VERSION}.jar" -o "${FLINK_LIB_DIR}flink-sql-connector-kafka-3.3.0-${APACHE_FLINK_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}flink/flink-connector-kafka/3.3.0-${APACHE_FLINK_VERSION}/flink-connector-kafka-3.3.0-${APACHE_FLINK_VERSION}.jar" -o "${FLINK_LIB_DIR}flink-connector-kafka-3.3.0-${APACHE_FLINK_VERSION}.jar" && \
curl -L "${MAVEN_ROOT_URL}flink/flink-json/${APACHE_FLINK_VERSION_WITH_PATCH}/flink-json-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" -o "${FLINK_LIB_DIR}flink-json-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" && \
curl -L "${MAVEN_ROOT_URL}flink/flink-avro/${APACHE_FLINK_VERSION_WITH_PATCH}/flink-avro-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" -o "${FLINK_LIB_DIR}flink-avro-${APACHE_FLINK_VERSION_WITH_PATCH}.jar" && \
curl -L "${MAVEN_ROOT_URL}avro/avro/1.12.0/avro-1.12.0.jar" -o "${FLINK_LIB_DIR}avro-1.12.0.jar" && \
curl -L "${MAVEN_ROOT_URL}kafka/kafka-clients/3.8.1/kafka-clients-3.8.1.jar" -o "${FLINK_LIB_DIR}kafka-clients-3.8.1.jar"; \
fi
# Install Python from source
RUN cd /usr/local && \
wget https://www.python.org/ftp/python/${PYTHON_VERSION_WITH_PATCH}/Python-${PYTHON_VERSION_WITH_PATCH}.tgz && \
tar -xf Python-${PYTHON_VERSION_WITH_PATCH}.tgz && \
cd Python-${PYTHON_VERSION_WITH_PATCH} && \
./configure --enable-optimizations && \
make -j$(nproc) && \
make altinstall
# Create a symbolic link to the Python binary
RUN ln -s /usr/local/Python-${PYTHON_VERSION_WITH_PATCH} /usr/bin/python
# Set the PATH environment variable with the path to the Python binary
ENV PATH="/usr/local/Python-${PYTHON_VERSION_WITH_PATCH}:${PATH}"
# Set JAVA_HOME
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64
# Copy Java files to the container
COPY java /opt/flink/java_apps/
# Set the working directory to the Java files
WORKDIR /opt/flink/java_apps
# Build Java files
RUN ./gradlew app:clean
RUN ./gradlew app:build
# Upgrade pip
RUN python${PYTHON_VERSION} -m pip install --upgrade pip
# Install AWS CLI to interact with AWS services for debugging services
RUN pip --no-cache-dir install --upgrade awscli
# Set CLASSPATH environment variable
ENV CLASSPATH="${FLINK_LIB_DIR}*"
# Set JAVA_HOME
ENV JAVA_HOME=/usr/lib/jvm/java-17-openjdk-arm64
# Download UV
COPY --from=ghcr.io/astral-sh/uv:latest /uv /usr/local/bin/uv
# Copy Python files to the container
COPY python /opt/flink/python_apps/
# Install Python dependencies based on the `uv.lock` file
WORKDIR /opt/flink/python_apps
RUN uv sync --frozen
# Zip Python files
WORKDIR /opt/flink/python_apps/src/kickstarter
RUN zip -r python_files.zip * -x '__pycache__/*' -x 'helper/__pycache__/*' -x 'model/__pycache__/*'
# Set the entrypoint to Flink's entrypoint script
CMD ["./bin/start-cluster.sh"]