Skip to content

Commit

Permalink
⬆️ Upgrade Jupyter image (#76)
Browse files Browse the repository at this point in the history
Signed-off-by: Jacob Woffenden <jacob.woffenden@digital.justice.gov.uk>
  • Loading branch information
jacobwoffenden authored Jan 29, 2024
1 parent 6c69339 commit 26a21c0
Show file tree
Hide file tree
Showing 53 changed files with 279 additions and 828 deletions.
62 changes: 62 additions & 0 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
---
name: Test and Build

on:
pull_request:
branches:
- main

permissions: {} # yamllint disable-line

jobs:
yamllint:
name: YAML Lint
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout
id: checkout
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Run yamllint
id: run_yamllint
uses: actionshub/yamllint@b772a30c3ba90c5f5aadfe94d8f3599e3a7099c8 # v1.8.2

markdownlint:
name: Markdown Lint
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout
id: checkout
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Run mdl
id: run_mdl
uses: actionshub/markdownlint@6c82ff529253530dfbf75c37570876c52692835f # v3.1.4

build-and-test:
if: github.ref != 'main'
name: Build and Test
runs-on: ubuntu-latest
permissions:
contents: read
strategy:
fail-fast: false
max-parallel: 3
matrix:
flavour:
- "allspark-notebook"
- "datascience-notebook"
steps:
- name: Checkout
id: checkout
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Build and Test
id: build_and_test
shell: bash
run: |
bash scripts/build-and-test.sh "${{ matrix.flavour }}"
108 changes: 0 additions & 108 deletions .github/workflows/jupyter-lab-test-and-build.yml

This file was deleted.

46 changes: 46 additions & 0 deletions .github/workflows/publish.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
---
name: Publish

on:
push:
tags:
- "v*"

permissions: {} # yamllint disable-line

jobs:
publish:
name: Publish
runs-on: ubuntu-latest
permissions:
contents: read
id-token: write
packages: write
strategy:
fail-fast: false
max-parallel: 3
matrix:
flavour:
- "allspark-notebook"
- "datascience-notebook"
steps:
- name: Checkout
id: checkout
uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1

- name: Log in to GitHub Container Registry
id: login_ghcr
uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Build and Push
id: build_and_push
uses: docker/build-push-action@4a13e500e55cf31b7a5d59a38ab2040ab0f42f56 # v5.1.0
with:
context: ${{ matrix.flavour }}
file: ${{ matrix.flavour }}/Dockerfile
push: true
tags: ghcr.io/ministryofjustice/analytical-platform-${{ matrix.flavour }}:${{ github.ref_name }}
80 changes: 29 additions & 51 deletions allspark-notebook/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,57 +1,35 @@
FROM jupyter/all-spark-notebook:spark-3.1.1@sha256:b73dad39ad5c469a92764e38d7cc4321040d3fedddcad7fcebc4ddc7f9c15ff2
# lab-4.0.11
FROM quay.io/jupyter/all-spark-notebook@sha256:a63b0faed54bc21d17a4691d8fae177dd95236e0adddbd9d43ee448dc2d5ba1e

LABEL maintainer=analytics-platform-tech@digital.justice.gov.uk
LABEL org.opencontainers.image.vendor="Ministry of Justice" \
org.opencontainers.image.authors="Analytical Platform" \
org.opencontainers.image.title="Jupyter All Spark Notebook" \
maintainer="analytics-platform-tech@digital.justice.gov.uk"

ENV PATH=$PATH:$HOME/.local/bin
ENV PATH="${PATH}:${HOME}/.local/bin" \
CHOWN_HOME="no" \
PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.12.134,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"

# Home directory contents is already owned by UID 1000
ENV CHOWN_HOME=no

# NB these are sensible defaults but may need to be changed programatically for
# non local spark (ie. EMR etc.)
ENV PYSPARK_SUBMIT_ARGS="--packages com.amazonaws:aws-java-sdk:1.12.134,org.apache.hadoop:hadoop-aws:3.0.1 pyspark-shell"

# Container must be run as root to use NB_UID
USER root

# Install OS pacakges
#
# The reason we have installed these has been lost. Including just in case.
#
# - gdal-bin
# - libspatialindex-dev
# - openssh-client
#
RUN apt-get update && \
apt-get install -y \
gdal-bin \
libspatialindex-dev \
openssh-client && \
rm -rf /var/lib/apt/lists/*

# I'm not sure this has any effect
RUN apt-get update --yes \
&& apt-get install --yes \
gdal-bin \
libspatialindex-dev \
openssh-client \
&& apt-get clean --yes \
&& rm -rf /var/lib/apt/lists/* \
&& pip install --no-cache-dir --upgrade \
pip \
boto3 \
nbstripout \
s3fs==2023.12.2 \
dataengineeringutils3==1.4.3 \
etl-manager==7.6.0 \
&& conda install --yes \
nbstripout \
&& nbstripout --install --system \
&& update-alternatives --set editor /bin/nano-tiny

COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/add-user-to-group.sh
COPY files/hdfs-site.xml /usr/local/spark/conf/hdfs-site.xml

# add-user-to-group.sh adds the $NB_USER to group 50 (staff) used by RStudio
COPY files/add-user-to-group.sh /usr/local/bin/before-notebook.d/

# Install python packages
# - pip - python package manager
# - boto3 - python AWS library
# - nbstripout - tool for stripping sensitive data out of notebooks
#
RUN pip install --upgrade \
pip \
boto3 \
nbstripout \
"s3fs<=0.4" \
dataengineeringutils3==1.3.0 \
etl-manager==7.4.0

RUN conda install --yes \
'nbstripout'

RUN nbstripout --install --system

# Vi just doesn't cut it for some people
RUN update-alternatives --set editor /bin/nano-tiny
4 changes: 0 additions & 4 deletions allspark-notebook/Dockerfile.tests

This file was deleted.

31 changes: 0 additions & 31 deletions allspark-notebook/Makefile

This file was deleted.

32 changes: 0 additions & 32 deletions allspark-notebook/docker-compose.yml

This file was deleted.

7 changes: 0 additions & 7 deletions allspark-notebook/files/pyspark-s3.py

This file was deleted.

Loading

0 comments on commit 26a21c0

Please sign in to comment.