Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -290,12 +290,14 @@ services:
- AWS_ACCESS_KEY_ID=${MINIO_ROOT_USER}
- AWS_SECRET_ACCESS_KEY=${MINIO_ROOT_PASSWORD}
- AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT=true
- PYSPARK_PYTHON=/opt/python/bin/python
ports:
- "8091:8091"
extra_hosts:
- "localhost:host-gateway"
volumes:
- ./infra/airflow/processing/spark/jobs:/opt/spark/jobs
- ./notebooks:/home/jovyan/work
healthcheck:
test: ["CMD", "curl", "-f", "http://spark-master:8080"]
interval: 30s
Expand Down Expand Up @@ -328,12 +330,14 @@ services:
- AWS_ACCESS_KEY_ID=${MINIO_ROOT_USER}
- AWS_SECRET_ACCESS_KEY=${MINIO_ROOT_PASSWORD}
- AWS_JAVA_V1_DISABLE_DEPRECATION_ANNOUNCEMENT=true
- PYSPARK_PYTHON=/opt/python/bin/python
ports:
- "8092:8092"
extra_hosts:
- "localhost:host-gateway"
volumes:
- ./infra/airflow/processing/spark/jobs:/opt/spark/jobs
- ./notebooks:/home/jovyan/work
healthcheck:
test: ["CMD", "curl", "-f", "http://spark-master:8080"]
interval: 30s
Expand Down Expand Up @@ -599,6 +603,8 @@ services:
- TRINO_URL=http://trino:8080
- SUPERSET_URL=http://superset:8088
- AIRFLOW_URL=http://airflow-apiserver:8080
- PYSPARK_PYTHON=/opt/conda/envs/notebook/bin/python
- PYSPARK_DRIVER_PYTHON=/opt/conda/envs/notebook/bin/python
profiles: [explore]

volumes:
Expand Down
13 changes: 13 additions & 0 deletions infra/jupyterlab/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ RUN apt-get update && apt-get install -y \
&& rm -rf /var/lib/apt/lists/*

ARG SPARK_VERSION=3.5.6
ARG PYTHON_VERSION=3.12
RUN set -eux; \
curl -fSL --connect-timeout 20 --max-time 900 --retry 5 --retry-connrefused \
-o /tmp/spark.tgz "https://downloads.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz"; \
Expand All @@ -17,6 +18,18 @@ RUN set -eux; \
ENV SPARK_HOME=/opt/spark
ENV PATH="$SPARK_HOME/bin:$PATH"

SHELL ["bash","-o","pipefail","-c"]

ENV CONDA_ENV_PATH=/opt/conda/envs/notebook

RUN mamba create -y -n notebook python=${PYTHON_VERSION} pip jupyterlab notebook && \
mamba clean -afy

ENV PATH="${CONDA_ENV_PATH}/bin:${PATH}"
ENV CONDA_DEFAULT_ENV=notebook
ENV PYSPARK_PYTHON="${CONDA_ENV_PATH}/bin/python"
ENV PYSPARK_DRIVER_PYTHON="${CONDA_ENV_PATH}/bin/python"

USER $NB_UID
RUN pip install --no-cache-dir \
trino[sqlalchemy] \
Expand Down
22 changes: 21 additions & 1 deletion infra/spark/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,25 @@ FROM bitnamilegacy/spark:3.5.6

USER 0

RUN pip install --no-cache-dir pyarrow pandas fastavro confluent-kafka[avro]
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
bzip2 \
ca-certificates \
&& rm -rf /var/lib/apt/lists/*

ARG PYTHON_VERSION=3.12
ENV MAMBA_ROOT_PREFIX=/opt/micromamba

RUN curl -Ls "https://micro.mamba.pm/api/micromamba/linux-64/latest" | tar -xj -C /tmp && \
install -m 0755 /tmp/bin/micromamba /usr/local/bin/micromamba && \
rm -rf /tmp/bin

RUN micromamba create -y -p /opt/python python=${PYTHON_VERSION} pip && \
micromamba clean -afy

ENV PATH="/opt/python/bin:${PATH}"

RUN /opt/python/bin/pip install --no-cache-dir pyarrow pandas fastavro confluent-kafka[avro]

RUN set -eux; \
mkdir -p "${SPARK_HOME}/jars"; \
Expand All @@ -21,5 +39,7 @@ RUN chown -R 1001:1001 ${SPARK_HOME}/conf

ENV SPARK_JARS_DIR="${SPARK_HOME}/jars"
ENV PYTHONPATH="/opt/spark/jobs:${PYTHONPATH}"
ENV PYSPARK_PYTHON="/opt/python/bin/python"
ENV PYSPARK_DRIVER_PYTHON="/opt/python/bin/python"

USER 1001
Loading