Skip to content

Commit

Permalink
Merge pull request #4 from openaire/bump_spark_version
Browse files Browse the repository at this point in the history
Bump spark version
  • Loading branch information
andremann authored Nov 6, 2024
2 parents 9a36d8c + 01ea0d2 commit f6587fd
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
18 changes: 8 additions & 10 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,41 +1,39 @@
FROM eclipse-temurin:21

RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip
RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip

RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1

# Fix the value of PYTHONHASHSEED
# Note: this is needed when you use Python 3.3 or greater
ENV SPARK_VERSION=3.5.1 \
ENV SPARK_VERSION=3.5.3 \
HADOOP_VERSION=3 \
SPARK_HOME=/opt/spark \
PYTHONHASHSEED=1


# Download and uncompress spark from the apache archive
RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
RUN wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz

RUN mkdir -p /opt/
RUN tar -xf spark-3.5.1-bin-hadoop3.tgz -C /opt/
RUN rm spark-3.5.1-bin-hadoop3.tgz
RUN mv /opt/spark-3.5.1-bin-hadoop3 /opt/spark
RUN tar -xf spark-3.5.3-bin-hadoop3.tgz -C /opt/
RUN rm spark-3.5.3-bin-hadoop3.tgz
RUN mv /opt/spark-3.5.3-bin-hadoop3 /opt/spark

COPY log4j2.properties /opt/spark/conf

RUN useradd -d /app -s /bin/bash -G sudo -u 1001 openaire

WORKDIR /app


RUN chown -R openaire /app

USER openaire

RUN pip install jupyter notebook
# Prepare environment for python
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN pip install -r requirements.txt --break-system-packages
EXPOSE 8889
RUN pip install jupyter notebook
ENV PATH="$PATH:/opt/spark/bin:/app/.local/bin"
ENV PYSPARK_DRIVER_PYTHON='jupyter'
ENV PYSPARK_DRIVER_PYTHON_OPTS='lab --ip 0.0.0.0 --no-browser --port=8889'
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
jupyter
numpy
pandas
coverage
Expand Down

0 comments on commit f6587fd

Please sign in to comment.