diff --git a/Dockerfile b/Dockerfile index 7cf547c..b168d23 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,24 +1,24 @@ FROM eclipse-temurin:21 -RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip +RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates python3 python3-pip RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 # Fix the value of PYTHONHASHSEED # Note: this is needed when you use Python 3.3 or greater -ENV SPARK_VERSION=3.5.1 \ +ENV SPARK_VERSION=3.5.3 \ HADOOP_VERSION=3 \ SPARK_HOME=/opt/spark \ PYTHONHASHSEED=1 # Download and uncompress spark from the apache archive -RUN wget https://dlcdn.apache.org/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz +RUN wget https://dlcdn.apache.org/spark/spark-3.5.3/spark-3.5.3-bin-hadoop3.tgz RUN mkdir -p /opt/ -RUN tar -xf spark-3.5.1-bin-hadoop3.tgz -C /opt/ -RUN rm spark-3.5.1-bin-hadoop3.tgz -RUN mv /opt/spark-3.5.1-bin-hadoop3 /opt/spark +RUN tar -xf spark-3.5.3-bin-hadoop3.tgz -C /opt/ +RUN rm spark-3.5.3-bin-hadoop3.tgz +RUN mv /opt/spark-3.5.3-bin-hadoop3 /opt/spark COPY log4j2.properties /opt/spark/conf @@ -26,16 +26,14 @@ RUN useradd -d /app -s /bin/bash -G sudo -u 1001 openaire WORKDIR /app - RUN chown -R openaire /app USER openaire -RUN pip install jupyter notebook +# Prepare environment for python COPY requirements.txt . -RUN pip install -r requirements.txt +RUN pip install -r requirements.txt --break-system-packages EXPOSE 8889 -RUN pip install jupyter notebook ENV PATH="$PATH:/opt/spark/bin:/app/.local/bin" ENV PYSPARK_DRIVER_PYTHON='jupyter' ENV PYSPARK_DRIVER_PYTHON_OPTS='lab --ip 0.0.0.0 --no-browser --port=8889' diff --git a/requirements.txt b/requirements.txt index b770d22..65ad5c5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +jupyter numpy pandas coverage