From 6978980fcda921fa77c75109f781b94b61928aeb Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Tue, 21 Jan 2025 15:49:46 -0600 Subject: [PATCH 01/10] fix: update dockerfile and requirements.txt to bring in line with index.commoncrawl.org --- Dockerfile | 21 ++++++++++++++------- requirements.txt | 7 +++++-- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2d0be0f..fe4985a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,18 +1,25 @@ -FROM python:3.9 +# as of Jan 2025, python3 on production is on 3.8.10 +FROM python:3.8.10 RUN apt-get -qq update && apt-get -qqy install awscli +# Create a virtualenv for the app +RUN python3 -m venv /var/venv +ENV PATH="/var/venv/bin:$PATH" + # Install dependencies COPY ./requirements.txt /tmp/requirements.txt -RUN pip install -r /tmp/requirements.txt +RUN pip install -Ur /tmp/requirements.txt # Add the cc-index-server code into the image COPY ./ /opt/webapp/ WORKDIR /opt/webapp -RUN ./install-collections.sh -# Note: to avoid that collections are fetched anew on every image build, -# you may install collections locally on the host in the build directory -# and remove this command +VOLUME /opt/webapp/collections + +ARG INSTALL_COLLECTIONS=true +RUN if [ "$INSTALL_COLLECTIONS" = "true" ]; then \ + ./install-collections.sh; \ + fi -CMD /usr/local/bin/pywb +CMD uwsgi --ini uwsgi.ini diff --git a/requirements.txt b/requirements.txt index 58528bf..3fad909 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb boto3 -gevent uwsgi +greenlet==1.1.2 +gevent==20.9.0 +werkzeug==2.0.3 +markupsafe==2.0.1 # AWS CLI (aws s3 cp ...) is used by install-collections.sh # to fetch cluster.idx and metadata.yaml -#awscli +awscli \ No newline at end of file From 93b6ad6074e171a6392a34d29943a06841f25881 Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Fri, 24 Jan 2025 13:59:11 -0600 Subject: [PATCH 02/10] fix: remove data from docker, mount with volume instead, remove awscli from docker and requirements.txt - data doesn't belong directly in docker image --- Dockerfile | 7 ------- README.md | 7 +++---- install-collections.sh | 2 +- requirements.txt | 6 +----- 4 files changed, 5 insertions(+), 17 deletions(-) diff --git a/Dockerfile b/Dockerfile index fe4985a..0b15b9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,8 +1,6 @@ # as of Jan 2025, python3 on production is on 3.8.10 FROM python:3.8.10 -RUN apt-get -qq update && apt-get -qqy install awscli - # Create a virtualenv for the app RUN python3 -m venv /var/venv ENV PATH="/var/venv/bin:$PATH" @@ -17,9 +15,4 @@ WORKDIR /opt/webapp VOLUME /opt/webapp/collections -ARG INSTALL_COLLECTIONS=true -RUN if [ "$INSTALL_COLLECTIONS" = "true" ]; then \ - ./install-collections.sh; \ - fi - CMD uwsgi --ini uwsgi.ini diff --git a/README.md b/README.md index a5859cc..5b0858b 100644 --- a/README.md +++ b/README.md @@ -28,10 +28,9 @@ If you have docker installed in your system, you can run index server with docke git clone https://github.com/commoncrawl/cc-index-server.git cd cc-index-server docker build . -t cc-index -docker run --rm --publish 8080:8080 -ti cc-index -``` - -You can use `install-collections.sh` to download indexes to your system and mount it on docker. +# optional/one time - big download of data to local collections folder... +./install-collections.sh +docker run --rm -v $PWD/collections/:/opt/webapp/collections/ --publish 8080:8080 -ti cc-index ## CDX Server API diff --git a/install-collections.sh b/install-collections.sh index 7a2bd45..12e3b81 100755 --- a/install-collections.sh +++ b/install-collections.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/bash if [ ! -d "collections" ]; then mkdir collections diff --git a/requirements.txt b/requirements.txt index 3fad909..ba699b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,4 @@ uwsgi greenlet==1.1.2 gevent==20.9.0 werkzeug==2.0.3 -markupsafe==2.0.1 - -# AWS CLI (aws s3 cp ...) is used by install-collections.sh -# to fetch cluster.idx and metadata.yaml -awscli \ No newline at end of file +markupsafe==2.0.1 \ No newline at end of file From 71f7cb443945d4f6002ad7a492606b93a3bcd24f Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Tue, 4 Feb 2025 19:22:50 +0000 Subject: [PATCH 03/10] chore: adding docker ignore for collections folder --- .dockerignore | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 .dockerignore diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..06deb0b --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +collections + From 4749423ba9d468fad50c2db13de427b0ebe7a22f Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Mon, 24 Feb 2025 16:57:18 -0600 Subject: [PATCH 04/10] attempting a python 3.11 upgrade, and move to latest pywb --- Dockerfile | 2 +- requirements.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0b15b9f..f1addd4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # as of Jan 2025, python3 on production is on 3.8.10 -FROM python:3.8.10 +FROM python:3.11-bookworm # Create a virtualenv for the app RUN python3 -m venv /var/venv diff --git a/requirements.txt b/requirements.txt index ba699b9..61ee40d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 -git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb +pywb boto3 uwsgi -greenlet==1.1.2 -gevent==20.9.0 -werkzeug==2.0.3 -markupsafe==2.0.1 \ No newline at end of file +greenlet +gevent +werkzeug +markupsafe \ No newline at end of file From e81f028f2bb0ae898b066207d5ff9abc2a72f5ec Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Mon, 24 Feb 2025 18:15:16 -0600 Subject: [PATCH 05/10] fix: revert python version to 3.8 for now. --- Dockerfile | 2 +- requirements.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index f1addd4..0b15b9f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # as of Jan 2025, python3 on production is on 3.8.10 -FROM python:3.11-bookworm +FROM python:3.8.10 # Create a virtualenv for the app RUN python3 -m venv /var/venv diff --git a/requirements.txt b/requirements.txt index 61ee40d..ba699b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 -pywb +git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb boto3 uwsgi -greenlet -gevent -werkzeug -markupsafe \ No newline at end of file +greenlet==1.1.2 +gevent==20.9.0 +werkzeug==2.0.3 +markupsafe==2.0.1 \ No newline at end of file From b63cc7178b33e77b3d852bd3d4e2eaeaf53a8511 Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Tue, 25 Feb 2025 10:09:19 -0600 Subject: [PATCH 06/10] fix: going back to 3.11, tests are passing. --- Dockerfile | 5 ++++- requirements.txt | 10 +++++----- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0b15b9f..27a55f5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,13 @@ # as of Jan 2025, python3 on production is on 3.8.10 -FROM python:3.8.10 +FROM python:3.11-bullseye # Create a virtualenv for the app RUN python3 -m venv /var/venv ENV PATH="/var/venv/bin:$PATH" +RUN apt install libpcre3-dev +RUN pip install --upgrade pip setuptools + # Install dependencies COPY ./requirements.txt /tmp/requirements.txt RUN pip install -Ur /tmp/requirements.txt diff --git a/requirements.txt b/requirements.txt index ba699b9..61ee40d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 -git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb +pywb boto3 uwsgi -greenlet==1.1.2 -gevent==20.9.0 -werkzeug==2.0.3 -markupsafe==2.0.1 \ No newline at end of file +greenlet +gevent +werkzeug +markupsafe \ No newline at end of file From b68a7a9f575aeaef18397d86476e905b2fb244c9 Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Tue, 25 Feb 2025 19:35:23 -0600 Subject: [PATCH 07/10] fix: reverting back to 3.8 due to a few failing tests, and our collinfo template --- Dockerfile | 2 +- requirements.txt | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Dockerfile b/Dockerfile index 27a55f5..350ae1c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ # as of Jan 2025, python3 on production is on 3.8.10 -FROM python:3.11-bullseye +FROM python:3.8.10 # Create a virtualenv for the app RUN python3 -m venv /var/venv diff --git a/requirements.txt b/requirements.txt index 61ee40d..ba699b9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 -pywb +git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb boto3 uwsgi -greenlet -gevent -werkzeug -markupsafe \ No newline at end of file +greenlet==1.1.2 +gevent==20.9.0 +werkzeug==2.0.3 +markupsafe==2.0.1 \ No newline at end of file From 360f7006d86809033a10a897c7458777883eea02 Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Wed, 26 Feb 2025 10:56:34 -0600 Subject: [PATCH 08/10] fix: getting this to build on arm --- Dockerfile | 43 +++++++++++++++++++++++++++++++++++++++---- requirements.txt | 6 +++++- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 350ae1c..b45c444 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,15 +1,50 @@ # as of Jan 2025, python3 on production is on 3.8.10 -FROM python:3.8.10 +FROM ubuntu:20.04 + +ENV DEBIAN_FRONTEND=noninteractive + +RUN apt-get update && apt-get install -y \ + build-essential \ + linux-headers-virtual \ + make \ + gcc \ + git \ + curl \ + nano \ + libev-dev \ + libssl-dev \ + libc-dev \ + libffi-dev \ + libpcre3-dev \ + python3.9-full \ + python3.9-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \ + update-alternatives --set python3 /usr/bin/python3.9 + +RUN apt-get update && apt-get install -y \ + python3-venv \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-wheel \ + python3-cffi \ + python3-pytest \ + && rm -rf /var/lib/apt/lists/* -# Create a virtualenv for the app RUN python3 -m venv /var/venv ENV PATH="/var/venv/bin:$PATH" -RUN apt install libpcre3-dev -RUN pip install --upgrade pip setuptools +#RUN pip install --upgrade pip setuptools wheel +#RUN pip install "Cython<3" "setuptools<58" # Install dependencies COPY ./requirements.txt /tmp/requirements.txt +#RUN pip download --no-binary :all: --no-deps gevent==20.9.0 +#RUN pip download --no-binary :all: --no-deps gevent==20.9.0 +#RUN pip install gevent-20.9.0.tar.gz RUN pip install -Ur /tmp/requirements.txt # Add the cc-index-server code into the image diff --git a/requirements.txt b/requirements.txt index ba699b9..40e7d3e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,12 @@ +# this is the lowest version that seems to build on newer oses on arm. +# currently, our pywb tries to get 20.9.0, we should update that... +gevent==20.12.0 + # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb + boto3 uwsgi greenlet==1.1.2 -gevent==20.9.0 werkzeug==2.0.3 markupsafe==2.0.1 \ No newline at end of file From 67b00fe81cd068544a321a58e524c0bc2a0e6a7c Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Wed, 26 Feb 2025 14:36:29 -0600 Subject: [PATCH 09/10] chore: removing venv since docker gives us isolation, and fixing requirements and versions for 3.9 on intel --- Dockerfile | 22 ++++++++++------------ requirements.txt | 3 ++- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index b45c444..488ebde 100644 --- a/Dockerfile +++ b/Dockerfile @@ -18,13 +18,6 @@ RUN apt-get update && apt-get install -y \ libpcre3-dev \ python3.9-full \ python3.9-dev \ - && rm -rf /var/lib/apt/lists/* - -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ - update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \ - update-alternatives --set python3 /usr/bin/python3.9 - -RUN apt-get update && apt-get install -y \ python3-venv \ python3-dev \ python3-pip \ @@ -34,17 +27,22 @@ RUN apt-get update && apt-get install -y \ python3-pytest \ && rm -rf /var/lib/apt/lists/* -RUN python3 -m venv /var/venv -ENV PATH="/var/venv/bin:$PATH" +#RUN python3 -m venv /var/venv +#ENV PATH="/var/venv/bin:$PATH" +RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1 && \ + update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2 && \ + update-alternatives --set python3 /usr/bin/python3.9 + +# might come in handy later if we can upgrade gevent further... #RUN pip install --upgrade pip setuptools wheel #RUN pip install "Cython<3" "setuptools<58" - -# Install dependencies -COPY ./requirements.txt /tmp/requirements.txt #RUN pip download --no-binary :all: --no-deps gevent==20.9.0 #RUN pip download --no-binary :all: --no-deps gevent==20.9.0 #RUN pip install gevent-20.9.0.tar.gz + +# Install dependencies +COPY ./requirements.txt /tmp/requirements.txt RUN pip install -Ur /tmp/requirements.txt # Add the cc-index-server code into the image diff --git a/requirements.txt b/requirements.txt index 40e7d3e..e24f2f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,8 +5,9 @@ gevent==20.12.0 # Modified version of PyWB (pywb>=2.5.0), API compatible with PyWB 0.33.2 git+https://github.com/commoncrawl/pywb.git@common-crawl-cdx-index#egg=pywb +cffi boto3 uwsgi greenlet==1.1.2 werkzeug==2.0.3 -markupsafe==2.0.1 \ No newline at end of file +markupsafe==2.0.1 From 47717918c81dcb145ea8f9156ca8135822225305 Mon Sep 17 00:00:00 2001 From: Jason Grey Date: Fri, 28 Feb 2025 17:36:53 -0600 Subject: [PATCH 10/10] doc: make branch clear for pywb --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 5b0858b..1d71039 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,7 @@ This project is a deployment of the [pywb](https://github.com/webrecorder/pywb) web archive replay and index server to provide an index query mechanism for datasets provided by [Common Crawl](https://commoncrawl.org) +We depend on a fork of pywb, [maintained on this branch](https://github.com/commoncrawl/pywb/tree/common-crawl-cdx-index). It is a modified version of PyWB (pywb>=2.5.0), which is API compatible with PyWB 0.33.2. ## Usage & Installation To run locally, please install with `pip install -r requirements.txt`