Added the submission

Coda-Research-Group · Jul 31, 2024 · fdd13da · fdd13da
1 parent 61d8cff
commit fdd13da
Show file tree

Hide file tree

Showing 9 changed files with 1,575 additions and 0 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,87 @@
+name: CI
+
+on:
+  push:
+    # Sequence of patterns matched against refs/heads
+    branches:
+      # Push events on main branch
+      - paper-sisap24-indexing-challenge
+    # Sequence of patterns matched against refs/tags
+    tags: "*"
+
+jobs:
+  test:
+    name: ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        version:
+          - "1.10"
+        os:
+          - ubuntu-latest
+        arch:
+          - x64
+        exclude:
+          - os: macOS-latest
+            arch: x86
+        python-version: ["3.11"]
+    steps:
+      - uses: actions/checkout@v4.1.7
+        with:
+          submodules: "true"
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install dependencies for downloading the dataset
+        run: |
+          sudo apt-get install curl libcurl4-openssl-dev
+      - name: Download database and queries
+        if: steps.cache-data2024.outputs.cache-hit != 'true'
+        env:
+          DBSIZE: 300K
+        run: |
+          mkdir data2024
+          cd data2024
+          curl -O https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=$DBSIZE.h5
+          curl -O http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+          curl -O http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=$DBSIZE--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+          cd ..
+      - uses: conda-incubator/setup-miniconda@v3
+        with:
+          auto-update-conda: true
+          python-version: ${{ matrix.python-version }}
+      - name: Install LMI dependencies
+        shell: bash -el {0}
+        run: |
+          conda create -n lmi -y python=3.11
+          conda activate lmi
+          conda install -c pytorch -y faiss-cpu=1.8.0
+          conda install h5py=3.11.0
+          pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1
+          pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+      - name: Benchmark LMI
+        shell: bash -el {0}
+        run: |
+          pwd
+          ls -l 
+          ls -l data2024
+          conda activate lmi
+          # Parameters for 300K:
+          python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log
+          python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log
+          python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log
+          # Parameters for 100M:
+          # python3 task1.py &>task1.log
+          # python3 task2.py &>task2.log
+          # python3 task3.py &>task3.log
+          python3 eval.py --results result res.csv
+      - uses: actions/upload-artifact@v4.3.4
+        with:
+          name: Results
+          path: |
+            res.csv
+            task1.log
+            task2.log
+            task3.log
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM continuumio/miniconda3:24.5.0-0
+
+RUN conda init bash && \
+    . /root/.bashrc && \
+    conda create -n lmi -y python=3.11 && \
+    conda activate lmi && \
+    conda install -c pytorch -y faiss-cpu=1.8.0 && \
+    conda install h5py=3.11.0 && \
+    pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1 && \
+    pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu && \
+    echo 'conda activate lmi' >> /root/.bashrc
+
+WORKDIR /app
+COPY . .
+
+ENTRYPOINT ["/bin/bash", "-l", "-c" ]
diff --git a/README.md b/README.md
@@ -0,0 +1,72 @@
+# SISAP 2024 Indexing Challenge
+
+This branch contains the code for our submission to the SISAP 2024 Indexing Challenge.
+
+## Setup
+
+See also `.github/workflows/ci.yml`. Note the different parameters for 300K and 100M datasets when running the experiments.
+
+### Using Docker
+
+```shell
+docker build -t sisap24 -f Dockerfile .
+docker run -it --rm sisap24 bash
+```
+
+## Using Conda
+
+```shell
+conda create -n lmi -y python=3.11
+conda activate lmi
+conda install -c pytorch -y faiss-cpu=1.8.0
+pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1
+pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
+```
+
+## Running Experiments
+
+### 300K dataset
+
+```shell
+DBSIZE=300K
+
+# Download data
+mkdir data2024 && cd data2024
+wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5
+wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+cd ..
+
+# Run experiments on 300K dataset
+python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log
+python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log
+python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log
+```
+
+### 100M dataset
+
+```shell
+DBSIZE=100M
+
+# Download data
+mkdir data2024 && cd data2024
+wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5
+wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
+cd ..
+
+# Run experiments on 100M dataset
+python3 task1.py &>task1.log
+python3 task2.py &>task2.log
+python3 task3.py &>task3.log
+```
+
+### Evaluating Results
+
+```shell
+# Calculate recall
+python3 eval.py --results result res.csv
+
+# Show the results
+cat res.csv
+```
diff --git a/eval.py b/eval.py
@@ -0,0 +1,98 @@
+# Adapted from https://github.com/sisap-challenges/sisap23-laion-challenge-evaluation/blob/0a6f90debe73365abee210d3950efc07223c846d/eval.py
+
+import argparse
+import csv
+import glob
+import os
+from pathlib import Path
+from typing import Generator
+
+import numpy as np
+
+os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE'  # Solves: Errno 121
+
+import h5py
+
+
+def get_groundtruth(size: str = '300K'):
+    out_fn = Path(f'data2024/gold-standard-dbsize={size}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5')
+    gt_f = h5py.File(out_fn, 'r')
+    true_I = np.array(gt_f['knns'])
+    gt_f.close()
+    return true_I
+
+
+def get_all_results(dirname: str) -> Generator[h5py.File, None, None]:
+    mask = [dirname + '/*/*/*.h5', dirname + '/*/result/*/*/*.h5', dirname + '/*/result/*/*/*/*.h5']
+    print('search for results matching:')
+    print('\n'.join(mask))
+    for m in mask:
+        for fn in glob.iglob(m):
+            print(fn)
+            f = h5py.File(fn, 'r')
+            if 'knns' not in f or not ('size' in f or 'size' in f.attrs):
+                print('Ignoring ' + fn)
+                f.close()
+                continue
+            yield f
+            f.close()
+
+
+def get_recall(I, gt, k: int) -> float:
+    assert k <= I.shape[1]
+    assert len(I) == len(gt)
+
+    n = len(I)
+    recall = 0
+    for i in range(n):
+        recall += len(set(I[i, :k]) & set(gt[i, :k]))
+    return recall / (n * k)
+
+
+def return_h5_str(f, param):
+    if param not in f:
+        return 0
+    x = f[param][()]
+    if type(x) == np.bytes_:
+        return x.decode()
+    return x
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--results', help='directory in which results are stored', default='results')
+    parser.add_argument('csvfile')
+    args = parser.parse_args()
+    true_I_cache = {}  # noqa: N816
+    test_sizes = ['300K', '10M', '100M']
+
+    columns = [
+        'size',
+        'algo',
+        'modelingtime',
+        'encdatabasetime',
+        'encqueriestime',
+        'buildtime',
+        'querytime',
+        'params',
+        'recall',
+    ]
+
+    with Path.open(args.csvfile, 'w', newline='') as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=columns)
+        writer.writeheader()
+        for res in get_all_results(args.results):
+            try:
+                size = res.attrs['size']
+                d = dict(res.attrs)
+            except:
+                size = res['size'][()].decode()
+                d = {k: return_h5_str(res, k) for k in columns}
+            if size not in test_sizes:
+                continue
+            if size not in true_I_cache:
+                true_I_cache[size] = get_groundtruth(size)
+            recall = get_recall(np.array(res['knns']), true_I_cache[size], 30)
+            d['recall'] = recall
+            print(d['algo'], d['params'], '=>', recall)
+            writer.writerow(d)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,95 @@
+[tool.ruff]
+line-length = 140
+extend-include = ["*.ipynb"]
+
+[tool.ruff.format]
+quote-style = "single"
+
+[tool.ruff.lint]
+ignore = [
+    # https://docs.astral.sh/ruff/rules/
+    "T201",   # print found
+    "Q000",   # Single quotes found but double quotes preferred
+    "ERA001", # Found commented-out code
+    "PGH003", # Use specific rule codes when ignoring type issues
+    "D100",   # Missing docstring in public module
+    "S101",   # Use of assert detected
+    "N806",   # Variable {name} in function should be lowercase
+    "EXE002", # The file is executable but no shebang is present
+    "INP001", # File {filename} is part of an implicit namespace package. Add an __init__.py.
+    "D101",   # Missing docstring in public class
+    "D102",   # Missing docstring in public method
+    "D103",   # Missing docstring in public function
+    "ANN101", # Missing type annotation for {name} in method
+    "FIX002", # Line contains TODO, consider resolving the issue
+    "TD003",  # Missing issue link on the line following this TODO
+    "D107",   # Missing docstring in __init__
+    "N803",   # Argument name {name} should be lowercase
+    "ANN204", # Missing return type annotation for special method {name}
+    "D105",   # Missing docstring in magic method
+    "E741",   # Ambiguous variable name: {name}
+]
+select = [
+    "F",     # Pyflakes
+    "E",     # pycodestyle - error
+    "W",     # pycodestyle - warning
+    "C90",   # mccabe
+    "I",     # isort
+    "N",     # pep8-naming
+    "D",     # pydocstyle
+    "UP",    # pyupgrade
+    "YTT",   # flake8-2020
+    "ANN",   # flake8-annotations
+    "ASYNC", # flake8-async
+    "TRIO",  # flake8-trio
+    "S",     # flake8-bandit
+    "BLE",   # flake8-blind-except
+    "FBT",   # flake8-boolean-trap
+    "B",     # flake8-bugbear
+    "A",     # flake8-builtins
+    "COM",   # flake8-commas
+    "CPY",   # flake8-copyright
+    "C4",    # flake8-comprehensions
+    "DTZ",   # flake8-datetimez
+    "T10",   # flake8-debugger
+    "DJ",    # flake8-django
+    "EM",    # flake8-errmsg
+    "EXE",   # flake8-executable
+    "FA",    # flake8-future-annotations
+    "ISC",   # flake8-implicit-str-concat
+    "ICN",   # flake8-import-conventions
+    "G",     # flake8-logging-format
+    "INP",   # flake8-no-pep420
+    "PIE",   # flake8-pie
+    "T20",   # flake8-print
+    "PYI",   # flake8-pyi
+    "PT",    # flake8-pytest-style
+    "Q",     # flake8-quotes
+    "RSE",   # flake8-raise
+    "RET",   # flake8-return
+    "SLF",   # flake8-self
+    "SLOT",  # flake8-slots
+    "SIM",   # flake8-simplify
+    "TID",   # flake8-tidy-imports
+    "TCH",   # flake8-type-checking
+    "INT",   # flake8-gettext
+    "ARG",   # flake8-unused-arguments
+    "PTH",   # flake8-use-pathlib
+    "TD",    # flake8-todos
+    "FIX",   # flake8-fixme
+    "ERA",   # eradicate
+    "PD",    # pandas-vet
+    "PGH",   # pygrep-hooks
+    "PL",    # Pylint
+    "TRY",   # tryceratops
+    "FLY",   # flynt
+    "NPY",   # NumPy-specific rules
+    "AIR",   # Airflow
+    "PERF",  # Perflint
+    "FURB",  # refurb
+    "LOG",   # flake8-logging
+    "RUF",   # Ruff-specific rules
+]
+
+[tool.ruff.lint.pylint]
+max-args = 7