Skip to content

Commit

Permalink
Added the submission
Browse files Browse the repository at this point in the history
  • Loading branch information
ProchazkaDavid committed Jul 31, 2024
1 parent 61d8cff commit fdd13da
Show file tree
Hide file tree
Showing 9 changed files with 1,575 additions and 0 deletions.
87 changes: 87 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
name: CI

on:
push:
# Sequence of patterns matched against refs/heads
branches:
# Push events on main branch
- paper-sisap24-indexing-challenge
# Sequence of patterns matched against refs/tags
tags: "*"

jobs:
test:
name: ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
version:
- "1.10"
os:
- ubuntu-latest
arch:
- x64
exclude:
- os: macOS-latest
arch: x86
python-version: ["3.11"]
steps:
- uses: actions/checkout@v4.1.7
with:
submodules: "true"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies for downloading the dataset
run: |
sudo apt-get install curl libcurl4-openssl-dev
- name: Download database and queries
if: steps.cache-data2024.outputs.cache-hit != 'true'
env:
DBSIZE: 300K
run: |
mkdir data2024
cd data2024
curl -O https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=$DBSIZE.h5
curl -O http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
curl -O http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=$DBSIZE--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
cd ..
- uses: conda-incubator/setup-miniconda@v3
with:
auto-update-conda: true
python-version: ${{ matrix.python-version }}
- name: Install LMI dependencies
shell: bash -el {0}
run: |
conda create -n lmi -y python=3.11
conda activate lmi
conda install -c pytorch -y faiss-cpu=1.8.0
conda install h5py=3.11.0
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
- name: Benchmark LMI
shell: bash -el {0}
run: |
pwd
ls -l
ls -l data2024
conda activate lmi
# Parameters for 300K:
python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log
python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log
python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log
# Parameters for 100M:
# python3 task1.py &>task1.log
# python3 task2.py &>task2.log
# python3 task3.py &>task3.log
python3 eval.py --results result res.csv
- uses: actions/upload-artifact@v4.3.4
with:
name: Results
path: |
res.csv
task1.log
task2.log
task3.log
16 changes: 16 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
FROM continuumio/miniconda3:24.5.0-0

RUN conda init bash && \
. /root/.bashrc && \
conda create -n lmi -y python=3.11 && \
conda activate lmi && \
conda install -c pytorch -y faiss-cpu=1.8.0 && \
conda install h5py=3.11.0 && \
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1 && \
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu && \
echo 'conda activate lmi' >> /root/.bashrc

WORKDIR /app
COPY . .

ENTRYPOINT ["/bin/bash", "-l", "-c" ]
72 changes: 72 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# SISAP 2024 Indexing Challenge

This branch contains the code for our submission to the SISAP 2024 Indexing Challenge.

## Setup

See also `.github/workflows/ci.yml`. Note the different parameters for 300K and 100M datasets when running the experiments.

### Using Docker

```shell
docker build -t sisap24 -f Dockerfile .
docker run -it --rm sisap24 bash
```

## Using Conda

```shell
conda create -n lmi -y python=3.11
conda activate lmi
conda install -c pytorch -y faiss-cpu=1.8.0
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu
```

## Running Experiments

### 300K dataset

```shell
DBSIZE=300K

# Download data
mkdir data2024 && cd data2024
wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5
wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
cd ..

# Run experiments on 300K dataset
python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log
python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log
python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log
```

### 100M dataset

```shell
DBSIZE=100M

# Download data
mkdir data2024 && cd data2024
wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5
wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5
wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5
cd ..

# Run experiments on 100M dataset
python3 task1.py &>task1.log
python3 task2.py &>task2.log
python3 task3.py &>task3.log
```

### Evaluating Results

```shell
# Calculate recall
python3 eval.py --results result res.csv

# Show the results
cat res.csv
```
98 changes: 98 additions & 0 deletions eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Adapted from https://github.com/sisap-challenges/sisap23-laion-challenge-evaluation/blob/0a6f90debe73365abee210d3950efc07223c846d/eval.py

import argparse
import csv
import glob
import os
from pathlib import Path
from typing import Generator

import numpy as np

os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE' # Solves: Errno 121

import h5py


def get_groundtruth(size: str = '300K'):
out_fn = Path(f'data2024/gold-standard-dbsize={size}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5')
gt_f = h5py.File(out_fn, 'r')
true_I = np.array(gt_f['knns'])
gt_f.close()
return true_I


def get_all_results(dirname: str) -> Generator[h5py.File, None, None]:
mask = [dirname + '/*/*/*.h5', dirname + '/*/result/*/*/*.h5', dirname + '/*/result/*/*/*/*.h5']
print('search for results matching:')
print('\n'.join(mask))
for m in mask:
for fn in glob.iglob(m):
print(fn)
f = h5py.File(fn, 'r')
if 'knns' not in f or not ('size' in f or 'size' in f.attrs):
print('Ignoring ' + fn)
f.close()
continue
yield f
f.close()


def get_recall(I, gt, k: int) -> float:
assert k <= I.shape[1]
assert len(I) == len(gt)

n = len(I)
recall = 0
for i in range(n):
recall += len(set(I[i, :k]) & set(gt[i, :k]))
return recall / (n * k)


def return_h5_str(f, param):
if param not in f:
return 0
x = f[param][()]
if type(x) == np.bytes_:
return x.decode()
return x


if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--results', help='directory in which results are stored', default='results')
parser.add_argument('csvfile')
args = parser.parse_args()
true_I_cache = {} # noqa: N816
test_sizes = ['300K', '10M', '100M']

columns = [
'size',
'algo',
'modelingtime',
'encdatabasetime',
'encqueriestime',
'buildtime',
'querytime',
'params',
'recall',
]

with Path.open(args.csvfile, 'w', newline='') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=columns)
writer.writeheader()
for res in get_all_results(args.results):
try:
size = res.attrs['size']
d = dict(res.attrs)
except:
size = res['size'][()].decode()
d = {k: return_h5_str(res, k) for k in columns}
if size not in test_sizes:
continue
if size not in true_I_cache:
true_I_cache[size] = get_groundtruth(size)
recall = get_recall(np.array(res['knns']), true_I_cache[size], 30)
d['recall'] = recall
print(d['algo'], d['params'], '=>', recall)
writer.writerow(d)
95 changes: 95 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
[tool.ruff]
line-length = 140
extend-include = ["*.ipynb"]

[tool.ruff.format]
quote-style = "single"

[tool.ruff.lint]
ignore = [
# https://docs.astral.sh/ruff/rules/
"T201", # print found
"Q000", # Single quotes found but double quotes preferred
"ERA001", # Found commented-out code
"PGH003", # Use specific rule codes when ignoring type issues
"D100", # Missing docstring in public module
"S101", # Use of assert detected
"N806", # Variable {name} in function should be lowercase
"EXE002", # The file is executable but no shebang is present
"INP001", # File {filename} is part of an implicit namespace package. Add an __init__.py.
"D101", # Missing docstring in public class
"D102", # Missing docstring in public method
"D103", # Missing docstring in public function
"ANN101", # Missing type annotation for {name} in method
"FIX002", # Line contains TODO, consider resolving the issue
"TD003", # Missing issue link on the line following this TODO
"D107", # Missing docstring in __init__
"N803", # Argument name {name} should be lowercase
"ANN204", # Missing return type annotation for special method {name}
"D105", # Missing docstring in magic method
"E741", # Ambiguous variable name: {name}
]
select = [
"F", # Pyflakes
"E", # pycodestyle - error
"W", # pycodestyle - warning
"C90", # mccabe
"I", # isort
"N", # pep8-naming
"D", # pydocstyle
"UP", # pyupgrade
"YTT", # flake8-2020
"ANN", # flake8-annotations
"ASYNC", # flake8-async
"TRIO", # flake8-trio
"S", # flake8-bandit
"BLE", # flake8-blind-except
"FBT", # flake8-boolean-trap
"B", # flake8-bugbear
"A", # flake8-builtins
"COM", # flake8-commas
"CPY", # flake8-copyright
"C4", # flake8-comprehensions
"DTZ", # flake8-datetimez
"T10", # flake8-debugger
"DJ", # flake8-django
"EM", # flake8-errmsg
"EXE", # flake8-executable
"FA", # flake8-future-annotations
"ISC", # flake8-implicit-str-concat
"ICN", # flake8-import-conventions
"G", # flake8-logging-format
"INP", # flake8-no-pep420
"PIE", # flake8-pie
"T20", # flake8-print
"PYI", # flake8-pyi
"PT", # flake8-pytest-style
"Q", # flake8-quotes
"RSE", # flake8-raise
"RET", # flake8-return
"SLF", # flake8-self
"SLOT", # flake8-slots
"SIM", # flake8-simplify
"TID", # flake8-tidy-imports
"TCH", # flake8-type-checking
"INT", # flake8-gettext
"ARG", # flake8-unused-arguments
"PTH", # flake8-use-pathlib
"TD", # flake8-todos
"FIX", # flake8-fixme
"ERA", # eradicate
"PD", # pandas-vet
"PGH", # pygrep-hooks
"PL", # Pylint
"TRY", # tryceratops
"FLY", # flynt
"NPY", # NumPy-specific rules
"AIR", # Airflow
"PERF", # Perflint
"FURB", # refurb
"LOG", # flake8-logging
"RUF", # Ruff-specific rules
]

[tool.ruff.lint.pylint]
max-args = 7
Loading

0 comments on commit fdd13da

Please sign in to comment.