-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
61d8cff
commit fdd13da
Showing
9 changed files
with
1,575 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
name: CI | ||
|
||
on: | ||
push: | ||
# Sequence of patterns matched against refs/heads | ||
branches: | ||
# Push events on main branch | ||
- paper-sisap24-indexing-challenge | ||
# Sequence of patterns matched against refs/tags | ||
tags: "*" | ||
|
||
jobs: | ||
test: | ||
name: ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} | ||
runs-on: ${{ matrix.os }} | ||
strategy: | ||
fail-fast: false | ||
matrix: | ||
version: | ||
- "1.10" | ||
os: | ||
- ubuntu-latest | ||
arch: | ||
- x64 | ||
exclude: | ||
- os: macOS-latest | ||
arch: x86 | ||
python-version: ["3.11"] | ||
steps: | ||
- uses: actions/checkout@v4.1.7 | ||
with: | ||
submodules: "true" | ||
- name: Set up Python ${{ matrix.python-version }} | ||
uses: actions/setup-python@v5 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install dependencies for downloading the dataset | ||
run: | | ||
sudo apt-get install curl libcurl4-openssl-dev | ||
- name: Download database and queries | ||
if: steps.cache-data2024.outputs.cache-hit != 'true' | ||
env: | ||
DBSIZE: 300K | ||
run: | | ||
mkdir data2024 | ||
cd data2024 | ||
curl -O https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=$DBSIZE.h5 | ||
curl -O http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
curl -O http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=$DBSIZE--public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
cd .. | ||
- uses: conda-incubator/setup-miniconda@v3 | ||
with: | ||
auto-update-conda: true | ||
python-version: ${{ matrix.python-version }} | ||
- name: Install LMI dependencies | ||
shell: bash -el {0} | ||
run: | | ||
conda create -n lmi -y python=3.11 | ||
conda activate lmi | ||
conda install -c pytorch -y faiss-cpu=1.8.0 | ||
conda install h5py=3.11.0 | ||
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1 | ||
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu | ||
- name: Benchmark LMI | ||
shell: bash -el {0} | ||
run: | | ||
pwd | ||
ls -l | ||
ls -l data2024 | ||
conda activate lmi | ||
# Parameters for 300K: | ||
python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log | ||
python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log | ||
python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log | ||
# Parameters for 100M: | ||
# python3 task1.py &>task1.log | ||
# python3 task2.py &>task2.log | ||
# python3 task3.py &>task3.log | ||
python3 eval.py --results result res.csv | ||
- uses: actions/upload-artifact@v4.3.4 | ||
with: | ||
name: Results | ||
path: | | ||
res.csv | ||
task1.log | ||
task2.log | ||
task3.log |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
FROM continuumio/miniconda3:24.5.0-0 | ||
|
||
RUN conda init bash && \ | ||
. /root/.bashrc && \ | ||
conda create -n lmi -y python=3.11 && \ | ||
conda activate lmi && \ | ||
conda install -c pytorch -y faiss-cpu=1.8.0 && \ | ||
conda install h5py=3.11.0 && \ | ||
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1 && \ | ||
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu && \ | ||
echo 'conda activate lmi' >> /root/.bashrc | ||
|
||
WORKDIR /app | ||
COPY . . | ||
|
||
ENTRYPOINT ["/bin/bash", "-l", "-c" ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
# SISAP 2024 Indexing Challenge | ||
|
||
This branch contains the code for our submission to the SISAP 2024 Indexing Challenge. | ||
|
||
## Setup | ||
|
||
See also `.github/workflows/ci.yml`. Note the different parameters for 300K and 100M datasets when running the experiments. | ||
|
||
### Using Docker | ||
|
||
```shell | ||
docker build -t sisap24 -f Dockerfile . | ||
docker run -it --rm sisap24 bash | ||
``` | ||
|
||
## Using Conda | ||
|
||
```shell | ||
conda create -n lmi -y python=3.11 | ||
conda activate lmi | ||
conda install -c pytorch -y faiss-cpu=1.8.0 | ||
pip install --no-cache-dir numpy==1.26.4 tqdm==4.66.4 loguru==0.7.2 scikit-learn==1.5.1 | ||
pip install --no-cache-dir torch==2.3.1 --index-url https://download.pytorch.org/whl/cpu | ||
``` | ||
|
||
## Running Experiments | ||
|
||
### 300K dataset | ||
|
||
```shell | ||
DBSIZE=300K | ||
|
||
# Download data | ||
mkdir data2024 && cd data2024 | ||
wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5 | ||
wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
cd .. | ||
|
||
# Run experiments on 300K dataset | ||
python3 task1.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task1.log | ||
python3 task2.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task2.log | ||
python3 task3.py --dataset-size ${DBSIZE} --sample-size 100000 --chunk-size 100000 &>task3.log | ||
``` | ||
|
||
### 100M dataset | ||
|
||
```shell | ||
DBSIZE=100M | ||
|
||
# Download data | ||
mkdir data2024 && cd data2024 | ||
wget https://sisap-23-challenge.s3.amazonaws.com/SISAP23-Challenge/laion2B-en-clip768v2-n=${DBSIZE}.h5 | ||
wget http://ingeotec.mx/~sadit/sisap2024-data/public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
wget http://ingeotec.mx/~sadit/sisap2024-data/gold-standard-dbsize=${DBSIZE}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5 | ||
cd .. | ||
|
||
# Run experiments on 100M dataset | ||
python3 task1.py &>task1.log | ||
python3 task2.py &>task2.log | ||
python3 task3.py &>task3.log | ||
``` | ||
|
||
### Evaluating Results | ||
|
||
```shell | ||
# Calculate recall | ||
python3 eval.py --results result res.csv | ||
|
||
# Show the results | ||
cat res.csv | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# Adapted from https://github.com/sisap-challenges/sisap23-laion-challenge-evaluation/blob/0a6f90debe73365abee210d3950efc07223c846d/eval.py | ||
|
||
import argparse | ||
import csv | ||
import glob | ||
import os | ||
from pathlib import Path | ||
from typing import Generator | ||
|
||
import numpy as np | ||
|
||
os.environ['HDF5_USE_FILE_LOCKING'] = 'FALSE' # Solves: Errno 121 | ||
|
||
import h5py | ||
|
||
|
||
def get_groundtruth(size: str = '300K'): | ||
out_fn = Path(f'data2024/gold-standard-dbsize={size}--public-queries-2024-laion2B-en-clip768v2-n=10k.h5') | ||
gt_f = h5py.File(out_fn, 'r') | ||
true_I = np.array(gt_f['knns']) | ||
gt_f.close() | ||
return true_I | ||
|
||
|
||
def get_all_results(dirname: str) -> Generator[h5py.File, None, None]: | ||
mask = [dirname + '/*/*/*.h5', dirname + '/*/result/*/*/*.h5', dirname + '/*/result/*/*/*/*.h5'] | ||
print('search for results matching:') | ||
print('\n'.join(mask)) | ||
for m in mask: | ||
for fn in glob.iglob(m): | ||
print(fn) | ||
f = h5py.File(fn, 'r') | ||
if 'knns' not in f or not ('size' in f or 'size' in f.attrs): | ||
print('Ignoring ' + fn) | ||
f.close() | ||
continue | ||
yield f | ||
f.close() | ||
|
||
|
||
def get_recall(I, gt, k: int) -> float: | ||
assert k <= I.shape[1] | ||
assert len(I) == len(gt) | ||
|
||
n = len(I) | ||
recall = 0 | ||
for i in range(n): | ||
recall += len(set(I[i, :k]) & set(gt[i, :k])) | ||
return recall / (n * k) | ||
|
||
|
||
def return_h5_str(f, param): | ||
if param not in f: | ||
return 0 | ||
x = f[param][()] | ||
if type(x) == np.bytes_: | ||
return x.decode() | ||
return x | ||
|
||
|
||
if __name__ == '__main__': | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--results', help='directory in which results are stored', default='results') | ||
parser.add_argument('csvfile') | ||
args = parser.parse_args() | ||
true_I_cache = {} # noqa: N816 | ||
test_sizes = ['300K', '10M', '100M'] | ||
|
||
columns = [ | ||
'size', | ||
'algo', | ||
'modelingtime', | ||
'encdatabasetime', | ||
'encqueriestime', | ||
'buildtime', | ||
'querytime', | ||
'params', | ||
'recall', | ||
] | ||
|
||
with Path.open(args.csvfile, 'w', newline='') as csvfile: | ||
writer = csv.DictWriter(csvfile, fieldnames=columns) | ||
writer.writeheader() | ||
for res in get_all_results(args.results): | ||
try: | ||
size = res.attrs['size'] | ||
d = dict(res.attrs) | ||
except: | ||
size = res['size'][()].decode() | ||
d = {k: return_h5_str(res, k) for k in columns} | ||
if size not in test_sizes: | ||
continue | ||
if size not in true_I_cache: | ||
true_I_cache[size] = get_groundtruth(size) | ||
recall = get_recall(np.array(res['knns']), true_I_cache[size], 30) | ||
d['recall'] = recall | ||
print(d['algo'], d['params'], '=>', recall) | ||
writer.writerow(d) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
[tool.ruff] | ||
line-length = 140 | ||
extend-include = ["*.ipynb"] | ||
|
||
[tool.ruff.format] | ||
quote-style = "single" | ||
|
||
[tool.ruff.lint] | ||
ignore = [ | ||
# https://docs.astral.sh/ruff/rules/ | ||
"T201", # print found | ||
"Q000", # Single quotes found but double quotes preferred | ||
"ERA001", # Found commented-out code | ||
"PGH003", # Use specific rule codes when ignoring type issues | ||
"D100", # Missing docstring in public module | ||
"S101", # Use of assert detected | ||
"N806", # Variable {name} in function should be lowercase | ||
"EXE002", # The file is executable but no shebang is present | ||
"INP001", # File {filename} is part of an implicit namespace package. Add an __init__.py. | ||
"D101", # Missing docstring in public class | ||
"D102", # Missing docstring in public method | ||
"D103", # Missing docstring in public function | ||
"ANN101", # Missing type annotation for {name} in method | ||
"FIX002", # Line contains TODO, consider resolving the issue | ||
"TD003", # Missing issue link on the line following this TODO | ||
"D107", # Missing docstring in __init__ | ||
"N803", # Argument name {name} should be lowercase | ||
"ANN204", # Missing return type annotation for special method {name} | ||
"D105", # Missing docstring in magic method | ||
"E741", # Ambiguous variable name: {name} | ||
] | ||
select = [ | ||
"F", # Pyflakes | ||
"E", # pycodestyle - error | ||
"W", # pycodestyle - warning | ||
"C90", # mccabe | ||
"I", # isort | ||
"N", # pep8-naming | ||
"D", # pydocstyle | ||
"UP", # pyupgrade | ||
"YTT", # flake8-2020 | ||
"ANN", # flake8-annotations | ||
"ASYNC", # flake8-async | ||
"TRIO", # flake8-trio | ||
"S", # flake8-bandit | ||
"BLE", # flake8-blind-except | ||
"FBT", # flake8-boolean-trap | ||
"B", # flake8-bugbear | ||
"A", # flake8-builtins | ||
"COM", # flake8-commas | ||
"CPY", # flake8-copyright | ||
"C4", # flake8-comprehensions | ||
"DTZ", # flake8-datetimez | ||
"T10", # flake8-debugger | ||
"DJ", # flake8-django | ||
"EM", # flake8-errmsg | ||
"EXE", # flake8-executable | ||
"FA", # flake8-future-annotations | ||
"ISC", # flake8-implicit-str-concat | ||
"ICN", # flake8-import-conventions | ||
"G", # flake8-logging-format | ||
"INP", # flake8-no-pep420 | ||
"PIE", # flake8-pie | ||
"T20", # flake8-print | ||
"PYI", # flake8-pyi | ||
"PT", # flake8-pytest-style | ||
"Q", # flake8-quotes | ||
"RSE", # flake8-raise | ||
"RET", # flake8-return | ||
"SLF", # flake8-self | ||
"SLOT", # flake8-slots | ||
"SIM", # flake8-simplify | ||
"TID", # flake8-tidy-imports | ||
"TCH", # flake8-type-checking | ||
"INT", # flake8-gettext | ||
"ARG", # flake8-unused-arguments | ||
"PTH", # flake8-use-pathlib | ||
"TD", # flake8-todos | ||
"FIX", # flake8-fixme | ||
"ERA", # eradicate | ||
"PD", # pandas-vet | ||
"PGH", # pygrep-hooks | ||
"PL", # Pylint | ||
"TRY", # tryceratops | ||
"FLY", # flynt | ||
"NPY", # NumPy-specific rules | ||
"AIR", # Airflow | ||
"PERF", # Perflint | ||
"FURB", # refurb | ||
"LOG", # flake8-logging | ||
"RUF", # Ruff-specific rules | ||
] | ||
|
||
[tool.ruff.lint.pylint] | ||
max-args = 7 |
Oops, something went wrong.