Skip to content

Commit 0309358

Browse files
first commit
0 parents  commit 0309358

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+14550
-0
lines changed

.dockerignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
# ignore .git folders
2+
.git
3+
4+
# ignore specific.json files
5+
arxiv-metadata-oai-snapshot.json

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
arxiv-metadata-oai-snapshot.json
2+
*.pkl
3+
*.DS_STORE
4+
*.log

CONTRIBUTING.md

Whitespace-only changes.

Dockerfile

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
FROM node:17.2-alpine AS ReactImage
2+
3+
WORKDIR /app/frontend
4+
5+
ENV NODE_PATH=/app/frontend/node_modules
6+
ENV PATH=$PATH:/app/frontend/node_modules/.bin
7+
8+
COPY ./frontend/package.json ./
9+
RUN yarn install --no-optional
10+
11+
ADD ./frontend ./
12+
RUN yarn build
13+
14+
15+
FROM python:3.8-slim-buster AS ApiImage
16+
17+
ENV PYTHONUNBUFFERED 1
18+
ENV PYTHONDONTWRITEBYTECODE 1
19+
20+
RUN python3 -m pip install --upgrade pip setuptools wheel
21+
22+
WORKDIR /app/
23+
COPY ./data/ ./data
24+
25+
RUN mkdir -p /app/backend
26+
WORKDIR /app/backend
27+
28+
COPY ./backend/ .
29+
RUN pip install -e .
30+
31+
# add static react files to fastapi image
32+
COPY --from=ReactImage /app/frontend/build /app/backend/vecsim_app/templates/build
33+
34+
WORKDIR /app/backend/vecsim_app
35+
36+
CMD ["sh", "./entrypoint.sh"]

LICENSE

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
BSD 3-Clause License
2+
3+
Copyright (c) 2022, Sam Partee
4+
All rights reserved.
5+
6+
Redistribution and use in source and binary forms, with or without
7+
modification, are permitted provided that the following conditions are met:
8+
9+
* Redistributions of source code must retain the above copyright notice, this
10+
list of conditions and the following disclaimer.
11+
12+
* Redistributions in binary form must reproduce the above copyright notice,
13+
this list of conditions and the following disclaimer in the documentation
14+
and/or other materials provided with the distribution.
15+
16+
* Neither the name of the copyright holder nor the names of its
17+
contributors may be used to endorse or promote products derived from
18+
this software without specific prior written permission.
19+
20+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
24+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
25+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
27+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
28+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Makefile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
MAKEFLAGS += --no-print-directory
2+
3+
# Do not remove this block. It is used by the 'help' rule when
4+
# constructing the help output.
5+
# help:
6+
# help: Redis arXiv Search App Makefile help
7+
# help:
8+
9+
SHELL:=/bin/bash
10+
CONDA_ACTIVATE=source $$(conda info --base)/etc/profile.d/conda.sh ; conda activate ; conda activate
11+
12+
# help: help - display this makefile's help information
13+
.PHONY: help
14+
help:
15+
@grep "^# help\:" Makefile | grep -v grep | sed 's/\# help\: //' | sed 's/\# help\://'
16+
17+
# help:
18+
# help: Conda Environment Setup
19+
# help: -------------
20+
21+
# help: env - setup a Python conda env for this application
22+
.PHONY: env
23+
env:
24+
@conda create -n arXiv python=3.9 -y
25+
$(CONDA_ACTIVATE) arXiv
26+
@cd backend/ && pip install -r requirements.txt
27+
28+
# help:
29+
# help:

README.md

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
2+
<div align="center">
3+
<a href="https://github.com/spartee/redis-vector-search"><img src="https://github.com/RedisVentures/redis-arXiv-search/blob/master/app/vecsim_app/data/redis-logo.png?raw=true" width="30%"><img></a>
4+
<br />
5+
<br />
6+
<div display="inline-block">
7+
<a href="https://github.com/RedisVentures/redis-arXiv-search"><b>Code</b></a>&nbsp;&nbsp;&nbsp;
8+
<a href="https://redis.io/docs/stack/search/reference/vectors/"><b>Redis VSS Documentation</b></a>&nbsp;&nbsp;&nbsp;
9+
</div>
10+
<br />
11+
<br />
12+
</div>
13+
14+
# Redis arXiv Search Demo
15+
16+
https://www.kaggle.com/code/foolofatook/zero-shot-classification-with-huggingface-pipeline
17+
https://huggingface.co/allenai/scibert_scivocab_uncased
18+
19+
20+
This arXiv demo showcases the vector search similarity (VSS) capability within Redis Stack and Redis Enterprise.
21+
Through the RediSearch module, vector types and indexes can be added to Redis. This turns Redis into
22+
a highly performant vector database which can be used for all types of applications.
23+
24+
25+
26+
## Application
27+
28+
This app was built as a Single Page Application (SPA) with the following components:
29+
30+
- **[Redis Stack](https://redis.io/docs/stack/)**: Vector database + JSON storage
31+
- **[FastAPI](https://fastapi.tiangolo.com/)** (Python 3.8)
32+
- JWT authentication using [OAuth2 "password
33+
flow"](https://fastapi.tiangolo.com/tutorial/security/simple-oauth2/) and
34+
PyJWT
35+
- **[Pydantic](https://pydantic-docs.helpmanual.io/)** for schema and validation
36+
- **[React](https://reactjs.org/)** (with Typescript)
37+
- **[Redis OM](https://redis.io/docs/stack/get-started/tutorials/stack-python/)** for ORM
38+
- **[Docker Compose](https://docs.docker.com/compose/)** for development
39+
- **[MaterialUI](https://material-ui.com/)** for some UI elements
40+
- **[React-Bootstrap](https://react-bootstrap.github.io/)** for some UI elements
41+
- **[Huggingface Sentence Transformers](https://huggingface.co/sentence-transformers)** for vector embedding creation
42+
43+
Some inspiration was taken from this [Cookiecutter project](https://github.com/Buuntu/fastapi-react)
44+
and turned into a SPA application instead of a separate front-end server approach.
45+
46+
47+
### Datasets
48+
49+
The dataset was taken from the the following [Kaggle link](https://www.kaggle.com/Cornell-University/arxiv).
50+
51+
Download and extract the zip file and place the json file in the `data/` directory.
52+
53+
## Running Locally
54+
55+
### Using pre-built containers
56+
57+
The easiest option to run locally is to use the following docker-compose file to launch the
58+
prebuilt container hosted on GitHub.
59+
60+
```yaml
61+
version: '3.7'
62+
services:
63+
64+
redis-vector-db:
65+
image: redis/redis-stack:latest
66+
ports:
67+
- 6379:6379
68+
- 8001:8001
69+
70+
backend:
71+
image: ghcr.io/spartee/redis-vss-fashion:v0.2.0
72+
environment:
73+
DEPLOYMENT: "dev"
74+
REDIS_DATA_URL: 'redis://redis-vector-db:6379'
75+
REDIS_OM_URL: 'redis://redis-vector-db:6379'
76+
REDIS_HOST: 'redis-vector-db'
77+
REDIS_PORT: 6379
78+
expose:
79+
- "8888"
80+
ports:
81+
- "8888:8888"
82+
depends_on:
83+
- "redis-vector-db"
84+
85+
```
86+
87+
To launch, run the following
88+
- ``docker compose up`` in same directory as ``docker-compose.yml``
89+
- Navigate to ``0.0.0.0:8888`` in a browser
90+
91+
### Building the containers
92+
93+
More to come here
94+
95+
### Running outside docker
96+
97+
More to come here

backend/requirements.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
fastapi
2+
httpx
3+
requests
4+
uvicorn
5+
aiofiles
6+
redis-om
7+
pandas
8+
sentence-transformers

backend/setup.cfg

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
[metadata]
2+
name = arXiv_vecsim
3+
version=0.0.1
4+
description = Workshop demo of Redis vector similarity over arXiv papers
5+
long_description = file: README.md
6+
long_description_content_type=text/markdown
7+
8+
url = https://github.com/RedisVentures/redis-arXiv-search
9+
project_urls =
10+
Source = https://github.com/RedisVentures/redis-arXiv-search
11+
12+
author = Tyler Hutcherson
13+
author_email = tyler.hutcherson@redis.com
14+
contact = Tyler Hutcherson
15+
contact_email = tyler.hutcherson@redis.com
16+
license = MIT
17+
keywords = redis, vector similarity, ai, machine learning
18+
classifiers =
19+
Programming Language :: Python :: 3.7
20+
Programming Language :: Python :: 3.8
21+
Programming Language :: Python :: 3.9
22+
23+
[options]
24+
packages = find:
25+
setup_requires =
26+
setuptools>=39.2
27+
include_package_data = True
28+
python_requires = >=3.7
29+
install_requires =
30+
fastapi
31+
httpx
32+
requests
33+
uvicorn
34+
aiofiles
35+
redis-om
36+
pandas
37+
ipython
38+
numpy
39+
jinja2
40+
passlib
41+
bcrypt
42+
python-multipart
43+
pyjwt
44+
sentence-transformers

backend/setup.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
2+
from setuptools import setup
3+
4+
setup()

backend/vecsim_app/__init__.py

Whitespace-only changes.

backend/vecsim_app/api/__init__.py

Whitespace-only changes.

backend/vecsim_app/api/routes.py

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
import typing as t
2+
import redis.asyncio as redis
3+
4+
from fastapi import APIRouter
5+
from vecsim_app import config
6+
from vecsim_app.schema import (
7+
SimilarityRequest,
8+
UserTextSimilarityRequest
9+
)
10+
from vecsim_app.models import Paper
11+
from vecsim_app.query import create_query
12+
13+
14+
# loaded once here at start up
15+
from sentence_transformers import SentenceTransformer
16+
TEXT_MODEL = SentenceTransformer('allenai/scibert_scivocab_uncased')
17+
18+
paper_router = r = APIRouter()
19+
redis_client = redis.from_url(config.REDIS_URL)
20+
21+
async def papers_from_results(results) -> list:
22+
return [await Paper.get(p.paper_pk) for p in results.docs]
23+
24+
@r.get("/", response_model=t.List[Paper],
25+
name="paper:get_paper_samples",
26+
operation_id="get_papers_samples")
27+
async def get_papers(limit: int = 20, skip: int = 0):
28+
pks = await Paper.all_pks()
29+
if pks:
30+
# TODO figure out how to slice async_generator
31+
papers = []
32+
i = 0
33+
async for pk in pks:
34+
if i >= skip and i < skip + limit:
35+
papers.append(await Paper.get(pk))
36+
if len(papers) == limit:
37+
break
38+
i += 1
39+
return papers
40+
return []
41+
42+
@r.post("/vectorsearch/text",
43+
response_model=t.List[Paper],
44+
name="paper:find_similar_by_text",
45+
operation_id="compute_text_similarity")
46+
async def find_papers_by_text(similarity_request: SimilarityRequest) -> t.List[Paper]:
47+
q = create_query(
48+
similarity_request.search_type,
49+
similarity_request.number_of_results
50+
)
51+
52+
# find the vector of the Paper listed in the request
53+
paper_vector_key = "paper_vector:" + str(similarity_request.paper_id)
54+
vector = await redis_client.hget(paper_vector_key, "vector")
55+
56+
# obtain results of the query
57+
results = await redis_client.ft().search(q, query_params={"vec_param": vector})
58+
59+
# Get Paper records of those results
60+
return await papers_from_results(results)
61+
62+
63+
@r.post("/vectorsearch/text/user",
64+
response_model=t.List[Paper],
65+
name="paper:find_similar_by_user_text",
66+
operation_id="compute_user_text_similarity")
67+
async def find_papers_by_user_text(similarity_request: UserTextSimilarityRequest) -> t.List[Paper]:
68+
q = create_query(
69+
similarity_request.search_type,
70+
similarity_request.number_of_results
71+
)
72+
73+
vector = TEXT_MODEL.encode(similarity_request.user_text)
74+
75+
# obtain results of the query
76+
results = await redis_client.ft().search(q, query_params={"vec_param": vector.tobytes()})
77+
78+
# Get Paper records of those results
79+
return await papers_from_results(results)

backend/vecsim_app/config.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
import os
2+
3+
PROJECT_NAME = "vecsim_app"
4+
API_DOCS = "/api/docs"
5+
OPENAPI_DOCS = "/api/openapi.json"
6+
INDEX_TYPE = os.environ.get("VECSIM_INDEX_TYPE", "HNSW")
7+
REDIS_HOST = os.environ.get("REDIS_HOST", "redis-vector-db")
8+
REDIS_PORT = os.environ.get("REDIS_PORT", 6379)
9+
REDIS_DB = os.environ.get("REDIS_DB", 0)
10+
REDIS_PASSWORD = os.environ.get("REDIS_PASSWORD", "testing123")
11+
REDIS_URL = f"redis://:{REDIS_PASSWORD}@{REDIS_HOST}:{REDIS_PORT}/{REDIS_DB}"
12+
os.environ["REDIS_DATA_URL"] = REDIS_URL
13+
os.environ["REDIS_OM_URL"] = REDIS_URL
14+
API_V1_STR = "/api/v1"
15+
DATA_LOCATION = os.environ.get("DATA_LOCATION", "../../data")
40.2 KB
Loading
12.5 KB
Loading

backend/vecsim_app/entrypoint.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
#!/bin/sh
2+
3+
python load_data.py
4+
5+
python main.py

0 commit comments

Comments
 (0)