diff --git a/database/config.py b/database/config.py deleted file mode 100644 index e3bdef7..0000000 --- a/database/config.py +++ /dev/null @@ -1 +0,0 @@ -EMBEDDING_MODEL = "text-embedding-3-small" diff --git a/database/db-dev.ipynb b/database/db-dev.ipynb deleted file mode 100644 index b70751c..0000000 --- a/database/db-dev.ipynb +++ /dev/null @@ -1,859 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 1, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import json\n", - "import os\n", - "import subprocess\n", - "from pathlib import Path\n", - "from pprint import pprint\n", - "from typing import Dict, List\n", - "\n", - "import tiktoken\n", - "import yaml\n", - "from config import EMBEDDING_MODEL\n", - "from dotenv import find_dotenv, load_dotenv\n", - "from loguru import logger\n", - "from openai import OpenAI\n", - "from qdrant_client import QdrantClient\n", - "from qdrant_client.http.models import PointStruct\n", - "from tqdm.auto import tqdm\n", - "from utils import (\n", - " create_collection,\n", - " embed_text,\n", - " get_collection_info,\n", - " get_count,\n", - " search,\n", - " upsert,\n", - ")\n", - "\n", - "load_dotenv(find_dotenv())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get config data" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "config_path = Path(\"../config.yaml\")\n", - "\n", - "with config_path.open(\"r\") as file:\n", - " config = yaml.safe_load(file)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load raw scraped data" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "raw_data_path = Path(\"../scraper/srb_labor_law_data.json\")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "with open(raw_data_path, \"r\", encoding=\"utf-8\") as file:\n", - " raw_data = json.loads(file.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Embedding text chunks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create JSONL for parallel embedding.
\n", - "The script for parallel processing is taken from [OpenAI CookBook](https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py)." - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "filename = Path(\"./requests_to_parallel_process.jsonl\")\n", - "jobs = [\n", - " {\n", - " \"model\": config[\"openai\"][\"embedding_model\"][\"name\"],\n", - " \"input\": \". \".join([sample[\"title\"], \" \".join(sample[\"texts\"])]),\n", - " }\n", - " for sample in raw_data\n", - "]\n", - "with open(filename, \"w\") as f:\n", - " for job in jobs:\n", - " json_string = json.dumps(job)\n", - " f.write(json_string + \"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Terminal command to run parallel processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "! python api_request_parallel_processor.py \\\n", - " --requests_filepath requests_to_parallel_process.jsonl \\\n", - " --save_filepath requests_to_parallel_process_results.jsonl \\\n", - " --request_url https://api.openai.com/v1/embeddings \\\n", - " --max_requests_per_minute 2500 \\\n", - " --max_tokens_per_minute 900000 \\\n", - " --token_encoding_name cl100k_base \\\n", - " --max_attempts 5 \\\n", - " --logging_level 20" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Format the data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "embeddings_path = Path(\"./requests_to_parallel_process_results.jsonl\")\n", - "with open(embeddings_path, \"r\", encoding=\"utf-8\") as file:\n", - " embeddings = []\n", - " for line in file:\n", - " embeddings.append(json.loads(line))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a look-up table of: article_name --> (embedding, text)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "embeddings_lookup = {}\n", - "for item in embeddings:\n", - " text = item[0][\"input\"]\n", - " article_name = text.split(\". \")[0]\n", - " embedding = item[1][\"data\"][0][\"embedding\"]\n", - " embeddings_lookup[article_name] = {\"embedding\": embedding, \"text\": text}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create PointStructures for Qdrant database." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "points = []\n", - "\n", - "for id, dictionary in enumerate(raw_data):\n", - " title = dictionary[\"title\"]\n", - " link = dictionary[\"link\"]\n", - " if title in embeddings_lookup:\n", - " embedding, text = (\n", - " embeddings_lookup[title][\"embedding\"],\n", - " embeddings_lookup[title][\"text\"],\n", - " )\n", - " points.append(\n", - " PointStruct(\n", - " id=id,\n", - " vector=embedding,\n", - " payload={\"title\": title, \"text\": text, \"link\": link},\n", - " )\n", - " )\n", - " else:\n", - " print(\n", - " f\"Warning: No embedding found for title '{title}'. This item will be skipped.\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Create Vector database" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "qdrant_client = QdrantClient(\n", - " url=os.environ[\"QDRANT_CLUSTER_URL\"],\n", - " api_key=os.environ[\"QDRANT_API_KEY\"],\n", - ")\n", - "\n", - "openai_client = OpenAI(api_key=os.environ[\"OPENAI_API_KEY\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Create a collection" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\u001b[32m2024-04-28 23:31:47.156\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mCreating collection: labor_law with vector size: 1536.\u001b[0m\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "collection_name = \"labor_law\"\n", - "create_collection(client=qdrant_client, name=collection_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Upload data to collection" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "UpdateResult(operation_id=0, status=)" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "upsert(client=qdrant_client, collection=collection_name, points=points)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check collection ifnormation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "get_collection_info(client=qdrant_client, collection=collection_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get collection points count" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "313" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "get_count(client=qdrant_client, collection=collection_name)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Search the Vector database " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "path_to_tests = Path(\"./test_queries.json\")\n", - "with open(path_to_tests, \"r\", encoding=\"utf-8\") as file:\n", - " test_samples = json.loads(file.read())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "test_samples[\"hard\"]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Get embeddings for tests" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "for level in test_samples.keys():\n", - " for i, sample in enumerate(test_samples[level]):\n", - " response = embed_text(\n", - " client=openai_client,\n", - " text=sample[\"query\"],\n", - " model=config[\"openai\"][\"embedding_model\"][\"name\"],\n", - " )\n", - " embedding = response.data[0].embedding\n", - " test_samples[level][i][\"embedding\"] = embedding" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Save tests with embeddings" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "with open(path_to_tests, \"w\", encoding=\"utf-8\") as file:\n", - " file.write(json.dumps(test_samples, indent=4))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'Koliko traje porodiljsko odsustvo?'" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "query = test_samples[\"hard\"][3][\"query\"]\n", - "embedding = test_samples[\"hard\"][3][\"embedding\"]\n", - "query" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "collection_name = \"zakon_o_radu\"" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "response = search(\n", - " client=qdrant_client,\n", - " collection=collection_name,\n", - " query_vector=embedding,\n", - " with_vectors=True,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Upated for multiple laws" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Getting Embeddings" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initial settings" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "laws_dir = Path(\"../scraper/laws\")\n", - "law_paths = list(laws_dir.iterdir())\n", - "\n", - "embeddings_dir = Path(\"./embeddings\")\n", - "embeddings_dir.mkdir(exist_ok=True)\n", - "\n", - "to_process_dir = Path(\"./to_process\")\n", - "to_process_dir.mkdir(exist_ok=True)\n", - "\n", - "max_num_tokens_per_chunk = 8191\n", - "\n", - "if not laws_dir.exists():\n", - " logger.error(f\"No laws directory found.\")\n", - "\n", - "if not len(law_paths):\n", - " logger.error(f\"No laws found in directory.\")" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [], - "source": [ - "def load_json(path: Path) -> List[Dict]:\n", - " if not path.exists():\n", - " logger.error(f\"File: {path} does not exist.\")\n", - " with open(path, \"r\", encoding=\"utf-8\") as file:\n", - " data = json.loads(file.read())\n", - " return data\n", - "\n", - "\n", - "def prepare_for_embedding(\n", - " output_path: Path, scraped_data: List[Dict], embedding_model: str = EMBEDDING_MODEL\n", - ") -> None:\n", - " jobs = [\n", - " {\n", - " \"model\": embedding_model,\n", - " \"input\": \"[\" + sample[\"title\"] + \"]: \" + \" \".join(sample[\"texts\"]),\n", - " }\n", - " for sample in scraped_data\n", - " ]\n", - " with open(output_path, \"w\", encoding=\"utf-8\") as file:\n", - " for job in jobs:\n", - " json_string = json.dumps(job)\n", - " file.write(json_string + \"\\n\")\n", - "\n", - "\n", - "def get_token_num(text: str, model_name: str = EMBEDDING_MODEL) -> int:\n", - " enc = tiktoken.encoding_for_model(EMBEDDING_MODEL)\n", - " return len(enc.encode(text))\n", - "\n", - "\n", - "def run_api_request_processor(\n", - " requests_filepath: Path,\n", - " save_path: Path,\n", - " max_requests_per_minute: int = 2500,\n", - " max_tokens_per_minute: int = 900000,\n", - " token_encoding_name: str = \"cl100k_base\",\n", - " max_attempts: int = 5,\n", - " logging_level: int = 20,\n", - ") -> None:\n", - " if not requests_filepath.exists():\n", - " logger.error(f\"File {requests_filepath} does not exist.\")\n", - " if save_path.suffix != \".jsonl\":\n", - " logger.error(f\"Save path {save_path} must be JSONL.\")\n", - "\n", - " command = [\n", - " \"python\",\n", - " \"api_request_parallel_processor.py\",\n", - " \"--requests_filepath\",\n", - " requests_filepath,\n", - " \"--save_filepath\",\n", - " save_path,\n", - " \"--request_url\",\n", - " \"https://api.openai.com/v1/embeddings\",\n", - " \"--max_requests_per_minute\",\n", - " str(max_requests_per_minute),\n", - " \"--max_tokens_per_minute\",\n", - " str(max_tokens_per_minute),\n", - " \"--token_encoding_name\",\n", - " token_encoding_name,\n", - " \"--max_attempts\",\n", - " str(max_attempts),\n", - " \"--logging_level\",\n", - " str(logging_level),\n", - " ]\n", - " result = subprocess.run(command, text=True, capture_output=True)\n", - "\n", - " if result.returncode == 0:\n", - " logger.info(\"Embedding executed successfully.\")\n", - " logger.info(f\"Embeddings saved to: {save_path}\")\n", - " else:\n", - " logger.error(\"Error in Embedding execution!\")\n", - " logger.error(\"Error:\", result.stderr)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Check number of tokens per chunk.
\n", - "⚠️ Integrate this into processing." - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "a96fbe906d654281a786ade22d58a76c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Checking tokens length: 0%| | 0/5 [00:00 max_num_tokens_per_chunk:\n", - " print(i, element)" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4ac361c4e2804841a627da657f3b2f91", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Embedding scraped laws: 0%| | 0/5 [00:00 List[Dict]:\n", - " if not path.exists():\n", - " logger.error(f\"File: {path} does not exist.\")\n", - "\n", - " with open(path, \"r\", encoding=\"utf-8\") as file:\n", - " embedded_data = []\n", - " for line in file:\n", - " embedded_data.append(json.loads(line))\n", - "\n", - " return embedded_data\n", - "\n", - "\n", - "def get_embedings_article_lookup(embedded_data: List[Dict]) -> Dict:\n", - " embeddings_lookup = {}\n", - " for item in embedded_data:\n", - " text = item[0][\"input\"]\n", - " article_name = text.split(\"]: \")[0][1:]\n", - " embedding = item[1][\"data\"][0][\"embedding\"]\n", - " embeddings_lookup[article_name] = {\"embedding\": embedding, \"text\": text}\n", - "\n", - " return embeddings_lookup\n", - "\n", - "\n", - "def get_data_points(raw_data: List[Dict], embeddings_lookup: Dict) -> List[PointStruct]:\n", - " points = []\n", - "\n", - " for id, dictionary in enumerate(raw_data):\n", - " title = dictionary[\"title\"]\n", - " link = dictionary[\"link\"]\n", - " if title in embeddings_lookup:\n", - " embedding, text = (\n", - " embeddings_lookup[title][\"embedding\"],\n", - " embeddings_lookup[title][\"text\"],\n", - " )\n", - " points.append(\n", - " PointStruct(\n", - " id=id,\n", - " vector=embedding,\n", - " payload={\"title\": title, \"text\": text, \"link\": link},\n", - " )\n", - " )\n", - " else:\n", - " logger.warning(\n", - " f\"Warning: No embedding found for title '{title}'. This item will be skipped.\"\n", - " )\n", - "\n", - " return points" - ] - }, - { - "cell_type": "code", - "execution_count": 95, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f6a9a69294f143768641ecc26ed4268d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Creating vector database collections: 0%| | 0/5 [00:00\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"porodicni_zakon\" collection with 364 data points.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:33.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_radu with vector size: 1536.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:36.784\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_radu\" collection with 313 data points.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:36.820\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_zastiti_podataka_o_licnosti with vector size: 1536.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:38.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_zastiti_podataka_o_licnosti\" collection with 102 data points.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:38.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_zastiti_potrosaca with vector size: 1536.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:40.287\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_zastiti_potrosaca\" collection with 198 data points.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:40.348\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_porezu_na_dohodak_gradjana with vector size: 1536.\u001b[0m\n", - "\u001b[32m2024-04-21 22:29:43.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_porezu_na_dohodak_gradjana\" collection with 256 data points.\u001b[0m\n" - ] - } - ], - "source": [ - "for file_path in tqdm(\n", - " embedding_paths,\n", - " desc=\"Creating vector database collections\",\n", - " total=len(embedding_paths),\n", - "):\n", - " embedded_data = load_embeddings(path=file_path)\n", - " embeddings_lookup = get_embedings_article_lookup(embedded_data)\n", - "\n", - " raw_data_path = laws_dir / file_path.with_suffix(\".json\").name\n", - " raw_data = load_json(path=raw_data_path)\n", - "\n", - " points = get_data_points(raw_data=raw_data, embeddings_lookup=embeddings_lookup)\n", - "\n", - " collection_name = file_path.stem.replace(\"-\", \"_\")\n", - " create_collection(client=qdrant_client, name=collection_name)\n", - " upsert(client=qdrant_client, collection=collection_name, points=points)\n", - "\n", - " if not get_count(client=qdrant_client, collection=collection_name) == len(raw_data):\n", - " logger.error(f\"There are missing points in {collection_name} collection.\")\n", - "\n", - " logger.info(\n", - " f'Created \"{collection_name}\" collection with {get_count(client=qdrant_client, collection=collection_name)} data points.'\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}