diff --git a/database/config.py b/database/config.py
deleted file mode 100644
index e3bdef7..0000000
--- a/database/config.py
+++ /dev/null
@@ -1 +0,0 @@
-EMBEDDING_MODEL = "text-embedding-3-small"
diff --git a/database/db-dev.ipynb b/database/db-dev.ipynb
deleted file mode 100644
index b70751c..0000000
--- a/database/db-dev.ipynb
+++ /dev/null
@@ -1,859 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 1,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "import json\n",
- "import os\n",
- "import subprocess\n",
- "from pathlib import Path\n",
- "from pprint import pprint\n",
- "from typing import Dict, List\n",
- "\n",
- "import tiktoken\n",
- "import yaml\n",
- "from config import EMBEDDING_MODEL\n",
- "from dotenv import find_dotenv, load_dotenv\n",
- "from loguru import logger\n",
- "from openai import OpenAI\n",
- "from qdrant_client import QdrantClient\n",
- "from qdrant_client.http.models import PointStruct\n",
- "from tqdm.auto import tqdm\n",
- "from utils import (\n",
- " create_collection,\n",
- " embed_text,\n",
- " get_collection_info,\n",
- " get_count,\n",
- " search,\n",
- " upsert,\n",
- ")\n",
- "\n",
- "load_dotenv(find_dotenv())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Get config data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "config_path = Path(\"../config.yaml\")\n",
- "\n",
- "with config_path.open(\"r\") as file:\n",
- " config = yaml.safe_load(file)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Load raw scraped data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "raw_data_path = Path(\"../scraper/srb_labor_law_data.json\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open(raw_data_path, \"r\", encoding=\"utf-8\") as file:\n",
- " raw_data = json.loads(file.read())"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Embedding text chunks"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create JSONL for parallel embedding.
\n",
- "The script for parallel processing is taken from [OpenAI CookBook](https://github.com/openai/openai-cookbook/blob/main/examples/api_request_parallel_processor.py)."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [],
- "source": [
- "filename = Path(\"./requests_to_parallel_process.jsonl\")\n",
- "jobs = [\n",
- " {\n",
- " \"model\": config[\"openai\"][\"embedding_model\"][\"name\"],\n",
- " \"input\": \". \".join([sample[\"title\"], \" \".join(sample[\"texts\"])]),\n",
- " }\n",
- " for sample in raw_data\n",
- "]\n",
- "with open(filename, \"w\") as f:\n",
- " for job in jobs:\n",
- " json_string = json.dumps(job)\n",
- " f.write(json_string + \"\\n\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Terminal command to run parallel processing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "! python api_request_parallel_processor.py \\\n",
- " --requests_filepath requests_to_parallel_process.jsonl \\\n",
- " --save_filepath requests_to_parallel_process_results.jsonl \\\n",
- " --request_url https://api.openai.com/v1/embeddings \\\n",
- " --max_requests_per_minute 2500 \\\n",
- " --max_tokens_per_minute 900000 \\\n",
- " --token_encoding_name cl100k_base \\\n",
- " --max_attempts 5 \\\n",
- " --logging_level 20"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Format the data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Load the embeddings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "embeddings_path = Path(\"./requests_to_parallel_process_results.jsonl\")\n",
- "with open(embeddings_path, \"r\", encoding=\"utf-8\") as file:\n",
- " embeddings = []\n",
- " for line in file:\n",
- " embeddings.append(json.loads(line))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create a look-up table of: article_name --> (embedding, text)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "embeddings_lookup = {}\n",
- "for item in embeddings:\n",
- " text = item[0][\"input\"]\n",
- " article_name = text.split(\". \")[0]\n",
- " embedding = item[1][\"data\"][0][\"embedding\"]\n",
- " embeddings_lookup[article_name] = {\"embedding\": embedding, \"text\": text}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create PointStructures for Qdrant database."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "points = []\n",
- "\n",
- "for id, dictionary in enumerate(raw_data):\n",
- " title = dictionary[\"title\"]\n",
- " link = dictionary[\"link\"]\n",
- " if title in embeddings_lookup:\n",
- " embedding, text = (\n",
- " embeddings_lookup[title][\"embedding\"],\n",
- " embeddings_lookup[title][\"text\"],\n",
- " )\n",
- " points.append(\n",
- " PointStruct(\n",
- " id=id,\n",
- " vector=embedding,\n",
- " payload={\"title\": title, \"text\": text, \"link\": link},\n",
- " )\n",
- " )\n",
- " else:\n",
- " print(\n",
- " f\"Warning: No embedding found for title '{title}'. This item will be skipped.\"\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Create Vector database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "qdrant_client = QdrantClient(\n",
- " url=os.environ[\"QDRANT_CLUSTER_URL\"],\n",
- " api_key=os.environ[\"QDRANT_API_KEY\"],\n",
- ")\n",
- "\n",
- "openai_client = OpenAI(api_key=os.environ[\"OPENAI_API_KEY\"])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Create a collection"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[32m2024-04-28 23:31:47.156\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m31\u001b[0m - \u001b[1mCreating collection: labor_law with vector size: 1536.\u001b[0m\n"
- ]
- },
- {
- "data": {
- "text/plain": [
- "True"
- ]
- },
- "execution_count": 12,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "collection_name = \"labor_law\"\n",
- "create_collection(client=qdrant_client, name=collection_name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Upload data to collection"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "UpdateResult(operation_id=0, status=)"
- ]
- },
- "execution_count": 18,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "upsert(client=qdrant_client, collection=collection_name, points=points)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check collection ifnormation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "get_collection_info(client=qdrant_client, collection=collection_name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Get collection points count"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "313"
- ]
- },
- "execution_count": 20,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "get_count(client=qdrant_client, collection=collection_name)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Search the Vector database "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "path_to_tests = Path(\"./test_queries.json\")\n",
- "with open(path_to_tests, \"r\", encoding=\"utf-8\") as file:\n",
- " test_samples = json.loads(file.read())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "test_samples[\"hard\"]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Get embeddings for tests"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "for level in test_samples.keys():\n",
- " for i, sample in enumerate(test_samples[level]):\n",
- " response = embed_text(\n",
- " client=openai_client,\n",
- " text=sample[\"query\"],\n",
- " model=config[\"openai\"][\"embedding_model\"][\"name\"],\n",
- " )\n",
- " embedding = response.data[0].embedding\n",
- " test_samples[level][i][\"embedding\"] = embedding"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Save tests with embeddings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "with open(path_to_tests, \"w\", encoding=\"utf-8\") as file:\n",
- " file.write(json.dumps(test_samples, indent=4))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'Koliko traje porodiljsko odsustvo?'"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "query = test_samples[\"hard\"][3][\"query\"]\n",
- "embedding = test_samples[\"hard\"][3][\"embedding\"]\n",
- "query"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "collection_name = \"zakon_o_radu\""
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "response = search(\n",
- " client=qdrant_client,\n",
- " collection=collection_name,\n",
- " query_vector=embedding,\n",
- " with_vectors=True,\n",
- ")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Upated for multiple laws"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Getting Embeddings"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Initial settings"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 67,
- "metadata": {},
- "outputs": [],
- "source": [
- "laws_dir = Path(\"../scraper/laws\")\n",
- "law_paths = list(laws_dir.iterdir())\n",
- "\n",
- "embeddings_dir = Path(\"./embeddings\")\n",
- "embeddings_dir.mkdir(exist_ok=True)\n",
- "\n",
- "to_process_dir = Path(\"./to_process\")\n",
- "to_process_dir.mkdir(exist_ok=True)\n",
- "\n",
- "max_num_tokens_per_chunk = 8191\n",
- "\n",
- "if not laws_dir.exists():\n",
- " logger.error(f\"No laws directory found.\")\n",
- "\n",
- "if not len(law_paths):\n",
- " logger.error(f\"No laws found in directory.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 87,
- "metadata": {},
- "outputs": [],
- "source": [
- "def load_json(path: Path) -> List[Dict]:\n",
- " if not path.exists():\n",
- " logger.error(f\"File: {path} does not exist.\")\n",
- " with open(path, \"r\", encoding=\"utf-8\") as file:\n",
- " data = json.loads(file.read())\n",
- " return data\n",
- "\n",
- "\n",
- "def prepare_for_embedding(\n",
- " output_path: Path, scraped_data: List[Dict], embedding_model: str = EMBEDDING_MODEL\n",
- ") -> None:\n",
- " jobs = [\n",
- " {\n",
- " \"model\": embedding_model,\n",
- " \"input\": \"[\" + sample[\"title\"] + \"]: \" + \" \".join(sample[\"texts\"]),\n",
- " }\n",
- " for sample in scraped_data\n",
- " ]\n",
- " with open(output_path, \"w\", encoding=\"utf-8\") as file:\n",
- " for job in jobs:\n",
- " json_string = json.dumps(job)\n",
- " file.write(json_string + \"\\n\")\n",
- "\n",
- "\n",
- "def get_token_num(text: str, model_name: str = EMBEDDING_MODEL) -> int:\n",
- " enc = tiktoken.encoding_for_model(EMBEDDING_MODEL)\n",
- " return len(enc.encode(text))\n",
- "\n",
- "\n",
- "def run_api_request_processor(\n",
- " requests_filepath: Path,\n",
- " save_path: Path,\n",
- " max_requests_per_minute: int = 2500,\n",
- " max_tokens_per_minute: int = 900000,\n",
- " token_encoding_name: str = \"cl100k_base\",\n",
- " max_attempts: int = 5,\n",
- " logging_level: int = 20,\n",
- ") -> None:\n",
- " if not requests_filepath.exists():\n",
- " logger.error(f\"File {requests_filepath} does not exist.\")\n",
- " if save_path.suffix != \".jsonl\":\n",
- " logger.error(f\"Save path {save_path} must be JSONL.\")\n",
- "\n",
- " command = [\n",
- " \"python\",\n",
- " \"api_request_parallel_processor.py\",\n",
- " \"--requests_filepath\",\n",
- " requests_filepath,\n",
- " \"--save_filepath\",\n",
- " save_path,\n",
- " \"--request_url\",\n",
- " \"https://api.openai.com/v1/embeddings\",\n",
- " \"--max_requests_per_minute\",\n",
- " str(max_requests_per_minute),\n",
- " \"--max_tokens_per_minute\",\n",
- " str(max_tokens_per_minute),\n",
- " \"--token_encoding_name\",\n",
- " token_encoding_name,\n",
- " \"--max_attempts\",\n",
- " str(max_attempts),\n",
- " \"--logging_level\",\n",
- " str(logging_level),\n",
- " ]\n",
- " result = subprocess.run(command, text=True, capture_output=True)\n",
- "\n",
- " if result.returncode == 0:\n",
- " logger.info(\"Embedding executed successfully.\")\n",
- " logger.info(f\"Embeddings saved to: {save_path}\")\n",
- " else:\n",
- " logger.error(\"Error in Embedding execution!\")\n",
- " logger.error(\"Error:\", result.stderr)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Check number of tokens per chunk.
\n",
- "⚠️ Integrate this into processing."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 70,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "a96fbe906d654281a786ade22d58a76c",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Checking tokens length: 0%| | 0/5 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "for file_path in tqdm(law_paths, desc=\"Checking tokens length\", total=len(law_paths)):\n",
- " scraped_data = load_json(path=file_path)\n",
- "\n",
- " for i, element in enumerate(scraped_data):\n",
- " full_text = \" \".join(element[\"texts\"])\n",
- " num_tokens = get_token_num(text=full_text)\n",
- " if num_tokens > max_num_tokens_per_chunk:\n",
- " print(i, element)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 88,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "4ac361c4e2804841a627da657f3b2f91",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Embedding scraped laws: 0%| | 0/5 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[32m2024-04-21 22:23:46.169\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1mEmbedding executed successfully.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:46.171\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mEmbeddings saved to: embeddings/zakon-o-porezu-na-dohodak-gradjana.jsonl\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:47.885\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1mEmbedding executed successfully.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:47.886\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mEmbeddings saved to: embeddings/zakon_o_zastiti_potrosaca.jsonl\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:49.781\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1mEmbedding executed successfully.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:49.782\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mEmbeddings saved to: embeddings/porodicni_zakon.jsonl\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:51.651\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1mEmbedding executed successfully.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:51.652\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mEmbeddings saved to: embeddings/zakon_o_zastiti_podataka_o_licnosti.jsonl\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:53.465\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m67\u001b[0m - \u001b[1mEmbedding executed successfully.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:23:53.467\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36mrun_api_request_processor\u001b[0m:\u001b[36m68\u001b[0m - \u001b[1mEmbeddings saved to: embeddings/zakon_o_radu.jsonl\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "for file_path in tqdm(law_paths, desc=\"Embedding scraped laws\", total=len(law_paths)):\n",
- " scraped_data = load_json(path=file_path)\n",
- "\n",
- " requests_filepath = to_process_dir / (file_path.stem + \".jsonl\")\n",
- " prepare_for_embedding(\n",
- " output_path=requests_filepath,\n",
- " scraped_data=scraped_data,\n",
- " )\n",
- "\n",
- " processed_filepath = embeddings_dir / requests_filepath.name\n",
- " run_api_request_processor(\n",
- " requests_filepath=requests_filepath, save_path=processed_filepath\n",
- " )"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "### Creating vector database"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 71,
- "metadata": {},
- "outputs": [],
- "source": [
- "embeddings_dir = Path(\"./embeddings\")\n",
- "embedding_paths = list(embeddings_dir.iterdir())\n",
- "\n",
- "if not embeddings_dir.exists():\n",
- " logger.error(f\"No embeddings directory found.\")\n",
- "\n",
- "if not len(embedding_paths):\n",
- " logger.error(f\"No embedding files found in directory.\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 94,
- "metadata": {},
- "outputs": [],
- "source": [
- "def load_embeddings(path: Path) -> List[Dict]:\n",
- " if not path.exists():\n",
- " logger.error(f\"File: {path} does not exist.\")\n",
- "\n",
- " with open(path, \"r\", encoding=\"utf-8\") as file:\n",
- " embedded_data = []\n",
- " for line in file:\n",
- " embedded_data.append(json.loads(line))\n",
- "\n",
- " return embedded_data\n",
- "\n",
- "\n",
- "def get_embedings_article_lookup(embedded_data: List[Dict]) -> Dict:\n",
- " embeddings_lookup = {}\n",
- " for item in embedded_data:\n",
- " text = item[0][\"input\"]\n",
- " article_name = text.split(\"]: \")[0][1:]\n",
- " embedding = item[1][\"data\"][0][\"embedding\"]\n",
- " embeddings_lookup[article_name] = {\"embedding\": embedding, \"text\": text}\n",
- "\n",
- " return embeddings_lookup\n",
- "\n",
- "\n",
- "def get_data_points(raw_data: List[Dict], embeddings_lookup: Dict) -> List[PointStruct]:\n",
- " points = []\n",
- "\n",
- " for id, dictionary in enumerate(raw_data):\n",
- " title = dictionary[\"title\"]\n",
- " link = dictionary[\"link\"]\n",
- " if title in embeddings_lookup:\n",
- " embedding, text = (\n",
- " embeddings_lookup[title][\"embedding\"],\n",
- " embeddings_lookup[title][\"text\"],\n",
- " )\n",
- " points.append(\n",
- " PointStruct(\n",
- " id=id,\n",
- " vector=embedding,\n",
- " payload={\"title\": title, \"text\": text, \"link\": link},\n",
- " )\n",
- " )\n",
- " else:\n",
- " logger.warning(\n",
- " f\"Warning: No embedding found for title '{title}'. This item will be skipped.\"\n",
- " )\n",
- "\n",
- " return points"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 95,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.jupyter.widget-view+json": {
- "model_id": "f6a9a69294f143768641ecc26ed4268d",
- "version_major": 2,
- "version_minor": 0
- },
- "text/plain": [
- "Creating vector database collections: 0%| | 0/5 [00:00, ?it/s]"
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- },
- {
- "name": "stderr",
- "output_type": "stream",
- "text": [
- "\u001b[32m2024-04-21 22:29:29.809\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: porodicni_zakon with vector size: 1536.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:33.578\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"porodicni_zakon\" collection with 364 data points.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:33.658\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_radu with vector size: 1536.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:36.784\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_radu\" collection with 313 data points.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:36.820\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_zastiti_podataka_o_licnosti with vector size: 1536.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:38.111\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_zastiti_podataka_o_licnosti\" collection with 102 data points.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:38.166\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_zastiti_potrosaca with vector size: 1536.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:40.287\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_zastiti_potrosaca\" collection with 198 data points.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:40.348\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mutils\u001b[0m:\u001b[36mcreate_collection\u001b[0m:\u001b[36m30\u001b[0m - \u001b[1mCreating collection: zakon_o_porezu_na_dohodak_gradjana with vector size: 1536.\u001b[0m\n",
- "\u001b[32m2024-04-21 22:29:43.001\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36m__main__\u001b[0m:\u001b[36m\u001b[0m:\u001b[36m21\u001b[0m - \u001b[1mCreated \"zakon_o_porezu_na_dohodak_gradjana\" collection with 256 data points.\u001b[0m\n"
- ]
- }
- ],
- "source": [
- "for file_path in tqdm(\n",
- " embedding_paths,\n",
- " desc=\"Creating vector database collections\",\n",
- " total=len(embedding_paths),\n",
- "):\n",
- " embedded_data = load_embeddings(path=file_path)\n",
- " embeddings_lookup = get_embedings_article_lookup(embedded_data)\n",
- "\n",
- " raw_data_path = laws_dir / file_path.with_suffix(\".json\").name\n",
- " raw_data = load_json(path=raw_data_path)\n",
- "\n",
- " points = get_data_points(raw_data=raw_data, embeddings_lookup=embeddings_lookup)\n",
- "\n",
- " collection_name = file_path.stem.replace(\"-\", \"_\")\n",
- " create_collection(client=qdrant_client, name=collection_name)\n",
- " upsert(client=qdrant_client, collection=collection_name, points=points)\n",
- "\n",
- " if not get_count(client=qdrant_client, collection=collection_name) == len(raw_data):\n",
- " logger.error(f\"There are missing points in {collection_name} collection.\")\n",
- "\n",
- " logger.info(\n",
- " f'Created \"{collection_name}\" collection with {get_count(client=qdrant_client, collection=collection_name)} data points.'\n",
- " )"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}