diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index baac3f5..dad3d5e 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -6,7 +6,7 @@ on: - main jobs: - changed_files: + test_router: runs-on: ubuntu-latest # windows-latest || macos-latest name: Test Router steps: @@ -19,28 +19,42 @@ jobs: with: files: router/** # files_ignore: docs/static.js + + - name: Install Dependencies + if: steps.changed-files-router.outputs.any_changed == 'true' + run: pip install -r requirements.txt - - name: Run step if any file(s) in the router folder change + - name: Run Tests if any file(s) in the router folder change if: steps.changed-files-router.outputs.any_changed == 'true' env: ALL_CHANGED_FILES: ${{ steps.changed-files-router.outputs.all_changed_files }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: | - pip install -r requirements.txt - python -m unittest tests/test_router.py - + LANGFUSE_SECRET_KEY: ${{ secrets.LANGFUSE_SECRET_KEY }} + LANGFUSE_PUBLIC_KEY: ${{ secrets.LANGFUSE_PUBLIC_KEY }} + LANGFUSE_HOST: ${{ secrets.LANGFUSE_HOST }} + run: python -m unittest tests/test_router.py + + test_database: + runs-on: ubuntu-latest # windows-latest || macos-latest + name: Test Database + steps: + - uses: actions/checkout@v4 # Test Database - name: Get changed files in the database folder id: changed-files-database uses: tj-actions/changed-files@v44 with: files: database/** + + - name: Install Dependencies + if: steps.changed-files-database.outputs.any_changed == 'true' + run: pip install -r requirements.txt - - name: Run step if any file(s) in the database folder change + - name: Run Tests if any file(s) in the database folder change if: steps.changed-files-database.outputs.any_changed == 'true' env: ALL_CHANGED_FILES: ${{ steps.changed-files-database.outputs.all_changed_files }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - run: | - pip install -r requirements.txt - python -m unittest tests/test_database.py \ No newline at end of file + QDRANT_API_KEY: ${{ secrets.QDRANT_API_KEY }} + QDRANT_CLUSTER_URL: ${{ secrets.QDRANT_CLUSTER_URL }} + run: python -m unittest tests/test_database.py \ No newline at end of file diff --git a/README.md b/README.md index cd3a789..8b0466f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,10 @@ -# Legal ChatBot Documentation +# Legal ChatBot 👩⚖️ + +Legal ChatBot is an innovative project designed to assist users in navigating the complex world of legal documents. + +Utilizing a combination of RAG (Retrieval-Augmented Generation) technology and a deep knowledge base of law articles, this bot can intelligently reference relevant legal texts during interactions. It offers an interactive platform for querying legal information, making it a valuable tool for professionals, students, and anyone needing quick insights into legal matters. + +Setup involves **Poetry** for dependency management, **Qdrant** for vector database functionality, and **Langfuse** for enhancing chatbot performance, ensuring a robust and efficient user experience. ## Setting Up the Project diff --git a/app.py b/app.py index 4209d57..236a642 100644 --- a/app.py +++ b/app.py @@ -56,6 +56,7 @@ def response_generator(query: str): # Rout query collections = semantic_query_router( client=openai_client, + model=config["openai"]["gpt_model"]["router"], query=query, prompt=ROUTER_PROMPT, temperature=config["openai"]["gpt_model"]["temperature"], @@ -82,7 +83,7 @@ def response_generator(query: str): stream = get_answer( client=openai_client, - model=config["openai"]["gpt_model"]["name"], + model=config["openai"]["gpt_model"]["llm"], temperature=config["openai"]["gpt_model"]["temperature"], messages=get_messages( context=context, query=query, conversation=st.session_state.messages diff --git a/chat-dev.ipynb b/chat-dev.ipynb index a97a568..9d43204 100644 --- a/chat-dev.ipynb +++ b/chat-dev.ipynb @@ -225,7 +225,7 @@ "outputs": [], "source": [ "response = openai_client.chat.completions.create(\n", - " model=config[\"openai\"][\"gpt_model\"][\"name_light\"],\n", + " model=config[\"openai\"][\"gpt_model\"][\"router\"],\n", " temperature=config[\"openai\"][\"gpt_model\"][\"temperature\"],\n", " messages=messages,\n", ")" diff --git a/llm/prompts.py b/llm/prompts.py index 76c40c1..c6b5ec1 100644 --- a/llm/prompts.py +++ b/llm/prompts.py @@ -17,6 +17,7 @@ Tvoj zadatak je da identifikuješ potrebe klijenta i na osnovu toga pružite najrelevantnije informacije. Kada pružaš odgovore ili savete, naglasiti iz kojeg tačno pravnog člana dolazi informacija i obavezno obezbedi link ka tom članu kako bi klijent mogao dodatno da se informiše. Cilj je da komunikacija bude efikasna i da klijent oseti da je u dobrim rukama. +Korisnik može da postavi pitanje na bilo kom jeziku i tvoj zadatak je da na pitanje odgovriš na istom jeziku kao i pitanje korisnika. Format odgovora: - Ispod naslova **Sažetak** prvo odgovori kratko i direktno na pitanje klijenta koristeći laičke izraze bez složene pravne terminologije. diff --git a/router/router_prompt.py b/router/router_prompt.py index 6d901f0..8adb1d7 100644 --- a/router/router_prompt.py +++ b/router/router_prompt.py @@ -11,15 +11,17 @@ - Zakon o zaštiti potrošača osigurava da potrošači u Srbiji imaju prava na sigurnost i kvalitet proizvoda i usluga. Zakon propisuje obaveze trgovaca u pogledu pravilnog informisanja potrošača o proizvodima, uslugama, cenama i pravu na reklamaciju. Takođe, uključuje prava potrošača na odustanak od kupovine unutar određenog roka i prava u slučaju neispravnosti proizvoda. - porodicni_zakon - Porodični zakon reguliše pravne odnose unutar porodice, uključujući brak, roditeljstvo, starateljstvo, hraniteljstvo i usvojenje. Zakon definiše prava i obaveze bračnih partnera, kao i prava dece i roditeljske odgovornosti. Takođe se bavi pitanjima nasleđivanja i alimentacije. +- nema_zakona + - Korisnikovo pitanje ne odgovara ni jednom zakonu. **FORMAT ODGOVORA:** - Odgovor vratiti u JSON formatu. - Odgovor treba da sadrzi samo JSON output, bez dodataka. - Odgovor mora da bude string koji moze da se ucita uz pmoc komande json.loads(). -- Imena zakona mogu biti samo sledeca: zakon_o_radu, zakon_o_porezu_na_dohodak_gradjana, zakon_o_zastiti_podataka_o_licnosti, zakon_o_zastiti_potrosaca, porodicni_zakon. +- Imena zakona mogu biti samo sledeca: zakon_o_radu, zakon_o_porezu_na_dohodak_gradjana, zakon_o_zastiti_podataka_o_licnosti, zakon_o_zastiti_potrosaca, porodicni_zakon, nema_zakona. - Jedno pitanje korisnika moze da se odnosi na vise zakona. - Ukoliko mislis da zakon odgovara korisnikovom pitanju ali nisi 100% siguran onda ga svakako stavi u odgovor. -- Ukoliko korisnikovo pitanje ne odgovara ni jednom zakonu vrati genericki string: "nema_zakona". +- Ukoliko korisnikovo pitanje ne odgovara ni jednom zakonu vrati listu sa generickim stringom: ["nema_zakona"]. - Zakone uvek moras vracati kao listu stringova bez obzira da li ih je 1 ili vise. - Primer JSON odgovora: diff --git a/scraper/README.md b/scraper/README.md new file mode 100644 index 0000000..c3af142 --- /dev/null +++ b/scraper/README.md @@ -0,0 +1,31 @@ +# Scraper + +This script scrapes law articles from a list of URLs and saves them as JSON files. + +## Usage + +To run the script, use the following command: + +```bash +python scraper/scraper.py --file scraper/urls.txt --output-dir laws_test +``` + +## Arguments +- `--url`: A single URL to scrape. +- `--file`: Path to a text file containing URLs separated by newlines. +- `--output-dir`: Directory to save the JSON files (default is scraper/laws). + +## Example +To scrape law articles from a single URL (example: Serbian Labor Law) and save the output in the `scraper/laws` directory: +```bash +python scraper/scraper.py --url "https://www.paragraf.rs/propisi/zakon_o_radu.html" --output-dir scraper/laws +``` + +To scrape law articles from a list of URLs in urls.txt and save the output in the `scraper/laws` directory: +```bash +python scraper/scraper.py --file scraper/urls.txt --output-dir scraper/laws +``` +> ⚠️ _**Note**: Ensure you are in the root directory of the project before running the script._ + +## Output +The output JSON files will be saved in the specified output directory, with each file named after the corresponding URL's stem. diff --git a/scraper/scraper-dev.ipynb b/scraper/scraper-dev.ipynb deleted file mode 100644 index ace3d39..0000000 --- a/scraper/scraper-dev.ipynb +++ /dev/null @@ -1,275 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "from pathlib import Path\n", - "from typing import Dict, List, Literal\n", - "\n", - "import requests\n", - "from bs4 import BeautifulSoup\n", - "from tqdm.auto import tqdm" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "URL = \"https://www.paragraf.rs/propisi/zakon_o_radu.html\"\n", - "output_path = Path(\"./srb_labor_law_data.json\")\n", - "page = requests.get(URL)" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "soup = BeautifulSoup(page.content, \"lxml\")" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "def check_class_element(element, class_name: Literal[\"normal\", \"clan\"]) -> bool:\n", - " \"\"\"Check if the element has a class 'normal'.\"\"\"\n", - " if class_name == \"normal\":\n", - " return element.get(\"class\") == [\"normal\"]\n", - " else:\n", - " return element.get(\"class\") == [\"clan\"]\n", - "\n", - "\n", - "def run_scraper(soup, url: str) -> List[Dict]:\n", - " law_articles = []\n", - " article_title = None\n", - " article_texts = []\n", - " article_link = None\n", - "\n", - " elements = soup.find_all(\"p\")\n", - " for el in tqdm(elements):\n", - " if check_class_element(el, \"clan\"):\n", - " if article_title:\n", - " law_articles.append(\n", - " {\n", - " \"title\": article_title,\n", - " \"texts\": article_texts,\n", - " \"link\": article_link,\n", - " }\n", - " )\n", - " article_texts = []\n", - " article_title = el.get_text(strip=True)\n", - "\n", - " name_attr = el.find(\"a\").get(\"name\") if el.find(\"a\") else None\n", - " article_link = f\"{url}#{name_attr}\" if name_attr else None\n", - " elif article_title and check_class_element(el, \"normal\"):\n", - " article_texts.append(el.get_text(strip=True))\n", - "\n", - " if article_title and article_texts:\n", - " law_articles.append(\n", - " {\"title\": article_title, \"texts\": article_texts, \"link\": article_link}\n", - " )\n", - "\n", - " return law_articles" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Write json" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "with open(output_path, \"w\", encoding=\"utf-8\") as file:\n", - " file.write(json.dumps(labor_law, indent=4))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Read json" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "with open(output_path, \"r\", encoding=\"utf-8\") as file:\n", - " data = json.loads(file.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Scrape multiple laws" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "urls = [\n", - " \"https://www.paragraf.rs/propisi/zakon_o_radu.html\",\n", - " \"https://www.paragraf.rs/propisi/zakon-o-porezu-na-dohodak-gradjana.html\",\n", - " \"https://www.paragraf.rs/propisi/zakon_o_zastiti_podataka_o_licnosti.html\",\n", - " \"https://www.paragraf.rs/propisi/zakon_o_zastiti_potrosaca.html\",\n", - " \"https://www.paragraf.rs/propisi/porodicni_zakon.html\",\n", - "]\n", - "output_dir = Path(\"./laws\")\n", - "if not output_dir.exists():\n", - " output_dir.mkdir(exist_ok=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "59fce09e0bde4c908d938957948630f5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Scraping laws: 0%| | 0/5 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "9cdadfd6d4cd40dda7ba5b2e4eba82cc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1451 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "72614c7bb65444f9afd0a38cfe32d24d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1358 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "119afe3a2daf48cda94dbe887996643a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1095 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ad7e1bb8691a4a02b26d3aeef8a7a26d", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1645 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ec1b314068ff477680d727fa0b3ed5a0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1808 [00:00, ?it/s]" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "for url in tqdm(urls, desc=\"Scraping laws\", total=len(urls)):\n", - " save_path = output_dir / Path(url).with_suffix(\".json\").name\n", - " page = requests.get(url)\n", - " soup = BeautifulSoup(page.content, \"lxml\")\n", - "\n", - " law_articles = run_scraper(soup, url)\n", - "\n", - " with open(save_path, \"w\", encoding=\"utf-8\") as file:\n", - " file.write(json.dumps(law_articles, indent=4))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/scraper/scraper.py b/scraper/scraper.py new file mode 100644 index 0000000..3b439df --- /dev/null +++ b/scraper/scraper.py @@ -0,0 +1,143 @@ +import argparse +import json +from pathlib import Path +from typing import Dict, List, Literal + +import requests +from bs4 import BeautifulSoup +from loguru import logger +from tqdm.auto import tqdm +from tqdm.contrib.logging import tqdm_logging_redirect + + +def check_class_element(element, class_name: Literal["normal", "clan"]) -> bool: + """Check if the element has a class 'normal'.""" + return element.get("class") == [class_name] + + +def run_scraper(soup: BeautifulSoup, url: str) -> List[Dict]: + """ + Scrape law articles from the provided BeautifulSoup object. + + This function processes the HTML content parsed by BeautifulSoup to extract law articles. + Each article is identified by a specific class and contains a title, a list of text paragraphs, + and a link to the article section. + + Args: + soup (BeautifulSoup): The BeautifulSoup object containing the parsed HTML content. + url (str): The base URL of the website to construct full article links. + + Returns: + List[Dict]: A list of dictionaries, each representing a law article with the following keys: + - "title" (str): The title of the article. + - "texts" (List[str]): A list of text paragraphs within the article. + - "link" (str): The URL link to the specific article section. + """ + law_articles = [] + article_title = None + article_texts = [] + article_link = None + + # Find all
elements in the HTML + elements = soup.find_all("p") + for el in elements: + # Determine the class name of the element + class_name = ( + "clan" if check_class_element(element=el, class_name="clan") else "normal" + ) + + # If the element is a title (class "clan"), start a new article + if class_name == "clan": + if article_title: + # Save the previous article + law_articles.append( + { + "title": article_title, + "texts": article_texts, + "link": article_link, + } + ) + article_texts = [] + # Get the article title + article_title = el.get_text(strip=True) + + # Get the link to the article section + name_attr = el.find("a").get("name") if el.find("a") else None + article_link = f"{url}#{name_attr}" if name_attr else None + # If the element is part of an article's text, add it to the current article + elif article_title and class_name == "normal": + article_texts.append(el.get_text(strip=True)) + + # Save the last article + if article_title and article_texts: + law_articles.append( + {"title": article_title, "texts": article_texts, "link": article_link} + ) + + return law_articles + + +def main(urls: List[str], output_dir: Path) -> None: + """ + Scrape law articles from a list of URLs and save them as JSON files. + + Args: + urls (List[str]): A list of URLs to scrape. + output_dir (Path): The directory where the JSON files will be saved. + """ + # Ensure the output directory exists + output_dir.mkdir(parents=True, exist_ok=True) + + with tqdm_logging_redirect(): + for url in tqdm(urls, desc="Scraping laws", total=len(urls)): + save_path = output_dir / f"{Path(url).stem}.json" + + try: + response = requests.get(url) + # Ensure we handle HTTP errors + response.raise_for_status() + except requests.RequestException as e: + logger.error(f'Failed to fetch URL: "{url}" - {e}') + continue + + soup = BeautifulSoup(response.content, "lxml") + + try: + law_articles = run_scraper(soup=soup, url=url) + except Exception as e: + logger.error(f'Failed to scrape data from URL: "{url}" - {e}') + continue + + try: + with open(save_path, "w", encoding="utf-8") as file: + json.dump(law_articles, file, indent=4, ensure_ascii=False) + logger.info(f'Successfully saved data to "{save_path}"') + except Exception as e: + logger.error(f'Failed to save data to "{save_path}" - {e}') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Scrape law articles from URLs.") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument("--url", type=str, help="Single URL to scrape.") + group.add_argument( + "--file", + type=Path, + help="Path to text file containing URLs separated by newlines.", + ) + parser.add_argument( + "--output-dir", + type=Path, + default=Path("scraper/laws"), + help="Directory to save the JSON files.", + ) + + args = parser.parse_args() + + if args.url: + urls = [args.url] + elif args.file: + with open(args.file, "r", encoding="utf-8") as file: + urls = [line.strip() for line in file if line.strip()] + + main(urls=urls, output_dir=args.output_dir) diff --git a/scraper/urls.txt b/scraper/urls.txt new file mode 100644 index 0000000..5af874d --- /dev/null +++ b/scraper/urls.txt @@ -0,0 +1,5 @@ +https://www.paragraf.rs/propisi/zakon_o_radu.html +https://www.paragraf.rs/propisi/zakon-o-porezu-na-dohodak-gradjana.html +https://www.paragraf.rs/propisi/zakon_o_zastiti_podataka_o_licnosti.html +https://www.paragraf.rs/propisi/zakon_o_zastiti_potrosaca.html +https://www.paragraf.rs/propisi/porodicni_zakon.html \ No newline at end of file