Merge pull request #41 from milistu/scraper

Scraper Update
milistu · May 16, 2024 · dcba319 · dcba319
2 parents a37770d + d25ea6e
commit dcba319
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 1 deletion.
diff --git a/scraper/README.md b/scraper/README.md
@@ -16,8 +16,12 @@ python scraper/scraper.py --file scraper/urls.txt --output-dir laws_test
 - `--output-dir`: Directory to save the JSON files (default is scraper/laws).
 
 ## Example
-To scrape law articles from a list of URLs in urls.txt and save the output in the `scraper/laws` directory:
+To scrape law articles from a single URL (example: Serbian Labor Law) and save the output in the `scraper/laws` directory:
+```bash
+python scraper/scraper.py --url "https://www.paragraf.rs/propisi/zakon_o_radu.html" --output-dir scraper/laws
+```
 
+To scrape law articles from a list of URLs in urls.txt and save the output in the `scraper/laws` directory:
 ```bash
 python scraper/scraper.py --file scraper/urls.txt --output-dir scraper/laws
 ```

diff --git a/scraper/scraper.py b/scraper/scraper.py
@@ -38,14 +38,18 @@ def run_scraper(soup: BeautifulSoup, url: str) -> List[Dict]:
     article_texts = []
     article_link = None
 
+    # Find all <p> elements in the HTML
     elements = soup.find_all("p")
     for el in elements:
+        # Determine the class name of the element
         class_name = (
             "clan" if check_class_element(element=el, class_name="clan") else "normal"
         )
 
+        # If the element is a title (class "clan"), start a new article
         if class_name == "clan":
             if article_title:
+                # Save the previous article
                 law_articles.append(
                     {
                         "title": article_title,
@@ -54,13 +58,17 @@ def run_scraper(soup: BeautifulSoup, url: str) -> List[Dict]:
                     }
                 )
                 article_texts = []
+            # Get the article title
             article_title = el.get_text(strip=True)
 
+            # Get the link to the article section
             name_attr = el.find("a").get("name") if el.find("a") else None
             article_link = f"{url}#{name_attr}" if name_attr else None
+        # If the element is part of an article's text, add it to the current article
         elif article_title and class_name == "normal":
             article_texts.append(el.get_text(strip=True))
 
+    # Save the last article
     if article_title and article_texts:
         law_articles.append(
             {"title": article_title, "texts": article_texts, "link": article_link}