Skip to content

Commit

Permalink
Add coments to run_scraper.
Browse files Browse the repository at this point in the history
  • Loading branch information
milistu committed May 16, 2024
1 parent 995b55e commit d25ea6e
Showing 1 changed file with 8 additions and 0 deletions.
8 changes: 8 additions & 0 deletions scraper/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,18 @@ def run_scraper(soup: BeautifulSoup, url: str) -> List[Dict]:
article_texts = []
article_link = None

# Find all <p> elements in the HTML
elements = soup.find_all("p")
for el in elements:
# Determine the class name of the element
class_name = (
"clan" if check_class_element(element=el, class_name="clan") else "normal"
)

# If the element is a title (class "clan"), start a new article
if class_name == "clan":
if article_title:
# Save the previous article
law_articles.append(
{
"title": article_title,
Expand All @@ -54,13 +58,17 @@ def run_scraper(soup: BeautifulSoup, url: str) -> List[Dict]:
}
)
article_texts = []
# Get the article title
article_title = el.get_text(strip=True)

# Get the link to the article section
name_attr = el.find("a").get("name") if el.find("a") else None
article_link = f"{url}#{name_attr}" if name_attr else None
# If the element is part of an article's text, add it to the current article
elif article_title and class_name == "normal":
article_texts.append(el.get_text(strip=True))

# Save the last article
if article_title and article_texts:
law_articles.append(
{"title": article_title, "texts": article_texts, "link": article_link}
Expand Down

0 comments on commit d25ea6e

Please sign in to comment.