From b43ad2df811e1bc253b5e88c1ff4740160b4e68f Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 23 Dec 2024 20:19:58 +0000 Subject: [PATCH 1/2] [sp] Fix download paging. - should fetch more results based on the search returns, rather then the number of headers those link to. --- pyscraper/sp_2024/download.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyscraper/sp_2024/download.py b/pyscraper/sp_2024/download.py index 4bf87dc9..a671606f 100644 --- a/pyscraper/sp_2024/download.py +++ b/pyscraper/sp_2024/download.py @@ -72,7 +72,7 @@ def get_debate_groupings(start_date: str, end_date: str) -> list[DebateGrouping] start_date, end_date, search_page ) meeting_urls.extend(page_result_urls) - if heading_count < 10: + if len(page_result_urls) < 10: keep_fetching = False else: search_page += 1 From b15d1b541be221d5dfb734d543fb3ebe727faf34 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 23 Dec 2024 20:24:41 +0000 Subject: [PATCH 2/2] [sp] Structure one sided divisions correctly. SP can have divisions where everyone votes the same way - so there can be just one msplist --- pyscraper/sp_2024/parse.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyscraper/sp_2024/parse.py b/pyscraper/sp_2024/parse.py index 390cd4f6..5ea23b9b 100644 --- a/pyscraper/sp_2024/parse.py +++ b/pyscraper/sp_2024/parse.py @@ -189,9 +189,11 @@ def process_raw_html(raw_html: Tag, agenda_item_url: str) -> BeautifulSoup: speaker.append(vote_div) vote_tag.decompose() - # now we want to wrap any sequential msplists tags in a division tag + # now we want to wrap any group of msplists tags in a division tag + # in the scottish parliament all the votes *can* be on one side + # and there will be just one msp list vote_tags = speaker.find_all("msplist") - if len(vote_tags) > 1: + if len(vote_tags) > 0: division_tag = soup.new_tag("division") vote_tags[0].insert_before(division_tag) for vote_tag in vote_tags: