Skip to content

Commit

Permalink
Fix spider: prevent global playwright usage
Browse files Browse the repository at this point in the history
Sets playwright for just the settings of the cuya_northeast_ohio_coordinating spider instead of in the global settings, which ensures the other spiders don't enable middleware they don't need.
  • Loading branch information
SimmonsRitchie committed Aug 13, 2024
1 parent 1f16c81 commit b91b4b2
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
8 changes: 0 additions & 8 deletions city_scrapers/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,3 @@
SPIDER_MIDDLEWARES = {}

logging.getLogger("pdfminer").propagate = False

# Playwright settings
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
DOWNLOAD_HANDLERS = {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
}
PLAYWRIGHT_BROWSER_TYPE = "firefox"
9 changes: 8 additions & 1 deletion city_scrapers/spiders/cuya_northeast_ohio_coordinating.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,15 @@ class CuyaNortheastOhioCoordinatingSpider(CityScrapersSpider):
"https://www.noaca.org/board-committees/noaca-board-and-committees/agendas-and-presentations/-toggle-all", # noqa
"https://www.noaca.org/board-committees/noaca-board-and-committees/agendas-and-presentations/-toggle-all/-npage-2", # noqa
]
# intended to avoid being block by bot detection
custom_settings = {
# Playwright uses to help avoid bot detection
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"DOWNLOAD_HANDLERS": {
"http": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
"https": "scrapy_playwright.handler.ScrapyPlaywrightDownloadHandler",
},
"PLAYWRIGHT_BROWSER_TYPE": "firefox",
# other scrapy settings to help avoid bot detection
"DOWNLOAD_DELAY": 1,
"ROBOTSTXT_OBEY": False,
"USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0", # noqa
Expand Down

0 comments on commit b91b4b2

Please sign in to comment.