From 4ceb7139a5b218bad73a80d96dfa014e20eaa1b2 Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Sun, 25 Aug 2024 08:41:05 +0300 Subject: [PATCH 1/7] added a proper extractor config & made uBO download url configurable --- config.example.json | 13 +++++--- sedd/config.py | 65 ++++++++++++++++++++++++++++++++++++++ sedd/main.py | 40 ++++++++++++----------- sedd/meta/notifications.py | 17 ++++++---- 4 files changed, 106 insertions(+), 29 deletions(-) create mode 100644 sedd/config.py diff --git a/config.example.json b/config.example.json index 2b55ffa..7f3778e 100644 --- a/config.example.json +++ b/config.example.json @@ -1,7 +1,10 @@ { - "email": "The email you use to log into SE", - "password": "The password you use to log into SE", - "notifications": { - "provider": "string, any of: native,. Can be null, which disables notifications" - } + "email": "The email you use to log into SE", + "password": "The password you use to log into SE", + "notifications": { + "provider": "string, any of: native,. Can be null, which disables notifications" + }, + "ubo": { + "download_url": "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" + } } diff --git a/sedd/config.py b/sedd/config.py new file mode 100644 index 0000000..051661f --- /dev/null +++ b/sedd/config.py @@ -0,0 +1,65 @@ +from typing import TypedDict, Literal + +from json import load +from os import path, getcwd + + +class SEDDNotificationsConfig(TypedDict): + provider: Literal['native'] | None + + +class SEDDUboConfig(TypedDict): + download_url: str + + +default_ubo_url = "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" + +default_notifications_config: SEDDNotificationsConfig = { + 'provider': None +} + +default_ubo_config: SEDDUboConfig = { + "download_url": default_ubo_url +} + + +class SEDDConfig: + email: str + password: str + notifications: SEDDNotificationsConfig + ubo: SEDDUboConfig + + def __init__(self, email: str, pwd: str, notifications: SEDDNotificationsConfig, ubo: SEDDUboConfig): + self.email = email + self.password = pwd + self.notifications = notifications + self.ubo = ubo + + def get_notifications_provider(self) -> Literal['native'] | None: + notifications_config = self.notifications + return notifications_config['provider'] if hasattr(notifications_config, 'provider') else None + + def get_ubo_download_url(self) -> str: + ubo_config = self.ubo + return ubo_config["download_url"] if hasattr(ubo_config, 'download_url') else default_ubo_url + + +def load_sedd_config() -> SEDDConfig: + config_path = path.join(getcwd(), 'config.json') + + config: SEDDConfig = None + + with open(config_path, "r") as f: + config = load(f) + + email = config["email"] + password = config["password"] + + notifications = config['notifications'] if hasattr( + config, 'notifications') else default_notifications_config + + ubo = config['ubo'] if hasattr(config, 'ubo') else default_ubo_config + + config = SEDDConfig(email, password, notifications, ubo) + + return config diff --git a/sedd/main.py b/sedd/main.py index 3a5748c..11f02f0 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -5,20 +5,22 @@ from selenium.common.exceptions import NoSuchElementException from typing import Dict -from .watcher.observer import register_pending_downloads_observer, Observer -from sedd.data import sites from time import sleep -import json import urllib.request -from .meta import notifications import re import os import sys from traceback import print_exception import argparse + + +from .config import load_sedd_config +from .data import sites +from .meta import notifications +from .watcher.observer import register_pending_downloads_observer from . import utils parser = argparse.ArgumentParser( @@ -71,21 +73,17 @@ def get_download_dir(): browser = webdriver.Firefox( options=options ) -if not os.path.exists("ubo.xpi"): - print("Downloading uBO") - urllib.request.urlretrieve( - "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi", - "ubo.xpi" - ) +sedd_config = load_sedd_config() -ubo_id = browser.install_addon("ubo.xpi", temporary=True) +ubo_download_url = sedd_config.get_ubo_download_url() + +if not os.path.exists("ubo.xpi"): + print(f"Downloading uBO from: {ubo_download_url}") + urllib.request.urlretrieve(ubo_download_url, "ubo.xpi") -with open("config.json", "r") as f: - config = json.load(f) -email = config["email"] -password = config["password"] +ubo_id = browser.install_addon("ubo.xpi", temporary=True) def kill_cookie_shit(browser: WebDriver): @@ -117,8 +115,8 @@ def login_or_create(browser: WebDriver, site: str): email_elem = browser.find_element(By.ID, "email") password_elem = browser.find_element(By.ID, "password") - email_elem.send_keys(email) - password_elem.send_keys(password) + email_elem.send_keys(sedd_config.email) + password_elem.send_keys(sedd_config.password) curr_url = browser.current_url browser.find_element(By.ID, "submit-button").click() @@ -130,7 +128,10 @@ def login_or_create(browser: WebDriver, site: str): if not captcha_walled: captcha_walled = True - notifications.notify("Captcha wall hit during login", config) + notifications.notify( + "Captcha wall hit during login", sedd_config + ) + sleep(10) if captcha_walled: @@ -239,6 +240,9 @@ def _exec_download(browser: WebDriver): etags ) + while True is True: + sleep(1) + if observer: pending = state.size() diff --git a/sedd/meta/notifications.py b/sedd/meta/notifications.py index 64d0527..6dddcaf 100644 --- a/sedd/meta/notifications.py +++ b/sedd/meta/notifications.py @@ -1,24 +1,29 @@ from desktop_notifier import DesktopNotifier import asyncio +from ..config import SEDDConfig + + def native(message: str, _): asyncio.run( DesktopNotifier().send( - title = "The data dump downloader needs attention", - message = f"{message}" + title="The data dump downloader needs attention", + message=f"{message}" ) ) + notification_providers = { "native": native } -def notify(message: str, config): - provider = config["notifications"]["provider"] + +def notify(message: str, config: SEDDConfig): + + provider = config.get_notifications_provider() + if provider is None: print(message) return notification_providers[provider](message, config["notifications"]) - - From 543069cb225df3e4bfb0a74e1c77018c7e4ee655 Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Sun, 25 Aug 2024 09:00:04 +0300 Subject: [PATCH 2/7] laxed .gitignore checks for config files --- .gitignore | 4 +++- sedd/main.py | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index c365219..54dd676 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ -/config.json +# configuration files +*config.json + compile_commands.json *.xpi /stackexchange_*/ diff --git a/sedd/main.py b/sedd/main.py index 11f02f0..6aa2dec 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -240,9 +240,6 @@ def _exec_download(browser: WebDriver): etags ) - while True is True: - sleep(1) - if observer: pending = state.size() From 9894bd9487257c41a045f39d0f1198c52fc1fccc Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Mon, 26 Aug 2024 04:04:12 +0300 Subject: [PATCH 3/7] moved driver initialization to its own module --- sedd/driver.py | 40 ++++++++++++++++++++++++++++++++++++++++ sedd/main.py | 37 ++++--------------------------------- 2 files changed, 44 insertions(+), 33 deletions(-) create mode 100644 sedd/driver.py diff --git a/sedd/driver.py b/sedd/driver.py new file mode 100644 index 0000000..a6c36d3 --- /dev/null +++ b/sedd/driver.py @@ -0,0 +1,40 @@ +from os import path, makedirs +from urllib import request + +from selenium import webdriver +from selenium.webdriver.firefox.options import Options + +from .config import SEDDConfig + + +def init_output_dir(output_dir: str): + if not path.exists(output_dir): + makedirs(output_dir) + + print(output_dir) + + return output_dir + + +def init_firefox_driver(config: SEDDConfig, output_dir: str): + options = Options() + options.enable_downloads = True + options.set_preference("browser.download.folderList", 2) + options.set_preference("browser.download.manager.showWhenStarting", False) + options.set_preference("browser.download.dir", output_dir) + options.set_preference( + "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip") + + browser = webdriver.Firefox( + options=options + ) + + ubo_download_url = config.get_ubo_download_url() + + if not path.exists("ubo.xpi"): + print(f"Downloading uBO from: {ubo_download_url}") + request.urlretrieve(ubo_download_url, "ubo.xpi") + + ubo_id = browser.install_addon("ubo.xpi", temporary=True) + + return browser, ubo_id diff --git a/sedd/main.py b/sedd/main.py index 6aa2dec..d7738aa 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -1,13 +1,10 @@ -from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.webdriver import WebDriver -from selenium.webdriver.firefox.options import Options from selenium.common.exceptions import NoSuchElementException from typing import Dict from time import sleep -import urllib.request import re import os @@ -23,6 +20,8 @@ from .watcher.observer import register_pending_downloads_observer from . import utils +from .driver import init_output_dir, init_firefox_driver + parser = argparse.ArgumentParser( prog="sedd", description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", @@ -51,39 +50,11 @@ args = parser.parse_args() -def get_download_dir(): - download_dir = args.output_dir - - if not os.path.exists(download_dir): - os.makedirs(download_dir) - - print(download_dir) - - return download_dir - - -options = Options() -options.enable_downloads = True -options.set_preference("browser.download.folderList", 2) -options.set_preference("browser.download.manager.showWhenStarting", False) -options.set_preference("browser.download.dir", get_download_dir()) -options.set_preference( - "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip") - -browser = webdriver.Firefox( - options=options -) - sedd_config = load_sedd_config() -ubo_download_url = sedd_config.get_ubo_download_url() - -if not os.path.exists("ubo.xpi"): - print(f"Downloading uBO from: {ubo_download_url}") - urllib.request.urlretrieve(ubo_download_url, "ubo.xpi") - +output_dir = init_output_dir(args.output_dir) -ubo_id = browser.install_addon("ubo.xpi", temporary=True) +browser, ubo_id = init_firefox_driver(sedd_config, output_dir) def kill_cookie_shit(browser: WebDriver): From 0f44fb089cfc8dabd30f3c463f14f934d210648c Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Mon, 26 Aug 2024 04:08:15 +0300 Subject: [PATCH 4/7] properly typed & moved CLI def in its own module --- sedd/cli.py | 42 ++++++++++++++++++++++++++++++++++++++++++ sedd/driver.py | 4 +--- sedd/main.py | 32 ++------------------------------ 3 files changed, 45 insertions(+), 33 deletions(-) create mode 100644 sedd/cli.py diff --git a/sedd/cli.py b/sedd/cli.py new file mode 100644 index 0000000..b5d53ac --- /dev/null +++ b/sedd/cli.py @@ -0,0 +1,42 @@ +import argparse + +from os import getcwd +from os.path import join + +from typing import TypedDict + + +class SEDDCLIArgs(TypedDict): + skip_loaded: bool + output_dir: str + dry_run: bool + + +parser = argparse.ArgumentParser( + prog="sedd", + description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", +) +parser.add_argument( + "-s", "--skip-loaded", + required=False, + default=False, + action="store_true", + dest="skip_loaded" +) +parser.add_argument( + "-o", "--outputDir", + required=False, + dest="output_dir", + default=join(getcwd(), "downloads") +) +parser.add_argument( + "--dry-run", + required=False, + default=False, + action="store_true", + dest="dry_run" +) + + +def parse_cli_args() -> SEDDCLIArgs: + return parser.parse_args() diff --git a/sedd/driver.py b/sedd/driver.py index a6c36d3..359ab0b 100644 --- a/sedd/driver.py +++ b/sedd/driver.py @@ -25,9 +25,7 @@ def init_firefox_driver(config: SEDDConfig, output_dir: str): options.set_preference( "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip") - browser = webdriver.Firefox( - options=options - ) + browser = webdriver.Firefox(options=options) ubo_download_url = config.get_ubo_download_url() diff --git a/sedd/main.py b/sedd/main.py index d7738aa..cd24d33 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -7,13 +7,11 @@ from time import sleep import re -import os import sys from traceback import print_exception -import argparse - +from .cli import parse_cli_args from .config import load_sedd_config from .data import sites from .meta import notifications @@ -22,33 +20,7 @@ from .driver import init_output_dir, init_firefox_driver -parser = argparse.ArgumentParser( - prog="sedd", - description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", -) -parser.add_argument( - "-s", "--skip-loaded", - required=False, - default=False, - action="store_true", - dest="skip_loaded" -) -parser.add_argument( - "-o", "--outputDir", - required=False, - dest="output_dir", - default=os.path.join(os.getcwd(), "downloads") -) -parser.add_argument( - "--dry-run", - required=False, - default=False, - action="store_true", - dest="dry_run" -) - -args = parser.parse_args() - +args = parse_cli_args() sedd_config = load_sedd_config() From 7e8df178554e7f7eb291a79bfc9b16636c1bf319 Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Mon, 26 Aug 2024 12:57:29 +0300 Subject: [PATCH 5/7] made uBlock configurable --- config.example.json | 34 ++++++++++++++- sedd/config/__init__.py | 3 ++ sedd/{ => config}/config.py | 35 +++++----------- sedd/config/defaults.py | 44 +++++++++++++++++++ sedd/config/typings.py | 25 +++++++++++ sedd/driver.py | 16 ++++++- sedd/main.py | 4 +- sedd/ubo/__init__.py | 2 + sedd/ubo/ubo.py | 31 ++++++++++++++ sedd/ubo/utils.py | 84 +++++++++++++++++++++++++++++++++++++ 10 files changed, 249 insertions(+), 29 deletions(-) create mode 100644 sedd/config/__init__.py rename sedd/{ => config}/config.py (52%) create mode 100644 sedd/config/defaults.py create mode 100644 sedd/config/typings.py create mode 100644 sedd/ubo/__init__.py create mode 100644 sedd/ubo/ubo.py create mode 100644 sedd/ubo/utils.py diff --git a/config.example.json b/config.example.json index 7f3778e..453f7bd 100644 --- a/config.example.json +++ b/config.example.json @@ -5,6 +5,38 @@ "provider": "string, any of: native,. Can be null, which disables notifications" }, "ubo": { - "download_url": "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" + "download_url": "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi", + "settings": { + "userSettings": { + "advancedUserEnabled": true + }, + "hiddenSettings": {}, + "selectedFilterLists": [ + "user-filters", + "ublock-filters", + "ublock-badware", + "ublock-privacy", + "ublock-unbreak", + "ublock-quick-fixes", + "easylist", + "easyprivacy", + "urlhaus-1", + "plowe-0", + "fanboy-cookiemonster", + "ublock-cookies-easylist", + "adguard-cookies", + "ublock-cookies-adguard" + ], + "whitelist": ["chrome-extension-scheme", "moz-extension-scheme"], + "dynamicFilters": { + "toAdd": [], + "toRemove": [] + }, + "userFilters": { + "enabled": true, + "trusted": true, + "toOverwrite": [] + } + } } } diff --git a/sedd/config/__init__.py b/sedd/config/__init__.py new file mode 100644 index 0000000..a3f6f85 --- /dev/null +++ b/sedd/config/__init__.py @@ -0,0 +1,3 @@ +from .config import * +from .defaults import * +from .typings import * diff --git a/sedd/config.py b/sedd/config/config.py similarity index 52% rename from sedd/config.py rename to sedd/config/config.py index 051661f..aaee3c2 100644 --- a/sedd/config.py +++ b/sedd/config/config.py @@ -1,26 +1,10 @@ -from typing import TypedDict, Literal +from typing import Literal from json import load from os import path, getcwd - -class SEDDNotificationsConfig(TypedDict): - provider: Literal['native'] | None - - -class SEDDUboConfig(TypedDict): - download_url: str - - -default_ubo_url = "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" - -default_notifications_config: SEDDNotificationsConfig = { - 'provider': None -} - -default_ubo_config: SEDDUboConfig = { - "download_url": default_ubo_url -} +from .defaults import default_ubo_url, default_ubo_settings, default_notifications_config, default_ubo_config +from .typings import SEDDNotificationsConfig, SEDDUboConfig, SEDDUboSettings class SEDDConfig: @@ -37,11 +21,15 @@ def __init__(self, email: str, pwd: str, notifications: SEDDNotificationsConfig, def get_notifications_provider(self) -> Literal['native'] | None: notifications_config = self.notifications - return notifications_config['provider'] if hasattr(notifications_config, 'provider') else None + return notifications_config['provider'] if 'provider' in notifications_config else None def get_ubo_download_url(self) -> str: ubo_config = self.ubo - return ubo_config["download_url"] if hasattr(ubo_config, 'download_url') else default_ubo_url + return ubo_config['download_url'] if 'download_url' in ubo_config else default_ubo_url + + def get_ubo_settings(self) -> SEDDUboSettings: + ubo_config = self.ubo + return ubo_config['settings'] if 'settings' in ubo_config else default_ubo_settings def load_sedd_config() -> SEDDConfig: @@ -55,10 +43,9 @@ def load_sedd_config() -> SEDDConfig: email = config["email"] password = config["password"] - notifications = config['notifications'] if hasattr( - config, 'notifications') else default_notifications_config + notifications = config['notifications'] if 'notifications' in config else default_notifications_config - ubo = config['ubo'] if hasattr(config, 'ubo') else default_ubo_config + ubo = config['ubo'] if 'ubo' in config else default_ubo_config config = SEDDConfig(email, password, notifications, ubo) diff --git a/sedd/config/defaults.py b/sedd/config/defaults.py new file mode 100644 index 0000000..1b279a8 --- /dev/null +++ b/sedd/config/defaults.py @@ -0,0 +1,44 @@ +from .typings import SEDDNotificationsConfig, SEDDUboSettings, SEDDUboConfig + +default_ubo_url = "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" + +default_notifications_config: SEDDNotificationsConfig = { + 'provider': None +} + +default_ubo_settings: SEDDUboSettings = { + 'userSettings': { + 'advancedUserEnabled': True, + }, + "hiddenSettings": {}, + "selectedFilterLists": [ + "user-filters", + "ublock-filters", + "ublock-badware", + "ublock-privacy", + "ublock-unbreak", + "ublock-quick-fixes", + "easylist", + "easyprivacy", + "urlhaus-1", + "plowe-0" + ], + "whitelist": [ + "chrome-extension-scheme", + "moz-extension-scheme" + ], + "dynamicFilters": { + "toAdd": [], + "toRemove": [] + }, + "userFilters": { + "enabled": False, + "trusted": False, + "toOverwrite": [] + } +} + +default_ubo_config: SEDDUboConfig = { + 'download_url': default_ubo_url, + 'settings': default_ubo_settings +} diff --git a/sedd/config/typings.py b/sedd/config/typings.py new file mode 100644 index 0000000..4ba034c --- /dev/null +++ b/sedd/config/typings.py @@ -0,0 +1,25 @@ +from typing import TypedDict, Literal + + +class SEDDNotificationsConfig(TypedDict): + provider: Literal['native'] | None + + +class SEDDUboUserFiltersSettings(TypedDict): + enabled: bool + trusted: bool + toOverwrite: list[str] + + +class SEDDUboSettings(TypedDict): + userSettings: dict[str, str | bool | int] + hiddenSettings: dict[str, str | bool | int] + selectedFilterLists: list[str] + whitelist: list[str] + dynamicFilters: dict[Literal['toAdd'] | Literal['toRemove'], list[str]] + userFilters: SEDDUboUserFiltersSettings + + +class SEDDUboConfig(TypedDict): + download_url: str + settings: SEDDUboSettings | None diff --git a/sedd/driver.py b/sedd/driver.py index 359ab0b..167ab19 100644 --- a/sedd/driver.py +++ b/sedd/driver.py @@ -1,10 +1,13 @@ from os import path, makedirs from urllib import request +from json import dumps +from uuid import uuid4 from selenium import webdriver from selenium.webdriver.firefox.options import Options from .config import SEDDConfig +from .ubo import init_ubo_settings def init_output_dir(output_dir: str): @@ -23,7 +26,14 @@ def init_firefox_driver(config: SEDDConfig, output_dir: str): options.set_preference("browser.download.manager.showWhenStarting", False) options.set_preference("browser.download.dir", output_dir) options.set_preference( - "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip") + "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip" + ) + + # our own uuid for uBO so as we don't need to do the dance of inspecing internals + ubo_internal_uuid = f"{uuid4()}" + + options.set_preference("extensions.webextensions.uuids", dumps( + {"uBlock0@raymondhill.net": ubo_internal_uuid})) browser = webdriver.Firefox(options=options) @@ -35,4 +45,6 @@ def init_firefox_driver(config: SEDDConfig, output_dir: str): ubo_id = browser.install_addon("ubo.xpi", temporary=True) - return browser, ubo_id + ubo_status = init_ubo_settings(browser, config, ubo_internal_uuid) + + return browser, ubo_id, ubo_status diff --git a/sedd/main.py b/sedd/main.py index cd24d33..4cd2dd9 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -162,8 +162,6 @@ def _exec_download(browser: WebDriver): state, observer = register_pending_downloads_observer(args.output_dir) for site in sites.sites: - print(f"Extracting from {site}...") - if site not in ["https://meta.stackexchange.com", "https://stackapps.com"]: # https://regex101.com/r/kG6nTN/1 meta_url = re.sub( @@ -175,6 +173,8 @@ def _exec_download(browser: WebDriver): if args.skip_loaded and main_loaded and meta_loaded: pass else: + print(f"Extracting from {site}...") + login_or_create(browser, site) download_data_dump( browser, diff --git a/sedd/ubo/__init__.py b/sedd/ubo/__init__.py new file mode 100644 index 0000000..39575e8 --- /dev/null +++ b/sedd/ubo/__init__.py @@ -0,0 +1,2 @@ +from .ubo import * +from .utils import * \ No newline at end of file diff --git a/sedd/ubo/ubo.py b/sedd/ubo/ubo.py new file mode 100644 index 0000000..69d3aa0 --- /dev/null +++ b/sedd/ubo/ubo.py @@ -0,0 +1,31 @@ +from selenium.webdriver import Firefox +from sys import exc_info +from time import sleep + +from ..config import SEDDConfig +from .utils import ubo_set_user_settings, \ + ubo_set_advanced_settings, ubo_set_selected_filters, \ + ubo_set_whitelist, ubo_set_dynamic_rules, ubo_set_user_filters + + +def init_ubo_settings(browser: Firefox, config: SEDDConfig, ubo_id: str) -> bool: + try: + browser.get( + f'moz-extension://{ubo_id}/dashboard.html#settings.html' + ) + + settings = config.get_ubo_settings() + + ubo_set_user_settings(browser, settings) + ubo_set_advanced_settings(browser, settings) + + # idk why, but applyFilterListSelection only works after a delay + sleep(1) + + ubo_set_selected_filters(browser, settings) + ubo_set_whitelist(browser, settings) + ubo_set_dynamic_rules(browser, settings) + ubo_set_user_filters(browser, settings) + except: + print('Failed to set uBLock config, using defaults') + print(exc_info()) diff --git a/sedd/ubo/utils.py b/sedd/ubo/utils.py new file mode 100644 index 0000000..296cd4b --- /dev/null +++ b/sedd/ubo/utils.py @@ -0,0 +1,84 @@ +from selenium.webdriver import Firefox +from ..config.typings import SEDDUboSettings + + +def ubo_set_user_settings(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'userSettings' message to set uBO's user settings + # see: https://github.com/gorhill/uBlock/blob/master/src/js/settings.js#L215 + for key, val in settings['userSettings'].items(): + browser.execute_script(""" + const name = arguments[0]; + const value = arguments[1]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'userSettings', name, value, + }); + """, key, val) + + +def ubo_set_advanced_settings(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'writeHiddenSettings' message to set uBO's advanced settings + # see: https://github.com/gorhill/uBlock/blob/master/src/js/advanced-settings.js#L177 + browser.execute_script(""" + const settings = arguments[0]; + + const content = Object.entries(settings) + .map(([k,v]) => `${k} ${v}`) + .join('\\n'); + + globalThis.vAPI.messaging.send('dashboard', { + what: 'writeHiddenSettings', content, + }); + """, settings['hiddenSettings']) + + +def ubo_set_selected_filters(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'applyFilterListSelection' message to set uBO's selected filter lists + # see: https://github.com/gorhill/uBlock/blob/master/src/js/storage.js#L486 + browser.execute_script(""" + const toSelect = arguments[0]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'applyFilterListSelection', toSelect, + }) + """, settings['selectedFilterLists']) + + +def ubo_set_whitelist(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'setWhitelist' message to set uBO's trusted sites list + # see: https://github.com/gorhill/uBlock/blob/master/src/js/messaging.js#L225 + browser.execute_script(""" + const list = arguments[0]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'setWhitelist', whitelist: list.join('\\n') + }) + """, settings["whitelist"]) + + +def ubo_set_dynamic_rules(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'modifyRuleset' message to set uBO's dynamic rules + # see: https://github.com/gorhill/uBlock/blob/master/src/js/dyna-rules.js#L279 + browser.execute_script(""" + const { toAdd = [], toRemove = [] } = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'modifyRuleset', permanent: true, + toAdd: toAdd.join('\\n'), + toRemove: toRemove.join('\\n'), + }) + + """, settings["dynamicFilters"]) + + +def ubo_set_user_filters(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'writeUserFilters' message to set uBO's user filters + # see: https://github.com/gorhill/uBlock/blob/master/src/js/storage.js#L582 + browser.execute_script(""" + const { trusted, enabled, toOverwrite = [] } = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'writeUserFilters', trusted, enabled, + content: toOverwrite.join('\\n') + }) + """, settings['userFilters']) From 3859890efaf4b50f060397563ec1b5d18af8c7fb Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Mon, 26 Aug 2024 13:51:03 +0300 Subject: [PATCH 6/7] fixed args typings & added keep-consent CLI option --- README.md | 9 +++++---- sedd/cli.py | 16 +++++++++++++--- sedd/driver.py | 4 ++-- sedd/main.py | 6 +++++- 4 files changed, 25 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 5d46602..0c31455 100644 --- a/README.md +++ b/README.md @@ -92,11 +92,12 @@ The downloader does **not** support Docker due to the display requirement. Exractor CLI supports the following configuration options: -| Short | Long | Type | Default | Description | -| ----- | ---------------------- | -------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `-o` | `--outputDir ` | Optional | `/downloads` | Specifies the directory to download the archives to. | +| Short | Long | Type | Default | Description | +| ----- | ---------------------- | -------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `-o` | `--outputDir ` | Optional | `/downloads` | Specifies the directory to download the archives to. | +| `-k` | `--keep-consent` | Optional | `false` | Whether to keep OneTrust's consent dialog. If set, you are responsible for getting rid of it yourself (uBlock can handle that for you too). | | `-s` | `--skip-loaded ` | Optional | - | Whether to skip over archives that have already been downloaded. An archive is considered to be downloaded if the output directory has one already & the file is not empty. | -| - | `--dry-run` | Optional | - | Whether to actually download the archives. If set, only traverses the network's sites. | +| - | `--dry-run` | Optional | - | Whether to actually download the archives. If set, only traverses the network's sites. | #### Captchas and other misc. barriers diff --git a/sedd/cli.py b/sedd/cli.py index b5d53ac..66bd62e 100644 --- a/sedd/cli.py +++ b/sedd/cli.py @@ -3,11 +3,10 @@ from os import getcwd from os.path import join -from typing import TypedDict - -class SEDDCLIArgs(TypedDict): +class SEDDCLIArgs(argparse.Namespace): skip_loaded: bool + keep_consent: bool output_dir: str dry_run: bool @@ -16,6 +15,7 @@ class SEDDCLIArgs(TypedDict): prog="sedd", description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", ) + parser.add_argument( "-s", "--skip-loaded", required=False, @@ -23,12 +23,22 @@ class SEDDCLIArgs(TypedDict): action="store_true", dest="skip_loaded" ) + +parser.add_argument( + "-k", "--keep-consent", + required=False, + dest="keep_consent", + action="store_true", + default=False +) + parser.add_argument( "-o", "--outputDir", required=False, dest="output_dir", default=join(getcwd(), "downloads") ) + parser.add_argument( "--dry-run", required=False, diff --git a/sedd/driver.py b/sedd/driver.py index 167ab19..4621a30 100644 --- a/sedd/driver.py +++ b/sedd/driver.py @@ -45,6 +45,6 @@ def init_firefox_driver(config: SEDDConfig, output_dir: str): ubo_id = browser.install_addon("ubo.xpi", temporary=True) - ubo_status = init_ubo_settings(browser, config, ubo_internal_uuid) + init_ubo_settings(browser, config, ubo_internal_uuid) - return browser, ubo_id, ubo_status + return browser, ubo_id diff --git a/sedd/main.py b/sedd/main.py index 4cd2dd9..9fdcbcc 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -90,7 +90,11 @@ def download_data_dump(browser: WebDriver, site: str, meta_url: str, etags: Dict print(f"Downloading data dump from {site}") def _exec_download(browser: WebDriver): - kill_cookie_shit(browser) + if args.keep_consent: + print('Consent dialog will not be auto-removed') + else: + kill_cookie_shit(browser) + try: checkbox = browser.find_element(By.ID, "datadump-agree-checkbox") btn = browser.find_element(By.ID, "datadump-download-button") From 4c507bbd17de94858953ec7c619539a7fe044179 Mon Sep 17 00:00:00 2001 From: Oleg Valter Date: Mon, 26 Aug 2024 14:18:28 +0300 Subject: [PATCH 7/7] filters must be reloaded after setting, or there's a chance they won't be ready before Selenium navigates --- sedd/ubo/ubo.py | 5 ++++- sedd/ubo/utils.py | 10 ++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/sedd/ubo/ubo.py b/sedd/ubo/ubo.py index 69d3aa0..a889b3f 100644 --- a/sedd/ubo/ubo.py +++ b/sedd/ubo/ubo.py @@ -5,7 +5,8 @@ from ..config import SEDDConfig from .utils import ubo_set_user_settings, \ ubo_set_advanced_settings, ubo_set_selected_filters, \ - ubo_set_whitelist, ubo_set_dynamic_rules, ubo_set_user_filters + ubo_set_whitelist, ubo_set_dynamic_rules, ubo_set_user_filters, \ + ubo_reload_all_filters def init_ubo_settings(browser: Firefox, config: SEDDConfig, ubo_id: str) -> bool: @@ -26,6 +27,8 @@ def init_ubo_settings(browser: Firefox, config: SEDDConfig, ubo_id: str) -> bool ubo_set_whitelist(browser, settings) ubo_set_dynamic_rules(browser, settings) ubo_set_user_filters(browser, settings) + + ubo_reload_all_filters(browser) except: print('Failed to set uBLock config, using defaults') print(exc_info()) diff --git a/sedd/ubo/utils.py b/sedd/ubo/utils.py index 296cd4b..7bcf73f 100644 --- a/sedd/ubo/utils.py +++ b/sedd/ubo/utils.py @@ -82,3 +82,13 @@ def ubo_set_user_filters(browser: Firefox, settings: SEDDUboSettings) -> None: content: toOverwrite.join('\\n') }) """, settings['userFilters']) + + +def ubo_reload_all_filters(browser: Firefox) -> None: + browser.execute_async_script(""" + const done = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'reloadAllFilters', + }).then(done) + """)