diff --git a/.gitignore b/.gitignore index c365219..54dd676 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ -/config.json +# configuration files +*config.json + compile_commands.json *.xpi /stackexchange_*/ diff --git a/README.md b/README.md index 5d46602..0c31455 100644 --- a/README.md +++ b/README.md @@ -92,11 +92,12 @@ The downloader does **not** support Docker due to the display requirement. Exractor CLI supports the following configuration options: -| Short | Long | Type | Default | Description | -| ----- | ---------------------- | -------- | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `-o` | `--outputDir ` | Optional | `/downloads` | Specifies the directory to download the archives to. | +| Short | Long | Type | Default | Description | +| ----- | ---------------------- | -------- | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `-o` | `--outputDir ` | Optional | `/downloads` | Specifies the directory to download the archives to. | +| `-k` | `--keep-consent` | Optional | `false` | Whether to keep OneTrust's consent dialog. If set, you are responsible for getting rid of it yourself (uBlock can handle that for you too). | | `-s` | `--skip-loaded ` | Optional | - | Whether to skip over archives that have already been downloaded. An archive is considered to be downloaded if the output directory has one already & the file is not empty. | -| - | `--dry-run` | Optional | - | Whether to actually download the archives. If set, only traverses the network's sites. | +| - | `--dry-run` | Optional | - | Whether to actually download the archives. If set, only traverses the network's sites. | #### Captchas and other misc. barriers diff --git a/config.example.json b/config.example.json index 2b55ffa..453f7bd 100644 --- a/config.example.json +++ b/config.example.json @@ -1,7 +1,42 @@ { - "email": "The email you use to log into SE", - "password": "The password you use to log into SE", - "notifications": { - "provider": "string, any of: native,. Can be null, which disables notifications" + "email": "The email you use to log into SE", + "password": "The password you use to log into SE", + "notifications": { + "provider": "string, any of: native,. Can be null, which disables notifications" + }, + "ubo": { + "download_url": "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi", + "settings": { + "userSettings": { + "advancedUserEnabled": true + }, + "hiddenSettings": {}, + "selectedFilterLists": [ + "user-filters", + "ublock-filters", + "ublock-badware", + "ublock-privacy", + "ublock-unbreak", + "ublock-quick-fixes", + "easylist", + "easyprivacy", + "urlhaus-1", + "plowe-0", + "fanboy-cookiemonster", + "ublock-cookies-easylist", + "adguard-cookies", + "ublock-cookies-adguard" + ], + "whitelist": ["chrome-extension-scheme", "moz-extension-scheme"], + "dynamicFilters": { + "toAdd": [], + "toRemove": [] + }, + "userFilters": { + "enabled": true, + "trusted": true, + "toOverwrite": [] + } } + } } diff --git a/sedd/cli.py b/sedd/cli.py new file mode 100644 index 0000000..66bd62e --- /dev/null +++ b/sedd/cli.py @@ -0,0 +1,52 @@ +import argparse + +from os import getcwd +from os.path import join + + +class SEDDCLIArgs(argparse.Namespace): + skip_loaded: bool + keep_consent: bool + output_dir: str + dry_run: bool + + +parser = argparse.ArgumentParser( + prog="sedd", + description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", +) + +parser.add_argument( + "-s", "--skip-loaded", + required=False, + default=False, + action="store_true", + dest="skip_loaded" +) + +parser.add_argument( + "-k", "--keep-consent", + required=False, + dest="keep_consent", + action="store_true", + default=False +) + +parser.add_argument( + "-o", "--outputDir", + required=False, + dest="output_dir", + default=join(getcwd(), "downloads") +) + +parser.add_argument( + "--dry-run", + required=False, + default=False, + action="store_true", + dest="dry_run" +) + + +def parse_cli_args() -> SEDDCLIArgs: + return parser.parse_args() diff --git a/sedd/config/__init__.py b/sedd/config/__init__.py new file mode 100644 index 0000000..a3f6f85 --- /dev/null +++ b/sedd/config/__init__.py @@ -0,0 +1,3 @@ +from .config import * +from .defaults import * +from .typings import * diff --git a/sedd/config/config.py b/sedd/config/config.py new file mode 100644 index 0000000..aaee3c2 --- /dev/null +++ b/sedd/config/config.py @@ -0,0 +1,52 @@ +from typing import Literal + +from json import load +from os import path, getcwd + +from .defaults import default_ubo_url, default_ubo_settings, default_notifications_config, default_ubo_config +from .typings import SEDDNotificationsConfig, SEDDUboConfig, SEDDUboSettings + + +class SEDDConfig: + email: str + password: str + notifications: SEDDNotificationsConfig + ubo: SEDDUboConfig + + def __init__(self, email: str, pwd: str, notifications: SEDDNotificationsConfig, ubo: SEDDUboConfig): + self.email = email + self.password = pwd + self.notifications = notifications + self.ubo = ubo + + def get_notifications_provider(self) -> Literal['native'] | None: + notifications_config = self.notifications + return notifications_config['provider'] if 'provider' in notifications_config else None + + def get_ubo_download_url(self) -> str: + ubo_config = self.ubo + return ubo_config['download_url'] if 'download_url' in ubo_config else default_ubo_url + + def get_ubo_settings(self) -> SEDDUboSettings: + ubo_config = self.ubo + return ubo_config['settings'] if 'settings' in ubo_config else default_ubo_settings + + +def load_sedd_config() -> SEDDConfig: + config_path = path.join(getcwd(), 'config.json') + + config: SEDDConfig = None + + with open(config_path, "r") as f: + config = load(f) + + email = config["email"] + password = config["password"] + + notifications = config['notifications'] if 'notifications' in config else default_notifications_config + + ubo = config['ubo'] if 'ubo' in config else default_ubo_config + + config = SEDDConfig(email, password, notifications, ubo) + + return config diff --git a/sedd/config/defaults.py b/sedd/config/defaults.py new file mode 100644 index 0000000..1b279a8 --- /dev/null +++ b/sedd/config/defaults.py @@ -0,0 +1,44 @@ +from .typings import SEDDNotificationsConfig, SEDDUboSettings, SEDDUboConfig + +default_ubo_url = "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi" + +default_notifications_config: SEDDNotificationsConfig = { + 'provider': None +} + +default_ubo_settings: SEDDUboSettings = { + 'userSettings': { + 'advancedUserEnabled': True, + }, + "hiddenSettings": {}, + "selectedFilterLists": [ + "user-filters", + "ublock-filters", + "ublock-badware", + "ublock-privacy", + "ublock-unbreak", + "ublock-quick-fixes", + "easylist", + "easyprivacy", + "urlhaus-1", + "plowe-0" + ], + "whitelist": [ + "chrome-extension-scheme", + "moz-extension-scheme" + ], + "dynamicFilters": { + "toAdd": [], + "toRemove": [] + }, + "userFilters": { + "enabled": False, + "trusted": False, + "toOverwrite": [] + } +} + +default_ubo_config: SEDDUboConfig = { + 'download_url': default_ubo_url, + 'settings': default_ubo_settings +} diff --git a/sedd/config/typings.py b/sedd/config/typings.py new file mode 100644 index 0000000..4ba034c --- /dev/null +++ b/sedd/config/typings.py @@ -0,0 +1,25 @@ +from typing import TypedDict, Literal + + +class SEDDNotificationsConfig(TypedDict): + provider: Literal['native'] | None + + +class SEDDUboUserFiltersSettings(TypedDict): + enabled: bool + trusted: bool + toOverwrite: list[str] + + +class SEDDUboSettings(TypedDict): + userSettings: dict[str, str | bool | int] + hiddenSettings: dict[str, str | bool | int] + selectedFilterLists: list[str] + whitelist: list[str] + dynamicFilters: dict[Literal['toAdd'] | Literal['toRemove'], list[str]] + userFilters: SEDDUboUserFiltersSettings + + +class SEDDUboConfig(TypedDict): + download_url: str + settings: SEDDUboSettings | None diff --git a/sedd/driver.py b/sedd/driver.py new file mode 100644 index 0000000..4621a30 --- /dev/null +++ b/sedd/driver.py @@ -0,0 +1,50 @@ +from os import path, makedirs +from urllib import request +from json import dumps +from uuid import uuid4 + +from selenium import webdriver +from selenium.webdriver.firefox.options import Options + +from .config import SEDDConfig +from .ubo import init_ubo_settings + + +def init_output_dir(output_dir: str): + if not path.exists(output_dir): + makedirs(output_dir) + + print(output_dir) + + return output_dir + + +def init_firefox_driver(config: SEDDConfig, output_dir: str): + options = Options() + options.enable_downloads = True + options.set_preference("browser.download.folderList", 2) + options.set_preference("browser.download.manager.showWhenStarting", False) + options.set_preference("browser.download.dir", output_dir) + options.set_preference( + "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip" + ) + + # our own uuid for uBO so as we don't need to do the dance of inspecing internals + ubo_internal_uuid = f"{uuid4()}" + + options.set_preference("extensions.webextensions.uuids", dumps( + {"uBlock0@raymondhill.net": ubo_internal_uuid})) + + browser = webdriver.Firefox(options=options) + + ubo_download_url = config.get_ubo_download_url() + + if not path.exists("ubo.xpi"): + print(f"Downloading uBO from: {ubo_download_url}") + request.urlretrieve(ubo_download_url, "ubo.xpi") + + ubo_id = browser.install_addon("ubo.xpi", temporary=True) + + init_ubo_settings(browser, config, ubo_internal_uuid) + + return browser, ubo_id diff --git a/sedd/main.py b/sedd/main.py index 3a5748c..9fdcbcc 100644 --- a/sedd/main.py +++ b/sedd/main.py @@ -1,91 +1,32 @@ -from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.firefox.webdriver import WebDriver -from selenium.webdriver.firefox.options import Options from selenium.common.exceptions import NoSuchElementException from typing import Dict -from .watcher.observer import register_pending_downloads_observer, Observer -from sedd.data import sites from time import sleep -import json -import urllib.request -from .meta import notifications import re -import os import sys from traceback import print_exception -import argparse + +from .cli import parse_cli_args +from .config import load_sedd_config +from .data import sites +from .meta import notifications +from .watcher.observer import register_pending_downloads_observer from . import utils -parser = argparse.ArgumentParser( - prog="sedd", - description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format", -) -parser.add_argument( - "-s", "--skip-loaded", - required=False, - default=False, - action="store_true", - dest="skip_loaded" -) -parser.add_argument( - "-o", "--outputDir", - required=False, - dest="output_dir", - default=os.path.join(os.getcwd(), "downloads") -) -parser.add_argument( - "--dry-run", - required=False, - default=False, - action="store_true", - dest="dry_run" -) - -args = parser.parse_args() - - -def get_download_dir(): - download_dir = args.output_dir - - if not os.path.exists(download_dir): - os.makedirs(download_dir) - - print(download_dir) - - return download_dir - - -options = Options() -options.enable_downloads = True -options.set_preference("browser.download.folderList", 2) -options.set_preference("browser.download.manager.showWhenStarting", False) -options.set_preference("browser.download.dir", get_download_dir()) -options.set_preference( - "browser.helperApps.neverAsk.saveToDisk", "application/x-gzip") - -browser = webdriver.Firefox( - options=options -) -if not os.path.exists("ubo.xpi"): - print("Downloading uBO") - urllib.request.urlretrieve( - "https://github.com/gorhill/uBlock/releases/download/1.59.0/uBlock0_1.59.0.firefox.signed.xpi", - "ubo.xpi" - ) - - -ubo_id = browser.install_addon("ubo.xpi", temporary=True) - -with open("config.json", "r") as f: - config = json.load(f) - -email = config["email"] -password = config["password"] +from .driver import init_output_dir, init_firefox_driver + +args = parse_cli_args() + +sedd_config = load_sedd_config() + +output_dir = init_output_dir(args.output_dir) + +browser, ubo_id = init_firefox_driver(sedd_config, output_dir) def kill_cookie_shit(browser: WebDriver): @@ -117,8 +58,8 @@ def login_or_create(browser: WebDriver, site: str): email_elem = browser.find_element(By.ID, "email") password_elem = browser.find_element(By.ID, "password") - email_elem.send_keys(email) - password_elem.send_keys(password) + email_elem.send_keys(sedd_config.email) + password_elem.send_keys(sedd_config.password) curr_url = browser.current_url browser.find_element(By.ID, "submit-button").click() @@ -130,7 +71,10 @@ def login_or_create(browser: WebDriver, site: str): if not captcha_walled: captcha_walled = True - notifications.notify("Captcha wall hit during login", config) + notifications.notify( + "Captcha wall hit during login", sedd_config + ) + sleep(10) if captcha_walled: @@ -146,7 +90,11 @@ def download_data_dump(browser: WebDriver, site: str, meta_url: str, etags: Dict print(f"Downloading data dump from {site}") def _exec_download(browser: WebDriver): - kill_cookie_shit(browser) + if args.keep_consent: + print('Consent dialog will not be auto-removed') + else: + kill_cookie_shit(browser) + try: checkbox = browser.find_element(By.ID, "datadump-agree-checkbox") btn = browser.find_element(By.ID, "datadump-download-button") @@ -218,8 +166,6 @@ def _exec_download(browser: WebDriver): state, observer = register_pending_downloads_observer(args.output_dir) for site in sites.sites: - print(f"Extracting from {site}...") - if site not in ["https://meta.stackexchange.com", "https://stackapps.com"]: # https://regex101.com/r/kG6nTN/1 meta_url = re.sub( @@ -231,6 +177,8 @@ def _exec_download(browser: WebDriver): if args.skip_loaded and main_loaded and meta_loaded: pass else: + print(f"Extracting from {site}...") + login_or_create(browser, site) download_data_dump( browser, diff --git a/sedd/meta/notifications.py b/sedd/meta/notifications.py index 64d0527..6dddcaf 100644 --- a/sedd/meta/notifications.py +++ b/sedd/meta/notifications.py @@ -1,24 +1,29 @@ from desktop_notifier import DesktopNotifier import asyncio +from ..config import SEDDConfig + + def native(message: str, _): asyncio.run( DesktopNotifier().send( - title = "The data dump downloader needs attention", - message = f"{message}" + title="The data dump downloader needs attention", + message=f"{message}" ) ) + notification_providers = { "native": native } -def notify(message: str, config): - provider = config["notifications"]["provider"] + +def notify(message: str, config: SEDDConfig): + + provider = config.get_notifications_provider() + if provider is None: print(message) return notification_providers[provider](message, config["notifications"]) - - diff --git a/sedd/ubo/__init__.py b/sedd/ubo/__init__.py new file mode 100644 index 0000000..39575e8 --- /dev/null +++ b/sedd/ubo/__init__.py @@ -0,0 +1,2 @@ +from .ubo import * +from .utils import * \ No newline at end of file diff --git a/sedd/ubo/ubo.py b/sedd/ubo/ubo.py new file mode 100644 index 0000000..a889b3f --- /dev/null +++ b/sedd/ubo/ubo.py @@ -0,0 +1,34 @@ +from selenium.webdriver import Firefox +from sys import exc_info +from time import sleep + +from ..config import SEDDConfig +from .utils import ubo_set_user_settings, \ + ubo_set_advanced_settings, ubo_set_selected_filters, \ + ubo_set_whitelist, ubo_set_dynamic_rules, ubo_set_user_filters, \ + ubo_reload_all_filters + + +def init_ubo_settings(browser: Firefox, config: SEDDConfig, ubo_id: str) -> bool: + try: + browser.get( + f'moz-extension://{ubo_id}/dashboard.html#settings.html' + ) + + settings = config.get_ubo_settings() + + ubo_set_user_settings(browser, settings) + ubo_set_advanced_settings(browser, settings) + + # idk why, but applyFilterListSelection only works after a delay + sleep(1) + + ubo_set_selected_filters(browser, settings) + ubo_set_whitelist(browser, settings) + ubo_set_dynamic_rules(browser, settings) + ubo_set_user_filters(browser, settings) + + ubo_reload_all_filters(browser) + except: + print('Failed to set uBLock config, using defaults') + print(exc_info()) diff --git a/sedd/ubo/utils.py b/sedd/ubo/utils.py new file mode 100644 index 0000000..7bcf73f --- /dev/null +++ b/sedd/ubo/utils.py @@ -0,0 +1,94 @@ +from selenium.webdriver import Firefox +from ..config.typings import SEDDUboSettings + + +def ubo_set_user_settings(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'userSettings' message to set uBO's user settings + # see: https://github.com/gorhill/uBlock/blob/master/src/js/settings.js#L215 + for key, val in settings['userSettings'].items(): + browser.execute_script(""" + const name = arguments[0]; + const value = arguments[1]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'userSettings', name, value, + }); + """, key, val) + + +def ubo_set_advanced_settings(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'writeHiddenSettings' message to set uBO's advanced settings + # see: https://github.com/gorhill/uBlock/blob/master/src/js/advanced-settings.js#L177 + browser.execute_script(""" + const settings = arguments[0]; + + const content = Object.entries(settings) + .map(([k,v]) => `${k} ${v}`) + .join('\\n'); + + globalThis.vAPI.messaging.send('dashboard', { + what: 'writeHiddenSettings', content, + }); + """, settings['hiddenSettings']) + + +def ubo_set_selected_filters(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'applyFilterListSelection' message to set uBO's selected filter lists + # see: https://github.com/gorhill/uBlock/blob/master/src/js/storage.js#L486 + browser.execute_script(""" + const toSelect = arguments[0]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'applyFilterListSelection', toSelect, + }) + """, settings['selectedFilterLists']) + + +def ubo_set_whitelist(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'setWhitelist' message to set uBO's trusted sites list + # see: https://github.com/gorhill/uBlock/blob/master/src/js/messaging.js#L225 + browser.execute_script(""" + const list = arguments[0]; + + globalThis.vAPI.messaging.send('dashboard', { + what: 'setWhitelist', whitelist: list.join('\\n') + }) + """, settings["whitelist"]) + + +def ubo_set_dynamic_rules(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'modifyRuleset' message to set uBO's dynamic rules + # see: https://github.com/gorhill/uBlock/blob/master/src/js/dyna-rules.js#L279 + browser.execute_script(""" + const { toAdd = [], toRemove = [] } = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'modifyRuleset', permanent: true, + toAdd: toAdd.join('\\n'), + toRemove: toRemove.join('\\n'), + }) + + """, settings["dynamicFilters"]) + + +def ubo_set_user_filters(browser: Firefox, settings: SEDDUboSettings) -> None: + # using vAPI 'writeUserFilters' message to set uBO's user filters + # see: https://github.com/gorhill/uBlock/blob/master/src/js/storage.js#L582 + browser.execute_script(""" + const { trusted, enabled, toOverwrite = [] } = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'writeUserFilters', trusted, enabled, + content: toOverwrite.join('\\n') + }) + """, settings['userFilters']) + + +def ubo_reload_all_filters(browser: Firefox) -> None: + browser.execute_async_script(""" + const done = arguments[0] + + globalThis.vAPI.messaging.send('dashboard', { + what: 'reloadAllFilters', + }).then(done) + """)