Skip to content

Commit

Permalink
Merge pull request #8 from Oaphi/caching
Browse files Browse the repository at this point in the history
Option for skipping download of files that are already on disk
  • Loading branch information
LunarWatcher authored Aug 24, 2024
2 parents 66f1938 + 7d974f2 commit 164b06f
Show file tree
Hide file tree
Showing 9 changed files with 329 additions and 75 deletions.
113 changes: 64 additions & 49 deletions README.md

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
selenium==4.23.1
desktop-notifier==5.0.1
watchdog==4.0.2
24 changes: 24 additions & 0 deletions sedd/data/files_map.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# for most sites, dump names correspond to domain name.
# However, due to some subdomain shenanigans, a couple of sites differ:
files_map: dict[str, str] = {
'alcohol.stackexchange.com': 'beer.stackexchange.com',
'alcohol.meta.stackexchange.com': 'beer.meta.stackexchange.com',
'mattermodeling.stackexchange.com': 'materials.stackexchange.com',
'mattermodeling.meta.stackexchange.com': 'materials.meta.stackexchange.com',
'communitybuilding.stackexchange.com': 'moderators.stackexchange.com',
'communitybuilding.meta.stackexchange.com': 'moderators.meta.stackexchange.com',
'medicalsciences.stackexchange.com': 'health.stackexchange.com',
'medicalsciences.meta.stackexchange.com': 'health.meta.stackexchange.com',
'psychology.stackexchange.com': 'cogsci.stackexchange.com',
'psychology.meta.stackexchange.com': 'cogsci.meta.stackexchange.com',
'writing.stackexchange.com': 'writers.stackexchange.com',
'writing.meta.stackexchange.com': 'writers.meta.stackexchange.com',
'video.stackexchange.com': 'avp.stackexchange.com',
'video.meta.stackexchange.com': 'avp.meta.stackexchange.com',
'meta.es.stackoverflow.com': 'es.meta.stackoverflow.com',
'meta.ja.stackoverflow.com': 'ja.meta.stackoverflow.com',
'meta.pt.stackoverflow.com': 'pt.meta.stackoverflow.com',
'meta.ru.stackoverflow.com': 'ru.meta.stackoverflow.com',
}

inverse_files_map: dict[str, str] = {v: k for k, v in files_map.items()}
126 changes: 100 additions & 26 deletions sedd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from selenium.common.exceptions import NoSuchElementException
from typing import Dict

from .watcher.observer import register_pending_downloads_observer, Observer

from sedd.data import sites
from time import sleep
import json
Expand All @@ -13,6 +15,8 @@
from .meta import notifications
import re
import os
import sys
from traceback import print_exception

import argparse
from . import utils
Expand All @@ -21,7 +25,13 @@
prog="sedd",
description="Automatic (unofficial) SE data dump downloader for the anti-community data dump format",
)

parser.add_argument(
"-s", "--skip-loaded",
required=False,
default=False,
action="store_true",
dest="skip_loaded"
)
parser.add_argument(
"-o", "--outputDir",
required=False,
Expand All @@ -38,6 +48,7 @@

args = parser.parse_args()


def get_download_dir():
download_dir = args.output_dir

Expand All @@ -48,15 +59,17 @@ def get_download_dir():

return download_dir


options = Options()
options.enable_downloads = True
options.set_preference("browser.download.folderList", 2)
options.set_preference("browser.download.manager.showWhenStarting", False)
options.set_preference("browser.download.dir", get_download_dir())
options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")
options.set_preference(
"browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")

browser = webdriver.Firefox(
options = options
options=options
)
if not os.path.exists("ubo.xpi"):
print("Downloading uBO")
Expand All @@ -74,18 +87,22 @@ def get_download_dir():
email = config["email"]
password = config["password"]


def kill_cookie_shit(browser: WebDriver):
sleep(3)
browser.execute_script("""let elem = document.getElementById("onetrust-banner-sdk"); if (elem) { elem.parentNode.removeChild(elem); }""")
browser.execute_script(
"""let elem = document.getElementById("onetrust-banner-sdk"); if (elem) { elem.parentNode.removeChild(elem); }""")
sleep(1)


def is_logged_in(browser: WebDriver, site: str):
url = f"{site}/users/current"
browser.get(url)
sleep(1)

return "/users/" in browser.current_url


def login_or_create(browser: WebDriver, site: str):
if is_logged_in(browser, site):
print("Already logged in")
Expand Down Expand Up @@ -125,7 +142,7 @@ def login_or_create(browser: WebDriver, site: str):
break


def download_data_dump(browser: WebDriver, site: str, etags: Dict[str, str]):
def download_data_dump(browser: WebDriver, site: str, meta_url: str, etags: Dict[str, str]):
print(f"Downloading data dump from {site}")

def _exec_download(browser: WebDriver):
Expand Down Expand Up @@ -168,32 +185,89 @@ def _exec_download(browser: WebDriver):
url = browser.execute_script("return window.extractedUrl;")
utils.extract_etag(url, etags)

sleep(5);
sleep(5)

main_loaded = utils.is_file_downloaded(args.output_dir, site)
meta_loaded = utils.is_file_downloaded(args.output_dir, meta_url)

if not args.skip_loaded or not main_loaded or not meta_loaded:
if args.skip_loaded and main_loaded:
pass
else:
browser.get(f"{site}/users/data-dump-access/current")

browser.get(f"{site}/users/data-dump-access/current")
_exec_download(browser)
if not args.dry_run:
utils.archive_file(args.output_dir, site)

_exec_download(browser)

if args.skip_loaded and meta_loaded:
pass
else:
browser.get(f"{meta_url}/users/data-dump-access/current")

if not args.dry_run:
utils.archive_file(args.output_dir, meta_url)

_exec_download(browser)

if site not in ["https://meta.stackexchange.com", "https://stackapps.com"]:
# https://regex101.com/r/kG6nTN/1
meta_url = re.sub(r"(https://(?:[^.]+\.(?=stackexchange))?)", r"\1meta.", site)
print(meta_url)
browser.get(f"{meta_url}/users/data-dump-access/current")
_exec_download(browser)

etags: Dict[str, str] = {}

for site in sites.sites:
print(f"Extracting from {site}...")
try:
state, observer = register_pending_downloads_observer(args.output_dir)

login_or_create(browser, site)
download_data_dump(
browser,
site,
etags
)
for site in sites.sites:
print(f"Extracting from {site}...")

if site not in ["https://meta.stackexchange.com", "https://stackapps.com"]:
# https://regex101.com/r/kG6nTN/1
meta_url = re.sub(
r"(https://(?:[^.]+\.(?=stackexchange))?)", r"\1meta.", site)

main_loaded = utils.is_file_downloaded(args.output_dir, site)
meta_loaded = utils.is_file_downloaded(args.output_dir, meta_url)

# TODO: replace with validation once downloading is verified done
# (or export for separate, later verification)
# Though keeping it here, removing files and re-running downloads feels like a better idea
print(etags)
if args.skip_loaded and main_loaded and meta_loaded:
pass
else:
login_or_create(browser, site)
download_data_dump(
browser,
site,
meta_url,
etags
)

if observer:
pending = state.size()

print(f"Waiting for {pending} download{'s'[:pending^1]} to complete")

while True:
if state.empty():
observer.stop()
browser.quit()

utils.cleanup_archive(args.output_dir)
break
else:
sleep(1)

except KeyboardInterrupt:
pass

except:
exception = sys.exc_info()

try:
print_exception(exception)
except:
print(exception)

browser.quit()
finally:
# TODO: replace with validation once downloading is verified done
# (or export for separate, later verification)
# Though keeping it here, removing files and re-running downloads feels like a better idea
print(etags)
62 changes: 62 additions & 0 deletions sedd/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,12 @@
import requests as r
from urllib.parse import urlparse
import os.path
import re
import sys

from .data.files_map import files_map, inverse_files_map
from .data.sites import sites


def extract_etag(url: str, etags: Dict[str, str]):
res = r.get(
Expand All @@ -21,3 +27,59 @@ def extract_etag(url: str, etags: Dict[str, str]):
etags[filename] = etag

print(f"ETag for {filename}: {etag}")


def get_file_name(site_or_url: str) -> str:
domain = re.sub(r'https://', '', site_or_url)

try:
file_name = files_map[domain]
return f'{file_name}.7z'
except KeyError:
return f'{domain}.7z'


def is_dump_file(file_name: str) -> bool:
file_name = re.sub(r'\.7z$', '', file_name)

try:
inverse_files_map[file_name]
except KeyError:
origin = f'https://{file_name}'
return origin in sites

return True


def check_file(base_path: str, file_name: str) -> bool:
try:
res = os.stat(os.path.join(base_path, file_name))
return res.st_size > 0
except FileNotFoundError:
return False


def archive_file(base_path: str, site_or_url: str) -> None:
try:
file_name = get_file_name(site_or_url)
file_path = os.path.join(base_path, file_name)
os.rename(file_path, f"{file_path}.old")
except FileNotFoundError:
pass


def cleanup_archive(base_path: str) -> None:
try:
file_entries = os.listdir(base_path)

for entry in file_entries:
if entry.endswith('.old'):
entry_path = os.path.join(base_path, entry)
os.remove(entry_path)
except:
print(sys.exc_info())


def is_file_downloaded(base_path: str, site_or_url: str) -> bool:
file_name = get_file_name(site_or_url)
return check_file(base_path, file_name)
Empty file added sedd/watcher/__init__.py
Empty file.
40 changes: 40 additions & 0 deletions sedd/watcher/handler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os

from watchdog.observers.api import BaseObserverSubclassCallable
from watchdog.events import FileSystemEventHandler

from .state import DownloadState
from ..utils import is_dump_file


class CleanupHandler(FileSystemEventHandler):
download_state: DownloadState
observer: BaseObserverSubclassCallable

def __init__(self, observer: BaseObserverSubclassCallable, state: DownloadState):
super()

self.download_state = state
self.observer = observer

def on_created(self, event):
file_name = os.path.basename(event.src_path)

# # we can safely ignore part file creations
if file_name.endswith('.part'):
return

if is_dump_file(file_name):
print(f"Download started: {file_name}")
self.download_state.add(file_name)

def on_moved(self, event):
file_name: str = os.path.basename(event.dest_path)

# we can safely ignore part file removals
if file_name.endswith('.part'):
return

if is_dump_file(file_name):
print(f"Download finished: {file_name}")
self.download_state.remove(file_name)
17 changes: 17 additions & 0 deletions sedd/watcher/observer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from threading import current_thread, main_thread
from watchdog.observers import Observer

from .handler import CleanupHandler
from .state import DownloadState


def register_pending_downloads_observer(output_dir: str):
if current_thread() is main_thread():
observer = Observer()
state = DownloadState()
handler = CleanupHandler(observer, state)

observer.schedule(handler, output_dir, recursive=True)
observer.start()

return state, observer
21 changes: 21 additions & 0 deletions sedd/watcher/state.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from typing import Set


class DownloadState:
# list of filenames pending download
pending: Set[str] = set()

def size(self):
return len(self.pending)

def empty(self):
return self.size() == 0

def add(self, file: str):
self.pending.add(file)

def remove(self, file: str):
self.pending.remove(file)


download_state = DownloadState()

0 comments on commit 164b06f

Please sign in to comment.