diff --git a/browsertrix-crawler/configs/tg24.yaml b/browsertrix-crawler/configs/tg24.yaml index f4eb0e5..bbd3ac4 100644 --- a/browsertrix-crawler/configs/tg24.yaml +++ b/browsertrix-crawler/configs/tg24.yaml @@ -1,4 +1,3 @@ -# TODO: Adjust for new TG24 (and beyond) site url structure seeds: # Crawl content available via navigation and frontpage - url: https://www.gathering.org diff --git a/browsertrix-crawler/configs/tgno.yaml b/browsertrix-crawler/configs/tgno.yaml new file mode 100644 index 0000000..9c25ccc --- /dev/null +++ b/browsertrix-crawler/configs/tgno.yaml @@ -0,0 +1,34 @@ +# Config intended to be used on new tg.no once launched. This page differs from +# previous iterations (in practice, even if not in theory) by being a single +# site gradually updated with new content and styling, rather than a new site +# each year. +seeds: + # Crawl content available via navigation and frontpage + - url: https://www.tg.no +include: + # Basic pages + - www.tg.no + +# Block calls to our tracking service +blockRules: + - url: matomo.gathering.org + +collection: tgno + +behaviors: autoscroll,autoplay,autofetch,siteSpecific +waitUntil: load,networkidle0 +generateCDX: true +combineWARCs: true +saveState: always +workers: 4 +# TODO: Remove it not needed, hopefully we won't need consent flow on new site +# Minimal profile that includes consent answers +# profile: /crawls/profiles/tg24.tar.gz + +# Make "live" crawling view available at 9037 +newContext: window +screencastPort: 9037 + +warcinfo: + operator: The Gathering + hostname: www.tg.no diff --git a/wayback/startup.sh b/wayback/startup.sh index 1c7242e..90156e2 100755 --- a/wayback/startup.sh +++ b/wayback/startup.sh @@ -16,6 +16,7 @@ git clone https://github.com/gathering/go-archive-tg21 || (cd go-archive-tg21 ; git clone https://github.com/gathering/go-archive-tg22 || (cd go-archive-tg22 ; git pull ; git lfs pull ; cd ..) git clone https://github.com/gathering/go-archive-tg23 || (cd go-archive-tg23 ; git pull ; git lfs pull ; cd ..) git clone https://github.com/gathering/go-archive-tg24 || (cd go-archive-tg24 ; git pull ; git lfs pull ; cd ..) +git clone https://github.com/gathering/go-archive-tgno || (cd go-archive-tgno ; git pull ; git lfs pull ; cd ..) cd "$WORKDIR" @@ -25,5 +26,6 @@ cp -r "$SOURCES/go-archive-tg21/browsertrix-crawler/crawls/collections/tg21/" "$ cp -r "$SOURCES/go-archive-tg22/browsertrix-crawler/tg22/" "$COLLECTIONS/" cp -r "$SOURCES/go-archive-tg23/tg23/" "$COLLECTIONS/" cp -r "$SOURCES/go-archive-tg24/tg24/" "$COLLECTIONS/" +cp -r "$SOURCES/go-archive-tgno/tgno/" "$COLLECTIONS/" exec /docker-entrypoint.sh $@