From 9fa9d71a69493f13ee12b055c89219d01d9ba554 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 21 Feb 2024 10:18:07 +0100 Subject: [PATCH] Create recipes for TED by topics --- .../create_ted_topics_recipes.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 dispatcher/backend/maint-scripts/create_ted_topics_recipes.py diff --git a/dispatcher/backend/maint-scripts/create_ted_topics_recipes.py b/dispatcher/backend/maint-scripts/create_ted_topics_recipes.py new file mode 100644 index 00000000..9e7a9653 --- /dev/null +++ b/dispatcher/backend/maint-scripts/create_ted_topics_recipes.py @@ -0,0 +1,152 @@ +import json +import logging +import os +import sys +from http import HTTPStatus + +import requests + +from get_token import get_token, get_token_headers, get_url + +logging.basicConfig( + level=logging.DEBUG, format="[%(asctime)s: %(levelname)s] %(message)s" +) +logger = logging.getLogger(__name__) + +SEARCH_URL = "https://zenith-prod-alt.ted.com/api/search" +REQUESTS_TIMEOUT = 10 +TED_INDEX_NAME = "coyote_models_acme_videos_alias_38ce41d1f97ca56a38068f613af166da" + + +def get_ted_topics(): + data = [ + { + "indexName": TED_INDEX_NAME, + "params": { + "attributeForDistinct": "objectID", + "distinct": 1, + "facets": ["subtitle_languages", "tags"], + "highlightPostTag": "__/ais-highlight__", + "highlightPreTag": "__ais-highlight__", + "hitsPerPage": 1, + "maxValuesPerFacet": 500, + "page": 0, + "query": "", + "tagFilters": "", + }, + } + ] + req = requests.post( + SEARCH_URL, + headers={"User-Agent": "Mozilla/5.0"}, + json=data, + timeout=REQUESTS_TIMEOUT, + ) + req.raise_for_status() + topics = json.loads(req.content)["results"][0]["facets"]["tags"].keys() + logger.info(f"{len(topics)} topics found") + return topics + + +def create_recipe(ted_topic_name: str, access_token): + clean_ted_topic_name = ted_topic_name.replace(" ", "-") + schedule_name = f"ted_topic_{clean_ted_topic_name}" + response = requests.get( + get_url(f"/schedules/{schedule_name}"), + headers=get_token_headers(access_token), + ) + if response.status_code == HTTPStatus.OK: + logger.warning(f"Recipe {schedule_name} already exists, ignoring.") + return + + if response.status_code != HTTPStatus.NOT_FOUND: + response.raise_for_status() + + logger.info(f"Creating recipe for {ted_topic_name}") + + data = { + "category": "ted", + "config": { + "flags": { + "description": f"A collection of TED videos about {ted_topic_name}", + "title": f"TED {ted_topic_name}", + "topics": ted_topic_name, + "name": f"ted_mul_{clean_ted_topic_name}", + "languages": "eng,fra,esp,deu,chi", + "format": "webm", + "low-quality": True, + "optimization-cache": os.environ["TED_OPTIM_CACHE_URL"], + "output": "/output", + "tmp-dir": "/output", + "subtitles": "all", + "subtitles-enough": True, + "debug": True, + "tags": "ted", + }, + "image": { + "name": "ghcr.io/openzim/ted", + "tag": "2.1.0", + }, + "monitor": False, + "platform": "ted", + "resources": { + "cpu": 3, + "disk": 32212254720, + "memory": 2147483648, + }, + "task_name": "ted", + "warehouse_path": "/.hidden/dev", + }, + "enabled": True, + "language": { + "code": "mul", + "name_en": "Multiple Languages", + "name_native": "Multiple Languages", + }, + "name": schedule_name, + "periodicity": "monthly", + "tags": [ + "ted-by-topic", + ], + } + response = requests.post( + get_url("/schedules/"), + headers=get_token_headers(access_token), + json=data, + ) + if response.status_code != HTTPStatus.CREATED: + logger.error(json.loads(response.content)) + response.raise_for_status() + + +def main(zf_username, zf_password): + """Creates recipes for TED by topics""" + + access_token, refresh_token = get_token(zf_username, zf_password) + + ted_topics = get_ted_topics() + logger.debug(ted_topics) + for topic in ted_topics: + if topic != "street art": + continue + create_recipe(ted_topic_name=topic, access_token=access_token) + + +if __name__ == "__main__": + if len(sys.argv) != 3: + logger.error( + "Incorrect number of arguments\n" + f"Usage: {sys.argv[0]} \n" + "TED_OPTIM_CACHE_URL environment variable must be set to the optimization" + " cache URL to use" + ) + sys.exit(1) + + if "TED_OPTIM_CACHE_URL" not in os.environ: + logger.error( + "TED_OPTIM_CACHE_URL environment variable must be set to the optimization" + " cache URL to use in recipes" + ) + sys.exit(2) + args = sys.argv[1:] + main(*args)