From 0c2ba613347b24e6d67c700ef702ae8e09aed8f5 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 21 Feb 2024 12:03:00 +0100 Subject: [PATCH 01/14] Upgrade QA tools + fix code to match new black conventions --- .github/workflows/QA.yml | 2 +- .../maint-scripts/report_youtube_api_keys.py | 16 ++++++++++------ dispatcher/backend/src/common/emailing.py | 14 ++++++++------ dispatcher/backend/src/common/schemas/models.py | 8 +++++--- dispatcher/backend/src/common/schemas/orms.py | 6 +++--- ...c77f5d8_add_unique_constraint_on_schedule_.py | 1 + .../0ad2bf164dae_alter_config_columns_types.py | 1 + ...7c_nullable_schedule_id_in_requested_tasks.py | 1 + .../43f385b318d4_add_original_schedule_name.py | 1 + .../versions/4de2adfc3d11_create_some_indexes.py | 1 + .../62227054989d_remove_mongo_backup_columns.py | 1 + .../6970d8681400_nullable_shedule_id_on_task.py | 1 + ...074cc939b_drop_task_id_in_scheduleduration.py | 1 + ...d6dc_create_schedules_tasks_and_requested_.py | 1 + .../versions/96203f95bc36_add_worker_table.py | 1 + ...2aa_keep_deleted_users_and_workers_in_the_.py | 1 + .../cbedcefd6059_store_mongo_id_as_well.py | 1 + .../ceae21f592b7_remove_ssh_key_last_used.py | 1 + ...a178652d5_apply_a_single_naming_convention.py | 1 + .../dc811d96975c_added_user_sshkey_tables.py | 1 + .../e1d3894be9ea_set_fields_as_non_nullable.py | 1 + ...eae2c347_user_username_is_mandatory_unique.py | 1 + .../fe65a1b0f953_store_refresh_tokens.py | 1 + .../integration/routes/users/test_password.py | 6 +++--- dispatcher/backend/src/utils/offliners.py | 16 ++++++++++------ watcher/watcher.py | 6 +++--- workers/app/common/worker.py | 8 ++++---- workers/app/task/worker.py | 8 +++++--- 28 files changed, 70 insertions(+), 38 deletions(-) diff --git a/.github/workflows/QA.yml b/.github/workflows/QA.yml index 1b07c74df..43a1b5fe3 100644 --- a/.github/workflows/QA.yml +++ b/.github/workflows/QA.yml @@ -19,7 +19,7 @@ jobs: - name: Install dependencies run: | - pip install black==23.3.0 flake8==6.0.0 isort==5.12.0 + pip install black==24.2.0 flake8==7.0.0 isort==5.13.2 black --version flake8 --version isort --version diff --git a/dispatcher/backend/maint-scripts/report_youtube_api_keys.py b/dispatcher/backend/maint-scripts/report_youtube_api_keys.py index c61cc6dcd..874c214b7 100755 --- a/dispatcher/backend/maint-scripts/report_youtube_api_keys.py +++ b/dispatcher/backend/maint-scripts/report_youtube_api_keys.py @@ -55,9 +55,11 @@ def report_youtube_api_keys(session: so.Session, *, display_unknown_secrets=Fals if hashed_api_key not in schedules_by_api_key.keys(): schedules_by_api_key[hashed_api_key] = { "api_key": api_key, - "key_name": known_api_keys[hashed_api_key] - if hashed_api_key in known_api_keys - else "unknown", + "key_name": ( + known_api_keys[hashed_api_key] + if hashed_api_key in known_api_keys + else "unknown" + ), "schedules": [], } schedules_by_api_key[hashed_api_key]["schedules"].append(schedule.name) @@ -69,9 +71,11 @@ def report_youtube_api_keys(session: so.Session, *, display_unknown_secrets=Fals for hashed_api_key, data in schedules_by_api_key.items(): report_data["keys"].append( { - "name": known_api_keys[hashed_api_key] - if hashed_api_key in known_api_keys.keys() - else "unknown", + "name": ( + known_api_keys[hashed_api_key] + if hashed_api_key in known_api_keys.keys() + else "unknown" + ), "schedules": sorted(data["schedules"]), } ) diff --git a/dispatcher/backend/src/common/emailing.py b/dispatcher/backend/src/common/emailing.py index faa0cf951..04b784b3b 100644 --- a/dispatcher/backend/src/common/emailing.py +++ b/dispatcher/backend/src/common/emailing.py @@ -51,12 +51,14 @@ def send_email_via_mailgun( url=f"{MAILGUN_API_URL}/messages", auth=("api", MAILGUN_API_KEY), data=data, - files=[ - ("attachment", (fpath.name, open(fpath, "rb").read())) - for fpath in attachments - ] - if attachments - else [], + files=( + [ + ("attachment", (fpath.name, open(fpath, "rb").read())) + for fpath in attachments + ] + if attachments + else [] + ), timeout=REQ_TIMEOUT_NOTIFICATIONS, ) resp.raise_for_status() diff --git a/dispatcher/backend/src/common/schemas/models.py b/dispatcher/backend/src/common/schemas/models.py index 61e4e64cb..05d2be474 100644 --- a/dispatcher/backend/src/common/schemas/models.py +++ b/dispatcher/backend/src/common/schemas/models.py @@ -86,9 +86,11 @@ def get_offliner_schema(offliner): Offliner.nautilus: NautilusFlagsSchema, Offliner.ted: TedFlagsSchema, Offliner.openedx: OpenedxFlagsSchema, - Offliner.zimit: ZimitFlagsSchemaRelaxed - if constants.ZIMIT_USE_RELAXED_SCHEMA - else ZimitFlagsSchema, + Offliner.zimit: ( + ZimitFlagsSchemaRelaxed + if constants.ZIMIT_USE_RELAXED_SCHEMA + else ZimitFlagsSchema + ), Offliner.kolibri: KolibriFlagsSchema, Offliner.wikihow: WikihowFlagsSchema, Offliner.ifixit: IFixitFlagsSchema, diff --git a/dispatcher/backend/src/common/schemas/orms.py b/dispatcher/backend/src/common/schemas/orms.py index 9db22593c..4938404d0 100644 --- a/dispatcher/backend/src/common/schemas/orms.py +++ b/dispatcher/backend/src/common/schemas/orms.py @@ -180,9 +180,9 @@ def get_duration(schedule: dbm.Schedule): duration_res["default"] = ScheduleFullSchema.get_one_duration(duration) if duration.worker: duration_res["available"] = True - duration_res["workers"][ - duration.worker.name - ] = ScheduleFullSchema.get_one_duration(duration) + duration_res["workers"][duration.worker.name] = ( + ScheduleFullSchema.get_one_duration(duration) + ) return duration_res def get_is_requested(schedule: dbm.Schedule): diff --git a/dispatcher/backend/src/migrations/versions/0055fc77f5d8_add_unique_constraint_on_schedule_.py b/dispatcher/backend/src/migrations/versions/0055fc77f5d8_add_unique_constraint_on_schedule_.py index 6bac021db..92be5cd7d 100644 --- a/dispatcher/backend/src/migrations/versions/0055fc77f5d8_add_unique_constraint_on_schedule_.py +++ b/dispatcher/backend/src/migrations/versions/0055fc77f5d8_add_unique_constraint_on_schedule_.py @@ -5,6 +5,7 @@ Create Date: 2023-05-04 12:36:14.802432 """ + from alembic import op # revision identifiers, used by Alembic. diff --git a/dispatcher/backend/src/migrations/versions/0ad2bf164dae_alter_config_columns_types.py b/dispatcher/backend/src/migrations/versions/0ad2bf164dae_alter_config_columns_types.py index 729f1779c..8426d6da1 100644 --- a/dispatcher/backend/src/migrations/versions/0ad2bf164dae_alter_config_columns_types.py +++ b/dispatcher/backend/src/migrations/versions/0ad2bf164dae_alter_config_columns_types.py @@ -5,6 +5,7 @@ Create Date: 2023-04-26 12:21:57.205579 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/3c4d546fde7c_nullable_schedule_id_in_requested_tasks.py b/dispatcher/backend/src/migrations/versions/3c4d546fde7c_nullable_schedule_id_in_requested_tasks.py index 5c5ecd715..554de35bb 100644 --- a/dispatcher/backend/src/migrations/versions/3c4d546fde7c_nullable_schedule_id_in_requested_tasks.py +++ b/dispatcher/backend/src/migrations/versions/3c4d546fde7c_nullable_schedule_id_in_requested_tasks.py @@ -5,6 +5,7 @@ Create Date: 2023-05-18 13:35:13.362937 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/43f385b318d4_add_original_schedule_name.py b/dispatcher/backend/src/migrations/versions/43f385b318d4_add_original_schedule_name.py index db20c90f5..01923dd36 100644 --- a/dispatcher/backend/src/migrations/versions/43f385b318d4_add_original_schedule_name.py +++ b/dispatcher/backend/src/migrations/versions/43f385b318d4_add_original_schedule_name.py @@ -5,6 +5,7 @@ Create Date: 2023-09-26 07:56:45.008277 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/4de2adfc3d11_create_some_indexes.py b/dispatcher/backend/src/migrations/versions/4de2adfc3d11_create_some_indexes.py index 5428430ef..7d0ed0b23 100644 --- a/dispatcher/backend/src/migrations/versions/4de2adfc3d11_create_some_indexes.py +++ b/dispatcher/backend/src/migrations/versions/4de2adfc3d11_create_some_indexes.py @@ -5,6 +5,7 @@ Create Date: 2023-04-06 05:13:40.448095 """ + from alembic import op # revision identifiers, used by Alembic. diff --git a/dispatcher/backend/src/migrations/versions/62227054989d_remove_mongo_backup_columns.py b/dispatcher/backend/src/migrations/versions/62227054989d_remove_mongo_backup_columns.py index aee45b9dc..2d9f8fc9f 100644 --- a/dispatcher/backend/src/migrations/versions/62227054989d_remove_mongo_backup_columns.py +++ b/dispatcher/backend/src/migrations/versions/62227054989d_remove_mongo_backup_columns.py @@ -5,6 +5,7 @@ Create Date: 2023-05-12 15:29:06.706797 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/6970d8681400_nullable_shedule_id_on_task.py b/dispatcher/backend/src/migrations/versions/6970d8681400_nullable_shedule_id_on_task.py index 6c80bbb0b..115d90410 100644 --- a/dispatcher/backend/src/migrations/versions/6970d8681400_nullable_shedule_id_on_task.py +++ b/dispatcher/backend/src/migrations/versions/6970d8681400_nullable_shedule_id_on_task.py @@ -5,6 +5,7 @@ Create Date: 2023-05-18 10:53:05.286686 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/76a074cc939b_drop_task_id_in_scheduleduration.py b/dispatcher/backend/src/migrations/versions/76a074cc939b_drop_task_id_in_scheduleduration.py index 6403a1952..826bee2aa 100644 --- a/dispatcher/backend/src/migrations/versions/76a074cc939b_drop_task_id_in_scheduleduration.py +++ b/dispatcher/backend/src/migrations/versions/76a074cc939b_drop_task_id_in_scheduleduration.py @@ -5,6 +5,7 @@ Create Date: 2023-05-04 11:27:30.248671 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/8279abb2d6dc_create_schedules_tasks_and_requested_.py b/dispatcher/backend/src/migrations/versions/8279abb2d6dc_create_schedules_tasks_and_requested_.py index e5f257b77..fefeb2460 100644 --- a/dispatcher/backend/src/migrations/versions/8279abb2d6dc_create_schedules_tasks_and_requested_.py +++ b/dispatcher/backend/src/migrations/versions/8279abb2d6dc_create_schedules_tasks_and_requested_.py @@ -5,6 +5,7 @@ Create Date: 2023-04-13 10:10:52.820099 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/96203f95bc36_add_worker_table.py b/dispatcher/backend/src/migrations/versions/96203f95bc36_add_worker_table.py index 5a9d710e8..1f50cff69 100644 --- a/dispatcher/backend/src/migrations/versions/96203f95bc36_add_worker_table.py +++ b/dispatcher/backend/src/migrations/versions/96203f95bc36_add_worker_table.py @@ -5,6 +5,7 @@ Create Date: 2023-04-06 15:56:02.435956 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/acf1446fa2aa_keep_deleted_users_and_workers_in_the_.py b/dispatcher/backend/src/migrations/versions/acf1446fa2aa_keep_deleted_users_and_workers_in_the_.py index 3e1f2b597..fec3bbce0 100644 --- a/dispatcher/backend/src/migrations/versions/acf1446fa2aa_keep_deleted_users_and_workers_in_the_.py +++ b/dispatcher/backend/src/migrations/versions/acf1446fa2aa_keep_deleted_users_and_workers_in_the_.py @@ -5,6 +5,7 @@ Create Date: 2023-04-28 11:29:30.878410 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/cbedcefd6059_store_mongo_id_as_well.py b/dispatcher/backend/src/migrations/versions/cbedcefd6059_store_mongo_id_as_well.py index 20633aaa2..9042b5597 100644 --- a/dispatcher/backend/src/migrations/versions/cbedcefd6059_store_mongo_id_as_well.py +++ b/dispatcher/backend/src/migrations/versions/cbedcefd6059_store_mongo_id_as_well.py @@ -5,6 +5,7 @@ Create Date: 2023-04-03 18:01:18.286681 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/ceae21f592b7_remove_ssh_key_last_used.py b/dispatcher/backend/src/migrations/versions/ceae21f592b7_remove_ssh_key_last_used.py index c6324e5ad..1c08b3c1b 100644 --- a/dispatcher/backend/src/migrations/versions/ceae21f592b7_remove_ssh_key_last_used.py +++ b/dispatcher/backend/src/migrations/versions/ceae21f592b7_remove_ssh_key_last_used.py @@ -5,6 +5,7 @@ Create Date: 2023-09-29 10:59:39.739351 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/d13a178652d5_apply_a_single_naming_convention.py b/dispatcher/backend/src/migrations/versions/d13a178652d5_apply_a_single_naming_convention.py index 735b824e5..701738220 100644 --- a/dispatcher/backend/src/migrations/versions/d13a178652d5_apply_a_single_naming_convention.py +++ b/dispatcher/backend/src/migrations/versions/d13a178652d5_apply_a_single_naming_convention.py @@ -5,6 +5,7 @@ Create Date: 2023-04-06 04:57:11.449333 """ + from alembic import op # revision identifiers, used by Alembic. diff --git a/dispatcher/backend/src/migrations/versions/dc811d96975c_added_user_sshkey_tables.py b/dispatcher/backend/src/migrations/versions/dc811d96975c_added_user_sshkey_tables.py index a5072a002..fa636346d 100644 --- a/dispatcher/backend/src/migrations/versions/dc811d96975c_added_user_sshkey_tables.py +++ b/dispatcher/backend/src/migrations/versions/dc811d96975c_added_user_sshkey_tables.py @@ -5,6 +5,7 @@ Create Date: 2023-03-31 14:03:52.822993 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/e1d3894be9ea_set_fields_as_non_nullable.py b/dispatcher/backend/src/migrations/versions/e1d3894be9ea_set_fields_as_non_nullable.py index 81950556c..0d33cb6dc 100644 --- a/dispatcher/backend/src/migrations/versions/e1d3894be9ea_set_fields_as_non_nullable.py +++ b/dispatcher/backend/src/migrations/versions/e1d3894be9ea_set_fields_as_non_nullable.py @@ -5,6 +5,7 @@ Create Date: 2023-05-04 14:08:19.014269 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/migrations/versions/f1a7eae2c347_user_username_is_mandatory_unique.py b/dispatcher/backend/src/migrations/versions/f1a7eae2c347_user_username_is_mandatory_unique.py index 12c901fc0..482e4cca6 100644 --- a/dispatcher/backend/src/migrations/versions/f1a7eae2c347_user_username_is_mandatory_unique.py +++ b/dispatcher/backend/src/migrations/versions/f1a7eae2c347_user_username_is_mandatory_unique.py @@ -5,6 +5,7 @@ Create Date: 2023-03-31 18:05:44.896451 """ + import sqlalchemy as sa from alembic import op diff --git a/dispatcher/backend/src/migrations/versions/fe65a1b0f953_store_refresh_tokens.py b/dispatcher/backend/src/migrations/versions/fe65a1b0f953_store_refresh_tokens.py index fae560f7f..22df9228d 100644 --- a/dispatcher/backend/src/migrations/versions/fe65a1b0f953_store_refresh_tokens.py +++ b/dispatcher/backend/src/migrations/versions/fe65a1b0f953_store_refresh_tokens.py @@ -5,6 +5,7 @@ Create Date: 2023-04-03 21:12:33.745734 """ + import sqlalchemy as sa from alembic import op from sqlalchemy.dialects import postgresql diff --git a/dispatcher/backend/src/tests/integration/routes/users/test_password.py b/dispatcher/backend/src/tests/integration/routes/users/test_password.py index b3d316cc5..88b7ec160 100644 --- a/dispatcher/backend/src/tests/integration/routes/users/test_password.py +++ b/dispatcher/backend/src/tests/integration/routes/users/test_password.py @@ -76,9 +76,9 @@ def test_update_password( url, json=document, headers={ - "Authorization": access_token - if use_admin_token - else make_access_token(username) + "Authorization": ( + access_token if use_admin_token else make_access_token(username) + ) }, ) assert response.status_code == expected_status_code diff --git a/dispatcher/backend/src/utils/offliners.py b/dispatcher/backend/src/utils/offliners.py index f738fb8e5..febe6c012 100644 --- a/dispatcher/backend/src/utils/offliners.py +++ b/dispatcher/backend/src/utils/offliners.py @@ -47,15 +47,19 @@ def command_for(offliner, flags, mount_point): cmd = offliner_def.cmd if offliner_def.std_output: flags[ - offliner_def.std_output - if isinstance(offliner_def.std_output, str) - else "output" + ( + offliner_def.std_output + if isinstance(offliner_def.std_output, str) + else "output" + ) ] = str(mount_point) if offliner_def.std_stats: flags[ - offliner_def.std_stats - if isinstance(offliner_def.std_stats, str) - else "stats-filename" + ( + offliner_def.std_stats + if isinstance(offliner_def.std_stats, str) + else "stats-filename" + ) ] = str(mount_point_for(offliner) / "task_progress.json") if offliner == Offliner.gutenberg: diff --git a/watcher/watcher.py b/watcher/watcher.py index 8af6263fd..57eb59c47 100644 --- a/watcher/watcher.py +++ b/watcher/watcher.py @@ -563,9 +563,9 @@ def entrypoint(): "Defaults to 1 inside Docker as we can't guess available CPUs", dest="nb_threads", type=int, - default=1 - if is_running_inside_container() - else multiprocessing.cpu_count() - 1 or 1, + default=( + 1 if is_running_inside_container() else multiprocessing.cpu_count() - 1 or 1 + ), ) parser.add_argument( diff --git a/workers/app/common/worker.py b/workers/app/common/worker.py index 2c9df0d2d..41d4e5b70 100644 --- a/workers/app/common/worker.py +++ b/workers/app/common/worker.py @@ -144,10 +144,10 @@ def authenticate(self, webapi_uri=None, force=False): options={"verify_signature": False}, ) self.connections[webapi_uri][authenticated_on] = datetime.datetime.now() - self.connections[webapi_uri][ - authentication_expires_on - ] = datetime.datetime.fromtimestamp( - self.connections[webapi_uri][token_payload]["exp"] + self.connections[webapi_uri][authentication_expires_on] = ( + datetime.datetime.fromtimestamp( + self.connections[webapi_uri][token_payload]["exp"] + ) ) return True except Exception as exc: diff --git a/workers/app/task/worker.py b/workers/app/task/worker.py index cca7477c0..7d17a154e 100644 --- a/workers/app/task/worker.py +++ b/workers/app/task/worker.py @@ -469,9 +469,11 @@ def refresh_files_list(self): { fpath.name: { UP: PENDING, - CHK: PENDING - if self.task["upload"]["zim"]["zimcheck"] - else SKIPPED, + CHK: ( + PENDING + if self.task["upload"]["zim"]["zimcheck"] + else SKIPPED + ), } } ) From 28d38671657d83f1dce39b34f4817c91d3a22470 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 21 Feb 2024 12:01:55 +0100 Subject: [PATCH 02/14] Upload artifacts after scraper completion --- dispatcher/backend/src/common/constants.py | 7 ++ dispatcher/backend/src/common/utils.py | 18 +++- .../src/tests/unit/routes/test_utils.py | 9 ++ dispatcher/backend/src/utils/scheduling.py | 6 ++ workers/app/common/docker.py | 7 +- workers/app/task/worker.py | 88 +++++++++++++++++++ 6 files changed, 127 insertions(+), 8 deletions(-) diff --git a/dispatcher/backend/src/common/constants.py b/dispatcher/backend/src/common/constants.py index 918b6355e..edfca5d60 100644 --- a/dispatcher/backend/src/common/constants.py +++ b/dispatcher/backend/src/common/constants.py @@ -33,6 +33,13 @@ LOGS_EXPIRATION = int(os.getenv("LOGS_EXPIRATION", "30")) except Exception: LOGS_EXPIRATION = 30 +ARTIFACTS_UPLOAD_URI = os.getenv( + "ARTIFACTS_UPLOAD_URI", "sftp://uploader@warehouse.farm.openzim.org:1522/artifacts" +) +try: + ARTIFACTS_EXPIRATION = int(os.getenv("ARTIFACTS_EXPIRATION", "30")) +except Exception: + ARTIFACTS_EXPIRATION = 30 # empty ZIMCHECK_OPTION means no zimcheck ZIMCHECK_OPTION = os.getenv("ZIMCHECK_OPTION", "") diff --git a/dispatcher/backend/src/common/utils.py b/dispatcher/backend/src/common/utils.py index 418482bc8..567833fda 100644 --- a/dispatcher/backend/src/common/utils.py +++ b/dispatcher/backend/src/common/utils.py @@ -117,6 +117,9 @@ def add_to_debug_if_present( task=task, kwargs_key="timeout", container_key="timeout" ) add_to_container_if_present(task=task, kwargs_key="log", container_key="log") + add_to_container_if_present( + task=task, kwargs_key="artifacts", container_key="artifacts" + ) add_to_debug_if_present(task=task, kwargs_key="task_log", debug_key="log") add_to_debug_if_present(task=task, kwargs_key="task_name", debug_key="task_name") add_to_debug_if_present(task=task, kwargs_key="task_args", debug_key="task_args") @@ -348,10 +351,17 @@ def task_checked_file_event_handler(session: so.Session, task_id: UUID, payload: def task_update_event_handler(session: so.Session, task_id: UUID, payload: dict): timestamp = get_timestamp_from_event(payload) - log = payload.get("log") # filename / S3 key of text file at upload_uri[logs] - logger.info(f"Task update: {task_id}, log: {log}") - - save_event(session, task_id, TaskStatus.update, timestamp, log=log) + if "log" in payload: + log = payload.get("log") # filename / S3 key of text file at upload_uri[logs] + logger.info(f"Task update: {task_id}, log: {log}") + save_event(session, task_id, TaskStatus.update, timestamp, log=log) + + if "artifacts" in payload: + artifacts = payload.get( + "artifacts" + ) # filename / S3 key of text file at upload_uri[logs] + logger.info(f"Task update: {task_id}, artifacts: {artifacts}") + save_event(session, task_id, TaskStatus.update, timestamp, artifacts=artifacts) def handle_others(task_id: UUID, event: str, payload: dict): diff --git a/dispatcher/backend/src/tests/unit/routes/test_utils.py b/dispatcher/backend/src/tests/unit/routes/test_utils.py index ed0b4df33..7b410e418 100644 --- a/dispatcher/backend/src/tests/unit/routes/test_utils.py +++ b/dispatcher/backend/src/tests/unit/routes/test_utils.py @@ -211,6 +211,15 @@ "&bucketName=org-kiwix-zimfarm-logs" ), }, + "artifacts": { + "expiration": 20, + "upload_uri": ( + "s3://s3.us-west-1.wasabisys.com/" + "?keyId=this_is_super_secret" + "&secretAccessKey=this_is_super_secret" + "&bucketName=org-kiwix-zimfarm-artifacts" + ), + }, }, }, ], diff --git a/dispatcher/backend/src/utils/scheduling.py b/dispatcher/backend/src/utils/scheduling.py index cf878ea7b..08f31708b 100644 --- a/dispatcher/backend/src/utils/scheduling.py +++ b/dispatcher/backend/src/utils/scheduling.py @@ -14,6 +14,8 @@ import db.models as dbm from common import getnow from common.constants import ( + ARTIFACTS_EXPIRATION, + ARTIFACTS_UPLOAD_URI, DEFAULT_SCHEDULE_DURATION, ENABLED_SCHEDULER, LOGS_EXPIRATION, @@ -165,6 +167,10 @@ def request_a_schedule( "upload_uri": LOGS_UPLOAD_URI, "expiration": LOGS_EXPIRATION, }, + "artifacts": { + "upload_uri": ARTIFACTS_UPLOAD_URI, + "expiration": ARTIFACTS_EXPIRATION, + }, }, notification=schedule.notification if schedule.notification else {}, updated_at=now, diff --git a/workers/app/common/docker.py b/workers/app/common/docker.py index cc8635d2f..7b64867e2 100644 --- a/workers/app/common/docker.py +++ b/workers/app/common/docker.py @@ -252,13 +252,12 @@ def get_scraper_container_name(task): ) -def upload_container_name(task_id, filename, unique): - ident = "zimup" if filename.endswith(".zim") else "logup" +def upload_container_name(task_id, filename, kind, unique): if unique: filename = f"{uuid.uuid4().hex}{pathlib.Path(filename).suffix}" else: filename = re.sub(r"[^a-zA-Z0-9_.-]", "_", filename) - return f"{short_id(task_id)}_{ident}_{filename}" + return f"{short_id(task_id)}_{kind}up_{filename}" def get_ip_address(docker_client, name): @@ -562,7 +561,7 @@ def start_uploader( resume, watch=False, ): - container_name = upload_container_name(task["_id"], filename, False) + container_name = upload_container_name(task["_id"], filename, kind, False) # remove container should it exists (should not) try: diff --git a/workers/app/task/worker.py b/workers/app/task/worker.py index 7d17a154e..162de8833 100644 --- a/workers/app/task/worker.py +++ b/workers/app/task/worker.py @@ -8,6 +8,7 @@ import shutil import signal import sys +import tarfile import time from typing import Any, Dict @@ -104,6 +105,7 @@ def __init__(self, **kwargs): self.scraper = None # scraper container self.log_uploader = None # scraper log uploader container + self.artifacts_uploader = None # scraper artifacts uploader container self.host_logsdir = None # path on host where logs are stored self.scraper_succeeded = None # whether scraper succeeded @@ -338,6 +340,7 @@ def stop(self, timeout=5): "dnscache", "scraper", "log_uploader", + "artifacts_uploader", "uploader", "checker", ): @@ -461,6 +464,87 @@ def check_scraper_log_upload(self): } ) + def upload_scraper_artifacts(self): + if not self.scraper: + logger.error("No scraper to upload its artifacts…") + return # scraper gone, we can't access artifacts + + artifacts_globs = self.task["config"].get("artifacts_globs", None) + if not artifacts_globs: + logger.debug("No artifacts configured for upload") + return + else: + logger.debug(f"Archiving files matching {artifacts_globs}") + + logger.debug("Creating a tar of requested artifacts") + filename = f"{self.task['_id']}_{self.task['config']['task_name']}.tar" + try: + files_to_tar = [ + file + for pattern in artifacts_globs + for file in self.task_workdir.glob(pattern) + ] + if len(files_to_tar) == 0: + logger.debug("No files found to archive") + return + + fpath = self.task_workdir / filename + with tarfile.open(fpath, "w") as tar: + for file in files_to_tar: + tar.add(file, arcname=file.relative_to(self.task_workdir)) + try: + file.unlink() + except Exception as exc: + logger.debug( + "Unable to delete file after archiving", exc_info=exc + ) + except Exception as exc: + logger.error(f"Unable to archive artifacts to {fpath}") + logger.exception(exc) + return False + + logger.debug("Starting artifacts uploader container…") + self.artifacts_uploader = start_uploader( + self.docker, + self.task, + "artifacts", + self.username, + host_workdir=self.host_task_workdir, + upload_dir="", + filename=filename, + move=False, + delete=True, + compress=True, + resume=True, + ) + + def check_scraper_artifacts_upload(self): + if not self.artifacts_uploader or self.container_running("artifacts_uploader"): + return + + try: + self.artifacts_uploader.reload() + exit_code = self.artifacts_uploader.attrs["State"]["ExitCode"] + filename = self.artifacts_uploader.labels["filename"] + except docker.errors.NotFound: + # prevent race condition if re-entering between this and container removal + return + logger.info(f"Scraper artifacts upload complete: {exit_code}") + if exit_code != 0: + logger.error( + f"Artifacts Uploader:: " + f"{get_container_logs(self.docker, self.artifacts_uploader.name)}" + ) + self.stop_container("artifacts_uploader") + + logger.info(f"Sending scraper artifacts filename: {filename}") + self.patch_task( + { + "event": "update", + "payload": {"artifacts": filename}, + } + ) + def refresh_files_list(self): for fpath in self.task_workdir.glob("*.zim"): if fpath.name not in self.zim_files.keys(): @@ -619,6 +703,7 @@ def handle_stopped_scraper(self): self.mark_scraper_completed(exit_code, stdout, stderr) self.scraper_succeeded = exit_code == 0 self.upload_scraper_log() + self.upload_scraper_artifacts() def sleep(self): time.sleep(1) @@ -670,6 +755,7 @@ def run(self): or self.container_running("uploader") or self.container_running("checker") or self.container_running("log_uploader") + or self.container_running("artifacts_uploader") ): now = datetime.datetime.now() if (now - last_check).total_seconds() < SLEEP_INTERVAL: @@ -679,10 +765,12 @@ def run(self): last_check = now self.handle_files() self.check_scraper_log_upload() + self.check_scraper_artifacts_upload() # make sure we submit upload status for last zim and scraper log self.handle_files() self.check_scraper_log_upload() + self.check_scraper_artifacts_upload() # done with processing, cleaning-up and exiting self.shutdown("succeeded" if self.scraper_succeeded else "failed") From 252e832687642977131b3699e9f102367319cf19 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Wed, 21 Feb 2024 16:18:29 +0100 Subject: [PATCH 03/14] Always keep zimit temporary files For convenience, we always keep zimit temporary files, so that they can be easily archived with the `artifacts_globs` configuration. --- dispatcher/backend/src/utils/offliners.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dispatcher/backend/src/utils/offliners.py b/dispatcher/backend/src/utils/offliners.py index febe6c012..91783b461 100644 --- a/dispatcher/backend/src/utils/offliners.py +++ b/dispatcher/backend/src/utils/offliners.py @@ -78,6 +78,7 @@ def command_for(offliner, flags, mount_point): if offliner == Offliner.zimit: if "adminEmail" not in flags: flags["adminEmail"] = "contact+zimfarm@kiwix.org" + flags["keep"] = True # always keep temporary files, they will be deleted anyway _command_for_set_default_publisher(flags, offliner_def) From 22e7bfeb3b475678225b765648130165e051b5f2 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 08:41:47 +0100 Subject: [PATCH 04/14] Ensure image name is aligned with offliner + make test assertions more precise and fix wrong fixtures --- dispatcher/backend/src/prestart.sh | 0 .../backend/src/routes/schedules/schedule.py | 23 ++++- .../routes/schedules/test_schedule.py | 96 ++++++++++++++----- .../backend/src/tests/utils_for_tests.py | 21 ++++ 4 files changed, 114 insertions(+), 26 deletions(-) create mode 100755 dispatcher/backend/src/prestart.sh create mode 100644 dispatcher/backend/src/tests/utils_for_tests.py diff --git a/dispatcher/backend/src/prestart.sh b/dispatcher/backend/src/prestart.sh new file mode 100755 index 000000000..e69de29bb diff --git a/dispatcher/backend/src/routes/schedules/schedule.py b/dispatcher/backend/src/routes/schedules/schedule.py index bbdd07f8f..d726b92c2 100644 --- a/dispatcher/backend/src/routes/schedules/schedule.py +++ b/dispatcher/backend/src/routes/schedules/schedule.py @@ -12,6 +12,7 @@ import db.models as dbm from common.constants import REQ_TIMEOUT_GHCR +from common.enum import Offliner from common.schemas.models import ScheduleConfigSchema, ScheduleSchema from common.schemas.orms import ScheduleFullSchema, ScheduleLightSchema from common.schemas.parameters import CloneSchema, SchedulesSchema, UpdateSchema @@ -239,12 +240,21 @@ def patch( raise_if(not request.get_json(), ValidationError, "Update can't be empty") # ensure we test flags according to new task_name if present - if "task_name" in update: + if ( + "task_name" in update + and update["task_name"] != schedule.config["task_name"] + ): raise_if( "flags" not in update, ValidationError, "Can't update offliner without updating flags", ) + raise_if( + "image" not in update or "name" not in update["image"], + ValidationError, + "Image name must be updated when offliner is changed", + ) + flags_schema = ScheduleConfigSchema.get_offliner_schema( update["task_name"] ) @@ -255,6 +265,17 @@ def patch( if "flags" in update: flags_schema().load(update["flags"]) + + if "image" in update and "name" in update["image"]: + if "task_name" in update: + future_task_name = update["task_name"] + else: + future_task_name = schedule.config["task_name"] + + if Offliner.get_image_prefix(future_task_name) + update["image"][ + "name" + ] != Offliner.get_image_name(future_task_name): + raise ValidationError("Image name must match selected offliner") except ValidationError as e: raise InvalidRequestJSON(e.messages) diff --git a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py index c5c033159..0ac2a7516 100644 --- a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py +++ b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py @@ -1,7 +1,9 @@ import json +from copy import deepcopy from uuid import uuid4 import pytest +from utils_for_tests import patch_dict from utils.offliners import expanded_config @@ -24,76 +26,103 @@ def schedule(make_schedule): {"tags": ["full"]}, {"tags": ["full", "small"]}, {"tags": ["full", "small"], "category": "vikidia", "enabled": False}, - {"task_name": "phet", "flags": {}}, + { + "task_name": "mwoffliner", + }, + { + "task_name": "phet", + "flags": {}, + "image": {"name": "openzim/phet", "tag": "1.1.2"}, + }, {"flags": {"mwUrl": "https://fr.wikipedia.org", "adminEmail": "hello@test.de"}}, {"warehouse_path": "/phet"}, - {"image": {"name": "openzim/phet", "tag": "latest"}}, + {"image": {"name": "openzim/mwoffliner", "tag": "1.12.2"}}, {"resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}}, { "task_name": "gutenberg", "warehouse_path": "/gutenberg", "flags": {}, - "image": {"name": "openzim/gutenberg", "tag": "latest"}, + "image": {"name": "openzim/gutenberg", "tag": "1.1.2"}, "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, }, {"name": "new_name"}, ] bad_patch_updates = [ + # Empty patch not allowed {}, + # wrong languages combinations {"language": {"name_en": "Bambara", "name_native": "Bamanankan"}}, {"language": {"code": "bm", "name_en": "", "name_native": "Bamanankan"}}, {"language": {"code": "bm", "name_en": "Bambara"}}, + # enabled flag must be a boolean, not a string {"enabled": "False"}, + # illegal category {"category": "ubuntu"}, + # empty tags not allowed {"tags": ""}, + # tags must be strings {"tags": ["full", 1]}, + # tags must be a list {"tags": "full,small"}, + # name cannot be empty {"name": ""}, + # config cannot be empty {"config": ""}, + # mwoffliner does not supports empty flags {"flags": {}}, + # cannot change only offliner, image must be changed as well + {"task_name": "phet", "flags": {}}, + # cannot change only image name, offliner must be changed as well + {"flags": {}, "image": {"name": "openzim/phet", "tag": "latest"}}, + # illegal offliner name { "task_name": "hop", - "warehouse_path": "/phet", "flags": {}, - "image": {"name": "openzim/phet", "tag": "latest"}, - "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, + "image": {"name": "openzim/hop", "tag": "latest"}, }, + # wrong warehouse_path { - "task_name": "phet", "warehouse_path": "/ubuntu", - "flags": {}, - "image": {"name": "openzim/phet", "tag": "latest"}, - "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, }, + # wrong mwUrl flag for phet { "task_name": "phet", - "warehouse_path": "/phet", "flags": {"mwUrl": "http://fr.wikipedia.org"}, "image": {"name": "openzim/phet", "tag": "latest"}, - "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, - }, - { - "task_name": "phet", - "warehouse_path": "/phet", - "flags": {}, - "image": {"name": "openzim/youtuba", "tag": "latest"}, - "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, }, + # image name not aligned with task name { "task_name": "gutenberg", - "warehouse_path": "/gutenberg", "flags": {}, "image": {"name": "openzim/youtube", "tag": "latest"}, + }, + # bad cpu value + { "resources": {"cpu": -1, "memory": MIN_RAM, "disk": ONE_GiB}, }, + # bad mem value + { + "resources": {"cpu": 3, "memory": -1, "disk": ONE_GiB}, + }, + # bad disk value + { + "resources": {"cpu": 3, "memory": MIN_RAM, "disk": -1}, + }, + # one resource missing + { + "resources": {"cpu": 3, "memory": MIN_RAM}, + }, + # illegal characters in name {"name": "new\u0000name"}, + # illegal characters in mwUrl { "flags": { "mwUrl": "https://fr.wiki\u0000pedia.org", "adminEmail": "hello@test.de", } }, + # illegal characters in adminEmail { "flags": { "mwUrl": "https://fr.wikipedia.org", @@ -455,6 +484,8 @@ def test_patch_schedule_via_name_with(self, client, access_token, update, schedu # let's reapply manually the changes that should have been done by the patch # so that we can confirm it has been done document = response.get_json() + + update_patch = {} config_keys = [ "task_name", "warehouse_path", @@ -464,11 +495,26 @@ def test_patch_schedule_via_name_with(self, client, access_token, update, schedu "flags", "monitor", ] - # these keys must not be applied since they are somewhere else is the document - for key in config_keys: - update.pop(key, None) - document.update(update) - assert response.get_json() == document + for key, value in update.items(): + if key in config_keys: + if "config" not in update_patch: + update_patch["config"] = {} + update_patch["config"][key] = value + else: + update_patch[key] = value + + # handle special situation for config image name + if ( + "config" in update_patch + and "image" in update_patch["config"] + and "name" in update_patch["config"]["image"] + ): + update_patch["config"]["image"]["name"] = ( + "ghcr.io/" + update_patch["config"]["image"]["name"] + ) + + patch_result = patch_dict(deepcopy(document), update_patch) + assert document == patch_result @pytest.mark.parametrize("update", bad_patch_updates) def test_patch_schedule_via_name_with_errors( diff --git a/dispatcher/backend/src/tests/utils_for_tests.py b/dispatcher/backend/src/tests/utils_for_tests.py new file mode 100644 index 000000000..85ffd350d --- /dev/null +++ b/dispatcher/backend/src/tests/utils_for_tests.py @@ -0,0 +1,21 @@ +def patch_dict(data, patch): + for key, patch_value in patch.items(): + if key in data: + if patch_value is None: + # If the patch value is None, remove the key from the original + # dictionary + del data[key] + else: + original_value = data[key] + if isinstance(original_value, dict) and isinstance(patch_value, dict): + # If both values are dictionaries, recursively patch the nested + # dictionaries + patch_dict(original_value, patch_value) + else: + # Otherwise, update the value in the original dictionary + data[key] = patch_value + else: + # If the key is not present in the original dictionary, set it with the + # patch value + data[key] = patch_value + return data From 1a27984679915a9425c58948345cf298dd4711b8 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 08:42:50 +0100 Subject: [PATCH 05/14] Reinforce removal of S3 secrets in response data --- dispatcher/backend/src/routes/utils.py | 39 +++++++++++-------- .../src/tests/unit/routes/test_utils.py | 19 +++++++++ 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/dispatcher/backend/src/routes/utils.py b/dispatcher/backend/src/routes/utils.py index 04a835971..71ac1c2e8 100644 --- a/dispatcher/backend/src/routes/utils.py +++ b/dispatcher/backend/src/routes/utils.py @@ -26,7 +26,7 @@ def has_dict_sub_key(data: Dict[str, Any], keys: List[str]): def remove_secrets_from_response(response: dict): """replaces or removes (in-place) all occurences of secrets""" remove_secret_flags(response) - remove_upload_secrets(response) + remove_s3_secrets(response) def remove_secret_flags(response: dict): @@ -87,19 +87,24 @@ def remove_secret_flag_from_command( ] = f'--{field_name}="{SECRET_REPLACEMENT}"' -def remove_upload_secrets(response: dict): - """remove keyId and secretAccessKey upload_uri, since we upload logs to - S3 but still need the rest of the URL to download scraper logs""" - if not has_dict_sub_key(response, ["upload", "logs", "upload_uri"]): - return - url = urlparse(response["upload"]["logs"]["upload_uri"]) - response["upload"]["logs"]["upload_uri"] = url._replace( - query="&".join( - [ - param - for param in url.query.split("&") - if not param.lower().startswith("keyid") - and not param.lower().startswith("secretaccesskey") - ] - ) - ).geturl() +def remove_s3_secrets(response: dict): + """remove keyId and secretAccessKey query params from any URL we might find""" + for key in response.keys(): + if not response[key]: + continue + if isinstance(response[key], dict): + remove_s3_secrets(response[key]) + else: + if not isinstance(response[key], str) or "://" not in response[key]: + continue + url = urlparse(response[key]) + response[key] = url._replace( + query="&".join( + [ + param + for param in url.query.split("&") + if not param.lower().startswith("keyid") + and not param.lower().startswith("secretaccesskey") + ] + ) + ).geturl() diff --git a/dispatcher/backend/src/tests/unit/routes/test_utils.py b/dispatcher/backend/src/tests/unit/routes/test_utils.py index 7b410e418..2e8dff7db 100644 --- a/dispatcher/backend/src/tests/unit/routes/test_utils.py +++ b/dispatcher/backend/src/tests/unit/routes/test_utils.py @@ -222,6 +222,25 @@ }, }, }, + { + "config": { + "task_name": "kolibri", + "flags": { + "name": "khanacademy_en_all", + "optimization-cache": "this_is_super_secret", + }, + }, + "i_am_not_a_real": { + "response_but": { + "please_clean_me": ( + "s3://s3.us-west-1.wasabisys.com/" + "?keyId=this_is_super_secret" + "&secretAccessKey=this_is_super_secret" + "&bucketName=org-kiwix-zimfarm-logs" + ), + }, + }, + }, ], ) def test_remove_secrets(response): From 2e4b478cc60f81f79ae97cfaf1b14de94351630b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 09:00:50 +0100 Subject: [PATCH 06/14] Handle the case where a key config is patched to a null value This situtaion is for now possible only for config keys, all "root" schedule keys cannot be set to null. When a schedule patch requires to set a config key to null, we want to: - remove the key from the config dictionary if already present - not set the key to null if the key was already absent from the config --- .../backend/src/routes/schedules/schedule.py | 8 ++- .../routes/schedules/test_schedule.py | 54 +++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/dispatcher/backend/src/routes/schedules/schedule.py b/dispatcher/backend/src/routes/schedules/schedule.py index d726b92c2..8bc347bd9 100644 --- a/dispatcher/backend/src/routes/schedules/schedule.py +++ b/dispatcher/backend/src/routes/schedules/schedule.py @@ -291,12 +291,18 @@ def patch( for key, value in update.items(): if key in config_keys: - schedule.config[key] = value + if value is None: + if key in schedule.config: + del schedule.config[key] + else: + schedule.config[key] = value elif key == "language": schedule.language_code = value["code"] schedule.language_name_en = value["name_en"] schedule.language_name_native = value["name_native"] else: + # we do not handle yet the case where a key is set to null because it is + # not allowed (yet) in UpdateSchema, only config keys can be set to null setattr(schedule, key, value) try: diff --git a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py index 0ac2a7516..63a2a6753 100644 --- a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py +++ b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py @@ -541,6 +541,60 @@ def test_unauthorized(self, client, access_token, schedules): assert response.status_code == 401 assert response.get_json() == {"error": "token invalid"} + def test_patch_schedule_does_not_set_null_config_keys( + self, client, access_token, schedule + ): + + update = {"platform": None} + url = "/schedules/{}".format(schedule["name"]) + response = client.patch( + url, json=update, headers={"Authorization": access_token} + ) + assert response.status_code == 204 + + response = client.get(url, headers={"Authorization": access_token}) + assert response.status_code == 200 + + document = response.get_json() + + assert "config" in document + assert "platform" not in document["config"] + + def test_patch_schedule_remove_null_config_keys( + self, client, access_token, schedule + ): + + update = {"platform": "ifixit"} + url = "/schedules/{}".format(schedule["name"]) + response = client.patch( + url, json=update, headers={"Authorization": access_token} + ) + assert response.status_code == 204 + + response = client.get(url, headers={"Authorization": access_token}) + assert response.status_code == 200 + + document = response.get_json() + + assert "config" in document + assert "platform" in document["config"] + assert document["config"]["platform"] == "ifixit" + + update = {"platform": None} + url = "/schedules/{}".format(schedule["name"]) + response = client.patch( + url, json=update, headers={"Authorization": access_token} + ) + assert response.status_code == 204 + + response = client.get(url, headers={"Authorization": access_token}) + assert response.status_code == 200 + + document = response.get_json() + + assert "config" in document + assert "platform" not in document["config"] + class TestScheduleDelete: def test_delete_schedule(self, client, access_token, schedule): From adf9f8dcee7def05b37a0f3e98a848c0256142dd Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 09:13:18 +0100 Subject: [PATCH 07/14] Add support for manipulation of config artifacts_globs in backend API --- dispatcher/backend/src/common/schemas/models.py | 1 + dispatcher/backend/src/common/schemas/parameters.py | 3 +++ dispatcher/backend/src/routes/schedules/schedule.py | 1 + .../src/tests/integration/routes/schedules/test_schedule.py | 3 +++ 4 files changed, 8 insertions(+) diff --git a/dispatcher/backend/src/common/schemas/models.py b/dispatcher/backend/src/common/schemas/models.py index 05d2be474..d882e35eb 100644 --- a/dispatcher/backend/src/common/schemas/models.py +++ b/dispatcher/backend/src/common/schemas/models.py @@ -73,6 +73,7 @@ class ScheduleConfigSchema(SerializableSchema): resources = fields.Nested(ResourcesSchema(), required=True) flags = fields.Dict(required=True) platform = String(required=True, allow_none=True, validate=validate_platform) + artifacts_globs = fields.List(String(validate=validate_not_empty), required=False) monitor = fields.Boolean(required=True, truthy=[True], falsy=[False]) @staticmethod diff --git a/dispatcher/backend/src/common/schemas/parameters.py b/dispatcher/backend/src/common/schemas/parameters.py index 123dff943..b9bfb29f1 100644 --- a/dispatcher/backend/src/common/schemas/parameters.py +++ b/dispatcher/backend/src/common/schemas/parameters.py @@ -107,6 +107,9 @@ class UpdateSchema(Schema): resources = fields.Nested(ResourcesSchema, required=False) monitor = fields.Boolean(required=False, truthy={True}, falsy={False}) flags = fields.Dict(required=False) + artifacts_globs = fields.List( + String(validate=validate_not_empty), required=False, allow_none=True + ) # schedule clone diff --git a/dispatcher/backend/src/routes/schedules/schedule.py b/dispatcher/backend/src/routes/schedules/schedule.py index 8bc347bd9..d0facf290 100644 --- a/dispatcher/backend/src/routes/schedules/schedule.py +++ b/dispatcher/backend/src/routes/schedules/schedule.py @@ -287,6 +287,7 @@ def patch( "platform", "flags", "monitor", + "artifacts_globs", ] for key, value in update.items(): diff --git a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py index 63a2a6753..b9778c11b 100644 --- a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py +++ b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py @@ -37,6 +37,7 @@ def schedule(make_schedule): {"flags": {"mwUrl": "https://fr.wikipedia.org", "adminEmail": "hello@test.de"}}, {"warehouse_path": "/phet"}, {"image": {"name": "openzim/mwoffliner", "tag": "1.12.2"}}, + {"artifacts_globs": ["*.json", "**/*.json"]}, {"resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}}, { "task_name": "gutenberg", @@ -238,6 +239,7 @@ class TestSchedulePost: "platform": None, "resources": {"cpu": 3, "memory": MIN_RAM, "disk": ONE_GiB}, "monitor": False, + "artifacts_globs": ["**/*.json"], }, "notification": {}, }, @@ -494,6 +496,7 @@ def test_patch_schedule_via_name_with(self, client, access_token, update, schedu "platform", "flags", "monitor", + "artifacts_globs", ] for key, value in update.items(): if key in config_keys: From 0a1e805357b8c642c151c3ba0b75bee6c54efb01 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 09:14:04 +0100 Subject: [PATCH 08/14] Add support for artifacts in UI --- dispatcher/frontend-ui/src/App.vue | 5 +++++ .../src/components/ScheduleEditor.vue | 20 +++++++++++++++++++ dispatcher/frontend-ui/src/constants.js | 15 +++++++++++--- .../frontend-ui/src/views/ScheduleView.vue | 1 + dispatcher/frontend-ui/src/views/TaskView.vue | 2 ++ 5 files changed, 40 insertions(+), 3 deletions(-) diff --git a/dispatcher/frontend-ui/src/App.vue b/dispatcher/frontend-ui/src/App.vue index 2a2315ad8..3b68d1ddf 100644 --- a/dispatcher/frontend-ui/src/App.vue +++ b/dispatcher/frontend-ui/src/App.vue @@ -141,6 +141,11 @@ .then(function (response) { // parent.error = null; let schedule = response.data; + + if (schedule.config.artifacts_globs) { + schedule.config.artifacts_globs_str = schedule.config.artifacts_globs.join("\n") + } + parent.$store.dispatch('setSchedule', schedule); if (on_success) { on_success(); } diff --git a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue index e619e2f5f..dca4c6ecb 100644 --- a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue +++ b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue @@ -193,6 +193,17 @@ + + + + + + + +

{{ edit_task_name}} command flags

@@ -435,6 +446,15 @@ payload.resources = parent.edit_schedule.config.resources; } + // artifacts globs needs to be transformed into a real list + let new_artifacts_globs = null; + if (parent.edit_schedule.config.artifacts_globs_str && parent.edit_schedule.config.artifacts_globs_str.trim() !== "") { + new_artifacts_globs = parent.edit_schedule.config.artifacts_globs_str.split(/\r?\n/).map(s => s.trim()).filter(Boolean); + } + if (new_artifacts_globs != parent.schedule.config.artifacts_globs) { + payload.artifacts_globs = new_artifacts_globs; + } + if (this.flags_payload) Object.assign(payload, {"flags": this.flags_payload}); diff --git a/dispatcher/frontend-ui/src/constants.js b/dispatcher/frontend-ui/src/constants.js index eb337a2a4..0b422e066 100644 --- a/dispatcher/frontend-ui/src/constants.js +++ b/dispatcher/frontend-ui/src/constants.js @@ -113,7 +113,15 @@ function image_url(config) { } function logs_url(task) { - let url = new URL(task.upload.logs.upload_uri); + return upload_url(task.upload.logs.upload_uri, task.container.log) +} + +function artifacts_url(task) { + return upload_url(task.upload.artifacts.upload_uri, task.container.artifacts) +} + +function upload_url(uri, filename) { + let url = new URL(uri); let scheme = url.protocol.replace(/:$/, ""); if (["http", "https"].indexOf(scheme) == -1) @@ -124,10 +132,10 @@ function logs_url(task) { let bucketName = url.searchParams.get("bucketName"); if (bucketName) log_url += bucketName + "/"; - return log_url + task.container.log; + return log_url + filename; } - return task.container.log; + return filename; } function build_command_without(config, secret_fields) { @@ -520,6 +528,7 @@ export default { image_human: image_human, image_url: image_url, logs_url: logs_url, + artifacts_url: artifacts_url, build_docker_command: build_docker_command, build_command_without: build_command_without, trim_command: trim_command, diff --git a/dispatcher/frontend-ui/src/views/ScheduleView.vue b/dispatcher/frontend-ui/src/views/ScheduleView.vue index 6574dddbc..fc1c32d6e 100644 --- a/dispatcher/frontend-ui/src/views/ScheduleView.vue +++ b/dispatcher/frontend-ui/src/views/ScheduleView.vue @@ -131,6 +131,7 @@ monitored + Artifacts{{ schedule.config.artifacts_globs }} Config Command {{ command }} Offliner Command {{ offliner_command }} diff --git a/dispatcher/frontend-ui/src/views/TaskView.vue b/dispatcher/frontend-ui/src/views/TaskView.vue index ca430c699..24f0e2df2 100644 --- a/dispatcher/frontend-ui/src/views/TaskView.vue +++ b/dispatcher/frontend-ui/src/views/TaskView.vue @@ -127,6 +127,7 @@
{{ task_container.stderr }}
Scraper LogDownload log + Scraper ArtifactsDownload artifacts Exception
{{ task_debug.exception }}
Traceback
{{ task_debug.traceback }}
Task-worker Log
{{ task_debug.log }}
@@ -210,6 +211,7 @@ secret_fields() { return Constants.secret_fields_for(this.offliner_def); }, pipe_duration() { return Constants.format_duration_between(this.task.timestamp.requested, this.task.timestamp.started); }, zimfarm_logs_url() { return Constants.logs_url(this.task); }, + zimfarm_artifacts_url() { return Constants.artifacts_url(this.task); }, kiwix_download_url() { return Constants.kiwix_download_url; }, webapi_url() { return Constants.zimfarm_webapi; }, command() { return this.task_container.command.join(" "); }, From f5237af1a200569718924dfc2da81c0a679add06 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 11:02:47 +0100 Subject: [PATCH 09/14] Enhance descriptions for docker image name and tag --- dispatcher/frontend-ui/src/components/ScheduleEditor.vue | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue index dca4c6ecb..61e57a7af 100644 --- a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue +++ b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue @@ -120,7 +120,7 @@ - + - + Date: Thu, 22 Feb 2024 11:03:01 +0100 Subject: [PATCH 10/14] Add semicolons for consistency --- dispatcher/frontend-ui/src/components/ScheduleEditor.vue | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue index 61e57a7af..2d4e6b00e 100644 --- a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue +++ b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue @@ -131,7 +131,7 @@ - + - + {{ edit_schedule.config.monitor|yes_no("Enabled", "Disabled") }} From b56caab1663a75c3980aecfb248508f3cbede82e Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 22 Feb 2024 11:19:41 +0100 Subject: [PATCH 11/14] Rewrite code to simplify method --- .../backend/src/routes/schedules/schedule.py | 107 ++++++++++-------- 1 file changed, 60 insertions(+), 47 deletions(-) diff --git a/dispatcher/backend/src/routes/schedules/schedule.py b/dispatcher/backend/src/routes/schedules/schedule.py index d0facf290..49dcb0a4a 100644 --- a/dispatcher/backend/src/routes/schedules/schedule.py +++ b/dispatcher/backend/src/routes/schedules/schedule.py @@ -225,60 +225,56 @@ def get(self, schedule_name: str, token: AccessToken.Payload, session: so.Sessio return jsonify(schedule) - @authenticate - @require_perm("schedules", "update") - @dbsession - def patch( - self, schedule_name: str, token: AccessToken.Payload, session: so.Session - ): - """Update all properties of a schedule but _id and most_recent_task""" + def _get_and_validate_patch(self, schedule, request): + """Check patch request is valid + + Current schedule is needed to check validity since the patch contains only + changes and some combination of fields are needed under some conditions, e.g. + you cannot change the task_name (offliner) without changing flags and image name + """ + update = UpdateSchema().load(request.get_json()) + raise_if(not request.get_json(), ValidationError, "Update can't be empty") + + # ensure we test flags according to new task_name if present + if ( + "task_name" in update + and update["task_name"] != schedule.config["task_name"] + ): + raise_if( + "flags" not in update, + ValidationError, + "Can't update offliner without updating flags", + ) + raise_if( + "image" not in update or "name" not in update["image"], + ValidationError, + "Image name must be updated when offliner is changed", + ) - schedule = dbm.Schedule.get(session, schedule_name, ScheduleNotFound) + flags_schema = ScheduleConfigSchema.get_offliner_schema(update["task_name"]) + else: + flags_schema = ScheduleConfigSchema.get_offliner_schema( + schedule.config["task_name"] + ) - try: - update = UpdateSchema().load(request.get_json()) - raise_if(not request.get_json(), ValidationError, "Update can't be empty") - - # ensure we test flags according to new task_name if present - if ( - "task_name" in update - and update["task_name"] != schedule.config["task_name"] - ): - raise_if( - "flags" not in update, - ValidationError, - "Can't update offliner without updating flags", - ) - raise_if( - "image" not in update or "name" not in update["image"], - ValidationError, - "Image name must be updated when offliner is changed", - ) + if "flags" in update: + flags_schema().load(update["flags"]) - flags_schema = ScheduleConfigSchema.get_offliner_schema( - update["task_name"] - ) + if "image" in update and "name" in update["image"]: + if "task_name" in update: + future_task_name = update["task_name"] else: - flags_schema = ScheduleConfigSchema.get_offliner_schema( - schedule.config["task_name"] - ) - - if "flags" in update: - flags_schema().load(update["flags"]) + future_task_name = schedule.config["task_name"] - if "image" in update and "name" in update["image"]: - if "task_name" in update: - future_task_name = update["task_name"] - else: - future_task_name = schedule.config["task_name"] + if Offliner.get_image_prefix(future_task_name) + update["image"][ + "name" + ] != Offliner.get_image_name(future_task_name): + raise ValidationError("Image name must match selected offliner") - if Offliner.get_image_prefix(future_task_name) + update["image"][ - "name" - ] != Offliner.get_image_name(future_task_name): - raise ValidationError("Image name must match selected offliner") - except ValidationError as e: - raise InvalidRequestJSON(e.messages) + return update + def _apply_patch_to_schedule(self, schedule, update): + """Apply the patch update to the schedule""" config_keys = [ "task_name", "warehouse_path", @@ -306,6 +302,23 @@ def patch( # not allowed (yet) in UpdateSchema, only config keys can be set to null setattr(schedule, key, value) + @authenticate + @require_perm("schedules", "update") + @dbsession + def patch( + self, schedule_name: str, token: AccessToken.Payload, session: so.Session + ): + """Update all properties of a schedule but _id and most_recent_task""" + + schedule = dbm.Schedule.get(session, schedule_name, ScheduleNotFound) + + try: + update = self._get_and_validate_patch(schedule=schedule, request=request) + except ValidationError as e: + raise InvalidRequestJSON(e.messages) + + self._apply_patch_to_schedule(schedule=schedule, update=update) + try: session.flush() except IntegrityError: From 0bcca031be8d1645f34ead665bc9b37e1741ac19 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 23 Feb 2024 14:32:27 +0100 Subject: [PATCH 12/14] Small changes following review --- dispatcher/backend/src/common/constants.py | 3 +++ .../integration/routes/schedules/test_schedule.py | 3 ++- dispatcher/backend/src/tests/utils_for_tests.py | 11 ++++++++++- .../frontend-ui/src/components/ScheduleEditor.vue | 2 +- workers/app/task/worker.py | 10 +++++----- 5 files changed, 21 insertions(+), 8 deletions(-) diff --git a/dispatcher/backend/src/common/constants.py b/dispatcher/backend/src/common/constants.py index edfca5d60..c9b617402 100644 --- a/dispatcher/backend/src/common/constants.py +++ b/dispatcher/backend/src/common/constants.py @@ -23,6 +23,7 @@ "ZIM_UPLOAD_URI", "sftp://uploader@warehouse.farm.openzim.org:1522/zim" ) try: + # ZIM files expiration, 0 to disable expiration ZIM_EXPIRATION = int(os.getenv("ZIM_EXPIRATION", "0")) except Exception: ZIM_EXPIRATION = 0 @@ -30,6 +31,7 @@ "LOGS_UPLOAD_URI", "sftp://uploader@warehouse.farm.openzim.org:1522/logs" ) try: + # log files expiration, 0 to disable expiration LOGS_EXPIRATION = int(os.getenv("LOGS_EXPIRATION", "30")) except Exception: LOGS_EXPIRATION = 30 @@ -37,6 +39,7 @@ "ARTIFACTS_UPLOAD_URI", "sftp://uploader@warehouse.farm.openzim.org:1522/artifacts" ) try: + # artifact files expiration, 0 to disable expiration ARTIFACTS_EXPIRATION = int(os.getenv("ARTIFACTS_EXPIRATION", "30")) except Exception: ARTIFACTS_EXPIRATION = 30 diff --git a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py index b9778c11b..c8effa03c 100644 --- a/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py +++ b/dispatcher/backend/src/tests/integration/routes/schedules/test_schedule.py @@ -516,7 +516,8 @@ def test_patch_schedule_via_name_with(self, client, access_token, update, schedu "ghcr.io/" + update_patch["config"]["image"]["name"] ) - patch_result = patch_dict(deepcopy(document), update_patch) + patch_result = deepcopy(document) + patch_dict(patch_result, update_patch) assert document == patch_result @pytest.mark.parametrize("update", bad_patch_updates) diff --git a/dispatcher/backend/src/tests/utils_for_tests.py b/dispatcher/backend/src/tests/utils_for_tests.py index 85ffd350d..0c2593411 100644 --- a/dispatcher/backend/src/tests/utils_for_tests.py +++ b/dispatcher/backend/src/tests/utils_for_tests.py @@ -1,4 +1,14 @@ def patch_dict(data, patch): + """Apply a patch to a dictionnary + + - data is the dictionnary to modify (in-place) + - patch is a dictionnary of modifications to apply (could contain nested dictionnary + when the value to modify is deep inside the data dictionnary) + + E.g. if data is { "key1": { "subkey1": "value1", subkey2": "value2" } } and patch is + { "key1": { "subkey2": "newvalue2"}} then after the operation data will become + { "key1": { "subkey1": "value1", subkey2": "newvalue2" } } + """ for key, patch_value in patch.items(): if key in data: if patch_value is None: @@ -18,4 +28,3 @@ def patch_dict(data, patch): # If the key is not present in the original dictionary, set it with the # patch value data[key] = patch_value - return data diff --git a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue index 2d4e6b00e..47ffb11f0 100644 --- a/dispatcher/frontend-ui/src/components/ScheduleEditor.vue +++ b/dispatcher/frontend-ui/src/components/ScheduleEditor.vue @@ -131,7 +131,7 @@ - + Date: Fri, 23 Feb 2024 14:32:57 +0100 Subject: [PATCH 13/14] Enhance parsing of query string for s3 secrets removal --- dispatcher/backend/src/routes/utils.py | 10 +-- .../src/tests/unit/routes/test_utils.py | 62 +++++++++++++++++++ 2 files changed, 67 insertions(+), 5 deletions(-) diff --git a/dispatcher/backend/src/routes/utils.py b/dispatcher/backend/src/routes/utils.py index 71ac1c2e8..b59aa9faa 100644 --- a/dispatcher/backend/src/routes/utils.py +++ b/dispatcher/backend/src/routes/utils.py @@ -1,6 +1,6 @@ import logging from typing import Any, Dict, List -from urllib.parse import urlparse +from urllib.parse import parse_qs, urlparse from common.constants import SECRET_REPLACEMENT from common.schemas.models import ScheduleConfigSchema @@ -101,10 +101,10 @@ def remove_s3_secrets(response: dict): response[key] = url._replace( query="&".join( [ - param - for param in url.query.split("&") - if not param.lower().startswith("keyid") - and not param.lower().startswith("secretaccesskey") + f"{key}={value}" + for key, values in parse_qs(url.query).items() + if str(key).lower() not in ["keyid", "secretaccesskey"] + for value in values ] ) ).geturl() diff --git a/dispatcher/backend/src/tests/unit/routes/test_utils.py b/dispatcher/backend/src/tests/unit/routes/test_utils.py index 2e8dff7db..fbb488136 100644 --- a/dispatcher/backend/src/tests/unit/routes/test_utils.py +++ b/dispatcher/backend/src/tests/unit/routes/test_utils.py @@ -249,6 +249,68 @@ def test_remove_secrets(response): assert "this_is_super_secret" not in str(response) +@pytest.mark.parametrize( + "response_before,response_after", + [ + ( + { + "please_clean_me1": ( + "s3://s3.us-west-1.wasabisys.com/" + "?keyId=this_is_super_secret" + "&secretAccessKey=this_is_super_secret" + "&bucketName=org-kiwix-zimfarm-logs" + ), + "please_clean_me2": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + "&keyId=this_is_super_secret" + "&secretAccessKey=this_is_super_secret" + ), + "please_clean_me3": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + "&keyId=this_is_super_secret" + "&secretAccessKey=this_is_super_secret" + "&something=somevalue" + ), + "please_clean_me4": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + "&secretAccessKey=this_is_super_secret" + "&something=somevalue" + "&keyId=this_is_super_secret" + "&something2=somevalue2" + ), + }, + { + "please_clean_me1": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + ), + "please_clean_me2": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + ), + "please_clean_me3": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + "&something=somevalue" + ), + "please_clean_me4": ( + "s3://s3.us-west-1.wasabisys.com/" + "?bucketName=org-kiwix-zimfarm-logs" + "&something=somevalue" + "&something2=somevalue2" + ), + }, + ), + ], +) +def test_remove_secrets_url_kept(response_before, response_after): + remove_secrets_from_response(response_before) + assert response_before == response_after + + @pytest.mark.parametrize( "data, keys, has", [ From bc235210e8572285f6f1f31809d473135f777645 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 23 Feb 2024 14:43:24 +0100 Subject: [PATCH 14/14] By default, artifacts functionnality is disabled --- dispatcher/backend/src/common/constants.py | 4 +--- workers/app/task/worker.py | 4 ++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/dispatcher/backend/src/common/constants.py b/dispatcher/backend/src/common/constants.py index c9b617402..3be97e5b3 100644 --- a/dispatcher/backend/src/common/constants.py +++ b/dispatcher/backend/src/common/constants.py @@ -35,9 +35,7 @@ LOGS_EXPIRATION = int(os.getenv("LOGS_EXPIRATION", "30")) except Exception: LOGS_EXPIRATION = 30 -ARTIFACTS_UPLOAD_URI = os.getenv( - "ARTIFACTS_UPLOAD_URI", "sftp://uploader@warehouse.farm.openzim.org:1522/artifacts" -) +ARTIFACTS_UPLOAD_URI = os.getenv("ARTIFACTS_UPLOAD_URI", None) try: # artifact files expiration, 0 to disable expiration ARTIFACTS_EXPIRATION = int(os.getenv("ARTIFACTS_EXPIRATION", "30")) diff --git a/workers/app/task/worker.py b/workers/app/task/worker.py index 5296fdea3..89ee7bc55 100644 --- a/workers/app/task/worker.py +++ b/workers/app/task/worker.py @@ -469,6 +469,10 @@ def upload_scraper_artifacts(self): logger.error("No scraper to upload its artifacts…") return # scraper gone, we can't access artifacts + if not self.task["upload"]["artifacts"]["upload_uri"]: + logger.debug("No artifacts upload URI configured") + return + artifacts_globs = self.task["config"].get("artifacts_globs", None) if not artifacts_globs: logger.debug("No artifacts configured for upload")