From 0dbd2ac170b53d288681c70a7efd1d46947de9c0 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 18 Dec 2024 17:44:34 -0600 Subject: [PATCH 001/327] `git merge --squash replay` --- .github/workflows/image-publish.yml | 3 +- .github/workflows/wipac-cicd.yml | 58 +- .gitignore | 1 + README.md | 165 +++--- dependencies-from-Dockerfile.log | 148 +----- resources/prod_tester/test_getter.py | 72 ++- resources/prod_tester/test_runner.py | 4 +- resources/prod_tester/test_suit_prod.py | 15 +- setup.cfg | 2 +- skydriver/config.py | 2 +- skydriver/database/__init__.py | 10 +- skydriver/database/interface.py | 190 ++++--- skydriver/database/schema.py | 37 +- skydriver/database/utils.py | 17 +- skydriver/images.py | 40 +- skydriver/k8s/scan_backlog.py | 117 +++- skydriver/rest_handlers.py | 644 ++++++++++++++--------- skydriver/server.py | 3 +- tests/integration/test_backlog_runner.py | 8 +- tests/integration/test_rest_routes.py | 207 ++++++-- tests/unit/test_sanity.py | 35 -- 21 files changed, 1045 insertions(+), 733 deletions(-) delete mode 100644 tests/unit/test_sanity.py diff --git a/.github/workflows/image-publish.yml b/.github/workflows/image-publish.yml index cecb3462..7db7d618 100644 --- a/.github/workflows/image-publish.yml +++ b/.github/workflows/image-publish.yml @@ -1,4 +1,4 @@ -name: docker releases +name: image publish on: push: @@ -18,7 +18,6 @@ env: jobs: docker: - name: "Docker Image" runs-on: ubuntu-latest steps: - name: Checkout Project diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index c38517e6..16b79cca 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -1,6 +1,6 @@ name: wipac ci/cd -on: [push] +on: [ push ] env: CI_TEST: 'yes' @@ -19,27 +19,29 @@ jobs: outputs: matrix: ${{ steps.versions.outputs.matrix }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - id: versions uses: WIPACrepo/wipac-dev-py-versions-action@v2.1 flake8: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 - - uses: WIPACrepo/wipac-dev-flake8-action@v1.0 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + - uses: WIPACrepo/wipac-dev-flake8-action@v1.2 + with: + max-complexity: 10 mypy: - needs: [py-versions] + needs: [ py-versions ] runs-on: ubuntu-latest strategy: fail-fast: false matrix: py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }} steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.py3 }} - uses: WIPACrepo/wipac-dev-mypy-action@v2.0 @@ -53,7 +55,7 @@ jobs: github.actor != 'dependabot[bot]' && github.ref_type == 'branch' name: checkout (only for non-dependabot non-default branches) - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} - if: | @@ -65,15 +67,15 @@ jobs: base-keywords: WIPAC IceCube unit-tests: - needs: [py-versions] + needs: [ py-versions ] runs-on: ubuntu-latest strategy: fail-fast: false matrix: py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }} steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v3 + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: python-version: ${{ matrix.py3 }} @@ -92,7 +94,7 @@ jobs: cat pytest.logs || true integration-tests: - needs: [py-versions] + needs: [ py-versions ] runs-on: ubuntu-latest strategy: fail-fast: false @@ -102,9 +104,9 @@ jobs: mongo: image: bitnami/mongodb:4 ports: - - 27017:27017 + - 27017:27017 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v2 - uses: docker/build-push-action@v3 with: @@ -130,7 +132,7 @@ jobs: echo "#!/bin/bash" >> $DIR/test-script.sh echo "set -xe" >> $DIR/test-script.sh echo "pip install .[tests]" >> $DIR/test-script.sh - echo "python -m pytest -vvv tests/integration" >> $DIR/test-script.sh + echo "python -m pytest -vvv tests/integration --exitfirst" >> $DIR/test-script.sh chmod +x $DIR/test-script.sh cat $DIR/test-script.sh @@ -159,7 +161,7 @@ jobs: test-build-docker: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v2 - uses: docker/build-push-action@v3 with: @@ -170,17 +172,17 @@ jobs: release: # only run on main/master/default if: format('refs/heads/{0}', github.event.repository.default_branch) == github.ref - needs: [flake8, mypy, py-setup, unit-tests, integration-tests, test-build-docker] + needs: [ flake8, mypy, py-setup, unit-tests, integration-tests, test-build-docker ] runs-on: ubuntu-latest concurrency: release steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 0 - token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} - - name: Python Semantic Release - uses: python-semantic-release/python-semantic-release@v7.34.6 - with: - github_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} - # repository_username: __token__ - # repository_password: ${{ secrets.PYPI_TOKEN }} + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + - name: Python Semantic Release + uses: python-semantic-release/python-semantic-release@v7.34.6 + with: + github_token: ${{ secrets.PERSONAL_ACCESS_TOKEN }} + # repository_username: __token__ + # repository_password: ${{ secrets.PYPI_TOKEN }} diff --git a/.gitignore b/.gitignore index cd31f730..2012806d 100644 --- a/.gitignore +++ b/.gitignore @@ -142,3 +142,4 @@ gke-cluster-config.yaml .idea/ +test-suit-sandbox* diff --git a/README.md b/README.md index fdd2b17b..088271b4 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ [![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) + # SkyDriver v1 + A SaaS Solution for Neutrino Event Reconstruction using the Skymap Scanner ## Overview + SkyDriver automates the entire scanning of an event: starting all servers and workers, transferring all needed data, and finally, all tear-down. SkyDriver also includes a database for storing scan requests, progress reports, and results. The computational engine for a scan is the [Skymap Scanner](https://github.com/icecube/skymap_scanner). The main interface is a REST server with several [routes and methods](#rest-api). One of many workflows may be: + 1. Request a scan ([POST @ `/scan`](#scan---post)) 1. Monitor the scanning status ([GET @ `/scan/SCAN_ID/status`](#scanscan_idstatus---get)) 2. Check for progress updates ([GET @ `/scan/SCAN_ID/manifest`](#scanscan_idmanifest---get)) @@ -16,17 +20,20 @@ One of many workflows may be: 5. [Make plots](#making-plots-with-a-scans-result-using-the-scan_id) Another workflow: + 1. Find a scan id for a particular run and event ([GET @ `/scans/find`](#scansfind---post)) 2. Get the scan's manifest and result ([GET @ `/scan/SCAN_ID`](#scanscan_id---get)) - -   + ## Getting Started + Users interface with SkyDriver via REST calls, so first, you will need to get a connection. This example uses [wipac-rest-tools](https://pypi.org/project/wipac-rest-tools/): + ```python from rest_tools.client import RestClient, SavedDeviceGrantAuth + def get_rest_client() -> RestClient: """Get REST client for talking to SkyDriver. @@ -45,9 +52,12 @@ def get_rest_client() -> RestClient: retries=0, ) + rc = get_rest_client() ``` + Now, you can make all the REST calls needed: + ```python rc.request_seq(method, path, args_dict) ``` @@ -55,12 +65,14 @@ rc.request_seq(method, path, args_dict) ### Two Quick Examples To request a new scan (see [POST @ `/scan`](#scan---post)): + ```python manifest = rc.request_seq("POST", "/scan", {"docker_tag": ...}) print(json.dumps(manifest)) ``` To see your scan's status (see [GET @ `/scan/SCAN_ID/status`](#scanscan_idstatus---get)): + ```python status = rc.request_seq("GET", f"/scan/{scan_id}/status") print(json.dumps(status)) @@ -70,14 +82,11 @@ Refer to the [REST API](#rest-api) section for comprehensive documentation detai Also, see [Using a Scan Result Outside of SkyDriver](#using-a-scan-result-outside-of-skydriver). - -   -## REST API -Documentation for the public-facing routes and method - +## REST API +Documentation for the public-facing routes and method   ### `/scan` - POST @@ -85,34 +94,35 @@ Documentation for the public-facing routes and method _Launch a new scan of an event_ #### Arguments -| Argument | Type | Required/Default | Description | -| --------------------------------- | ------------ | ---------------- | -------------------- | -| `"docker_tag"` | str | *[REQUIRED]* | the docker tag of the Skymap Scanner image (must be in CVMFS). Ex: `v3.1.4`, `v3.5`, `v3`, `latest`, `eqscan-6207146` (branch-based tag) -| `"cluster"` | dict or list | *[REQUIRED]* | the worker cluster(s) to use *along with the number of workers for each:* Example: `{"sub-2": 1234}`. NOTE: To request a schedd more than once, provide a list of 2-lists instead (Ex: `[ ["sub-2", 56], ["sub-2", 1234] ]`) -| `"reco_algo"` | bool | *[REQUIRED]* | which reco algorithm to use (see [Skymap Scanner](https://github.com/icecube/skymap_scanner/tree/main/skymap_scanner/recos)) -| `"event_i3live_json"` | dict or str | *[REQUIRED]* | Realtime's JSON event format -| `"nsides"` | dict | *[REQUIRED]* | the nside progression to use (see [Skymap Scanner](https://github.com/icecube/skymap_scanner)) -| `"real_or_simulated_event"` | str | *[REQUIRED]* | whether this event is real or simulated. Ex: `real`, `simulated` -| `"max_pixel_reco_time"` | int | *[REQUIRED]* | the max amount of time (seconds) each pixel's reco should take (accurate values will evict pixels from slow workers thereby re-delivering to faster workers -- slow workers are unavoidable due to non-deterministic errors) -| `"max_worker_runtime"` | int | default: `4*60*60` | the max amount of time (second) each client worker can work for (larger values are needed as the event size increases AND the workforce size decreases) -| `"skyscan_mq_client_timeout_wait_for_first_message"` | int | default: image's default value | how long a client can wait for its first message (pixel) before giving up and exiting -| `"scanner_server_memory"` | str | default: `1024M` | how much memory for the scanner server to request -| `"worker_memory"` | str | default: `8G` | how much memory per client worker to request -| `"worker_disk"` | str | default: `1G` | how much disk per client worker to request -| `"debug_mode"` | str or list | default: None | what debug mode(s) to use: `"client-logs"` collects the scanner clients' stderr/stdout including icetray logs (scans are limited in # of workers) -| `"predictive_scanning_threshold"` | float | default: `1.0` | the predictive scanning threshold `[0.1, 1.0]` (see [Skymap Scanner](https://github.com/icecube/skymap_scanner)) -| `"priority"` | int | default: `0` | the relative priority of this scan -- higher values indicate higher priority. **NOTE: Values `>= 10` are reserved for Realtime alert scans (these scan requests are not throttled).** Also, see [HTCondor jobs](https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority) -| `"classifiers"` | dict[str, str | bool | float | int] | default: `{}` | a user-defined collection of labels, attributes, etc. -- this is constrained in size and is intended for user-defined metadata only -| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) +| Argument | Type | Required/Default | Description | +|------------------------------------------------------|-----------------------------------------------------------------|-----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `"docker_tag"` | str | *[REQUIRED]* | the docker tag of the Skymap Scanner image (must be in CVMFS). Ex: `v3.1.4`, `v3.5`, `v3`, `latest`, `eqscan-6207146` (branch-based tag) +| `"cluster"` | dict or list | *[REQUIRED]* | the worker cluster(s) to use *along with the number of workers for each:* Example: `{"sub-2": 1234}`. NOTE: To request a schedd more than once, provide a list of 2-lists instead (Ex: `[ ["sub-2", 56], ["sub-2", 1234] ]`) +| `"reco_algo"` | bool | *[REQUIRED]* | which reco algorithm to use (see [Skymap Scanner](https://github.com/icecube/skymap_scanner/tree/main/skymap_scanner/recos)) +| `"event_i3live_json"` | dict or str | *[REQUIRED]* | Realtime's JSON event format +| `"nsides"` | dict | *[REQUIRED]* | the nside progression to use (see [Skymap Scanner](https://github.com/icecube/skymap_scanner)) +| `"real_or_simulated_event"` | str | *[REQUIRED]* | whether this event is real or simulated. Ex: `real`, `simulated` +| `"max_pixel_reco_time"` | int | *[REQUIRED]* | the max amount of time (seconds) each pixel's reco should take (accurate values will evict pixels from slow workers thereby re-delivering to faster workers -- slow workers are unavoidable due to non-deterministic errors) +| `"max_worker_runtime"` | int | default: `4*60*60` | the max amount of time (second) each client worker can work for (larger values are needed as the event size increases AND the workforce size decreases) +| `"skyscan_mq_client_timeout_wait_for_first_message"` | int | default: image's default value | how long a client can wait for its first message (pixel) before giving up and exiting +| `"scanner_server_memory"` | str | default: `1024M` | how much memory for the scanner server to request +| `"worker_memory"` | str | default: `8G` | how much memory per client worker to request +| `"worker_disk"` | str | default: `1G` | how much disk per client worker to request +| `"debug_mode"` | str or list | default: None | what debug mode(s) to use: `"client-logs"` collects the scanner clients' stderr/stdout including icetray logs (scans are limited in # of workers) +| `"predictive_scanning_threshold"` | float | default: `1.0` | the predictive scanning threshold `[0.1, 1.0]` (see [Skymap Scanner](https://github.com/icecube/skymap_scanner)) +| `"priority"` | int | default: `0` | the relative priority of this scan -- higher values indicate higher priority. **NOTE: Values `>= 100` are reserved for Realtime alert scans (these scan requests are not throttled).** Also, see [HTCondor jobs](https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority) +| `"classifiers"` | dict[str, str | bool | float | int] | default: `{}` | a user-defined collection of labels, attributes, etc. -- this is constrained in size and is intended for user-defined metadata only +| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) #### SkyDriver Effects + - Creates and starts a new Skymap Scanner instance spread across many client workers - The new scanner will send updates routinely and when the scan completes (see [GET (manifest)](#scanscan_idmanifest-get) and [GET (result)](#scanscan_idresult-get)) #### Returns -dict - [Manifest](#manifest) +dict - [Manifest](#manifest)   ### `/scan/SCAN_ID/manifest` - GET @@ -120,18 +130,19 @@ dict - [Manifest](#manifest) _Retrieve the manifest of a scan_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------- | ----------- | ---------------- | -------------------- | -| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) - +| Argument | Type | Required/Default | Description | +|---------------------|-------------------------|------------------|----------------------------------------------------------------------------------------| +| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) + #### SkyDriver Effects + None #### Returns -dict - [Manifest](#manifest) +dict - [Manifest](#manifest)   ### `/scan/SCAN_ID/result` - GET @@ -139,16 +150,18 @@ dict - [Manifest](#manifest) _Retrieve the result of a scan_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------- | ----------- | ---------------- | -------------------- | -| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) + +| Argument | Type | Required/Default | Description | +|---------------------|------|------------------|----------------------------------------------------------------------------------------| +| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) #### SkyDriver Effects + None #### Returns -dict - [Result](#result) +dict - [Result](#result)   ### `/scan/SCAN_ID` - GET @@ -156,67 +169,74 @@ dict - [Result](#result) _Retrieve the manifest and result of a scan_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------- | ----------- | ---------------- | -------------------- | -| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) - +| Argument | Type | Required/Default | Description | +|---------------------|-------------------------|------------------|----------------------------------------------------------------------------------------| +| `"include_deleted"` | bool | default: `False` | *Not normally needed* -- `True` prevents a 404 error if the scan was deleted (aborted) + #### SkyDriver Effects + None #### Returns + ``` { "manifest": Manifest dict, "result": Result dict, } ``` + - See [Manifest](#manifest) - See [Result](#result) -   ### `/scan/SCAN_ID` - DELETE ------------------------------------------------------------------------------- _Abort a scan and/or mark scan (manifest and result) as "deleted"_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------------- | ----------- | ---------------- | -------------------- | -| `"delete_completed_scan"` | bool | default: `False` | whether to mark a completed scan as "deleted" -- *this is not needed for aborting an ongoing scan* -| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) + +| Argument | Type | Required/Default | Description | +|---------------------------|------|-----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------| +| `"delete_completed_scan"` | bool | default: `False` | whether to mark a completed scan as "deleted" -- *this is not needed for aborting an ongoing scan* +| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) #### SkyDriver Effects + - The Skymap Scanner instance is stopped and removed - The scan's manifest and result are marked as "deleted" in the database #### Returns + ``` { "manifest": Manifest dict, "result": Result dict, } ``` + - See [Manifest](#manifest) - See [Result](#result) -   ### `/scans/find` - POST ------------------------------------------------------------------------------- _Retrieve scan manifests corresponding to a specific search query_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------- | ----------- | ---------------- | -------------------- | -| `"filter"` | dict | *[REQUIRED]* | a MongoDB-syntax filter for `Manifest` -| `"include_deleted"` | bool | default: `False` | whether to include deleted scans (overwritten by `filter`'s `is_deleted`) -| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) +| Argument | Type | Required/Default | Description | +|-------------------------|------|-----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------| +| `"filter"` | dict | *[REQUIRED]* | a MongoDB-syntax filter for `Manifest` +| `"include_deleted"` | bool | default: `False` | whether to include deleted scans (overwritten by `filter`'s `is_deleted`) +| `"manifest_projection"` | list | default: all fields but [these](#manifest-fields-excluded-by-default-in-response) | which `Manifest` fields to include in the response (include `*` to include all fields) ##### Example + One simple `"filter"` may be: + ``` { "filter": { @@ -226,19 +246,22 @@ One simple `"filter"` may be: } } ``` + See https://www.mongodb.com/docs/manual/tutorial/query-documents/ for more complex queries. #### SkyDriver Effects + None #### Returns + ``` { "manifests": list[Manifest dict], } ``` -- See [Manifest](#manifest) +- See [Manifest](#manifest)   ### `/scans/backlog` - GET @@ -246,12 +269,15 @@ None _Retrieve entire backlog list_ #### Arguments + None #### SkyDriver Effects + None #### Returns + ``` { "entries": [ @@ -265,22 +291,23 @@ None } ``` -   ### `/scan/SCAN_ID/status` - GET ------------------------------------------------------------------------------- _Retrieve the status of a scan_ #### Arguments -| Argument | Type | Required/Default | Description | -| ------------------------ | ------- | ---------------- | -------------------- | -| `"include_pod_statuses"` | bool | `False` | whether to include the k8s pod statuses for the clientmanager & central server -- expends additional resources +| Argument | Type | Required/Default | Description | +|--------------------------|------|------------------|----------------------------------------------------------------------------------------------------------------| +| `"include_pod_statuses"` | bool | `False` | whether to include the k8s pod statuses for the clientmanager & central server -- expends additional resources #### SkyDriver Effects + None #### Returns + ``` { "scan_state": str, # a short human-readable code @@ -295,7 +322,9 @@ None ``` ##### Scan State Codes + There are several codes for `scan_state`: + - Successful state * `SCAN_FINISHED_SUCCESSFULLY` - Non-finished scan states (in reverse order of occurrence) @@ -310,19 +339,21 @@ There are several codes for `scan_state`: * `STOPPED__PRESTARTUP` * *NOTE: a failed scan my not have an above code automatically, and may need a `DELETE` request to get the code. Until then, it will retain an non-finished state code.* -   ### `/scan/SCAN_ID/logs` - GET ------------------------------------------------------------------------------- _Retrieve the logs of a scan's pod: central server & client starter(s)_ #### Arguments + None #### SkyDriver Effects + None #### Returns + ``` { "pod_container_logs": str | list[ dict[str,str] ], # list @@ -330,14 +361,16 @@ None } ``` -   ### Return Types ------------------------------------------------------------------------------- + #### Manifest + _A dictionary containing non-physics metadata on a scan_ Pseudo-code: + ``` { scan_id: str, @@ -345,14 +378,14 @@ Pseudo-code: timestamp: float, is_deleted: bool, - event_i3live_json_dict: dict, + event_i3live_json_dict: dict, # the i3 event or an internal id scanner_server_args: str, priority: int, classifiers: dict[str, str | bool | float | int] - event_i3live_json_dict__hash: str, # a deterministic hash of the event json + event_i3live_json_dict__hash: str | None, # deprecated ewms_task: { tms_args: list[str], @@ -429,18 +462,23 @@ Pseudo-code: last_updated: float, } ``` + - See [skydriver/database/schema.py](https://github.com/WIPACrepo/SkyDriver/blob/main/skydriver/database/schema.py) ##### Manifest Fields Excluded by Default in Response + Some routes/methods respond with the scan's manifest. This is a large dictionary, so by default, all but [GET @ `/scan/SCAN_ID/manifest`](#scanscan_idmanifest---get) exclude these fields: + - `event_i3live_json_dict` See https://github.com/search?q=repo%3AWIPACrepo%2FSkyDriver+DEFAULT_EXCLUDED_MANIFEST_FIELDS&type=code #### Result + _A dictionary containing the scan result_ Pseudo-code: + ``` { scan_id: str, @@ -449,13 +487,18 @@ Pseudo-code: is_final: bool, # is this result the final result? } ``` + - See [skydriver/database/schema.py](https://github.com/WIPACrepo/SkyDriver/blob/main/skydriver/database/schema.py) - See [skyreader's SkyScanResult](https://github.com/icecube/skyreader/) -   + ## Using a Scan Result Outside of SkyDriver + ### Making Plots with a Scan's Result (using the `scan_id`) + See skyreader's [plot_skydriver_scan_result.py](https://github.com/icecube/skyreader/blob/main/examples/plot_skydriver_scan_result.py) + ### Creating a `SkyScanResult` Instance from a Scan's Result (using the `scan_id`) + Also, see skyreader's [plot_skydriver_scan_result.py](https://github.com/icecube/skyreader/blob/main/examples/plot_skydriver_scan_result.py) diff --git a/dependencies-from-Dockerfile.log b/dependencies-from-Dockerfile.log index 0ae4c3a3..4d0e63f9 100644 --- a/dependencies-from-Dockerfile.log +++ b/dependencies-from-Dockerfile.log @@ -6,39 +6,24 @@ ######################################################################## # pip freeze ######################################################################## -boto3==1.35.83 -botocore==1.35.83 +boto3==1.35.84 +botocore==1.35.84 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 charset-normalizer==3.4.0 -coloredlogs==15.0.1 cryptography==44.0.0 dacite==1.8.1 -Deprecated==1.2.15 dnspython==2.7.0 durationpy==0.9 google-auth==2.37.0 -googleapis-common-protos==1.56.1 -grpcio==1.68.1 htcondor==24.2.1 humanfriendly==10.0 idna==3.10 -importlib_metadata==8.5.0 jmespath==1.0.1 kubernetes==31.0.0 motor==3.3.2 oauthlib==3.2.2 -opentelemetry-api==1.29.0 -opentelemetry-exporter-jaeger==1.21.0 -opentelemetry-exporter-jaeger-proto-grpc==1.21.0 -opentelemetry-exporter-jaeger-thrift==1.21.0 -opentelemetry-exporter-otlp-proto-common==1.29.0 -opentelemetry-exporter-otlp-proto-http==1.29.0 -opentelemetry-proto==1.29.0 -opentelemetry-sdk==1.29.0 -opentelemetry-semantic-conventions==0.50b0 -protobuf==5.29.2 pyasn1==0.6.1 pyasn1_modules==0.4.1 pycparser==2.22 @@ -53,17 +38,13 @@ requests-oauthlib==2.0.0 rsa==4.9 s3transfer==0.10.4 six==1.17.0 -thrift==0.21.0 tornado==6.4.2 typeguard==4.4.1 typing_extensions==4.12.2 urllib3==2.2.3 websocket-client==1.8.0 wipac-dev-tools==1.13.0 -wipac-rest-tools==1.5.3 -wipac-telemetry==0.3.1 -wrapt==1.17.0 -zipp==3.21.0 +wipac-rest-tools==1.8.4 ######################################################################## # pipdeptree ######################################################################## @@ -75,15 +56,15 @@ pipdeptree==2.24.0 └── pip [required: >=24.2, installed: 24.3.1] setuptools==65.5.1 skydriver-clientmanager-ewms-sidecar -├── boto3 [required: Any, installed: 1.35.83] -│ ├── botocore [required: >=1.35.83,<1.36.0, installed: 1.35.83] +├── boto3 [required: Any, installed: 1.35.84] +│ ├── botocore [required: >=1.35.84,<1.36.0, installed: 1.35.84] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.2.3] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.10.0,<0.11.0, installed: 0.10.4] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.83] +│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.84] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -139,7 +120,7 @@ skydriver-clientmanager-ewms-sidecar │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] │ └── typing_extensions [required: Any, installed: 4.12.2] -└── wipac-rest-tools [required: <1.6.0, installed: 1.5.3] +└── wipac-rest-tools [required: Any, installed: 1.8.4] ├── cachetools [required: Any, installed: 5.5.0] ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] @@ -164,118 +145,3 @@ skydriver-clientmanager-ewms-sidecar │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] └── typing_extensions [required: Any, installed: 4.12.2] wheel==0.45.1 -wipac-telemetry==0.3.1 -├── coloredlogs [required: Any, installed: 15.0.1] -│ └── humanfriendly [required: >=9.1, installed: 10.0] -├── opentelemetry-api [required: Any, installed: 1.29.0] -│ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ └── zipp [required: >=3.20, installed: 3.21.0] -├── opentelemetry-exporter-jaeger [required: Any, installed: 1.21.0] -│ ├── opentelemetry-exporter-jaeger-proto-grpc [required: ==1.21.0, installed: 1.21.0] -│ │ ├── googleapis-common-protos [required: ~=1.52,<1.60.0, installed: 1.56.1] -│ │ │ └── protobuf [required: >=3.15.0, installed: 5.29.2] -│ │ ├── grpcio [required: >=1.0.0,<2.0.0, installed: 1.68.1] -│ │ ├── opentelemetry-api [required: ~=1.3, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ └── opentelemetry-sdk [required: ~=1.11, installed: 1.29.0] -│ │ ├── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ ├── opentelemetry-semantic-conventions [required: ==0.50b0, installed: 0.50b0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ └── typing_extensions [required: >=3.7.4, installed: 4.12.2] -│ └── opentelemetry-exporter-jaeger-thrift [required: ==1.21.0, installed: 1.21.0] -│ ├── opentelemetry-api [required: ~=1.3, installed: 1.29.0] -│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ ├── opentelemetry-sdk [required: ~=1.11, installed: 1.29.0] -│ │ ├── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ ├── opentelemetry-semantic-conventions [required: ==0.50b0, installed: 0.50b0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ └── typing_extensions [required: >=3.7.4, installed: 4.12.2] -│ └── thrift [required: >=0.10.0, installed: 0.21.0] -│ └── six [required: >=1.7.2, installed: 1.17.0] -├── opentelemetry-exporter-otlp-proto-http [required: Any, installed: 1.29.0] -│ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ ├── googleapis-common-protos [required: ~=1.52, installed: 1.56.1] -│ │ └── protobuf [required: >=3.15.0, installed: 5.29.2] -│ ├── opentelemetry-api [required: ~=1.15, installed: 1.29.0] -│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ ├── opentelemetry-exporter-otlp-proto-common [required: ==1.29.0, installed: 1.29.0] -│ │ └── opentelemetry-proto [required: ==1.29.0, installed: 1.29.0] -│ │ └── protobuf [required: >=5.0,<6.0, installed: 5.29.2] -│ ├── opentelemetry-proto [required: ==1.29.0, installed: 1.29.0] -│ │ └── protobuf [required: >=5.0,<6.0, installed: 5.29.2] -│ ├── opentelemetry-sdk [required: ~=1.29.0, installed: 1.29.0] -│ │ ├── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ ├── opentelemetry-semantic-conventions [required: ==0.50b0, installed: 0.50b0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ │ └── typing_extensions [required: >=3.7.4, installed: 4.12.2] -│ └── requests [required: ~=2.7, installed: 2.32.3] -│ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] -│ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] -│ ├── idna [required: >=2.5,<4, installed: 3.10] -│ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] -├── opentelemetry-sdk [required: Any, installed: 1.29.0] -│ ├── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ ├── opentelemetry-semantic-conventions [required: ==0.50b0, installed: 0.50b0] -│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ └── opentelemetry-api [required: ==1.29.0, installed: 1.29.0] -│ │ ├── Deprecated [required: >=1.2.6, installed: 1.2.15] -│ │ │ └── wrapt [required: >=1.10,<2, installed: 1.17.0] -│ │ └── importlib_metadata [required: >=6.0,<=8.5.0, installed: 8.5.0] -│ │ └── zipp [required: >=3.20, installed: 3.21.0] -│ └── typing_extensions [required: >=3.7.4, installed: 4.12.2] -├── protobuf [required: Any, installed: 5.29.2] -├── typing_extensions [required: Any, installed: 4.12.2] -└── wipac-dev-tools [required: Any, installed: 1.13.0] - ├── requests [required: Any, installed: 2.32.3] - │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] - │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] - │ ├── idna [required: >=2.5,<4, installed: 3.10] - │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] - └── typing_extensions [required: Any, installed: 4.12.2] diff --git a/resources/prod_tester/test_getter.py b/resources/prod_tester/test_getter.py index 3d07d3b7..8bf2cd61 100644 --- a/resources/prod_tester/test_getter.py +++ b/resources/prod_tester/test_getter.py @@ -48,21 +48,17 @@ def log_file(self) -> Path: return config.SANDBOX_DIR / f"logs/{self.scan_id}.log" -def fetch_file(url, mode="text"): - """Fetch a file from a URL.""" +def download_file(url: str, dest: Path) -> Path: + """Download a file from a URL.""" + if os.path.exists(dest): + return dest + dest.parent.mkdir(parents=True, exist_ok=True) print(f"downloading from {url}...") - response = requests.get(url) + response = requests.get(url, timeout=10) response.raise_for_status() - return response.text if mode == "text" else response.content - - -def download_file(url: str, dest: Path): - """Download a file from a URL.""" - if not os.path.exists(dest): - dest.parent.mkdir(parents=True, exist_ok=True) - file_content = fetch_file(url, mode="binary") - with open(dest, "wb") as f: - f.write(file_content) + with open(dest, "wb") as f: + f.write(response.content) + return dest class GHATestFetcher: @@ -76,8 +72,10 @@ class GHATestFetcher: def _read_gha_matrix(self): """Parse the 'matrix' defined in the github actions CI job.""" - yaml_content = fetch_file(config.GHA_FILE_URL) - gha_data = yaml.safe_load(yaml_content) + with open( + download_file(config.GHA_FILE_URL, config.SANDBOX_DIR / "tests.yml") + ) as f: + gha_data = yaml.safe_load(f) # Extract the matrix values for "test-run-realistic" test_run_realistic = gha_data.get("jobs", {}).get( @@ -142,34 +140,32 @@ def setup_tests() -> Iterator[TestParamSet]: # prep each test for m in matrix: event_fname = m[EVENTFILE_KEY] - test = TestParamSet( - event_file=events_dir / event_fname, - reco_algo=m[RECO_ALGO_KEY], - result_file=( - results_dir / m[RECO_ALGO_KEY] / config.EVENT_RESULT_MAP[event_fname] - ), + event_file = events_dir / event_fname + result_file = ( + results_dir / m[RECO_ALGO_KEY] / config.EVENT_RESULT_MAP[event_fname] ) - # get event file - download_file( - f"{config.EVENT_DIR_URL}{event_fname}", - test.event_file, - ) - # -> transform pkl file into json file -- skydriver only takes json - if test.event_file.suffix == ".pkl": - with open(test.event_file, "rb") as f: - contents = pickle.load(f) - test.event_file.unlink() # rm - test.event_file = test.event_file.with_suffix( - ".json" - ) # use a different fname - with open(test.event_file, "w") as f: - json.dump(contents, f, indent=4) + # get event file -- all event files will be saved as .json + as_json = event_file.with_suffix(".json") + if not as_json.exists(): + download_file(f"{config.EVENT_DIR_URL}{event_fname}", event_file) + # -> transform pkl file into json file -- skydriver only takes json + if event_file.suffix == ".pkl": + with open(event_file, "rb") as f: + contents = pickle.load(f) + event_file.unlink() # rm + with open(as_json, "w") as f: + json.dump(contents, f, indent=4) + event_file = as_json # use the .json filepath # get the expected-result file download_file( f"{config.RESULT_DIR_URL}{m[RECO_ALGO_KEY]}/{config.EVENT_RESULT_MAP[event_fname]}", - test.result_file, + result_file, ) - yield test + yield TestParamSet( + event_file=event_file, + reco_algo=m[RECO_ALGO_KEY], + result_file=result_file, + ) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 11bdea93..257a477f 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -23,9 +23,9 @@ def get_rest_client(skydriver_url: str) -> RestClient: This will present a QR code in the terminal for initial validation. """ - logging.info("connecting to skydriver...") if "://" not in skydriver_url: skydriver_url = "https://" + skydriver_url + logging.info(f"connecting to {skydriver_url}...") # NOTE: If your script will not be interactive (like a cron job), # then you need to first run your script manually to validate using @@ -58,7 +58,7 @@ async def launch_a_scan( "docker_tag": "latest", "max_pixel_reco_time": 30 * 60, # seconds "scanner_server_memory": "1G", - "priority": 99, + "priority": 100, "scanner_server_env": { "SKYSCAN_MINI_TEST": True, }, diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index aeecde47..995002e2 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -238,8 +238,19 @@ async def main(): "w", ) as tar: tar.add(config.SANDBOX_DIR, arcname=os.path.basename(config.SANDBOX_DIR)) - # then rm -rf the dir - shutil.rmtree(config.SANDBOX_DIR) + # then rm -rf the dir (saving the downloaded files) + for entry in config.SANDBOX_DIR.iterdir(): + if entry.name in { + "expected_results", # dir + "realtime_events", # dir + "compare_scan_results.py", # file + "tests.yml", # file + }: + continue + if entry.is_dir(): + shutil.rmtree(entry) + else: + entry.unlink() config.SANDBOX_DIR.mkdir(exist_ok=True) rc = test_runner.get_rest_client(args.skydriver_url) diff --git a/setup.cfg b/setup.cfg index 3a232f48..33bb48ba 100644 --- a/setup.cfg +++ b/setup.cfg @@ -37,7 +37,7 @@ install_requires = tornado typeguard wipac-dev-tools - wipac-rest-tools[telemetry]<1.6.0 # FUTURE: need to update REST arg validation to use later versions of pkg + wipac-rest-tools python_requires = >=3.10, <3.12 packages = find: diff --git a/skydriver/config.py b/skydriver/config.py index 8439b386..e88e1c5f 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -26,7 +26,7 @@ CLUSTER_STOPPER_K8S_TTL_SECONDS_AFTER_FINISHED = 1 * 60 * 60 CLUSTER_STOPPER_K8S_JOB_N_RETRIES = 6 -SCAN_MIN_PRIORITY_TO_START_NOW = 10 +SCAN_MIN_PRIORITY_TO_START_ASAP = 100 @enum.unique diff --git a/skydriver/database/__init__.py b/skydriver/database/__init__.py index 0ee28123..be165dbd 100644 --- a/skydriver/database/__init__.py +++ b/skydriver/database/__init__.py @@ -1,12 +1,18 @@ """Init.""" - from urllib.parse import quote_plus from motor.motor_asyncio import AsyncIOMotorClient +from . import interface, mongodc, schema, utils from ..config import ENV -from . import interface, mongodc, schema # noqa: F401 # export + +__all__ = [ + "interface", + "mongodc", + "schema", + "utils", +] async def create_mongodb_client() -> AsyncIOMotorClient: # type: ignore[valid-type] diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index c1c2b480..655d0063 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -17,7 +17,7 @@ _RESULTS_COLL_NAME, _SCAN_BACKLOG_COLL_NAME, ) -from ..config import ENV +from ..config import ENV, SCAN_MIN_PRIORITY_TO_START_ASAP LOGGER = logging.getLogger(__name__) @@ -56,7 +56,7 @@ async def get(self, scan_id: str, incl_del: bool) -> schema.Manifest: async def post( self, - event_i3live_json_dict: schema.StrDict, + i3_event_id: str, scan_id: str, scanner_server_args: str, tms_args_list: list[str], @@ -72,7 +72,7 @@ async def post( scan_id=scan_id, timestamp=time.time(), is_deleted=False, - event_i3live_json_dict=event_i3live_json_dict, + i3_event_id=i3_event_id, scanner_server_args=scanner_server_args, ewms_task=schema.EWMSTaskDirective( tms_args=tms_args_list, @@ -82,7 +82,10 @@ async def post( priority=priority, ) - # db + return await self.put(manifest) + + async def put(self, manifest: schema.Manifest) -> schema.Manifest: + """Put into db.""" try: manifest = await self.collection.find_one_and_update( {"scan_id": manifest.scan_id}, @@ -94,11 +97,83 @@ async def post( except mongodc.DocumentNotFoundException as e: raise web.HTTPError( 500, - log_message=f"Failed to post {self.collection.name} document ({scan_id})", + log_message=f"Failed to post {self.collection.name} document ({manifest.scan_id})", ) from e return manifest + @staticmethod + def _put_once_event_metadata( + in_db: schema.Manifest, + upserting: dict, + scan_id: str, + event_metadata: schema.EventMetadata, + ) -> None: + if not event_metadata: + raise ValueError("event_metadata cannot be falsy") + elif not in_db.event_metadata: + upserting["event_metadata"] = event_metadata + elif in_db.event_metadata != event_metadata: + msg = "Cannot change an existing event_metadata" + raise web.HTTPError( + 400, + log_message=msg + f" for {scan_id=}", + reason=msg, + ) + + @staticmethod + def _put_once_scan_metadata( + in_db: schema.Manifest, + upserting: dict, + scan_id: str, + scan_metadata: schema.StrDict, + ) -> None: + if not scan_metadata: + raise ValueError("scan_metadata cannot be falsy") + elif not in_db.scan_metadata: + upserting["scan_metadata"] = scan_metadata + elif in_db.scan_metadata != scan_metadata: + msg = "Cannot change an existing scan_metadata" + raise web.HTTPError( + 400, + log_message=msg + f" for {scan_id=}", + reason=msg, + ) + + @staticmethod + def _put_ewms_task( + in_db: schema.Manifest, + upserting: dict, + cluster: schema.Cluster | None, + complete: bool | None, + ): + if not cluster and not complete: + raise ValueError("cluster and complete cannot both be falsy") + + upserting["ewms_task"] = copy.deepcopy(in_db.ewms_task) + # cluster / clusters + # TODO - when TMS is up and running, it will handle cluster updating--remove then + # NOTE - there is a race condition inherent with list attributes, don't do this in TMS + if not cluster: + pass # don't put in DB + else: + try: # find by uuid -> replace + idx = next( + i + for i, c in enumerate(in_db.ewms_task.clusters) + if cluster.uuid == c.uuid + ) + upserting["ewms_task"].clusters = ( + in_db.ewms_task.clusters[:idx] + + [cluster] + + in_db.ewms_task.clusters[idx + 1 :] + ) + except StopIteration: # not found -> append + upserting["ewms_task"].clusters = in_db.ewms_task.clusters + [cluster] + # complete # workforce is done + if complete is not None: + upserting["ewms_task"].complete = complete # workforce is done + async def patch( self, scan_id: str, @@ -122,71 +197,30 @@ async def patch( return await self.get(scan_id, incl_del=True) upserting: schema.StrDict = {} + if progress: + upserting["progress"] = progress - # Store/validate: event_metadata & scan_metadata - # NOTE: in theory there's a race condition (get+upsert), but it's set-once-only, so it's OK + # Validate, then store + # NOTE: in theory there's a race condition (get+upsert) in_db = await self.get(scan_id, incl_del=True) - # event_metadata - if not event_metadata: - pass # don't put in DB - elif not in_db.event_metadata: - upserting["event_metadata"] = event_metadata - elif in_db.event_metadata != event_metadata: - msg = "Cannot change an existing event_metadata" - raise web.HTTPError( - 400, - log_message=msg + f" for {scan_id=}", - reason=msg, - ) - # scan_metadata - if not scan_metadata: - pass # don't put in DB - elif not in_db.scan_metadata: - upserting["scan_metadata"] = scan_metadata - elif in_db.scan_metadata != scan_metadata: - msg = "Cannot change an existing scan_metadata" - raise web.HTTPError( - 400, - log_message=msg + f" for {scan_id=}", - reason=msg, - ) - - # tms + if event_metadata: + self._put_once_event_metadata(in_db, upserting, scan_id, event_metadata) + if scan_metadata: + self._put_once_scan_metadata(in_db, upserting, scan_id, scan_metadata) if cluster or complete is not None: - upserting["ewms_task"] = copy.deepcopy(in_db.ewms_task) - # cluster / clusters - # TODO - when TMS is up and running, it will handle cluster updating--remove then - # NOTE - there is a race condition inherent with list attributes, don't do this in TMS - if not cluster: - pass # don't put in DB - else: - try: # find by uuid -> replace - idx = next( - i - for i, c in enumerate(in_db.ewms_task.clusters) - if cluster.uuid == c.uuid - ) - upserting["ewms_task"].clusters = ( - in_db.ewms_task.clusters[:idx] - + [cluster] - + in_db.ewms_task.clusters[idx + 1 :] - ) - except StopIteration: # not found -> append - upserting["ewms_task"].clusters = in_db.ewms_task.clusters + [ - cluster - ] - # complete # workforce is done - if complete is not None: - upserting["ewms_task"].complete = complete # workforce is done - - # progress - if progress: - upserting["progress"] = progress + self._put_ewms_task(in_db, upserting, cluster, complete) - # validate + # Update db if not upserting: # did we actually update anything? LOGGER.debug(f"nothing to patch for manifest ({scan_id=})") return in_db + else: + return await self._patch(upserting, scan_id) + + async def _patch(self, upserting: dict, scan_id: str) -> schema.Manifest: + """Update the doc in the DB.""" + if not upserting: + raise ValueError("upserting cannot be empty") try: upserting = mongodc.typecheck_as_dc_fields(upserting, schema.Manifest) except TypeError as e: @@ -323,7 +357,10 @@ def __init__(self, motor_client: AsyncIOMotorClient) -> None: # type: ignore[va motor_client[_DB_NAME], _SCAN_BACKLOG_COLL_NAME # type: ignore[index] ) - async def fetch_next_as_pending(self) -> schema.ScanBacklogEntry: + async def fetch_next_as_pending( + self, + include_low_priority_scans: bool, + ) -> schema.ScanBacklogEntry: """Fetch the next ready entry and mark as pending. This for when the container is restarted (process is killed). @@ -331,17 +368,22 @@ async def fetch_next_as_pending(self) -> schema.ScanBacklogEntry: # LOGGER.debug("fetching & marking top backlog entry as a pending...") # ^^^ don't log too often - # atomically find & update + mongo_filter = { + # get entries that have never been pending (0.0) and/or + # entries that have been pending for too long (parent + # process may have died) -- younger pending entries may + # still be in flight by other processes) + "pending_timestamp": { + "$lt": time.time() - ENV.SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE + } + } + if not include_low_priority_scans: + # iow: only include high priority scans + mongo_filter.update({"priority": {"$gte": SCAN_MIN_PRIORITY_TO_START_ASAP}}) + + # atomically find & update; raises DocumentNotFoundException if no match entry = await self.collection.find_one_and_update( - { - # get entries that have never been pending (0.0) and/or - # entries that have been pending for too long (parent - # process may have died) -- younger pending entries may - # still be in flight by other processes) - "pending_timestamp": { - "$lt": time.time() - ENV.SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE - } - }, + mongo_filter, { "$set": {"pending_timestamp": time.time()}, "$inc": {"next_attempt": 1}, diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index ad72c735..985b18fe 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -2,8 +2,6 @@ import dataclasses as dc import enum -import hashlib -import json from typing import Any, Iterator, Literal import wipac_dev_tools as wdt @@ -236,6 +234,9 @@ def __post_init__(self) -> None: # NOTE - self.env_vars done in EnvVars +DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" + + @typechecked @dc.dataclass class Manifest(ScanIDDataclass): @@ -244,21 +245,23 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - # grabbed by scanner central server - event_i3live_json_dict: StrDict # TODO: delete after time & replace w/ hash? - ewms_task: EWMSTaskDirective # args placed in k8s job obj scanner_server_args: str - priority: int = 0 # same as https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority + priority: int = ( + 0 # same as https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority + ) # open to requestor classifiers: dict[str, str | bool | float | int] = dc.field(default_factory=dict) - # special fields -- see __post_init__ - event_i3live_json_dict__hash: str = "" # possibly overwritten + # i3 event -- grabbed by scanner central server + i3_event_id: str = "" # id to i3_event coll + # -> deprecated fields -- see __post_init__ for backward compatibility logic + event_i3live_json_dict: StrDict | str = DEPRECATED_EVENT_I3LIVE_JSON_DICT + event_i3live_json_dict__hash: str | None = None # **DEPRECATED** # found/created during first few seconds of scanning event_metadata: EventMetadata | None = None @@ -270,16 +273,14 @@ class Manifest(ScanIDDataclass): last_updated: float = 0.0 def __post_init__(self) -> None: - if self.event_i3live_json_dict: - # shorten b/c this can be a LARGE dict - self.event_i3live_json_dict__hash = hashlib.md5( - json.dumps( # sort -> deterministic - self.event_i3live_json_dict, - sort_keys=True, - ensure_ascii=True, - ).encode("utf-8") - ).hexdigest() - + if ( + not self.i3_event_id + and self.event_i3live_json_dict == DEPRECATED_EVENT_I3LIVE_JSON_DICT + ): + raise ValueError( + "Manifest must define 'i3_event_id' " + "(old manifests may define 'event_i3live_json_dict' instead)" + ) self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) def get_state(self) -> ScanState: diff --git a/skydriver/database/utils.py b/skydriver/database/utils.py index e3e7a12c..908931b7 100644 --- a/skydriver/database/utils.py +++ b/skydriver/database/utils.py @@ -1,6 +1,5 @@ """General Mongo utils.""" - from motor.motor_asyncio import AsyncIOMotorClient from pymongo import ASCENDING, DESCENDING @@ -10,6 +9,8 @@ _MANIFEST_COLL_NAME = "Manifests" _RESULTS_COLL_NAME = "Results" _SCAN_BACKLOG_COLL_NAME = "ScanBacklog" +_SCAN_REQUEST_COLL_NAME = "ScanRequests" +_I3_EVENT_COLL_NAME = "I3Events" async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ignore[valid-type] @@ -17,6 +18,20 @@ async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ign Call on server startup. """ + # USER SCAN REQUESTS COLL + await motor_client[_DB_NAME][_SCAN_REQUEST_COLL_NAME].create_index( # type: ignore[index] + "scan_id", + name="scan_id_index", + unique=True, + ) + + # I3 EVENTS COLL + await motor_client[_DB_NAME][_I3_EVENT_COLL_NAME].create_index( # type: ignore[index] + "i3_event_id", + name="i3_event_id_index", + unique=True, + ) + # MANIFEST COLL await motor_client[_DB_NAME][_MANIFEST_COLL_NAME].create_index( # type: ignore[index] "scan_id", diff --git a/skydriver/images.py b/skydriver/images.py index a5c43998..c40d65f5 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -1,6 +1,5 @@ """Utilities for dealing with docker/cvmfs/singularity images.""" - import logging import re from pathlib import Path @@ -51,6 +50,26 @@ def get_skyscan_docker_image(tag: str) -> str: # utils +def _match_sha_to_majminpatch(sha: str) -> str | None: + """Finds the image w/ same SHA and has a version tag like '#.#.#'. + + No error handling + """ + url = DOCKERHUB_API_URL + while True: + resp = requests.get(url).json() + for result in resp["results"]: + if sha != result.get("digest", result["images"][0]["digest"]): + # some old ones have their 'digest' in their 'images' list entry + continue + if VERSION_REGEX_MAJMINPATCH.fullmatch(result["name"]): + return result["name"] # type: ignore[no-any-return] + if not resp["next"]: + break + url = resp["next"] + return None + + @cachetools.func.ttl_cache(ttl=5 * 60) def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: """Get the '#.#.#' tag on Docker Hub w/ `docker_tag`'s SHA if possible. @@ -75,25 +94,6 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: if VERSION_REGEX_MAJMINPATCH.fullmatch(docker_tag): return docker_tag - def _match_sha_to_majminpatch(sha: str) -> str | None: - """Finds the image w/ same SHA and has a version tag like '#.#.#'. - - No error handling - """ - url = DOCKERHUB_API_URL - while True: - resp = requests.get(url).json() - for result in resp["results"]: - if sha != result.get("digest", result["images"][0]["digest"]): - # some old ones have their 'digest' in their 'images' list entry - continue - if VERSION_REGEX_MAJMINPATCH.fullmatch(result["name"]): - return result["name"] # type: ignore[no-any-return] - if not resp["next"]: - break - url = resp["next"] - return None - _error = ValueError("Image tag could not resolve to a full version") try: diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index eec814c0..5d6e47dc 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -8,6 +8,7 @@ import bson import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient +from tornado import web from .utils import KubeAPITools from .. import database @@ -16,30 +17,39 @@ LOGGER = logging.getLogger(__name__) -async def enqueue( +async def designate_for_startup( scan_id: str, job_obj: kubernetes.client.V1Job, scan_backlog: database.interface.ScanBacklogClient, priority: int, ) -> None: """Enqueue k8s job to be started by job-starter thread.""" - entry = database.schema.ScanBacklogEntry( - scan_id=scan_id, - timestamp=time.time(), - pickled_k8s_job=bson.Binary(pickle.dumps(job_obj)), - priority=priority, - ) - await scan_backlog.insert(entry) + try: + LOGGER.info(f"enqueuing k8s job for {scan_id=}") + entry = database.schema.ScanBacklogEntry( + scan_id=scan_id, + timestamp=time.time(), + pickled_k8s_job=bson.Binary(pickle.dumps(job_obj)), + priority=priority, + ) + await scan_backlog.insert(entry) + except Exception as e: + LOGGER.exception(e) + raise web.HTTPError( + 400, + log_message="Failed to enqueue Kubernetes job for Scanner instance", + ) async def get_next_backlog_entry( scan_backlog: database.interface.ScanBacklogClient, manifests: database.interface.ManifestClient, + include_low_priority_scans: bool, ) -> database.schema.ScanBacklogEntry: """Get the next entry & remove any that have been cancelled.""" while True: # get next up -- raises DocumentNotFoundException if none - entry = await scan_backlog.fetch_next_as_pending() + entry = await scan_backlog.fetch_next_as_pending(include_low_priority_scans) LOGGER.info(f"Got backlog entry ({entry.scan_id=})") if entry.next_attempt > ENV.SCAN_BACKLOG_MAX_ATTEMPTS: @@ -80,6 +90,63 @@ async def run( LOGGER.info("Restarted scan backlog runner.") +def _logging_heartbeat(last_log_time: float) -> float: + if time.time() - last_log_time > ENV.SCAN_BACKLOG_RUNNER_DELAY: + LOGGER.info("scan backlog runner is still alive") + return time.time() + else: + return last_log_time + + +class IntervalTimer: + """A utility class to track time intervals. + + This class allows tracking of elapsed time between actions and provides + mechanisms to wait until a specified time interval has passed. + + TODO: Move this to dev-tools (copied from TMS). + """ + + def __init__(self, seconds: float, logger: logging.Logger) -> None: + self.seconds = seconds + self._last_time = time.time() + self.logger = logger + + def fastforward(self): + """Reset the timer so that the next call to `has_interval_elapsed` will return True. + + This effectively skips the current interval and forces the timer to indicate + that the interval has elapsed on the next check. + """ + self._last_time = float("-inf") + + async def wait_until_interval(self) -> None: + """Wait asynchronously until the specified interval has elapsed. + + This method checks the elapsed time every second, allowing cooperative + multitasking during the wait. + """ + self.logger.debug( + f"Waiting until {self.seconds}s has elapsed since the last iteration..." + ) + while not self.has_interval_elapsed(): + await asyncio.sleep(1) + + def has_interval_elapsed(self) -> bool: + """Check if the specified time interval has elapsed since the last expiration. + + If the interval has elapsed, the internal timer is reset to the current time. + """ + diff = time.time() - self._last_time + if diff >= self.seconds: + self._last_time = time.time() + self.logger.debug( + f"At least {self.seconds}s have elapsed (actually {diff}s)." + ) + return True + return False + + async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, @@ -88,29 +155,23 @@ async def _run( manifests = database.interface.ManifestClient(mongo_client) scan_backlog = database.interface.ScanBacklogClient(mongo_client) - short_sleep = True # don't wait for full delay after first starting up (helpful for testing new changes) - - last_log_time = 0.0 # keep track of last time a log was made so we're not annoying + last_log_heartbeat = 0.0 # log every so often, not on every iteration + long_interval_timer = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) while True: - if short_sleep: - await asyncio.sleep(ENV.SCAN_BACKLOG_RUNNER_SHORT_DELAY) - else: - await asyncio.sleep(ENV.SCAN_BACKLOG_RUNNER_DELAY) - - # like a heartbeat for the logs - if time.time() - last_log_time > ENV.SCAN_BACKLOG_RUNNER_DELAY: - LOGGER.info("scan backlog runner is still alive") - last_log_time = time.time() + await asyncio.sleep(ENV.SCAN_BACKLOG_RUNNER_SHORT_DELAY) + last_log_heartbeat = _logging_heartbeat(last_log_heartbeat) # get next entry try: - entry = await get_next_backlog_entry(scan_backlog, manifests) - short_sleep = False + entry = await get_next_backlog_entry( + scan_backlog, + manifests, + # include low priority scans only when enough time has passed + include_low_priority_scans=long_interval_timer.has_interval_elapsed(), + ) except database.mongodc.DocumentNotFoundException: - if not short_sleep: # don't log too often - LOGGER.debug("no backlog entry found") - short_sleep = True + long_interval_timer.fastforward() continue # empty queue # get k8s job object @@ -118,7 +179,7 @@ async def _run( job_obj = pickle.loads(entry.pickled_k8s_job) except Exception as e: LOGGER.exception(e) - short_sleep = True # don't wait long b/c nothing was started + long_interval_timer.fastforward() # nothing was started, so don't wait long continue LOGGER.info( @@ -133,7 +194,7 @@ async def _run( except kubernetes.client.exceptions.ApiException as e: # job (entry) will be revived & restarted in future iteration LOGGER.exception(e) - short_sleep = True # don't wait long b/c nothing was started + long_interval_timer.fastforward() # nothing was started, so don't wait long continue # remove from backlog now that startup succeeded diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 5d16e70a..9a8987c5 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1,9 +1,12 @@ """Handlers for the SkyDriver REST API server interface.""" +import argparse import asyncio import dataclasses as dc import json import logging +import pickle +import re import uuid from typing import Any, Type, TypeVar @@ -11,9 +14,16 @@ import kubernetes.client # type: ignore[import-untyped] from dacite import from_dict from dacite.exceptions import DaciteError -from motor.motor_asyncio import AsyncIOMotorClient -from rest_tools.server import RestHandler, token_attribute_role_mapping_auth +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection +from pymongo import ReturnDocument +from rest_tools.server import ( + ArgumentHandler, + ArgumentSource, + RestHandler, + token_attribute_role_mapping_auth, +) from tornado import web +from wipac_dev_tools import argparse_tools from . import database, images, k8s from .config import ( @@ -23,13 +33,14 @@ DebugMode, ENV, KNOWN_CLUSTERS, - SCAN_MIN_PRIORITY_TO_START_NOW, is_testing, ) +from .database import schema +from .k8s.scan_backlog import designate_for_startup +from .k8s.scanner_instance import SkymapScannerK8sWrapper LOGGER = logging.getLogger(__name__) - # ----------------------------------------------------------------------------- # constants @@ -41,11 +52,6 @@ WAIT_BEFORE_TEARDOWN = 60 -DEFAULT_EXCLUDED_MANIFEST_FIELDS = { - "event_i3live_json_dict", -} - - # ----------------------------------------------------------------------------- # REST requestor auth @@ -53,13 +59,13 @@ USER_ACCT = "user" SKYMAP_SCANNER_ACCT = "system" - if is_testing(): - def service_account_auth(**kwargs): # type: ignore + def service_account_auth(roles: list[str], **kwargs): # type: ignore def make_wrapper(method): # type: ignore[no-untyped-def] async def wrapper(self, *args, **kwargs): # type: ignore[no-untyped-def] LOGGER.warning("TESTING: auth disabled") + self.auth_roles = [roles[0]] # make as a list containing just 1st role return await method(self, *args, **kwargs) return wrapper @@ -84,7 +90,7 @@ def all_dc_fields(class_or_instance: Any) -> set[str]: return set(f.name for f in dc.fields(class_or_instance)) -def dict_projection(dicto: dict, projection: set[str]) -> dict: +def dict_projection(dicto: dict, projection: set[str] | list[str]) -> dict: """Keep only the keys in the `projection`. If `projection` is empty or includes '*', return all fields. @@ -96,6 +102,12 @@ def dict_projection(dicto: dict, projection: set[str]) -> dict: return {k: v for k, v in dicto.items() if k in projection} +def _arg_dict_strict(val: Any) -> dict: + if not isinstance(val, dict): + raise argparse.ArgumentTypeError("arg must be a dict") + return val + + # ----------------------------------------------------------------------------- # handlers @@ -116,6 +128,18 @@ def initialize( # type: ignore # pylint: disable=W0221 self.manifests = database.interface.ManifestClient(mongo_client) self.results = database.interface.ResultClient(mongo_client) self.scan_backlog = database.interface.ScanBacklogClient(mongo_client) + self.scan_request_coll = ( + AsyncIOMotorCollection( # in contrast, this one is accessed directly + mongo_client[database.interface._DB_NAME], # type: ignore[index] + database.utils._SCAN_REQUEST_COLL_NAME, + ) + ) + self.i3_event_coll = ( + AsyncIOMotorCollection( # in contrast, this one is accessed directly + mongo_client[database.interface._DB_NAME], # type: ignore[index] + database.utils._I3_EVENT_COLL_NAME, + ) + ) self.k8s_batch_api = k8s_batch_api @@ -144,36 +168,36 @@ class ScansFindHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT]) # type: ignore async def post(self) -> None: """Get matching scan manifest(s) for the given search.""" - mongo_filter: dict[str, Any] = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) + arghand.add_argument( "filter", - type=dict, - strict_type=True, + type=_arg_dict_strict, + ) + arghand.add_argument( + "include_deleted", + default=False, + type=bool, ) - incl_del = self.get_argument("include_deleted", default=False, type=bool) - # response args - manifest_projection = self.get_argument( + arghand.add_argument( "manifest_projection", - default=( - all_dc_fields(database.schema.Manifest) - - DEFAULT_EXCLUDED_MANIFEST_FIELDS - ), - type=set[str], + default=all_dc_fields(database.schema.Manifest), + type=str, ) + args = arghand.parse_args() - if "is_deleted" not in mongo_filter and not incl_del: - mongo_filter["is_deleted"] = False + if "is_deleted" not in args.filter and not args.include_deleted: + args.filter["is_deleted"] = False manifests = [ - dict_projection(dc.asdict(m), manifest_projection) - async for m in self.manifests.find_all(mongo_filter) + dict_projection(dc.asdict(m), args.manifest_projection) + async for m in self.manifests.find_all(args.filter) ] self.write({"manifests": manifests}) # - # NOTE - 'EventMappingHandler' needs to stay user-read-only b/c - # it's indirectly updated by the launching of a new scan + # NOTE - handler needs to stay user-read-only # @@ -201,7 +225,7 @@ async def get(self) -> None: # ----------------------------------------------------------------------------- -def cluster_lookup(name: str, n_workers: int) -> database.schema.Cluster: +def _cluster_lookup(name: str, n_workers: int) -> database.schema.Cluster: """Grab the Cluster object known using `name`.""" if cluster := KNOWN_CLUSTERS.get(name): if cluster["orchestrator"] == "condor": @@ -216,14 +240,14 @@ def cluster_lookup(name: str, n_workers: int) -> database.schema.Cluster: location=database.schema.KubernetesLocation(**cluster["location"]), n_workers=n_workers, ) - raise TypeError( + raise argparse.ArgumentTypeError( f"requested unknown cluster: {name} (available:" f" {', '.join(KNOWN_CLUSTERS.keys())})" ) def _json_to_dict(val: Any) -> dict: - _error = TypeError("must be JSON-string or JSON-friendly dict") + _error = argparse.ArgumentTypeError("must be JSON-string or JSON-friendly dict") # str -> json-dict if isinstance(val, str): try: @@ -247,9 +271,9 @@ def _json_to_dict(val: Any) -> dict: def _dict_or_list_to_request_clusters( val: dict | list, ) -> list[database.schema.Cluster]: - _error = TypeError( + _error = argparse.ArgumentTypeError( "must be a dict of cluster location and number of workers, Ex: {'sub-2': 1500, ...}" - " (to request a cluster location more than once, provide a list of 2-lists instead)," + " (to request a cluster location more than once, provide a list of 2-lists instead)" # TODO: make n_workers optional when using "TMS smart starter" ) if isinstance(val, dict): @@ -262,27 +286,29 @@ def _dict_or_list_to_request_clusters( if not all(isinstance(a, list | tuple) and len(a) == 2 for a in val): raise _error # - return [cluster_lookup(name, n_workers) for name, n_workers in val] + return [_cluster_lookup(name, n_workers) for name, n_workers in val] def _classifiers_validator(val: Any) -> dict[str, str | bool | float | int]: # type checks if not isinstance(val, dict): - raise TypeError("must be a dict") + raise argparse.ArgumentTypeError("must be a dict") if any(v for v in val.values() if not isinstance(v, str | bool | float | int)): - raise TypeError("entry must be 'str | bool | float | int'") + raise argparse.ArgumentTypeError("entry must be 'str | bool | float | int'") # size check if len(val) > MAX_CLASSIFIERS_LEN: - raise ValueError(f"must be at most {MAX_CLASSIFIERS_LEN} entries long") + raise argparse.ArgumentTypeError( + f"must be at most {MAX_CLASSIFIERS_LEN} entries long" + ) for key, subval in val.items(): if len(key) > MAX_CLASSIFIERS_LEN: - raise ValueError( + raise argparse.ArgumentTypeError( f"key must be at most {MAX_CLASSIFIERS_LEN} characters long" ) try: if len(subval) > MAX_CLASSIFIERS_LEN: - raise ValueError( + raise argparse.ArgumentTypeError( f"str-field must be at most {MAX_CLASSIFIERS_LEN} characters long" ) except TypeError: @@ -301,13 +327,7 @@ def _data_size_parse(val: Any) -> int: try: return humanfriendly.parse_size(str(val)) # type: ignore[no-any-return] except humanfriendly.InvalidSize: - raise ValueError("invalid data size") - - -def _validate_arg(val: Any, test: bool, exc: Exception) -> Any: - if test: - return val - raise exc + raise argparse.ArgumentTypeError("invalid data size") class ScanLauncherHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @@ -318,218 +338,306 @@ class ScanLauncherHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT]) # type: ignore async def post(self) -> None: """Start a new scan.""" - + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) # docker args - docker_tag = self.get_argument( # any tag on docker hub (including 'latest') -- must also be on CVMFS (but not checked here) + arghand.add_argument( + # any tag on docker hub (including 'latest') -- must also be on CVMFS (but not checked here) "docker_tag", type=images.resolve_docker_tag, - forbiddens=[r"\s*"], # no empty string / whitespace ) - # scanner server args - scanner_server_memory_bytes = self.get_argument( + arghand.add_argument( "scanner_server_memory", type=_data_size_parse, default=DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, ) - # client worker args - worker_memory_bytes = self.get_argument( + arghand.add_argument( "worker_memory", type=_data_size_parse, default=DEFAULT_WORKER_MEMORY_BYTES, ) - self.get_argument( # NOTE - DEPRECATED + arghand.add_argument( # NOTE - DEPRECATED "memory", - type=lambda x: _validate_arg( + type=lambda x: argparse_tools.validate_arg( x, not bool(x), # False if given - ValueError("argument is deprecated, please use 'worker_memory'"), + argparse.ArgumentTypeError( + "argument is deprecated--use 'worker_memory'" + ), ), default=None, - forbiddens=[r"\s*"], # no empty string / whitespace ) - worker_disk_bytes = self.get_argument( + arghand.add_argument( "worker_disk", type=_data_size_parse, default=DEFAULT_WORKER_DISK_BYTES, ) - request_clusters = self.get_argument( + arghand.add_argument( "cluster", type=_dict_or_list_to_request_clusters, ) - # scanner args - reco_algo = self.get_argument( + arghand.add_argument( "reco_algo", - type=str, - forbiddens=[r"\s*"], # no empty string / whitespace + type=lambda x: argparse_tools.validate_arg( + x, + bool(re.match(r"\S", x)), # no empty string / whitespace + argparse.ArgumentTypeError("cannot be empty string / whitespace"), + ), ) - event_i3live_json_dict = self.get_argument( + arghand.add_argument( "event_i3live_json", type=_json_to_dict, # JSON-string/JSON-friendly dict -> dict ) - nsides: dict[int, int] = self.get_argument( + arghand.add_argument( "nsides", - type=dict, - strict_type=True, + type=_arg_dict_strict, ) - real_or_simulated_event = self.get_argument( + arghand.add_argument( "real_or_simulated_event", # as opposed to simulation type=str, choices=REAL_CHOICES + SIM_CHOICES, ) - predictive_scanning_threshold = self.get_argument( + arghand.add_argument( "predictive_scanning_threshold", type=float, default=1.0, - strict_type=False, # allow casting from int (1) ) - max_pixel_reco_time = self.get_argument( + arghand.add_argument( "max_pixel_reco_time", type=int, ) - max_worker_runtime = self.get_argument( + arghand.add_argument( "max_worker_runtime", type=int, default=4 * 60 * 60, ) - skyscan_mq_client_timeout_wait_for_first_message: int | None = ( - self.get_argument( - # TODO - remove when TMS is handling workforce-scaling - "skyscan_mq_client_timeout_wait_for_first_message", - type=int, - default=-1, # elephant in Cairo - ) + arghand.add_argument( + # TODO - remove when TMS is handling workforce-scaling + "skyscan_mq_client_timeout_wait_for_first_message", + type=int, + default=-1, # elephant in Cairo, see below ) - if skyscan_mq_client_timeout_wait_for_first_message == -1: - skyscan_mq_client_timeout_wait_for_first_message = None - debug_mode = self.get_argument( + arghand.add_argument( "debug_mode", type=_debug_mode, default=[], ) - if DebugMode.CLIENT_LOGS in debug_mode: - for cluster in request_clusters: - cname, cinfo = cluster.to_known_cluster() - if cluster.n_workers > cinfo.get( - "max_n_clients_during_debug_mode", float("inf") - ): - raise web.HTTPError( - 400, - log_message=( - f"Too many workers: Cluster '{cname}' can only have " - f"{cinfo.get('max_n_clients_during_debug_mode')} " - f"workers when 'debug_mode' " - f"includes '{DebugMode.CLIENT_LOGS.value}'" - ), - ) - # other args - classifiers = self.get_argument( + arghand.add_argument( "classifiers", type=_classifiers_validator, default={}, ) - priority = self.get_argument( + arghand.add_argument( "priority", type=int, default=0, ) - scanner_server_env_from_user = self.get_argument( + arghand.add_argument( "scanner_server_env", type=_classifiers_validator, # piggy-back this validator default={}, ) - # response args - manifest_projection = self.get_argument( + arghand.add_argument( "manifest_projection", - default=( - all_dc_fields(database.schema.Manifest) - - DEFAULT_EXCLUDED_MANIFEST_FIELDS - ), - type=set[str], + default=all_dc_fields(database.schema.Manifest), + type=str, ) + args = arghand.parse_args() + + # more arg validation + if DebugMode.CLIENT_LOGS in args.debug_mode: + for cluster in args.cluster: + cname, cinfo = cluster.to_known_cluster() + if cluster.n_workers > cinfo.get( + "max_n_clients_during_debug_mode", float("inf") + ): + raise web.HTTPError( + 400, + log_message=( + f"Too many workers: Cluster '{cname}' can only have " + f"{cinfo.get('max_n_clients_during_debug_mode')} " + f"workers when 'debug_mode' " + f"includes '{DebugMode.CLIENT_LOGS.value}'" + ), + ) # generate unique scan_id scan_id = uuid.uuid4().hex - # get the container info ready - scanner_wrapper = k8s.scanner_instance.SkymapScannerK8sWrapper( - docker_tag=docker_tag, - scan_id=scan_id, - # server - scanner_server_memory_bytes=scanner_server_memory_bytes, - reco_algo=reco_algo, - nsides=nsides, - is_real_event=real_or_simulated_event in REAL_CHOICES, - predictive_scanning_threshold=predictive_scanning_threshold, - # cluster starter - starter_exc=str( # TODO - remove once tested in prod - classifiers.get("__unstable_starter_exc", "clientmanager") + # Before doing anything else, persist in DB + # -> store the event in its own collection to reduce redundancy + i3_event_id = uuid.uuid4().hex + await self.i3_event_coll.insert_one( + { + "i3_event_id": i3_event_id, + "json_dict": args.event_i3live_json, # this was transformed into dict + } + ) + # -> store scan_request_obj in db + scan_request_obj = dict( + docker_tag=args.docker_tag, + scanner_server_memory_bytes=args.scanner_server_memory, # already in bytes + reco_algo=args.reco_algo, + nsides=args.nsides, + real_or_simulated_event=args.real_or_simulated_event, + predictive_scanning_threshold=args.predictive_scanning_threshold, + classifiers=args.classifiers, + request_clusters=args.cluster, + worker_memory_bytes=args.worker_memory, + worker_disk_bytes=args.worker_disk, # already in bytes + max_pixel_reco_time=args.max_pixel_reco_time, + max_worker_runtime=args.max_worker_runtime, + priority=args.priority, + debug_mode=args.debug_mode, + skyscan_mq_client_timeout_wait_for_first_message=( + args.skyscan_mq_client_timeout_wait_for_first_message + if args.skyscan_mq_client_timeout_wait_for_first_message != -1 + else None ), - request_clusters=request_clusters, - worker_memory_bytes=worker_memory_bytes, - worker_disk_bytes=worker_disk_bytes, - max_pixel_reco_time=max_pixel_reco_time, - max_worker_runtime=max_worker_runtime, - priority=priority, - # universal - debug_mode=debug_mode, - # env + i3_event_id=i3_event_id, # foreign key to i3_event collection rest_address=self.request.full_url().rstrip(self.request.uri), - skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, - scanner_server_env_from_user=scanner_server_env_from_user, + scanner_server_env_from_user=args.scanner_server_env, + ) + await self.scan_request_coll.insert_one( + { + "scan_id": scan_id, + "scan_request_obj_pkl": pickle.dumps(scan_request_obj), + # ^^^ can be well compressed, obj will only be decompressed for re-scans + }, ) - # put in db (do before k8s start so if k8s fail, we can debug using db's info) - manifest = await self.manifests.post( - event_i3live_json_dict, + # go! + manifest = await _start_scan( + self.manifests, + self.scan_backlog, scan_id, - scanner_wrapper.scanner_server_args, - scanner_wrapper.cluster_starter_args_list, - from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), - classifiers, - priority, + scan_request_obj, + ) + self.write( + dict_projection(dc.asdict(manifest), args.manifest_projection), ) - enqueue = True - # start now? - if priority >= SCAN_MIN_PRIORITY_TO_START_NOW: - try: - resp = k8s.utils.KubeAPITools.start_job( - self.k8s_batch_api, - scanner_wrapper.job_obj, - ) - LOGGER.info(resp) - except kubernetes.client.exceptions.ApiException as e: - # job (entry) will be enqueued and tried again per priority - LOGGER.exception(e) - else: - enqueue = False +async def _start_scan( + manifests: database.interface.ManifestClient, + scan_backlog: database.interface.ScanBacklogClient, + scan_id: str, + scan_request_obj: dict, +) -> schema.Manifest: + # get the container info ready + scanner_wrapper = SkymapScannerK8sWrapper( + docker_tag=scan_request_obj["docker_tag"], + scan_id=scan_id, + # server + scanner_server_memory_bytes=scan_request_obj["scanner_server_memory_bytes"], + reco_algo=scan_request_obj["reco_algo"], + nsides=scan_request_obj["nsides"], + is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, + predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], + # cluster starter + starter_exc=str( # TODO - remove once tested in prod + scan_request_obj["classifiers"].get( + "__unstable_starter_exc", "clientmanager" + ) + ), + request_clusters=scan_request_obj["request_clusters"], + worker_memory_bytes=scan_request_obj["worker_memory_bytes"], + worker_disk_bytes=scan_request_obj["worker_disk_bytes"], + max_pixel_reco_time=scan_request_obj["max_pixel_reco_time"], + max_worker_runtime=scan_request_obj["max_worker_runtime"], + priority=scan_request_obj["priority"], + # universal + debug_mode=scan_request_obj["debug_mode"], + # env + rest_address=scan_request_obj["rest_address"], + skyscan_mq_client_timeout_wait_for_first_message=scan_request_obj[ + "skyscan_mq_client_timeout_wait_for_first_message" + ], + scanner_server_env_from_user=scan_request_obj["scanner_server_env_from_user"], + ) - # start later? - if enqueue: - # enqueue skymap scanner instance to be started in-time - try: - LOGGER.info(f"enqueuing k8s job for {scan_id=}") - await k8s.scan_backlog.enqueue( - scan_id, - scanner_wrapper.job_obj, - self.scan_backlog, - manifest.priority, - ) - except Exception as e: - LOGGER.exception(e) - raise web.HTTPError( - 400, - log_message="Failed to enqueue Kubernetes job for Scanner instance", - ) + # put in db (do before k8s start so if k8s fail, we can debug using db's info) + manifest = await manifests.post( + scan_request_obj["i3_event_id"], + scan_id, + scanner_wrapper.scanner_server_args, + scanner_wrapper.cluster_starter_args_list, + from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), + scan_request_obj["classifiers"], + scan_request_obj["priority"], + ) + + await designate_for_startup( + scan_id, + scanner_wrapper.job_obj, + scan_backlog, + scan_request_obj["priority"], + ) + + return manifest + + +# ----------------------------------------------------------------------------- + +class ScanRescanHandler(BaseSkyDriverHandler): # pylint: disable=W0223 + """Handles actions on copying a scan's manifest and starting that.""" + + ROUTE = r"/scan/(?P\w+)/actions/rescan$" + + @service_account_auth(roles=[USER_ACCT]) # type: ignore + async def post(self, scan_id: str) -> None: + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) + # response args + arghand.add_argument( + "manifest_projection", + default=all_dc_fields(database.schema.Manifest), + type=str, + ) + args = arghand.parse_args() + + # generate unique scan_id + new_scan_id = uuid.uuid4().hex + + # grab the original requester's 'scan_request_obj' + doc = await self.scan_request_coll.find_one_and_update( + {"scan_id": scan_id}, + {"$push": {"rescan_ids": new_scan_id}}, + return_document=ReturnDocument.AFTER, + ) + # -> backup plan: was this scan_id actually a rescan itself? + if not doc: + doc = await self.scan_request_coll.find_one_and_update( + {"rescan_ids": scan_id}, # one in a list + {"$push": {"rescan_ids": new_scan_id}}, + return_document=ReturnDocument.AFTER, + ) + # -> error: couldn't find it anywhere + if not doc: + raise web.HTTPError( + 404, + log_message="Could not find original scan-request information to start a rescan", + ) + scan_request_obj = pickle.loads(doc["scan_request_obj_pkl"]) + + # add to 'classifiers' so the user has provenance info + scan_request_obj["classifiers"].update( + {"rescan": True, "origin_scan_id": scan_id} + ) + + # go! + manifest = await _start_scan( + self.manifests, + self.scan_backlog, + new_scan_id, + scan_request_obj, + ) self.write( - dict_projection(dc.asdict(manifest), manifest_projection), + dict_projection(dc.asdict(manifest), args.manifest_projection), ) @@ -608,26 +716,24 @@ class ScanHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT]) # type: ignore async def delete(self, scan_id: str) -> None: """Abort a scan and/or mark manifest & result as "deleted".""" - delete_completed_scan = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) + arghand.add_argument( "delete_completed_scan", default=False, type=bool, ) - # response args - manifest_projection = self.get_argument( + arghand.add_argument( "manifest_projection", - default=( - all_dc_fields(database.schema.Manifest) - - DEFAULT_EXCLUDED_MANIFEST_FIELDS - ), - type=set[str], + default=all_dc_fields(database.schema.Manifest), + type=str, ) + args = arghand.parse_args() # check DB states manifest = await self.manifests.get(scan_id, True) if ( - manifest.ewms_task.complete and not delete_completed_scan + manifest.ewms_task.complete and not args.delete_completed_scan ): # workforce is done msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( @@ -650,7 +756,9 @@ async def delete(self, scan_id: str) -> None: self.write( { - "manifest": dict_projection(dc.asdict(manifest), manifest_projection), + "manifest": dict_projection( + dc.asdict(manifest), args.manifest_projection + ), "result": result_dict, } ) @@ -658,34 +766,24 @@ async def delete(self, scan_id: str) -> None: @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get manifest & result.""" - incl_del = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) + arghand.add_argument( "include_deleted", default=False, type=bool, ) - # # response args - # manifest_projection = self.get_argument( - # "manifest_projection", - # default=( - # all_dc_fields(database.schema.Manifest) - # - DEFAULT_EXCLUDED_MANIFEST_FIELDS - # ), - # type=set[str], - # ) - manifest_projection = ( - all_dc_fields(database.schema.Manifest) - DEFAULT_EXCLUDED_MANIFEST_FIELDS - ) + args = arghand.parse_args() result, manifest = await get_result_safely( self.manifests, self.results, scan_id, - incl_del, + args.include_deleted, ) self.write( { - "manifest": dict_projection(dc.asdict(manifest), manifest_projection), + "manifest": dc.asdict(manifest), "result": dc.asdict(result) if result else {}, } ) @@ -702,28 +800,53 @@ class ScanManifestHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get scan progress.""" - incl_del = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) + arghand.add_argument( "include_deleted", default=False, type=bool, ) - # # response args - # manifest_projection = self.get_argument( - # "manifest_projection", - # default=all_dc_fields(database.schema.Manifest), - # type=set[str], - # ) + # response args + arghand.add_argument( + "projection", + default=all_dc_fields(database.schema.Manifest), + type=str, + ) + args = arghand.parse_args() - manifest = await self.manifests.get(scan_id, incl_del) + # get manifest from db + manifest = await self.manifests.get(scan_id, args.include_deleted) - self.write( - # dict_projection(dc.asdict(manifest), manifest_projection), - dc.asdict(manifest) - ) + # Backward Compatibility for Skymap Scanner: + # Include the whole event dict in the response like the 'old' manifest. + # This overrides the manifest's field which should be an id. + if ( + self.auth_roles[0] == SKYMAP_SCANNER_ACCT # type: ignore + and "event_i3live_json_dict" in args.projection + and manifest.i3_event_id # if no id, then event already in manifest + ): + if i3event_doc := await self.i3_event_coll.find_one( + {"i3_event_id": manifest.i3_event_id} + ): + manifest.event_i3live_json_dict = i3event_doc["json_dict"] + else: # this would mean the event was removed from the db + error_msg = ( + f"No i3 event document found with id '{manifest.i3_event_id}'" + f"--if other fields are wanted, re-request using 'projection'" + ) + raise web.HTTPError( + 404, + log_message=error_msg, + reason=error_msg, + ) + + resp = dict_projection(dc.asdict(manifest), args.projection) + self.write(resp) @service_account_auth(roles=[SKYMAP_SCANNER_ACCT]) # type: ignore async def patch(self, scan_id: str) -> None: """Update scan progress.""" + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) T = TypeVar("T") @@ -733,35 +856,36 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: try: return from_dict(data_class, val) except DaciteError as e: - raise ValueError(str(e)) + raise argparse.ArgumentTypeError(str(e)) - progress = self.get_argument( + arghand.add_argument( "progress", type=lambda x: from_dict_wrapper_or_none(database.schema.Progress, x), default=None, ) - event_metadata = self.get_argument( + arghand.add_argument( "event_metadata", type=lambda x: from_dict_wrapper_or_none(database.schema.EventMetadata, x), default=None, ) - scan_metadata: database.schema.StrDict = self.get_argument( + arghand.add_argument( "scan_metadata", type=dict, default={}, ) - cluster = self.get_argument( + arghand.add_argument( "cluster", type=lambda x: from_dict_wrapper_or_none(database.schema.Cluster, x), default=None, ) + args = arghand.parse_args() manifest = await self.manifests.patch( scan_id, - progress, - event_metadata, - scan_metadata, - cluster, + args.progress, + args.event_metadata, + args.scan_metadata, + args.cluster, ) # NOTE - the following will be moved to TMS, then improved @@ -804,6 +928,48 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: # ----------------------------------------------------------------------------- +class ScanI3EventHandler(BaseSkyDriverHandler): # pylint: disable=W0223 + """Handles grabbing i3 events using scan ids.""" + + ROUTE = r"/scan/(?P\w+)/i3-event$" + + @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + async def get(self, scan_id: str) -> None: + """Get scan's i3 event.""" + manifest = await self.manifests.get(scan_id, True) + + # look up event in collection + if manifest.i3_event_id: + doc = await self.i3_event_coll.find_one( + {"i3_event_id": manifest.i3_event_id} + ) + if doc: + i3_event = doc["json_dict"] + else: # this would mean the event was removed from the db + error_msg = ( + f"No i3 event document found with id '{manifest.i3_event_id}'" + ) + raise web.HTTPError( + 404, + log_message=error_msg, + reason=error_msg, + ) + # unless, this is an old scan -- where the whole dict was stored w/ the manifest + else: + i3_event = manifest.event_i3live_json_dict + + self.write({"i3_event": i3_event}) + + # + # NOTE - handler needs to stay user-read-only + # + # FUTURE - add delete? + # + + +# ----------------------------------------------------------------------------- + + class ScanResultHandler(BaseSkyDriverHandler): # pylint: disable=W0223 """Handles actions on persisted scan results.""" @@ -812,17 +978,19 @@ class ScanResultHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get a scan's persisted result.""" - incl_del = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) + arghand.add_argument( "include_deleted", default=False, type=bool, ) + args = arghand.parse_args() result, _ = await get_result_safely( self.manifests, self.results, scan_id, - incl_del, + args.include_deleted, ) self.write(dc.asdict(result) if result else {}) @@ -830,21 +998,25 @@ async def get(self, scan_id: str) -> None: @service_account_auth(roles=[SKYMAP_SCANNER_ACCT]) # type: ignore async def put(self, scan_id: str) -> None: """Put (persist) a scan's result.""" - skyscan_result: dict[str, Any] = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) + arghand.add_argument( "skyscan_result", - type=dict, - strict_type=True, + type=_arg_dict_strict, + ) + arghand.add_argument( + "is_final", + type=bool, ) - is_final = self.get_argument("is_final", type=bool) + args = arghand.parse_args() - if not skyscan_result: + if not args.skyscan_result: self.write({}) return result_dc = await self.results.put( scan_id, - skyscan_result, - is_final, + args.skyscan_result, + args.is_final, ) self.write(dc.asdict(result_dc)) @@ -853,7 +1025,7 @@ async def put(self, scan_id: str) -> None: # AFTER RESPONSE # # when we get the final result, it's time to tear down - if is_final: + if args.is_final: await asyncio.sleep( WAIT_BEFORE_TEARDOWN ) # regular time.sleep() sleeps the entire server @@ -871,21 +1043,23 @@ class ScanStatusHandler(BaseSkyDriverHandler): # pylint: disable=W0223 @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get a scan's status.""" - include_pod_statuses = self.get_argument( + arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) + arghand.add_argument( "include_pod_statuses", type=bool, default=False, ) + args = arghand.parse_args() manifest = await self.manifests.get(scan_id, incl_del=True) # get pod status pods_411: dict[str, Any] = {} - if include_pod_statuses: + if args.include_pod_statuses: try: pods_411["pod_status"] = k8s.utils.KubeAPITools.get_pod_status( self.k8s_batch_api, - k8s.scanner_instance.SkymapScannerK8sWrapper.get_job_name(scan_id), + SkymapScannerK8sWrapper.get_job_name(scan_id), ENV.K8S_NAMESPACE, ) pods_411["pod_message"] = "retrieved" @@ -906,7 +1080,7 @@ async def get(self, scan_id: str) -> None: "pods": pods_411, "clusters": [dc.asdict(c) for c in manifest.ewms_task.clusters], } - if not include_pod_statuses: + if not args.include_pod_statuses: resp.pop("pods") self.write(resp) @@ -919,7 +1093,7 @@ async def get(self, scan_id: str) -> None: class ScanLogsHandler(BaseSkyDriverHandler): # pylint: disable=W0223 - """Handles relying logs for scans.""" + """Handles relaying logs for scans.""" ROUTE = r"/scan/(?P\w+)/logs$" @@ -929,7 +1103,7 @@ async def get(self, scan_id: str) -> None: try: pod_container_logs = k8s.utils.KubeAPITools.get_container_logs( self.k8s_batch_api, - k8s.scanner_instance.SkymapScannerK8sWrapper.get_job_name(scan_id), + SkymapScannerK8sWrapper.get_job_name(scan_id), ENV.K8S_NAMESPACE, ) pod_container_logs_message = "retrieved" diff --git a/skydriver/server.py b/skydriver/server.py index c0d20366..de1eebf8 100644 --- a/skydriver/server.py +++ b/skydriver/server.py @@ -1,6 +1,5 @@ """Root python script for SkyDriver REST API server interface.""" - import logging from typing import Any @@ -43,8 +42,10 @@ async def make( rest_handlers.MainHandler, rest_handlers.ScanHandler, rest_handlers.ScanManifestHandler, + rest_handlers.ScanI3EventHandler, rest_handlers.ScanResultHandler, rest_handlers.ScanLauncherHandler, + rest_handlers.ScanRescanHandler, rest_handlers.ScanStatusHandler, rest_handlers.ScanLogsHandler, ]: diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index 7b3c2661..0a9a4b64 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -1,6 +1,5 @@ """Integration tests for backlog runner.""" - # pylint: disable=redefined-outer-name import asyncio @@ -9,9 +8,10 @@ from unittest import mock from unittest.mock import Mock +from rest_tools.client import RestClient + import skydriver import skydriver.images # noqa: F401 # export -from rest_tools.client import RestClient skydriver.config.config_logging() @@ -97,6 +97,10 @@ async def test_10( if i in [1, 3]: print_it(await rc.request("DELETE", f"/scan/{resp['scan_id']}")) + # NOTE: KubeAPITools.start_job() should be called: + # 1x for each scan POST and 1x for each DELETE, + # *unless* the scan is deleted before the backlog starts it (then, just 1x) + # inspect print_it(await rc.request("GET", "/scans/backlog")) for i in range(N_JOBS - 2): diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index abc7f3ae..b05ec7f1 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -1,8 +1,7 @@ """Integration tests for the REST server.""" import asyncio -import hashlib -import json +import copy import logging import os import random @@ -93,14 +92,9 @@ async def _launch_scan( scan_id=resp["scan_id"], is_deleted=False, timestamp=resp["timestamp"], # see below - event_i3live_json_dict__hash=hashlib.md5( - json.dumps( - post_scan_body["event_i3live_json"], - sort_keys=True, - ensure_ascii=True, - ).encode("utf-8") - ).hexdigest(), - event_i3live_json_dict=post_scan_body["event_i3live_json"], + event_i3live_json_dict__hash=None, # field has been deprecated, always 'None' + event_i3live_json_dict="use 'i3_event_id'", # field has been deprecated + i3_event_id=resp["i3_event_id"], # see below event_metadata=None, scan_metadata=None, progress=None, @@ -117,6 +111,7 @@ async def _launch_scan( # TODO: check more fields in future (hint: ctrl+F this comment) ) assert RE_UUID4HEX.fullmatch(resp["scan_id"]) + assert RE_UUID4HEX.fullmatch(resp["i3_event_id"]) assert launch_time < resp["timestamp"] < resp["last_updated"] < time.time() # check args (avoid whitespace headaches...) @@ -239,9 +234,6 @@ async def _launch_scan( # get scan_id assert resp["scan_id"] - # remove fields usually not returned - assert resp.pop("event_i3live_json_dict") # remove to match with other requests - # assert resp["ewms_task"].pop("env_vars") # remove to match with other requests return resp # type: ignore[no-any-return] @@ -270,12 +262,12 @@ async def _do_patch( now = time.time() resp = await rc.request("PATCH", f"/scan/{scan_id}/manifest", body) - assert resp.pop("event_i3live_json_dict") # remove to match with other requests - # assert resp["ewms_task"].pop("env_vars") # remove to match with other requests assert resp == dict( scan_id=scan_id, is_deleted=False, timestamp=resp["timestamp"], # see below + i3_event_id=resp["i3_event_id"], # not checking + event_i3live_json_dict=resp["event_i3live_json_dict"], # not checking event_i3live_json_dict__hash=resp[ "event_i3live_json_dict__hash" ], # not checking @@ -305,7 +297,7 @@ async def _do_patch( else resp["ewms_task"]["clusters"] # not checking ), ), - classifiers=CLASSIFIERS, + classifiers=resp["classifiers"], # not checking last_updated=resp["last_updated"], # see below priority=0, # TODO: check more fields in future (hint: ctrl+F this comment) @@ -315,8 +307,6 @@ async def _do_patch( manifest = resp # keep around # query progress resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert resp.pop("event_i3live_json_dict") # remove to match with other requests - # assert resp["ewms_task"].pop("env_vars") # remove to match with other requests assert resp == manifest return manifest # type: ignore[no-any-return] @@ -467,8 +457,6 @@ async def _send_result( # query progress resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert resp.pop("event_i3live_json_dict") # remove to match with other requests - # assert resp["ewms_task"].pop("env_vars") # remove to match with other requests assert resp == last_known_manifest # query result @@ -556,8 +544,6 @@ async def _delete_scan( resp = await rc.request( "GET", f"/scan/{scan_id}/manifest", {"include_deleted": True} ) - assert resp.pop("event_i3live_json_dict") # remove to match with other requests - # assert resp["ewms_task"].pop("env_vars") # remove to match with other requests assert resp == del_resp["manifest"] # RESULT: query w/ scan id (fails) @@ -627,7 +613,8 @@ async def _delete_scan( "include_deleted": True, }, ) - assert [m["scan_id"] for m in resp["manifests"]] == [scan_id] + assert scan_id in [m["scan_id"] for m in resp["manifests"]] + # ^^^ not testing that this is unique b/c the event could've been re-ran (rescan) resp = await rc.request( "POST", "/scans/find", @@ -640,7 +627,8 @@ async def _delete_scan( }, }, ) - assert [m["scan_id"] for m in resp["manifests"]] == [scan_id] + assert scan_id in [m["scan_id"] for m in resp["manifests"]] + # ^^^ not testing that this is unique b/c the event could've been re-ran (rescan) def get_tms_args( @@ -703,7 +691,7 @@ def get_tms_args( ], ], ) -async def test_00( +async def test_000( clusters: list | dict, docker_tag_input: str, docker_tag_expected: str, @@ -714,9 +702,6 @@ async def test_00( """Test normal scan creation and retrieval.""" rc = server() - # - # LAUNCH SCAN - # manifest = await _launch_scan( rc, { @@ -726,7 +711,25 @@ async def test_00( }, get_tms_args(clusters, docker_tag_expected, known_clusters), ) + + await _after_scan_start_logic( + rc, + manifest, + clusters, + known_clusters, + test_wait_before_teardown, + ) + + +async def _after_scan_start_logic( + rc: RestClient, + manifest: dict, + clusters: list | dict, + known_clusters: dict, + test_wait_before_teardown: float, +): scan_id = manifest["scan_id"] + # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -784,8 +787,6 @@ async def test_00( # wait as long as the server, so it'll mark as complete await asyncio.sleep(test_wait_before_teardown + 1) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert manifest.pop("event_i3live_json_dict") # remove to match with other requests - # assert manifest["ewms_task"].pop("env_vars") # remove to match with other requests assert manifest["ewms_task"]["complete"] # workforce is done # @@ -797,7 +798,122 @@ async def test_00( POST_SCAN_BODY_FOR_TEST_01 = dict(**POST_SCAN_BODY, cluster={"foobar": 1}) -async def test_01__bad_data( +def _assert_manifests_equal_with_normalization( + manifest_beta: dict, manifest_alpha: dict +): + """ + Asserts that specific keys in two manifests are equal after normalization. + Handles dynamically generated fields such as UUIDs and scan IDs. + + Args: + manifest_beta (dict): The first manifest to compare. + manifest_alpha (dict): The second manifest to compare. + + Raises: + AssertionError: If any of the specified keys are not equal after normalization. + """ + keys_to_compare = [ + "i3_event_id", + "ewms_task", + "priority", + "scanner_server_args", + ] + + def normalize_ewms_task(ewms_task: dict) -> dict: + """ + Normalizes the `ewms_task` dictionary by redacting specific dynamic sub-keys. + """ + normalized = copy.deepcopy(ewms_task) + + # Normalize `env_vars.scanner_server` + for dicto in normalized["env_vars"]["scanner_server"]: + if dicto["name"] == "SKYSCAN_SKYDRIVER_SCAN_ID": + dicto["value"] = "" + # Normalize `env_vars.scanner_server` + for listo in normalized["env_vars"]["tms_starters"]: + for dicto in listo: + if dicto["name"] == "SKYSCAN_SKYDRIVER_SCAN_ID": + dicto["value"] = "" + + # Normalize `tms_args` + normalized["tms_args"] = [ + re.sub(r"--uuid [a-f0-9\-]+", "--uuid ", arg) + for arg in normalized["tms_args"] + ] + + return normalized + + for key in keys_to_compare: + if key == "ewms_task": + normalized_beta = normalize_ewms_task(manifest_beta[key]) + normalized_alpha = normalize_ewms_task(manifest_alpha[key]) + assert normalized_beta == normalized_alpha, ( + f"Mismatch in key '{key}':\n" + f"Beta: {normalized_beta}\n" + f"Alpha: {normalized_alpha}" + ) + else: + assert manifest_beta[key] == manifest_alpha[key], ( + f"Mismatch in key '{key}':\n" + f"Beta: {manifest_beta.get(key)}\n" + f"Alpha: {manifest_alpha.get(key)}" + ) + + assert manifest_beta["timestamp"] > manifest_alpha["timestamp"] + + +async def test_010__rescan( + server: Callable[[], RestClient], + known_clusters: dict, + test_wait_before_teardown: float, +) -> None: + rc = server() + + clusters = {"foobar": 1, "a-schedd": 999, "cloud": 4568} + + # OG SCAN + manifest_alpha = await _launch_scan( + rc, + { + **POST_SCAN_BODY, + "docker_tag": "3.4.0", + "cluster": clusters, + }, + get_tms_args(clusters, "3.4.0", known_clusters), + ) + await _after_scan_start_logic( + rc, + manifest_alpha, + clusters, + known_clusters, + test_wait_before_teardown, + ) + + # RESCAN + manifest_beta = await rc.request( + "POST", + f"/scan/{manifest_alpha['scan_id']}/actions/rescan", + ) + # compare manifests + assert manifest_beta["classifiers"] == { + **manifest_alpha["classifiers"], + **{"rescan": True, "origin_scan_id": manifest_alpha["scan_id"]}, + } + _assert_manifests_equal_with_normalization(manifest_beta, manifest_alpha) + # continue on... + await _after_scan_start_logic( + rc, + manifest_beta, + clusters, + known_clusters, + test_wait_before_teardown, + ) + + +######################################################################################## + + +async def test_100__bad_data( server: Callable[[], RestClient], known_clusters: dict, test_wait_before_teardown: float, @@ -821,7 +937,12 @@ async def test_01__bad_data( # # empty body with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `\w+`: \(MissingArgumentError\) required argument is missing for url: {rc.address}/scan", + match=re.escape( + f"400 Client Error: the following arguments are required: " + f"docker_tag, cluster, reco_algo, event_i3live_json, nsides, " + f"real_or_simulated_event, max_pixel_reco_time " + f"for url: {rc.address}/scan" + ), ) as e: await rc.request("POST", "/scan", {}) print(e.value) @@ -836,7 +957,7 @@ async def test_01__bad_data( print(f"{arg}: [{bad_val}]") with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `{arg}`: \(ValueError\) .+ for url: {rc.address}/scan", + match=rf"400 Client Error: argument {arg}: .+ for url: {rc.address}/scan", ) as e: await rc.request( "POST", "/scan", {**POST_SCAN_BODY_FOR_TEST_01, arg: bad_val} @@ -852,7 +973,7 @@ async def test_01__bad_data( print(f"[{bad_val}]") with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `cluster`: \(ValueError\) .+ for url: {rc.address}/scan", + match=rf"400 Client Error: argument cluster: .+ for url: {rc.address}/scan", ) as e: await rc.request( "POST", "/scan", {**POST_SCAN_BODY_FOR_TEST_01, "cluster": bad_val} @@ -864,7 +985,10 @@ async def test_01__bad_data( print(arg) with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `{arg}`: \(MissingArgumentError\) required argument is missing for url: {rc.address}/scan", + match=re.escape( + f"400 Client Error: the following arguments are required: {arg} " + f"for url: {rc.address}/scan" + ), ) as e: # remove arg from body await rc.request( @@ -876,7 +1000,7 @@ async def test_01__bad_data( # # bad docker tag with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `docker_tag`: \(ValueError\) .+ for url: {rc.address}/scan", + match=rf"400 Client Error: argument docker_tag: invalid type for url: {rc.address}/scan", ) as e: await rc.request( "POST", "/scan", {**POST_SCAN_BODY_FOR_TEST_01, "docker_tag": "foo"} @@ -954,7 +1078,7 @@ async def test_01__bad_data( for bad_val in ["Done", ["a", "b", "c"]]: # type: ignore[assignment] with pytest.raises( requests.exceptions.HTTPError, - match=rf"400 Client Error: `progress`: \(ValueError\) missing value for field .* for url: {rc.address}/scan/{scan_id}/manifest", + match=rf"400 Client Error: argument progress: missing value for field .* for url: {rc.address}/scan/{scan_id}/manifest", ) as e: await rc.request( "PATCH", f"/scan/{scan_id}/manifest", {"progress": bad_val} @@ -982,7 +1106,9 @@ async def test_01__bad_data( with pytest.raises( requests.exceptions.HTTPError, match=re.escape( - f"400 Client Error: `skyscan_result`: (MissingArgumentError) required argument is missing for url: {rc.address}/scan/{scan_id}/result" + f"400 Client Error: the following arguments are required: " + f"skyscan_result, is_final " + f"for url: {rc.address}/scan/{scan_id}/result" ), ) as e: await rc.request("PUT", f"/scan/{scan_id}/result", {}) @@ -998,7 +1124,8 @@ async def test_01__bad_data( with pytest.raises( requests.exceptions.HTTPError, match=re.escape( - f"400 Client Error: `skyscan_result`: (ValueError) type mismatch: 'dict' (value is '{type(bad_val)}') for url: {rc.address}/scan/{scan_id}/result" + f"400 Client Error: argument skyscan_result: arg must be a dict " + f"for url: {rc.address}/scan/{scan_id}/result" ), ) as e: await rc.request( @@ -1013,8 +1140,6 @@ async def test_01__bad_data( # wait as long as the server, so it'll mark as complete await asyncio.sleep(test_wait_before_teardown) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert manifest.pop("event_i3live_json_dict") # remove to match with other requests - # assert manifest["ewms_task"].pop("env_vars") # remove to match with other requests assert manifest["ewms_task"]["complete"] # workforce is done # diff --git a/tests/unit/test_sanity.py b/tests/unit/test_sanity.py deleted file mode 100644 index cf651d06..00000000 --- a/tests/unit/test_sanity.py +++ /dev/null @@ -1,35 +0,0 @@ -"""Test that everything is where we think it is.""" - -import inspect - -from rest_tools.server import RestHandler -from skydriver import rest_handlers - - -def test_00__rest_handlers() -> None: - """Dir-check all the REST handlers.""" - - known_handlers = { - rest_handlers.MainHandler: r"/$", - rest_handlers.ScansFindHandler: r"/scans/find$", - rest_handlers.ScanBacklogHandler: r"/scans/backlog$", - rest_handlers.ScanLauncherHandler: r"/scan$", - rest_handlers.ScanHandler: r"/scan/(?P\w+)$", - rest_handlers.ScanManifestHandler: r"/scan/(?P\w+)/manifest$", - rest_handlers.ScanResultHandler: r"/scan/(?P\w+)/result$", - rest_handlers.ScanStatusHandler: r"/scan/(?P\w+)/status$", - rest_handlers.ScanLogsHandler: r"/scan/(?P\w+)/logs$", - } - - # search for all known handlers - for handler, route in known_handlers.items(): - assert handler.ROUTE == route # type: ignore[attr-defined] # base type does not have ROUTE - - # find - for _, klass in inspect.getmembers( - rest_handlers, - predicate=lambda x: ( - inspect.isclass(x) and issubclass(x, RestHandler) and x != RestHandler - ), - ): - assert klass in known_handlers or klass == rest_handlers.BaseSkyDriverHandler From ecf703630a81c432f69304bac7eac27eff4879db Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 09:56:15 -0600 Subject: [PATCH 002/327] rm `ewms_sidecar` --- ewms_sidecar/__init__.py | 16 -- ewms_sidecar/__main__.py | 13 -- ewms_sidecar/condor/__init__.py | 3 - ewms_sidecar/condor/act.py | 87 --------- ewms_sidecar/condor/condor_tools.py | 47 ----- ewms_sidecar/condor/starter.py | 203 --------------------- ewms_sidecar/condor/watcher.py | 266 ---------------------------- ewms_sidecar/config.py | 49 ----- ewms_sidecar/ewms_sidecar.py | 168 ------------------ ewms_sidecar/py.typed | 0 ewms_sidecar/utils.py | 138 --------------- 11 files changed, 990 deletions(-) delete mode 100644 ewms_sidecar/__init__.py delete mode 100644 ewms_sidecar/__main__.py delete mode 100644 ewms_sidecar/condor/__init__.py delete mode 100644 ewms_sidecar/condor/act.py delete mode 100644 ewms_sidecar/condor/condor_tools.py delete mode 100644 ewms_sidecar/condor/starter.py delete mode 100644 ewms_sidecar/condor/watcher.py delete mode 100644 ewms_sidecar/config.py delete mode 100644 ewms_sidecar/ewms_sidecar.py delete mode 100644 ewms_sidecar/py.typed delete mode 100644 ewms_sidecar/utils.py diff --git a/ewms_sidecar/__init__.py b/ewms_sidecar/__init__.py deleted file mode 100644 index a5b4c4b9..00000000 --- a/ewms_sidecar/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Public init.""" - -# version is a human-readable version number. - -# version_info is a four-tuple for programmatic comparison. The first -# three numbers are the components of the version number. The fourth -# is zero for an official release, positive for a development branch, -# or negative for a release candidate or beta (after the base version -# number has been incremented) -__version__ = "1.1.0" -version_info = ( - int(__version__.split(".")[0]), - int(__version__.split(".")[1]), - int(__version__.split(".")[2]), - 0, -) diff --git a/ewms_sidecar/__main__.py b/ewms_sidecar/__main__.py deleted file mode 100644 index cda4d1ad..00000000 --- a/ewms_sidecar/__main__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Entry-point to start up EWMS Sidecar.""" - - -import logging - -from . import ewms_sidecar - -LOGGER = logging.getLogger(__name__) - - -if __name__ == "__main__": - ewms_sidecar.main() - LOGGER.info("Done.") diff --git a/ewms_sidecar/condor/__init__.py b/ewms_sidecar/condor/__init__.py deleted file mode 100644 index a526f44f..00000000 --- a/ewms_sidecar/condor/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Init.""" - -from .act import act # noqa: F401 diff --git a/ewms_sidecar/condor/act.py b/ewms_sidecar/condor/act.py deleted file mode 100644 index 9fed4419..00000000 --- a/ewms_sidecar/condor/act.py +++ /dev/null @@ -1,87 +0,0 @@ -"""The post-argparse entry point for condor actions.""" - - -import argparse -import logging - -import htcondor # type: ignore[import-untyped] - -from .. import utils -from ..config import ENV -from . import condor_tools, starter, watcher - -LOGGER = logging.getLogger(__name__) - - -def act(args: argparse.Namespace) -> None: - """Do the action.""" - htcondor.set_subsystem("TOOL") - htcondor.param["TOOL_DEBUG"] = "D_FULLDEBUG" - # htcondor.param["TOOL_LOG"] = "log.txt" - # htcondor.enable_log() - htcondor.enable_debug() - - # condor auth & go - with htcondor.SecMan() as secman: - secman.setToken(htcondor.Token(ENV.CONDOR_TOKEN)) - schedd_obj = condor_tools.get_schedd_obj(args.collector, args.schedd) - _act(args, schedd_obj) - - -def _act(args: argparse.Namespace, schedd_obj: htcondor.Schedd) -> None: - LOGGER.info( - f"Starting {args.n_workers} Skymap Scanner client workers on {args.collector} / {args.schedd}" - ) - # make connections -- do now so we don't have any surprises downstream - skydriver_rc = utils.connect_to_skydriver() - # start - submit_dict = starter.prep( - spool=args.spool, - # starter CL args -- worker - worker_memory_bytes=args.worker_memory_bytes, - worker_disk_bytes=args.worker_disk_bytes, - n_cores=args.n_cores, - max_worker_runtime=args.max_worker_runtime, - priority=args.priority, - # starter CL args -- client - client_args=args.client_args, - client_startup_json_s3=utils.s3ify(args.client_startup_json), - image=args.image, - ) - # final checks - if args.dryrun: - LOGGER.critical("Script Aborted: dryrun enabled") - return - if utils.skydriver_aborted_scan(skydriver_rc): - LOGGER.critical("Script Aborted: SkyDriver aborted scan") - return - # start - submit_result_obj = starter.start( - schedd_obj=schedd_obj, - n_workers=args.n_workers, - submit_dict=submit_dict, - spool=args.spool, - ) - # report to SkyDriver - skydriver_cluster_obj = dict( - orchestrator="condor", - location={ - "collector": args.collector, - "schedd": args.schedd, - }, - uuid=args.uuid, - cluster_id=submit_result_obj.cluster(), - n_workers=submit_result_obj.num_procs(), - starter_info=submit_dict, - ) - utils.update_skydriver(skydriver_rc, **skydriver_cluster_obj) - LOGGER.info("Sent cluster info to SkyDriver") - watcher.watch( - args.collector, - args.schedd, - submit_result_obj.cluster(), - schedd_obj, - submit_result_obj.num_procs(), - skydriver_rc, - skydriver_cluster_obj, - ) diff --git a/ewms_sidecar/condor/condor_tools.py b/ewms_sidecar/condor/condor_tools.py deleted file mode 100644 index 3bc4aed6..00000000 --- a/ewms_sidecar/condor/condor_tools.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Util functions wrapping common htcondor actions.""" - - -import logging - -import htcondor # type: ignore[import-untyped] - -LOGGER = logging.getLogger(__name__) - - -def get_schedd_obj(collector: str, schedd: str) -> htcondor.Schedd: - """Get object for talking with HTCondor schedd. - - Examples: - `collector = "foo-bar.icecube.wisc.edu"` - `schedd = "baz.icecube.wisc.edu"` - """ - schedd_ad = htcondor.Collector(collector).locate( # ~> exception - htcondor.DaemonTypes.Schedd, schedd - ) - schedd_obj = htcondor.Schedd(schedd_ad) - LOGGER.info(f"Connected to Schedd {collector=} {schedd=}") - return schedd_obj - - -IDLE = 1 -RUNNING = 2 -REMOVED = 3 -COMPLETED = 4 -HELD = 5 -TRANSFERRING_OUTPUT = 6 -SUSPENDED = 7 - -_STATUS_MAPPING = { - IDLE: "Idle", - RUNNING: "Running", - REMOVED: "Removed", - COMPLETED: "Completed", - HELD: "Held", - TRANSFERRING_OUTPUT: "Transferring Output", - SUSPENDED: "Suspended", -} - - -def job_status_to_str(status_code: int) -> str: - """Get the human-readable string for the job status int.""" - return _STATUS_MAPPING.get(status_code, f"Invalid status code: {status_code}") diff --git a/ewms_sidecar/condor/starter.py b/ewms_sidecar/condor/starter.py deleted file mode 100644 index 6ce17f20..00000000 --- a/ewms_sidecar/condor/starter.py +++ /dev/null @@ -1,203 +0,0 @@ -"""For starting Skymap Scanner clients on an HTCondor cluster.""" - - -import logging -from pathlib import Path -from typing import Any - -import htcondor # type: ignore[import-untyped] -import humanfriendly - -from ..config import ENV, FORWARDED_ENV_VARS -from ..utils import S3File - -LOGGER = logging.getLogger(__name__) - - -def make_condor_logs_dir() -> Path: - """Make the condor logs subdirectory.""" - dpath = Path("tms-cluster") - dpath.mkdir(parents=True) - LOGGER.info(f"HTCondor will write log files to {dpath}") - return dpath - - -def make_condor_job_description( - spool: bool, - # condor args - worker_memory_bytes: int, - worker_disk_bytes: int, - n_cores: int, - max_worker_runtime: int, - priority: int, - # skymap scanner args - image: str, - client_startup_json_s3: S3File, - client_args_string: str, -) -> dict[str, Any]: - """Make the condor job description (dict).""" - - # NOTE: - # In the newest version of condor we could use: - # universe = container - # container_image = ... - # arguments = python -m ... - # But for now, we're stuck with: - # executable = ... - # +SingularityImage = ... - # arguments = /usr/local/icetray/env-shell.sh python -m ... - # Because "this universe doesn't know how to do the - # entrypoint, and loading the icetray env file - # directly from cvmfs messes up the paths" -DS - - # Build the environment specification for condor - env_vars = ["EWMS_PILOT_HTCHIRP=True"] - # EWMS_* are inherited via condor `getenv`, but we have default in case these are not set. - if not ENV.EWMS_PILOT_QUARANTINE_TIME: - env_vars.append("EWMS_PILOT_QUARANTINE_TIME=1800") - # The container sets I3_DATA to /opt/i3-data, however `millipede_wilks` requires files (spline tables) that are not available in the image. For the time being we require CVFMS and we load I3_DATA from there. In order to override the environment variables we need to prepend APPTAINERENV_ or SINGULARITYENV_ to the variable name. There are site-dependent behaviour but these two should cover all cases. See https://github.com/icecube/skymap_scanner/issues/135#issuecomment-1449063054. - for prefix in ["APPTAINERENV_", "SINGULARITYENV_"]: - env_vars.append(f"{prefix}I3_DATA=/cvmfs/icecube.opensciencegrid.org/data") - environment = " ".join(env_vars) - - # write - submit_dict = { - "executable": "/bin/bash", - "arguments": f"/usr/local/icetray/env-shell.sh python -m skymap_scanner.client {client_args_string} --client-startup-json ./{client_startup_json_s3.fname}", - "+SingularityImage": f'"{image}"', # must be quoted - "Requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", - "getenv": ", ".join(FORWARDED_ENV_VARS), - "environment": f'"{environment}"', # must be quoted - "+FileSystemDomain": '"blah"', # must be quoted - # - "should_transfer_files": "YES", - "transfer_input_files": client_startup_json_s3.url, - "transfer_output_files": '""', # must be quoted for "none" - # - # Don't transfer executable (/bin/bash) in case of - # version (dependency) mismatch. - # Ex: - # "/lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.36' not found" - # Technically this is just needed for spooling -- since if - # we don't spool, the executable (/bin/bash) can't be - # transferred anyway and so a local version will be used - "transfer_executable": "false", - # - "request_cpus": str(n_cores), - "request_memory": humanfriendly.format_size( # 1073741824 -> "1 GiB" -> "1 GB" - worker_memory_bytes, binary=True - ).replace("i", ""), - "request_disk": humanfriendly.format_size( # 1073741824 -> "1 GiB" -> "1 GB" - worker_disk_bytes, binary=True - ).replace("i", ""), - "priority": int(priority), - "+WantIOProxy": "true", # for HTChirp - "+OriginalTime": max_worker_runtime, # Execution time limit -- 1 hour default on OSG - } - - # outputs - if spool: - # this is the location where the files will go when/if *returned here* - logs_dir = make_condor_logs_dir() - submit_dict.update( - { - "output": str(logs_dir / "tms-worker-$(ProcId).out"), - "error": str(logs_dir / "tms-worker-$(ProcId).err"), - "log": str(logs_dir / "tms-cluster.log"), - } - ) - # https://htcondor.readthedocs.io/en/latest/users-manual/file-transfer.html#specifying-if-and-when-to-transfer-files - submit_dict.update( - { - "transfer_output_files": ",".join( - [ - submit_dict["output"], # type: ignore[list-item] - submit_dict["error"], # type: ignore[list-item] - submit_dict["log"], # type: ignore[list-item] - ] - ), - "when_to_transfer_output": "ON_EXIT_OR_EVICT", - } - ) - else: - # NOTE: this needs to be removed if we ARE transferring files - submit_dict["initialdir"] = "/tmp" - - return submit_dict - - -def prep( - # starter CL args -- helper - spool: bool, - # starter CL args -- worker - worker_memory_bytes: int, - worker_disk_bytes: int, - n_cores: int, - max_worker_runtime: int, - priority: int, - # starter CL args -- client - client_args: list[tuple[str, str]], - client_startup_json_s3: S3File, - image: str, -) -> dict[str, Any]: - """Create objects needed for starting cluster.""" - - # get client args - client_args_string = "" - if client_args: - for carg, value in client_args: - client_args_string += f" --{carg} {value} " - LOGGER.info(f"Client Args: {client_args}") - if "--client-startup-json" in client_args_string: - raise RuntimeError( - "The '--client-args' arg cannot include \"--client-startup-json\". " - "This needs to be given to this script explicitly ('--client-startup-json')." - ) - - # make condor job description - submit_dict = make_condor_job_description( - spool, - # condor args - worker_memory_bytes, - worker_disk_bytes, - n_cores, - max_worker_runtime, - priority, - # skymap scanner args - image, - client_startup_json_s3, - client_args_string, - ) - LOGGER.info(submit_dict) - - return submit_dict - - -def start( - schedd_obj: htcondor.Schedd, - n_workers: int, - # - submit_dict: dict[str, Any], - spool: bool, -) -> htcondor.SubmitResult: - """Start cluster.""" - submit_obj = htcondor.Submit(submit_dict) - LOGGER.info(submit_obj) - - # submit - submit_result_obj = schedd_obj.submit( - submit_obj, - count=n_workers, # submit N workers - spool=spool, # for transferring logs & files - ) - LOGGER.info(submit_result_obj) - if spool: - jobs = list( - submit_obj.jobs( - count=n_workers, - clusterid=submit_result_obj.cluster(), - ) - ) - schedd_obj.spool(jobs) - - return submit_result_obj diff --git a/ewms_sidecar/condor/watcher.py b/ewms_sidecar/condor/watcher.py deleted file mode 100644 index 46d2475c..00000000 --- a/ewms_sidecar/condor/watcher.py +++ /dev/null @@ -1,266 +0,0 @@ -"""For watching Skymap Scanner clients on an HTCondor cluster.""" - - -import collections -import logging -import time -from pprint import pformat -from typing import Any, Iterator - -import htcondor # type: ignore[import-untyped] -from rest_tools.client import RestClient - -from .. import utils -from ..config import WATCHER_INTERVAL, WATCHER_MAX_RUNTIME, WATCHER_N_TOP_TASK_ERRORS -from . import condor_tools as ct - -LOGGER = logging.getLogger(__name__) - - -PROJECTION = [ - "ClusterId", - "JobStatus", - "EnteredCurrentStatus", - "ProcId", - # - "HoldReason", - "HoldReasonCode", - "HoldReasonSubCode", - # - "HTChirpEWMSPilotLastUpdatedTimestamp", - "HTChirpEWMSPilotStartedTimestamp", - "HTChirpEWMSPilotStatus", - # - "HTChirpEWMSPilotTasksTotal", - "HTChirpEWMSPilotTasksFailed", - "HTChirpEWMSPilotTasksSuccess", - # - "HTChirpEWMSPilotError", - "HTChirpEWMSPilotErrorTraceback", -] - - -DONE_JOB_STATUSES: list[int] = [ - ct.REMOVED, - ct.COMPLETED, - ct.HELD, -] -NON_RESPONSE_LIMIT = 10 - - -def _translate_special_attrs(job_ad: dict[str, Any]) -> None: - """Special handling for specific attrs.""" - for attr in job_ad: - if attr.startswith("HTChirp"): - # unquote - if isinstance(job_ad[attr], str): - try: - job_ad[attr] = htcondor.classad.unquote(job_ad[attr]) - except Exception: - # LOGGER.error(f"could not unquote: {job[attr]}") - # LOGGER.exception(e) - pass - try: - job_ad["JobStatus"] = int(job_ad["JobStatus"]) - except Exception as e: - LOGGER.exception(e) - - -def update_stored_job_infos( - job_infos: dict[int, dict[str, Any]], - classad: Any, - source: str, -) -> None: - """Update the job's classad attrs in `job_infos`.""" - procid = int(classad["ProcId"]) - job_infos[procid]["source"] = source - job_infos[procid].update(dict(classad)) # start with everything - _translate_special_attrs(job_infos[procid]) - - -def iter_job_classads( - schedd_obj: htcondor.Schedd, - constraint: str, - projection: list[str], -) -> Iterator[tuple[htcondor.classad.ClassAd, str]]: - """Get the job class ads, trying various sources. - - May not get all of them. - """ - for call in [ - schedd_obj.query, - schedd_obj.history, - schedd_obj.jobEpochHistory, - ]: - try: - for classad in call(constraint, projection): - if "ProcId" not in classad: - continue - # LOGGER.info(f"looking at job {classad['ProcId']}") - # LOGGER.debug(str(call)) - # LOGGER.debug(classad) - yield classad, call.__name__ - except Exception as e: - LOGGER.exception(e) - - -def get_aggregate_statuses( - job_infos: dict[int, dict[str, Any]], - previous: dict[str, dict[str, int]], -) -> tuple[dict[str, dict[str, int]], bool]: - """Aggregate statuses of jobs & return whether this is an new value.""" - - def transform_job_status_val(info: dict[str, Any]) -> str: - """Get job status -- transforming any as needed. - - NOTE: each transformation needs to be generic - enough to aggregate nicely with others; e.g. don't - append a timestamp, do append a standard reason str. - """ - if info["JobStatus"] == ct.HELD: - codes = ( - info.get("HoldReasonCode", None), - info.get("HoldReasonSubCode", None), - ) - return ( - f"{ct.job_status_to_str(ct.HELD)}: " - f"{codes} " - f"{info.get('HoldReason', 'unknown reason')}" - ) - else: - return ct.job_status_to_str(info["JobStatus"]) - - statuses: dict[str, dict[str, int]] = { - k: {} - for k in set(transform_job_status_val(info) for info in job_infos.values()) - } - - for job_status in statuses: - ids_for_this_job_status = [ # subset of job_infos ids - i - for i, info in job_infos.items() - if transform_job_status_val(info) == job_status - ] - # NOTE - if the pilot did not send a status (ex: Held job), it is `None` - statuses[job_status] = dict( - collections.Counter( - job_infos[i]["HTChirpEWMSPilotStatus"] for i in ids_for_this_job_status - ) - ) - - return statuses, statuses != previous - - -def get_aggregate_top_task_errors( - job_infos: dict[int, dict[str, Any]], - n_top_task_errors: int, - previous: dict[str, int], -) -> tuple[dict[str, int], bool]: - """Aggregate top X errors of jobs & return whether this is an new value.""" - counts = collections.Counter( - dicto.get("HTChirpEWMSPilotError") for dicto in job_infos.values() - ) - counts.pop(None, None) # remove counts of "no error" - - errors = dict(counts.most_common(n_top_task_errors)) - return errors, errors != previous # type: ignore[return-value] - - -def watch( - collector: str, - schedd: str, - cluster_id: str, - schedd_obj: htcondor.Schedd, - n_workers: int, - # - skydriver_rc: RestClient, - skydriver_cluster_obj: dict[str, Any], -) -> None: - """Main logic.""" - LOGGER.info( - f"Watching Skymap Scanner client workers on {cluster_id} / {collector} / {schedd}" - ) - - job_infos: dict[int, dict[str, Any]] = { - i: { # NOTE - it's important that attrs reported on later are `None` to start - "JobStatus": None, - "HTChirpEWMSPilotStatus": None, - } - for i in range(n_workers) - } - - start = time.time() - non_response_ct = 0 - aggregate_statuses: dict[str, dict[str, int]] = {} - aggregate_top_task_errors: dict[str, int] = {} - - def keep_watching() -> bool: - """ - NOTE - condor may be lagging, so we can't just quit when - all jobs are done, since there may be more attrs to be updated. - """ - if not any( # if no done jobs, then keep going always - job_infos[j]["JobStatus"] in DONE_JOB_STATUSES for j in job_infos - ): - return True - else: - # condor may occasionally slow down & prematurely return nothing - return non_response_ct < NON_RESPONSE_LIMIT # allow X non-responses - - # WATCHING LOOP - while ( - keep_watching() - and time.time() - start - < WATCHER_MAX_RUNTIME # just in case, stop if taking too long - ): - # wait -- sleeping at top guarantees this happens - time.sleep(WATCHER_INTERVAL) - LOGGER.info("(re)checking jobs...") - - # query - classads = iter_job_classads( - schedd_obj, - ( - f"ClusterId == {cluster_id} && " - # only care about "older" status jobs if they are RUNNING - f"( JobStatus == {ct.RUNNING} || EnteredCurrentStatus >= {int(time.time()) - WATCHER_INTERVAL*3} )" - ), - PROJECTION, - ) - non_response_ct += 1 # just in case - for ad, source in classads: - non_response_ct = 0 - update_stored_job_infos(job_infos, ad, source) - # NOTE - if memory becomes an issue, switch to an in-iterator design - - # aggregate - aggregate_statuses, has_new_statuses = get_aggregate_statuses( - job_infos, - aggregate_statuses, - ) - aggregate_top_task_errors, has_new_errors = get_aggregate_top_task_errors( - job_infos, - WATCHER_N_TOP_TASK_ERRORS, - aggregate_top_task_errors, - ) - - # log - LOGGER.info(f"job aggregate statuses ({n_workers=})") - LOGGER.info(f"{pformat(aggregate_statuses, indent=4)}") - LOGGER.info( - f"job aggregate top {WATCHER_N_TOP_TASK_ERRORS} task errors ({n_workers=})" - ) - LOGGER.info(f"{pformat(aggregate_top_task_errors, indent=4)}") - - # figure updates - if not has_new_statuses and not has_new_errors: - LOGGER.info("no updates") - else: - # send updates - LOGGER.info("sending updates to skydriver") - utils.update_skydriver( - skydriver_rc, - **skydriver_cluster_obj, - statuses=aggregate_statuses, - top_task_errors=aggregate_top_task_errors, - ) diff --git a/ewms_sidecar/config.py b/ewms_sidecar/config.py deleted file mode 100644 index 0f000dac..00000000 --- a/ewms_sidecar/config.py +++ /dev/null @@ -1,49 +0,0 @@ -"""Config settings.""" - - -import dataclasses as dc -import os - -from wipac_dev_tools import from_environment_as_dataclass - -LOCAL_K8S_HOST = "local" - -_FORWARDED_ENV_VAR_PREFIXES = ["SKYSCAN_", "EWMS_"] -_NONFORWARDED_ENV_VAR_PREFIXES = ["EWMS_TMS_"] -FORWARDED_ENV_VARS = [ - var - for var in os.environ - if not any(var.startswith(p) for p in _NONFORWARDED_ENV_VAR_PREFIXES) - and any(var.startswith(p) for p in _FORWARDED_ENV_VAR_PREFIXES) -] -SECRET_FORWARDED_ENV_VARS = ["SKYSCAN_SKYDRIVER_AUTH", "SKYSCAN_BROKER_AUTH"] - -WATCHER_INTERVAL = 60 * 3 -WATCHER_MAX_RUNTIME = 60 * 60 * 24 -WATCHER_N_TOP_TASK_ERRORS = 10 - - -@dc.dataclass(frozen=True) -class EnvConfig: - """Environment variables.""" - - # pylint:disable=invalid-name - CLIENT_STARTER_WAIT_FOR_STARTUP_JSON: int = 60 - CONDOR_TOKEN: str = "" - # - EWMS_PILOT_QUARANTINE_TIME: int = 0 - # - EWMS_TMS_S3_ACCESS_KEY_ID: str = "" - EWMS_TMS_S3_BUCKET: str = "" - EWMS_TMS_S3_EXPIRATION: int = 60 * 60 * 24 # seconds / 1 day - EWMS_TMS_S3_SECRET_KEY: str = "" - EWMS_TMS_S3_URL: str = "" - - # piggy-back scanner env vars - SKYSCAN_LOG_THIRD_PARTY: str = "WARNING" - SKYSCAN_SKYDRIVER_ADDRESS: str = "" - SKYSCAN_SKYDRIVER_AUTH: str = "" - SKYSCAN_SKYDRIVER_SCAN_ID: str = "" - - -ENV = from_environment_as_dataclass(EnvConfig) diff --git a/ewms_sidecar/ewms_sidecar.py b/ewms_sidecar/ewms_sidecar.py deleted file mode 100644 index 96a6cb33..00000000 --- a/ewms_sidecar/ewms_sidecar.py +++ /dev/null @@ -1,168 +0,0 @@ -"""The EWMS Sidecar.""" - - -import argparse -import logging -import time -from pathlib import Path - -from wipac_dev_tools import argparse_tools, logging_tools - -from . import condor -from .config import ENV - -LOGGER = logging.getLogger(__name__) - - -def main() -> None: - """Main.""" - parser = argparse.ArgumentParser( - description="Handle EWMS requests adjacent to a Skymap Scanner central server", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # method - parser.add_argument( # TODO - remove once EWMS is full-time - dest="method", - help="how to start up the jobs", - ) - - parser.add_argument( - "--uuid", - required=True, - help="the uuid for the cluster", - ) - - SidecarArgs.condor(parser) - SidecarArgs.starter(parser) - - # parse args & set up logging - args = parser.parse_args() - logging_tools.set_level( - "DEBUG", # os.getenv("SKYSCAN_LOG", "INFO"), # type: ignore[arg-type] - first_party_loggers=LOGGER, - third_party_level=ENV.SKYSCAN_LOG_THIRD_PARTY, # type: ignore[arg-type] - use_coloredlogs=True, # for formatting - future_third_parties=["boto3", "botocore"], - ) - logging_tools.log_argparse_args(args, logger=LOGGER, level="WARNING") - - # Go! - match args.method: - case "direct-remote-condor": - condor.act(args) - # case "ewms": - # ewms.act(args) - case other: - raise RuntimeError(f"method not supported: {other}") - - -class SidecarArgs: - @staticmethod - def condor(parser: argparse.ArgumentParser) -> None: - """Add args to parser.""" - parser.add_argument( - "--collector", - default="", - help="the full URL address of the HTCondor collector server. Ex: foo-bar.icecube.wisc.edu", - ) - parser.add_argument( - "--schedd", - default="", - help="the full DNS name of the HTCondor Schedd server. Ex: baz.icecube.wisc.edu", - ) - - @staticmethod - def starter(parser: argparse.ArgumentParser) -> None: - """Add args to parser.""" - - def wait_for_file(waitee: Path, wait_time: int) -> Path: - """Wait for `waitee` to exist, then return fullly-resolved path.""" - elapsed_time = 0 - sleep = 5 - while not waitee.exists(): - LOGGER.info(f"waiting for {waitee} ({sleep}s intervals)...") - time.sleep(sleep) - elapsed_time += sleep - if elapsed_time >= wait_time: - raise argparse.ArgumentTypeError( - f"FileNotFoundError: waited {wait_time}s [{waitee}]" - ) - return waitee.resolve() - - # helper args - parser.add_argument( - "--dryrun", - default=False, - action="store_true", - help="does everything except submitting the worker(s)", - ) - parser.add_argument( - "--spool", - default=False, - action="store_true", - help="whether to spool (persist) logs -- if not given, logs are not kept", - ) - - # worker args - parser.add_argument( - "--worker-memory-bytes", - required=True, - type=int, - help="amount of worker memory (bytes)", - ) - parser.add_argument( - "--worker-disk-bytes", - required=True, - type=int, - help="amount of worker disk (bytes)", - ) - parser.add_argument( - "--n-cores", - default=1, - type=int, - help="number of cores per worker", - ) - parser.add_argument( - "--n-workers", - required=True, - type=int, - help="number of worker to start", - ) - parser.add_argument( - "--max-worker-runtime", - required=True, - type=int, - help="how long each worker is allowed to run", - ) - parser.add_argument( - "--priority", - required=True, - help="relative priority of this job/jobs", - ) - - # client args - parser.add_argument( - "--client-args", - required=False, - nargs="*", - type=lambda x: argparse_tools.validate_arg( - x.split(":", maxsplit=1), - len(x.split(":", maxsplit=1)) == 2, - ValueError('must " "-delimited series of "clientarg:value"-tuples'), - ), - help="n 'key:value' pairs containing the python CL arguments to pass to skymap_scanner.client", - ) - parser.add_argument( - "--client-startup-json", - help="The 'startup.json' file to startup each client", - type=lambda x: wait_for_file( - Path(x), - ENV.CLIENT_STARTER_WAIT_FOR_STARTUP_JSON, - ), - ) - parser.add_argument( - "--image", - required=True, - help="a path or url to the workers' image", - ) diff --git a/ewms_sidecar/py.typed b/ewms_sidecar/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/ewms_sidecar/utils.py b/ewms_sidecar/utils.py deleted file mode 100644 index 02f6e49a..00000000 --- a/ewms_sidecar/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -"""General Utilities.""" - - -import dataclasses as dc -import logging -from pathlib import Path -from typing import Any - -import boto3 # type: ignore[import-untyped] -import requests -from rest_tools.client import RestClient - -from .config import ENV - -LOGGER = logging.getLogger(__name__) - - -def connect_to_skydriver() -> RestClient: - """Connect to SkyDriver REST server & check scan id.""" - if not ENV.SKYSCAN_SKYDRIVER_SCAN_ID: - raise RuntimeError( - "Cannot connect to SkyDriver without `SKYSCAN_SKYDRIVER_SCAN_ID`" - ) - - skydriver_rc = RestClient( - ENV.SKYSCAN_SKYDRIVER_ADDRESS, - token=ENV.SKYSCAN_SKYDRIVER_AUTH, - ) - - LOGGER.info("Connected to SkyDriver") - return skydriver_rc - - -def skydriver_aborted_scan(skydriver_rc: RestClient) -> bool: - """Return whether the scan has been signaled for deletion.""" - ret = skydriver_rc.request_seq( - "GET", - f"/scan/{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}/manifest", - ) - return ret["is_deleted"] # type: ignore[no-any-return] - - -def update_skydriver( - skydriver_rc: RestClient, - orchestrator: str, - location: dict[str, str], - uuid: str, - cluster_id: str | int, - n_workers: int, - starter_info: dict[str, Any], - # - statuses: dict[str, dict[str, int]] | None = None, - top_task_errors: dict[str, int] | None = None, -) -> None: - """Send SkyDriver updates from the `submit_result`.""" - skydriver_cluster_obj = { - "orchestrator": orchestrator, - "location": location, - "uuid": uuid, - "cluster_id": str(cluster_id), - "n_workers": n_workers, - "starter_info": starter_info, - } - if statuses: - skydriver_cluster_obj["statuses"] = statuses - if top_task_errors: - skydriver_cluster_obj["top_task_errors"] = top_task_errors - - skydriver_rc.request_seq( - "PATCH", - f"/scan/{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}/manifest", - {"cluster": skydriver_cluster_obj}, - ) - - -@dc.dataclass -class S3File: - """Wrap an S3 file.""" - - url: str - fname: str - - -def s3ify(filepath: Path) -> S3File: - """Put the file in s3 and return info about it.""" - if not ( - ENV.EWMS_TMS_S3_URL - and ENV.EWMS_TMS_S3_ACCESS_KEY_ID - and ENV.EWMS_TMS_S3_SECRET_KEY - and ENV.EWMS_TMS_S3_BUCKET - and ENV.SKYSCAN_SKYDRIVER_SCAN_ID - ): - raise RuntimeError( - "must define all EWMS_TMS_S3_* environment variables to use S3" - ) - s3_client = boto3.client( - "s3", - "us-east-1", - endpoint_url=ENV.EWMS_TMS_S3_URL, - aws_access_key_id=ENV.EWMS_TMS_S3_ACCESS_KEY_ID, - aws_secret_access_key=ENV.EWMS_TMS_S3_SECRET_KEY, - ) - bucket = ENV.EWMS_TMS_S3_BUCKET - key = f"{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}-s3-{filepath.stem}" - - # get GET url - get_url = s3_client.generate_presigned_url( - "get_object", - Params={ - "Bucket": bucket, - "Key": key, - }, - ExpiresIn=ENV.EWMS_TMS_S3_EXPIRATION, # seconds - ) - s3_file = S3File(get_url, key) - - # check if already there (via other process/container) - try: - resp = requests.get(get_url) - resp.raise_for_status() - LOGGER.debug(resp) - LOGGER.info(f"File is already in S3. Using url: {get_url}") - return s3_file - except requests.exceptions.HTTPError: - LOGGER.info("File is not in S3 yet. Posting...") - - # POST - upload_details = s3_client.generate_presigned_post(bucket, key) - with open(filepath, "rb") as f: - response = requests.post( - upload_details["url"], - data=upload_details["fields"], - files={"file": (filepath.name, f)}, # maps filename to obj - ) - LOGGER.info(f"Upload response: {response.status_code}") - LOGGER.info(str(response.content)) - - return s3_file From 91aecd544549327310fa054bcc8871eb0ad15799 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 11:04:36 -0600 Subject: [PATCH 003/327] add ewms rest connection --- .github/workflows/wipac-cicd.yml | 1 - skydriver/__main__.py | 28 ++++++++++++++++++++++++++-- skydriver/config.py | 13 ++++++++++--- skydriver/database/utils.py | 2 +- skydriver/k8s/scan_backlog.py | 9 ++++++++- skydriver/rest_handlers.py | 3 +++ skydriver/server.py | 3 +++ 7 files changed, 51 insertions(+), 8 deletions(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 16b79cca..3be36066 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -3,7 +3,6 @@ name: wipac ci/cd on: [ push ] env: - CI_TEST: 'yes' THIS_IMAGE_WITH_TAG: 'ghcr.io/wipacrepo/skydriver:latest' EWMS_PILOT_TASK_TIMEOUT: 999 SCAN_BACKLOG_RUNNER_SHORT_DELAY: 1 diff --git a/skydriver/__main__.py b/skydriver/__main__.py index 75afdb14..82b3856f 100644 --- a/skydriver/__main__.py +++ b/skydriver/__main__.py @@ -3,12 +3,31 @@ import asyncio import logging +from rest_tools.client import ClientCredentialsAuth, RestClient + from . import database, k8s, server from .config import ENV, config_logging LOGGER = logging.getLogger(__name__) +def setup_ewms_client() -> RestClient: + """Connect to EWMS rest server.""" + if ENV.CI: + return RestClient( + ENV.EWMS_ADDRESS, + logger=LOGGER, + ) + else: + return ClientCredentialsAuth( + ENV.EWMS_ADDRESS, + ENV.EWMS_TOKEN_URL, + ENV.EWMS_CLIENT_ID, + ENV.EWMS_CLIENT_SECRET, + logger=LOGGER, + ) + + async def main() -> None: """Establish connections and start components.""" @@ -24,16 +43,21 @@ async def main() -> None: k8s_batch_api = k8s.setup_k8s_batch_api() LOGGER.info("K8s client connected.") + # EWMS rest client + LOGGER.info("Setting up EWMS client...") + ewms_rc = setup_ewms_client() + LOGGER.info("EWMS client connected.") + # Scan Backlog Runner LOGGER.info("Starting scan backlog runner...") backlog_task = asyncio.create_task( - k8s.scan_backlog.run(mongo_client, k8s_batch_api) + k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc) ) await asyncio.sleep(0) # start up previous task # REST Server LOGGER.info("Setting up REST server...") - rs = await server.make(mongo_client, k8s_batch_api) + rs = await server.make(mongo_client, k8s_batch_api, ewms_rc) rs.startup(address=ENV.REST_HOST, port=ENV.REST_PORT) # type: ignore[no-untyped-call] try: await asyncio.Event().wait() diff --git a/skydriver/config.py b/skydriver/config.py index e88e1c5f..0f55a703 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -40,7 +40,13 @@ class DebugMode(enum.Enum): class EnvConfig: """Environment variables.""" - # pylint:disable=invalid-name + # EWMS connections + EWMS_ADDRESS: str + EWMS_TOKEN_URL: str = "" # needed in prod + EWMS_CLIENT_ID: str = "" # '' + EWMS_CLIENT_SECRET: str = "" # '' + + # misc AUTH_AUDIENCE: str = "skydriver" AUTH_OPENID_URL: str = "" MONGODB_AUTH_PASS: str = "" # empty means no authentication required @@ -49,7 +55,8 @@ class EnvConfig: MONGODB_PORT: int = 27017 REST_HOST: str = "localhost" REST_PORT: int = 8080 - CI_TEST: bool = False + + CI: bool = False # github actions sets this to 'true' LOG_LEVEL: str = "DEBUG" LOG_LEVEL_THIRD_PARTY: str = "WARNING" @@ -176,7 +183,7 @@ def is_testing() -> bool: Note: this needs to run on import. """ - return ENV.CI_TEST + return ENV.CI def config_logging() -> None: diff --git a/skydriver/database/utils.py b/skydriver/database/utils.py index 908931b7..4f1dc522 100644 --- a/skydriver/database/utils.py +++ b/skydriver/database/utils.py @@ -73,7 +73,7 @@ async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ign async def drop_collections(motor_client: AsyncIOMotorClient) -> None: # type: ignore[valid-type] """Drop the "regular" collections -- most useful for testing.""" - if not ENV.CI_TEST: + if not ENV.CI: raise RuntimeError("Cannot drop collections if not in testing mode") await motor_client[_DB_NAME][_MANIFEST_COLL_NAME].drop() # type: ignore[index] await motor_client[_DB_NAME][_RESULTS_COLL_NAME].drop() # type: ignore[index] diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 5d6e47dc..5cc562ab 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -8,6 +8,7 @@ import bson import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient +from rest_tools.client import RestClient from tornado import web from .utils import KubeAPITools @@ -73,6 +74,7 @@ async def get_next_backlog_entry( async def run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, + ewms_rc: RestClient, ) -> None: """Error-handling around the scan backlog runner loop.""" LOGGER.info("Started scan backlog runner.") @@ -80,7 +82,7 @@ async def run( while True: # let's go! try: - await _run(mongo_client, k8s_batch_api) + await _run(mongo_client, k8s_batch_api, ewms_rc) except Exception as e: LOGGER.exception(e) @@ -150,6 +152,7 @@ def has_interval_elapsed(self) -> bool: async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, + ewms_rc: RestClient, ) -> None: """The (actual) main loop.""" manifests = database.interface.ManifestClient(mongo_client) @@ -174,6 +177,10 @@ async def _run( long_interval_timer.fastforward() continue # empty queue + # TODO: Request to SkyDriver + resp = await ewms_rc.request("POST", "/v0/workflows", {}) + # TODO: Start K8s Job + # get k8s job object try: job_obj = pickle.loads(entry.pickled_k8s_job) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 9a8987c5..7edaeff8 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -16,6 +16,7 @@ from dacite.exceptions import DaciteError from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from pymongo import ReturnDocument +from rest_tools.client import RestClient from rest_tools.server import ( ArgumentHandler, ArgumentSource, @@ -119,6 +120,7 @@ def initialize( # type: ignore # pylint: disable=W0221 self, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, + ewms_rc: RestClient, *args: Any, **kwargs: Any, ) -> None: @@ -141,6 +143,7 @@ def initialize( # type: ignore # pylint: disable=W0221 ) ) self.k8s_batch_api = k8s_batch_api + self.ewms_rc = ewms_rc # ---------------------------------------------------------------------------- diff --git a/skydriver/server.py b/skydriver/server.py index de1eebf8..df3b8223 100644 --- a/skydriver/server.py +++ b/skydriver/server.py @@ -5,6 +5,7 @@ import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient +from rest_tools.client import RestClient from rest_tools.server import RestHandlerSetup, RestServer from . import rest_handlers @@ -16,6 +17,7 @@ async def make( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, + ewms_rc: RestClient, ) -> RestServer: """Make a SkyDriver REST service (does not start up automatically).""" debug = is_testing() @@ -32,6 +34,7 @@ async def make( # Setup clients/apis args["mongo_client"] = mongo_client args["k8s_batch_api"] = k8s_batch_api + args["ewms_rc"] = ewms_rc # Configure REST Routes rs = RestServer(debug=debug) From ad0bfdda7ed1171a39b430658a13972958e9a342 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 14:07:19 -0700 Subject: [PATCH 004/327] incorporate ewms requesting: schema+backlogger updates --- CHANGELOG.md | 233 ++++++++++++++++++++++++------ skydriver/database/interface.py | 6 +- skydriver/database/schema.py | 25 +++- skydriver/ewms.py | 52 +++++++ skydriver/k8s/scan_backlog.py | 39 +++-- skydriver/k8s/scanner_instance.py | 10 +- skydriver/rest_handlers.py | 14 +- tests/unit/test_scan_state.py | 24 +-- 8 files changed, 315 insertions(+), 88 deletions(-) create mode 100644 skydriver/ewms.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 416f7c71..e71a1455 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -119,8 +119,6 @@ ## v0.8.0 (2023-11-29) - - ## v0.7.13 (2023-11-28) ### Other @@ -163,8 +161,8 @@ ### Other * update dependencies*.log files(s) ([`8625ece`](https://github.com/WIPACrepo/SkyDriver/commit/8625ece5f5765321c8a65711c34b4f7532355f9e)) -* Cluster-Watching: include `HoldReason` - 2 ([`96975ed`](https://github.com/WIPACrepo/SkyDriver/commit/96975ede756ae8a06ec09926ee0c0e0c998e31a4)) -* Cluster-Watching: include `HoldReason` ([`dd447e1`](https://github.com/WIPACrepo/SkyDriver/commit/dd447e1ccc9c685d79d01929a47cc321bba6fd3e)) +* ManualCluster-Watching: include `HoldReason` - 2 ([`96975ed`](https://github.com/WIPACrepo/SkyDriver/commit/96975ede756ae8a06ec09926ee0c0e0c998e31a4)) +* ManualCluster-Watching: include `HoldReason` ([`dd447e1`](https://github.com/WIPACrepo/SkyDriver/commit/dd447e1ccc9c685d79d01929a47cc321bba6fd3e)) ## v0.7.6 (2023-11-20) @@ -178,7 +176,7 @@ ### Other -* Cluster-Watching: only update skydriver if needed - 2 ([`fcdd669`](https://github.com/WIPACrepo/SkyDriver/commit/fcdd669bd22ee79d323a759bf9282cf6983d727d)) +* ManualCluster-Watching: only update skydriver if needed - 2 ([`fcdd669`](https://github.com/WIPACrepo/SkyDriver/commit/fcdd669bd22ee79d323a759bf9282cf6983d727d)) ## v0.7.4 (2023-11-20) @@ -190,30 +188,28 @@ ### Other -* Cluster-Watching: only update skydriver if needed ([`72654cc`](https://github.com/WIPACrepo/SkyDriver/commit/72654cc051e8234d6114f25eb2fc6c6cfe0482c4)) +* ManualCluster-Watching: only update skydriver if needed ([`72654cc`](https://github.com/WIPACrepo/SkyDriver/commit/72654cc051e8234d6114f25eb2fc6c6cfe0482c4)) ## v0.7.2 (2023-11-20) ### Other -* Cluster-Watching: CL arg fix ([`4d44629`](https://github.com/WIPACrepo/SkyDriver/commit/4d4462913612a1b7de420c75a9075e112a788a6c)) +* ManualCluster-Watching: CL arg fix ([`4d44629`](https://github.com/WIPACrepo/SkyDriver/commit/4d4462913612a1b7de420c75a9075e112a788a6c)) ## v0.7.1 (2023-11-20) ### Other -* Cluster-Watching: add `WATCHER_MAX_RUNTIME` ([`9e7171f`](https://github.com/WIPACrepo/SkyDriver/commit/9e7171fac83e35bf6ded50b9c81af8c46f57ad65)) +* ManualCluster-Watching: add `WATCHER_MAX_RUNTIME` ([`9e7171f`](https://github.com/WIPACrepo/SkyDriver/commit/9e7171fac83e35bf6ded50b9c81af8c46f57ad65)) ## v0.7.0 (2023-11-20) - - ## v0.6.20 (2023-11-16) ### Other * update dependencies*.log files(s) ([`2980045`](https://github.com/WIPACrepo/SkyDriver/commit/2980045aa51199a44f2812633fdfdfdef0165367)) -* Cluster-Watching: stop watching once we consistently get no response ([`5a50bfd`](https://github.com/WIPACrepo/SkyDriver/commit/5a50bfd03eeac3d4de0e544f82fa76ba94cbede8)) +* ManualCluster-Watching: stop watching once we consistently get no response ([`5a50bfd`](https://github.com/WIPACrepo/SkyDriver/commit/5a50bfd03eeac3d4de0e544f82fa76ba94cbede8)) ## v0.6.19 (2023-11-16) @@ -223,15 +219,15 @@ * Use `WIPACrepo/wipac-dev-py-setup-action@v2.9` ([`950ec18`](https://github.com/WIPACrepo/SkyDriver/commit/950ec186fc32b4983b1ae3fe130af57383bad78a)) * Mypy - 2 ([`6238014`](https://github.com/WIPACrepo/SkyDriver/commit/623801411ac5354ef9d084f25f9b42d4f62be980)) * Mypy ([`0d05891`](https://github.com/WIPACrepo/SkyDriver/commit/0d058914a034911d1ebb8162b0cf84211b5b0eda)) -* Cluster-Watching: updates for new ewms-pilot chirp attrs ([`48e3693`](https://github.com/WIPACrepo/SkyDriver/commit/48e3693f061cda4341373bcaa44043fd1b9add1d)) +* ManualCluster-Watching: updates for new ewms-pilot chirp attrs ([`48e3693`](https://github.com/WIPACrepo/SkyDriver/commit/48e3693f061cda4341373bcaa44043fd1b9add1d)) ## v0.6.18 (2023-11-09) ### Other * update dependencies*.log files(s) ([`8dcb032`](https://github.com/WIPACrepo/SkyDriver/commit/8dcb03270e7028878319d3eaec454eea8e99445f)) -* Cluster-Watching: use projection ([`d19cd67`](https://github.com/WIPACrepo/SkyDriver/commit/d19cd67e509a49535e5eac082686f7786d9b54ed)) -* Cluster-Watching: only query newly updated jobs ([`bd0d5c4`](https://github.com/WIPACrepo/SkyDriver/commit/bd0d5c445eae090e60c53b67853d17ab18da35a7)) +* ManualCluster-Watching: use projection ([`d19cd67`](https://github.com/WIPACrepo/SkyDriver/commit/d19cd67e509a49535e5eac082686f7786d9b54ed)) +* ManualCluster-Watching: only query newly updated jobs ([`bd0d5c4`](https://github.com/WIPACrepo/SkyDriver/commit/bd0d5c445eae090e60c53b67853d17ab18da35a7)) ## v0.6.17 (2023-11-09) @@ -240,13 +236,13 @@ * update dependencies*.log files(s) ([`dbf120c`](https://github.com/WIPACrepo/SkyDriver/commit/dbf120c20fc3756dd8e44c6d429cbdb7d3141fad)) * Mypy Fix ([`2ba43a8`](https://github.com/WIPACrepo/SkyDriver/commit/2ba43a813101b6c7c1ccd83dc772bd311009908e)) * Condor: Set `+OriginalTime` (4 hours) ([`95de516`](https://github.com/WIPACrepo/SkyDriver/commit/95de5169aa86ada9ff51dd3009e074ec27b6d7c5)) -* Cluster-Watching: ignore already-completed jobs ([`0b47442`](https://github.com/WIPACrepo/SkyDriver/commit/0b47442673875359bb4787cabb23106b9277c50f)) +* ManualCluster-Watching: ignore already-completed jobs ([`0b47442`](https://github.com/WIPACrepo/SkyDriver/commit/0b47442673875359bb4787cabb23106b9277c50f)) ## v0.6.16 (2023-11-08) ### Other -* Cluster-Watching: record source - 2 ([`ed75709`](https://github.com/WIPACrepo/SkyDriver/commit/ed7570938d3416750d1f8049c1a6f2303f06cf57)) +* ManualCluster-Watching: record source - 2 ([`ed75709`](https://github.com/WIPACrepo/SkyDriver/commit/ed7570938d3416750d1f8049c1a6f2303f06cf57)) ## v0.6.15 (2023-11-08) @@ -258,23 +254,23 @@ ### Other -* Cluster-Watching: use `htcondor.classad.unquote()` - 2 ([`e0b812e`](https://github.com/WIPACrepo/SkyDriver/commit/e0b812e6bbe083b899b2cb6df4e274ab83dd98f2)) +* ManualCluster-Watching: use `htcondor.classad.unquote()` - 2 ([`e0b812e`](https://github.com/WIPACrepo/SkyDriver/commit/e0b812e6bbe083b899b2cb6df4e274ab83dd98f2)) ## v0.6.13 (2023-11-08) ### Other * update dependencies*.log files(s) ([`a608f55`](https://github.com/WIPACrepo/SkyDriver/commit/a608f551d5efa5dd6ed00a64b188d25df7d952b4)) -* Cluster-Watching: filter completed jobs post-hoc ([`707247c`](https://github.com/WIPACrepo/SkyDriver/commit/707247c2b3919cb0f59f58448a2ba7748ee09a84)) -* Cluster-Watching: record source ([`cfdbcc9`](https://github.com/WIPACrepo/SkyDriver/commit/cfdbcc9ea45d4952b0e7533b765696ac778a253f)) -* Cluster-Watching: use `htcondor.classad.unquote()` ([`a595dde`](https://github.com/WIPACrepo/SkyDriver/commit/a595dde7362cc2e4ac752ed33d1c4f57890af1ac)) +* ManualCluster-Watching: filter completed jobs post-hoc ([`707247c`](https://github.com/WIPACrepo/SkyDriver/commit/707247c2b3919cb0f59f58448a2ba7748ee09a84)) +* ManualCluster-Watching: record source ([`cfdbcc9`](https://github.com/WIPACrepo/SkyDriver/commit/cfdbcc9ea45d4952b0e7533b765696ac778a253f)) +* ManualCluster-Watching: use `htcondor.classad.unquote()` ([`a595dde`](https://github.com/WIPACrepo/SkyDriver/commit/a595dde7362cc2e4ac752ed33d1c4f57890af1ac)) ## v0.6.12 (2023-11-07) ### Other -* Cluster-Watching: limit querying further - 2 ([`751cca8`](https://github.com/WIPACrepo/SkyDriver/commit/751cca836e421a7a351d4e64ba1237d6dfc99139)) -* Cluster-Watching: limit querying further ([`a96eba4`](https://github.com/WIPACrepo/SkyDriver/commit/a96eba4a5dcd475afce42ac1d3ffbcc30388b4e4)) +* ManualCluster-Watching: limit querying further - 2 ([`751cca8`](https://github.com/WIPACrepo/SkyDriver/commit/751cca836e421a7a351d4e64ba1237d6dfc99139)) +* ManualCluster-Watching: limit querying further ([`a96eba4`](https://github.com/WIPACrepo/SkyDriver/commit/a96eba4a5dcd475afce42ac1d3ffbcc30388b4e4)) ## v0.6.11 (2023-11-07) @@ -282,61 +278,61 @@ * update dependencies*.log files(s) ([`b9e4192`](https://github.com/WIPACrepo/SkyDriver/commit/b9e4192c72aa2c5740a35eb6a4721212f5416267)) * Mypy ([`08202e5`](https://github.com/WIPACrepo/SkyDriver/commit/08202e51a5068872ebf5c15f014fc21e06fcc811)) -* Cluster-Watching: limit querying ([`b912ecb`](https://github.com/WIPACrepo/SkyDriver/commit/b912ecb158883a603627bb605467082b99e47da6)) -* Cluster-Watching: handle condor types ([`a7238dd`](https://github.com/WIPACrepo/SkyDriver/commit/a7238dde60ab2f329b24decbd162e22b6fd31e6c)) +* ManualCluster-Watching: limit querying ([`b912ecb`](https://github.com/WIPACrepo/SkyDriver/commit/b912ecb158883a603627bb605467082b99e47da6)) +* ManualCluster-Watching: handle condor types ([`a7238dd`](https://github.com/WIPACrepo/SkyDriver/commit/a7238dde60ab2f329b24decbd162e22b6fd31e6c)) ## v0.6.10 (2023-11-06) ### Other * update dependencies*.log files(s) ([`8b45eca`](https://github.com/WIPACrepo/SkyDriver/commit/8b45ecaac7f7d79857a3e14fe7f434b36300d97a)) -* Cluster-Watching: store chirps ([`26fe1e2`](https://github.com/WIPACrepo/SkyDriver/commit/26fe1e2d10ba3ff3796154f1024b09424614f66a)) +* ManualCluster-Watching: store chirps ([`26fe1e2`](https://github.com/WIPACrepo/SkyDriver/commit/26fe1e2d10ba3ff3796154f1024b09424614f66a)) ## v0.6.9 (2023-10-31) ### Other * update dependencies*.log files(s) ([`7185334`](https://github.com/WIPACrepo/SkyDriver/commit/718533494d99042a9aa193110f29df661e31324c)) -* Cluster-Watching: add basic status aggregation and timely exit ([`934c671`](https://github.com/WIPACrepo/SkyDriver/commit/934c671e1fb4e00fb66ad07b1e51e253f3dea129)) +* ManualCluster-Watching: add basic status aggregation and timely exit ([`934c671`](https://github.com/WIPACrepo/SkyDriver/commit/934c671e1fb4e00fb66ad07b1e51e253f3dea129)) ## v0.6.8 (2023-10-31) ### Other -* Cluster-Watching Fix 5 ([`b83de7d`](https://github.com/WIPACrepo/SkyDriver/commit/b83de7d0da21468d6e17da311da99701a1e41c37)) +* ManualCluster-Watching Fix 5 ([`b83de7d`](https://github.com/WIPACrepo/SkyDriver/commit/b83de7d0da21468d6e17da311da99701a1e41c37)) ## v0.6.7 (2023-10-31) ### Other * update dependencies*.log files(s) ([`774aab8`](https://github.com/WIPACrepo/SkyDriver/commit/774aab81e9f528d2e999b6497185246a790c2cae)) -* Cluster-Watching Fix 4 ([`db1a960`](https://github.com/WIPACrepo/SkyDriver/commit/db1a96087917dcfe5719651b200ada1b78f85cf2)) +* ManualCluster-Watching Fix 4 ([`db1a960`](https://github.com/WIPACrepo/SkyDriver/commit/db1a96087917dcfe5719651b200ada1b78f85cf2)) ## v0.6.6 (2023-10-30) ### Other * update dependencies*.log files(s) ([`0dda5cc`](https://github.com/WIPACrepo/SkyDriver/commit/0dda5ccec403ef915f1b8f6cde8de80686dbc805)) -* Cluster-Watching Fix 3 ([`2f513f7`](https://github.com/WIPACrepo/SkyDriver/commit/2f513f712b12de36ff7a3c0afeef21beeecaa072)) +* ManualCluster-Watching Fix 3 ([`2f513f7`](https://github.com/WIPACrepo/SkyDriver/commit/2f513f712b12de36ff7a3c0afeef21beeecaa072)) ## v0.6.5 (2023-10-30) ### Other * Misc Test Fix ([`a794d28`](https://github.com/WIPACrepo/SkyDriver/commit/a794d282c5c817d76993ad95016573b4d1da6adf)) -* Cluster-Watching Fix 2 ([`ce6f2a5`](https://github.com/WIPACrepo/SkyDriver/commit/ce6f2a565476f2705d07b2827cd2ab40bcf9df1e)) +* ManualCluster-Watching Fix 2 ([`ce6f2a5`](https://github.com/WIPACrepo/SkyDriver/commit/ce6f2a565476f2705d07b2827cd2ab40bcf9df1e)) ## v0.6.4 (2023-10-30) ### Other -* Cluster-Watching Fix 1 ([`7265da7`](https://github.com/WIPACrepo/SkyDriver/commit/7265da73108996c4c7dcc8b94f7e9111431f0fb5)) +* ManualCluster-Watching Fix 1 ([`7265da7`](https://github.com/WIPACrepo/SkyDriver/commit/7265da73108996c4c7dcc8b94f7e9111431f0fb5)) ## v0.6.3 (2023-10-30) ### Other -* Add Cluster-watching to Starter ([#88](https://github.com/WIPACrepo/SkyDriver/issues/88)) ([`46f6502`](https://github.com/WIPACrepo/SkyDriver/commit/46f6502f143bd39af9698934b5085486811f88c3)) +* Add ManualCluster-watching to Starter ([#88](https://github.com/WIPACrepo/SkyDriver/issues/88)) ([`46f6502`](https://github.com/WIPACrepo/SkyDriver/commit/46f6502f143bd39af9698934b5085486811f88c3)) ## v0.6.2 (2023-10-25) @@ -355,8 +351,6 @@ ## v0.6.0 (2023-10-23) - - ## v0.5.16 (2023-10-17) ### Other @@ -461,8 +455,6 @@ ## v0.5.0 (2023-09-05) - - ## v0.4.1 (2023-09-01) ### Other @@ -633,7 +625,7 @@ ### Other -* Add GKE Cluster: `gke-2306` ([`7279ca3`](https://github.com/WIPACrepo/SkyDriver/commit/7279ca3db600a6c5c6aaef74d3892752902e4de2)) +* Add GKE ManualCluster: `gke-2306` ([`7279ca3`](https://github.com/WIPACrepo/SkyDriver/commit/7279ca3db600a6c5c6aaef74d3892752902e4de2)) * Update Auth Handling Pt-6 ([`287dc6c`](https://github.com/WIPACrepo/SkyDriver/commit/287dc6cfc87d92ac0369d62677dde304b488c06b)) ## v0.3.24 (2023-06-26) @@ -825,9 +817,8 @@ ## v0.2.0 (2023-06-13) - - ## v0.1.8 (2023-05-30) + ### Other * update requirements.txt ([`2f84163`](https://github.com/WIPACrepo/SkyDriver/commit/2f84163e1a0a7fb228c080ac4bec4b9782228252)) @@ -835,7 +826,9 @@ * Bump py-versions CI release v2.1 ([#41](https://github.com/WIPACrepo/SkyDriver/issues/41)) ([`3dd6477`](https://github.com/WIPACrepo/SkyDriver/commit/3dd6477af1f69d265dd7ac4832da6e61879737a2)) ## v0.1.7 (2023-05-09) + ### Other + * Resolve Dependency Conflicts ([#37](https://github.com/WIPACrepo/SkyDriver/issues/37)) ([`4fdbcf0`](https://github.com/WIPACrepo/SkyDriver/commit/4fdbcf0a5650eac3ab38774ce17496335a389683)) * update requirements.txt ([`62b92ab`](https://github.com/WIPACrepo/SkyDriver/commit/62b92abc2cde52944d386f3c8a8157bfe5635e6a)) * update requirements-tests.txt ([`fcca0a7`](https://github.com/WIPACrepo/SkyDriver/commit/fcca0a7b522f9288ec690b84d76c5afeb42d9e58)) @@ -843,7 +836,9 @@ * TMS: set `EWMS_PILOT_HTCHIRP=True` ([`7db476a`](https://github.com/WIPACrepo/SkyDriver/commit/7db476a590ab5efc1d8c4553aae3a174aec85105)) ## v0.1.6 (2023-05-03) + ### Other + * update requirements-clientmanager.txt ([`6fc3e7a`](https://github.com/WIPACrepo/SkyDriver/commit/6fc3e7a91e84e1a9fab22bd203592ed4e2ceb395)) * [split: fix imports] ([`29b7835`](https://github.com/WIPACrepo/SkyDriver/commit/29b7835fe0b4a34ec98fc061defd4abb3fd3ad2f)) * [split: trim] ([`563cf42`](https://github.com/WIPACrepo/SkyDriver/commit/563cf422fbdd6b5c85340f2b227af962d49f079c)) @@ -855,27 +850,37 @@ * Add `k8s/` ([`29b9673`](https://github.com/WIPACrepo/SkyDriver/commit/29b9673ecbfa447e0f882709dc3981d0dc072f2f)) ## v0.1.5 (2023-05-02) + ### Other + * update requirements-clientmanager.txt ([`c22f566`](https://github.com/WIPACrepo/SkyDriver/commit/c22f566b3cbc5231f91ee1f0adf008cb54a53ed7)) * TMS: Enable HTChirp ([`bf849c2`](https://github.com/WIPACrepo/SkyDriver/commit/bf849c22e897fd5cc90203839f47ca99ed0ce626)) ## v0.1.4 (2023-05-01) + ### Other + * Increase K8s Memory for TMS Starter ([`9a261fe`](https://github.com/WIPACrepo/SkyDriver/commit/9a261fef0c3f107341dc8f641d6a16e17629a5d1)) ## v0.1.3 (2023-05-01) + ### Other + * Return Empty Dict For Pending Result ([#34](https://github.com/WIPACrepo/SkyDriver/issues/34)) ([`e536f1b`](https://github.com/WIPACrepo/SkyDriver/commit/e536f1b97d858155ef14de1579cd24d54f673d24)) ## v0.1.2 (2023-04-26) + ### Other + * update requirements.txt ([`291b1c4`](https://github.com/WIPACrepo/SkyDriver/commit/291b1c4686b881f2805e352ff56dccbca112a724)) * update requirements-tests.txt ([`234f968`](https://github.com/WIPACrepo/SkyDriver/commit/234f9689d0d8ff6d7ee9f2e8c275736ea0388956)) * update requirements-clientmanager.txt ([`10fdea3`](https://github.com/WIPACrepo/SkyDriver/commit/10fdea365e54166f690ca0f4abbbf26a1e92613e)) * Fix K8s Memory Syntax ([`6a766c8`](https://github.com/WIPACrepo/SkyDriver/commit/6a766c85cb07535e64237502f71df8c9dce64484)) ## v0.1.1 (2023-04-25) + ### Other + * update requirements.txt ([`74e074d`](https://github.com/WIPACrepo/SkyDriver/commit/74e074d317c4031a8709bb0ca3be4bc6e05279cf)) * update requirements-tests.txt ([`00d2fb7`](https://github.com/WIPACrepo/SkyDriver/commit/00d2fb732997accfbe38628a31204b54ad14f2d2)) * update requirements-clientmanager.txt ([`c97d477`](https://github.com/WIPACrepo/SkyDriver/commit/c97d47722580c36c0ea8eee9c184b49b92c04040)) @@ -884,105 +889,150 @@ ## v0.1.0 (2023-04-17) - ## v0.0.76 (2023-04-17) + ### Other + * Pre-v0.1 Fixes ([#31](https://github.com/WIPACrepo/SkyDriver/issues/31)) ([`7353019`](https://github.com/WIPACrepo/SkyDriver/commit/73530195877c1a2ac9cd6d96b578fcad5d8a967e)) ## v0.0.75 (2023-04-14) + ### Other + * update requirements-tests.txt ([`0c30528`](https://github.com/WIPACrepo/SkyDriver/commit/0c3052878813ce8f24af56e424f1e81f853b4c2b)) * update requirements-clientmanager.txt ([`fa56532`](https://github.com/WIPACrepo/SkyDriver/commit/fa56532958bbd39be52710ff6c0197d9c0f9e5b3)) * Route API Documentation ([#28](https://github.com/WIPACrepo/SkyDriver/issues/28)) ([`a86802a`](https://github.com/WIPACrepo/SkyDriver/commit/a86802a49fa590d30ef0ebf20b561b5c6229d282)) ## v0.0.74 (2023-04-14) + ### Other + * Add `/scan/SCANID` + Route Refactors ([#30](https://github.com/WIPACrepo/SkyDriver/issues/30)) ([`05a8a69`](https://github.com/WIPACrepo/SkyDriver/commit/05a8a6906de200defe28f9541618e040d1120c31)) ## v0.0.73 (2023-04-13) + ### Other + * Tear Down After Final Result ([#29](https://github.com/WIPACrepo/SkyDriver/issues/29)) ([`7735a38`](https://github.com/WIPACrepo/SkyDriver/commit/7735a3886cb123262112e064e52da93e51a01ec4)) ## v0.0.72 (2023-04-11) + ### Other + * update requirements-clientmanager.txt ([`48fb5ca`](https://github.com/WIPACrepo/SkyDriver/commit/48fb5ca7970ec6cd3b0736b546b4a15b8d2e2d4d)) ## v0.0.71 (2023-04-11) + ### Other + * update requirements-clientmanager.txt ([`d899a9f`](https://github.com/WIPACrepo/SkyDriver/commit/d899a9fbeb33d00613faa95f3acf6c8e6e72d5e0)) * Fix S3 path url key name ([`f56d351`](https://github.com/WIPACrepo/SkyDriver/commit/f56d3512929fbb7c3f6dbfc031937b748bd84305)) ## v0.0.70 (2023-04-11) + ### Other + * Use `condor_token_sub2` ([`8d23cbf`](https://github.com/WIPACrepo/SkyDriver/commit/8d23cbfc8ddc3e9db990a1871365c91f6376acbc)) ## v0.0.69 (2023-04-10) + ### Other + * Condor: use filename in key instead pt-2 ([`7998366`](https://github.com/WIPACrepo/SkyDriver/commit/7998366e7f9be9d015bd3179f4fa341ba7650426)) * Condor: use filename in key instead ([`323dbff`](https://github.com/WIPACrepo/SkyDriver/commit/323dbff7116cce16dbc65714a6af11cb56b41b5a)) ## v0.0.68 (2023-04-10) + ### Other + * Rename S3 File (`ResponseContentDisposition`) ([#27](https://github.com/WIPACrepo/SkyDriver/issues/27)) ([`5806bd6`](https://github.com/WIPACrepo/SkyDriver/commit/5806bd608f3eacbb7f0ce93498f8c1126560f168)) ## v0.0.67 (2023-04-10) + ### Other + * Fix S3 pre-check ([`8633592`](https://github.com/WIPACrepo/SkyDriver/commit/8633592f34a55cdeab4b4274f799a407b6a9d66b)) ## v0.0.66 (2023-04-10) + ### Other + * Fix condor data types ([`714ae7d`](https://github.com/WIPACrepo/SkyDriver/commit/714ae7dd4ecd4d3cff3d2f67266c8216f3d75ff4)) ## v0.0.65 (2023-04-10) + ### Other + * Fix condor syntax ([`dbdfdaa`](https://github.com/WIPACrepo/SkyDriver/commit/dbdfdaa696fc83857eeeaddf8847bfbe58405287)) ## v0.0.64 (2023-04-10) + ### Other + * Fix starter/stopper arg ordering ([`22714a7`](https://github.com/WIPACrepo/SkyDriver/commit/22714a7d40c0247570427e22f69583c470dc4766)) ## v0.0.63 (2023-04-10) + ### Other + * update requirements-clientmanager.txt ([`0223ac7`](https://github.com/WIPACrepo/SkyDriver/commit/0223ac7bf3f806d18b71a68bf46e0453ee3a181c)) * Handle Multiple Condor Schedds ([#26](https://github.com/WIPACrepo/SkyDriver/issues/26)) ([`6d41e2b`](https://github.com/WIPACrepo/SkyDriver/commit/6d41e2b543b2e30c21943f5d8339e09a6b8b802e)) ## v0.0.62 (2023-04-07) + ### Other + * S3: point scanner to filename of name `key` ([`3fb88b3`](https://github.com/WIPACrepo/SkyDriver/commit/3fb88b306a8a8e74bfb230fe77b81c2acf7a54d5)) * Condor: set `initialdir = /tmp` when not spooling ([`5156cc2`](https://github.com/WIPACrepo/SkyDriver/commit/5156cc22400b6288834f1bf778d3b5987e781250)) ## v0.0.61 (2023-04-07) + ### Other + * Condor: use `skip_filechecks = True` when not spooling ([`d242250`](https://github.com/WIPACrepo/SkyDriver/commit/d24225039adc1971042b08d14189f9a34e699c02)) * Condor: spool when using `log`, etc. ([`0bea2dc`](https://github.com/WIPACrepo/SkyDriver/commit/0bea2dc0037f02ac9f8d3a93c9be3c920c8650ea)) ## v0.0.60 (2023-04-07) + ### Other + * TMS S3 Environment Variables ([#24](https://github.com/WIPACrepo/SkyDriver/issues/24)) ([`8ff144d`](https://github.com/WIPACrepo/SkyDriver/commit/8ff144d04f0d15cceba93f5be32861caa29ca930)) ## v0.0.59 (2023-04-07) + ### Other + * update requirements-clientmanager.txt ([`fda62b5`](https://github.com/WIPACrepo/SkyDriver/commit/fda62b59f87d6005f35c34bb00f2a8dc24a11890)) * Require `boto3` (not `boto`) ([`6f58e24`](https://github.com/WIPACrepo/SkyDriver/commit/6f58e24a5681438dfa98b9ee25d12ddda2641131)) ## v0.0.58 (2023-04-07) + ### Other + * Put Statup Files in S3 ([#22](https://github.com/WIPACrepo/SkyDriver/issues/22)) ([`654439d`](https://github.com/WIPACrepo/SkyDriver/commit/654439d6349323f7dbedb3e89c2572d1d0367f39)) ## v0.0.57 (2023-04-06) + ### Other + * Don't Persist Condor Outputs When Not Needed ([#21](https://github.com/WIPACrepo/SkyDriver/issues/21)) ([`1ea84b6`](https://github.com/WIPACrepo/SkyDriver/commit/1ea84b6d38c15358efe4750859ee162cbc8332d3)) ## v0.0.56 (2023-04-06) + ### Other + * `max_reco_time` / `EWMS_PILOT_SUBPROC_TIMEOUT` ([#20](https://github.com/WIPACrepo/SkyDriver/issues/20)) ([`81aedd0`](https://github.com/WIPACrepo/SkyDriver/commit/81aedd05173abf46943a5e0d83a9dde2276d1a4f)) ## v0.0.55 (2023-03-28) + ### Other + * Add `last updated` & `predictive scanning threshold` to progress report ([`5ac524e`](https://github.com/WIPACrepo/SkyDriver/commit/5ac524e9bdca4e1504625867712201257a32691d)) ## v0.0.54 (2023-03-27) + ### Other + * Fix tests ([`8b6be59`](https://github.com/WIPACrepo/SkyDriver/commit/8b6be59cd23428f6fccbab8fb4cba9ea1889cad8)) * Fix tests ([`0143f79`](https://github.com/WIPACrepo/SkyDriver/commit/0143f7996067314cf027ffe70022e78607eb01c2)) * update requirements.txt ([`2d9a20b`](https://github.com/WIPACrepo/SkyDriver/commit/2d9a20b26e9f1a29d44931430a7b9805a6ac2141)) @@ -991,22 +1041,30 @@ * Add `predictive_scanning_threshold` (default: `1.0`) ([`a95b459`](https://github.com/WIPACrepo/SkyDriver/commit/a95b45907a94c91d91c8d76f0105b7064e10245a)) ## v0.0.53 (2023-03-23) + ### Other + * Starter: require `has_avx2` ([`265d219`](https://github.com/WIPACrepo/SkyDriver/commit/265d219fd006e7239898abe489d6d3e11aa94b93)) ## v0.0.52 (2023-03-23) + ### Other + * All k8s env vars need to be strings ([`9c48cbd`](https://github.com/WIPACrepo/SkyDriver/commit/9c48cbd2f7573c65ab081215b629b4e12e392028)) ## v0.0.51 (2023-03-22) + ### Other + * update requirements.txt ([`5abee33`](https://github.com/WIPACrepo/SkyDriver/commit/5abee332cddeb63520bb211a4c1c55314ff82842)) * update requirements-tests.txt ([`1214f79`](https://github.com/WIPACrepo/SkyDriver/commit/1214f794a12b982827edf6336b4eda1855c89afb)) * update requirements-clientmanager.txt ([`3e18fc1`](https://github.com/WIPACrepo/SkyDriver/commit/3e18fc199fbeb04434b12df755595daad0992697)) * Truncate log dumps for `event_i3live_json_dict.value.data` ([`7d0158d`](https://github.com/WIPACrepo/SkyDriver/commit/7d0158d9beed2070c13d1249a29d47f7854750c7)) ## v0.0.50 (2023-03-22) + ### Other + * Fix tests ([`08152fe`](https://github.com/WIPACrepo/SkyDriver/commit/08152fec8f6dc511153c226ccc550ece2ea9ac25)) * update requirements.txt ([`b801241`](https://github.com/WIPACrepo/SkyDriver/commit/b801241863e4e304b29003b6c125d5f33f18ccff)) * update requirements-tests.txt ([`11afab4`](https://github.com/WIPACrepo/SkyDriver/commit/11afab4efe34c53f41c875683a6bc27ac94b5756)) @@ -1014,27 +1072,39 @@ * Add `RABBITMQ_HEARTBEAT` default as `3600` ([`8ef911d`](https://github.com/WIPACrepo/SkyDriver/commit/8ef911d4033256abb218995cb29086f3c954179f)) ## v0.0.49 (2023-03-21) + ### Other + * Make `memory` default `6GB` ([`6a130db`](https://github.com/WIPACrepo/SkyDriver/commit/6a130db02959bdf75f50783aec78ffa29eaa6c50)) ## v0.0.48 (2023-03-21) + ### Other + * Starter: fix multi-requests ([`50f7b7b`](https://github.com/WIPACrepo/SkyDriver/commit/50f7b7b5b4243de3ad902c9d9798a2d93bb43828)) ## v0.0.47 (2023-03-21) + ### Other + * Starter: fix type ([`cae870a`](https://github.com/WIPACrepo/SkyDriver/commit/cae870a676fe786026ce87a3dc4d18ba8e0544f3)) ## v0.0.46 (2023-03-21) + ### Other + * Starter: fix typo ([`861d04d`](https://github.com/WIPACrepo/SkyDriver/commit/861d04d075afff0e112fde89ad44647845edb5e3)) ## v0.0.45 (2023-03-21) + ### Other + * Enable Starting Multiple Condor Clusters ([#17](https://github.com/WIPACrepo/SkyDriver/issues/17)) ([`0bb4d89`](https://github.com/WIPACrepo/SkyDriver/commit/0bb4d898e49b64c2eaef4c41b5345e0f84afbe94)) ## v0.0.44 (2023-03-20) + ### Other + * Mypy ([`f4fa897`](https://github.com/WIPACrepo/SkyDriver/commit/f4fa8971768daee7f714a3e04ea4e2b8aed6efee)) * Require `flake8` and `mypy` jobs ([`51bc3e9`](https://github.com/WIPACrepo/SkyDriver/commit/51bc3e9d0d4ae9c4060b9b1ca42987366ebad145)) * Flake8 ([`3474ec2`](https://github.com/WIPACrepo/SkyDriver/commit/3474ec245c851ba05d8525e299cf2ed6d6e3a4a6)) @@ -1042,30 +1112,40 @@ * Clientmanager: fix logging pt-2 ([`ad0df97`](https://github.com/WIPACrepo/SkyDriver/commit/ad0df971b037fc7e269d85439084b98225464a03)) ## v0.0.43 (2023-03-20) + ### Other + * Always label k8s metadata with `app.kubernetes.io/instance` ([`f3a33ae`](https://github.com/WIPACrepo/SkyDriver/commit/f3a33ae51c580d9a8fc9d204a3f2e86cc3d9ff11)) * Clientmanager: fix logging ([`ece113b`](https://github.com/WIPACrepo/SkyDriver/commit/ece113b2704d3000fbbd9d820787202f9fa7595d)) ## v0.0.42 (2023-03-20) + ### Other + * Require `collector` & `schedd` ([`5fa5498`](https://github.com/WIPACrepo/SkyDriver/commit/5fa5498078499873b0795118178342c43ca311a5)) * Fix tests ([`8f92ff8`](https://github.com/WIPACrepo/SkyDriver/commit/8f92ff8a42c43c165f2626cac99774f21e88db5e)) * Merge remote-tracking branch 'origin/main' ([`2e6be6d`](https://github.com/WIPACrepo/SkyDriver/commit/2e6be6d4fc366b8f2e44f6b75e07bbc3abe46819)) * Fix CL arg ordering ([`2eff2c8`](https://github.com/WIPACrepo/SkyDriver/commit/2eff2c8926eaa436417b6c9acf576ff4a49a80f9)) ## v0.0.41 (2023-03-20) + ### Other + * Htcondor: syntax fix ([`7fb9f9f`](https://github.com/WIPACrepo/SkyDriver/commit/7fb9f9f7258eabbe2b909b4eaa56245bc0ee4867)) ## v0.0.40 (2023-03-20) + ### Other + * update requirements.txt ([`7db2064`](https://github.com/WIPACrepo/SkyDriver/commit/7db2064f1a6c9c3cf50e777be88528be0fd71064)) * update requirements-tests.txt ([`205866d`](https://github.com/WIPACrepo/SkyDriver/commit/205866d0ab80a4d3ec677c90f902b559fdd29799)) * update requirements-clientmanager.txt ([`cc22fa0`](https://github.com/WIPACrepo/SkyDriver/commit/cc22fa0747ce6a73eee1e3116ba139df38b86a2e)) * ClientManager Upgrades (Stop Action) ([#16](https://github.com/WIPACrepo/SkyDriver/issues/16)) ([`cfa28ac`](https://github.com/WIPACrepo/SkyDriver/commit/cfa28ac2396bd45ac394e26b77bfb6257d17b041)) ## v0.0.39 (2023-03-17) + ### Other + * update requirements.txt ([`caf59a4`](https://github.com/WIPACrepo/SkyDriver/commit/caf59a4ffd90d3dda0a41ba48ec1ce6f3e8e8398)) * update requirements-tests.txt ([`46e3557`](https://github.com/WIPACrepo/SkyDriver/commit/46e35575e5d86d02ca11e94c6bec2f4573aa47de)) * update requirements-clientmanager.txt ([`747dee6`](https://github.com/WIPACrepo/SkyDriver/commit/747dee6676b080dd55c727ea0fe424f51d04d4a7)) @@ -1076,19 +1156,27 @@ * Updated Docker Tag Logic ([#15](https://github.com/WIPACrepo/SkyDriver/issues/15)) ([`587c0ad`](https://github.com/WIPACrepo/SkyDriver/commit/587c0adaa109bf942b57d1d0b1217f927c25e802)) ## v0.0.38 (2023-03-16) + ### Other + * Clientmanager: fix sending cluster info, rollback dumping logs ([`98b5f65`](https://github.com/WIPACrepo/SkyDriver/commit/98b5f659af8b10c677954e817ecd9cb31c4b0368)) ## v0.0.37 (2023-03-15) + ### Other + * Clientmanager: use `/bin/bash` ([`4d8e25c`](https://github.com/WIPACrepo/SkyDriver/commit/4d8e25c95e1de53eb4da1f97f3f38b0c5f90d2b1)) ## v0.0.36 (2023-03-15) + ### Other + * Clientmanager: using logging for condor logs ([`b7dac93`](https://github.com/WIPACrepo/SkyDriver/commit/b7dac938b8baec350c9ad40323a6645ab04084d5)) ## v0.0.35 (2023-03-15) + ### Other + * update requirements.txt ([`a465df5`](https://github.com/WIPACrepo/SkyDriver/commit/a465df5089e74b80aac8bb3ce300be2f8f9c753e)) * update requirements-tests.txt ([`e54b93a`](https://github.com/WIPACrepo/SkyDriver/commit/e54b93a0d2311d06a4398b5a8e21c7402f21236c)) * update requirements-clientmanager.txt ([`0e910d7`](https://github.com/WIPACrepo/SkyDriver/commit/0e910d749627737fa199665e4d61338a30231f4f)) @@ -1097,152 +1185,211 @@ * Clientmanager: dump condor files when done launching ([`4963a7e`](https://github.com/WIPACrepo/SkyDriver/commit/4963a7ec53bb389be2936ba6aeb7908ad24f973e)) ## v0.0.34 (2023-03-10) + ### Other + * Cast to list ([`923189c`](https://github.com/WIPACrepo/SkyDriver/commit/923189c2ff8ead25574a9b099ef9a25dc627304f)) ## v0.0.33 (2023-03-10) + ### Other + * Add `job_description.jobs()` ([`542e1fc`](https://github.com/WIPACrepo/SkyDriver/commit/542e1fccc734ae99b662826da37395a0c51b5e15)) ## v0.0.32 (2023-03-10) + ### Other + * Add `spool()` call ([`1a61cfd`](https://github.com/WIPACrepo/SkyDriver/commit/1a61cfda2e41cd5c5d75762ad92dabecb4447de3)) * Use `spool=True` ([`20265cf`](https://github.com/WIPACrepo/SkyDriver/commit/20265cfd93e182e4631ec0a60deb13431bc8c7b4)) ## v0.0.31 (2023-03-10) + ### Other + * Add quotes to condor submit ([`bc0b538`](https://github.com/WIPACrepo/SkyDriver/commit/bc0b538356ac104a89bbb79446e848892f5881b9)) ## v0.0.30 (2023-03-10) + ### Other + * Create `~/.condor/tokens.d/` if needed ([`de17c11`](https://github.com/WIPACrepo/SkyDriver/commit/de17c1196e15e49f64996f9f8f82d0737903c0d6)) ## v0.0.29 (2023-03-10) + ### Other + * Update tests ([`84fdef4`](https://github.com/WIPACrepo/SkyDriver/commit/84fdef423c3b5d89c0537d3c1dd6cbf9276b8ca3)) * Forward `condor_token`/`CONDOR_TOKEN` to client starter ([`ebd6e47`](https://github.com/WIPACrepo/SkyDriver/commit/ebd6e474a5b1057c25505439abe474b9a9aa9d4b)) ## v0.0.28 (2023-03-10) + ### Other + * Fix test; mypy ([`17d358b`](https://github.com/WIPACrepo/SkyDriver/commit/17d358b888bcff7ebca4023e7d4b91eabbdd8b7b)) * If requestor attempts empty result, just ignore it ([`e8c8d2d`](https://github.com/WIPACrepo/SkyDriver/commit/e8c8d2d263012001aaa25ca5cf517e0091a2552c)) ## v0.0.27 (2023-03-10) + ### Other + * Clientmanager: fix scanid retrieval ([`0891275`](https://github.com/WIPACrepo/SkyDriver/commit/08912759330a83f46039fc08e938839de370e68d)) * Fix tests ([`4811bf9`](https://github.com/WIPACrepo/SkyDriver/commit/4811bf976f0c0ce56ad58615f26a9f5038481576)) * Re-key as `skyscan_result` ([`3e6b360`](https://github.com/WIPACrepo/SkyDriver/commit/3e6b360848ecbf259447025550874b7c0ec60c0d)) ## v0.0.26 (2023-03-09) + ### Other + * Fix tests ([`cbca980`](https://github.com/WIPACrepo/SkyDriver/commit/cbca980b97ff2813ac5d991d2f7cd52e75cbf0c5)) * Point to `ghcr.io/wipacrepo/skydriver:latest` for clientmanager ([`9580f83`](https://github.com/WIPACrepo/SkyDriver/commit/9580f836681ea59a94a6e80b314a666201842608)) * ClientManager / `client_starter.py` ([#14](https://github.com/WIPACrepo/SkyDriver/issues/14)) ([`bbb2cb1`](https://github.com/WIPACrepo/SkyDriver/commit/bbb2cb13a1de699bb8ac340ffd353d2bf5585c9e)) ## v0.0.25 (2023-03-09) + ### Other + * Add `spec.template.metadata.labels.app = scanner-instance` ([`8b84f1e`](https://github.com/WIPACrepo/SkyDriver/commit/8b84f1e9eadf46d985e2dd6e00e7fb2380105232)) ## v0.0.24 (2023-03-09) + ### Other + * Make `processing_stats.rate` optional ([`f7b3c24`](https://github.com/WIPACrepo/SkyDriver/commit/f7b3c244b9b2e1be577c348c424525b5f81bfc69)) ## v0.0.23 (2023-03-09) + ### Other + * Allow scanner account to GET @ `/scan/manifest/` ([`17d0bd3`](https://github.com/WIPACrepo/SkyDriver/commit/17d0bd37fd8c3008b18938fd9596ad5169ec8eca)) ## v0.0.22 (2023-03-09) + ### Other + * Fix tests ([`98d60fc`](https://github.com/WIPACrepo/SkyDriver/commit/98d60fc8fff31a45c30eed38c13887109eefe4f7)) * Fix volume mount (`/common-space`) ([`e3306cb`](https://github.com/WIPACrepo/SkyDriver/commit/e3306cb03b7549db8aa7d1d16ec1e7ae23348131)) ## v0.0.21 (2023-03-09) + ### Other + * Fix tests ([`08074f6`](https://github.com/WIPACrepo/SkyDriver/commit/08074f698a27a18c53a28c19b02189905fd78a36)) * Use one directory for the shared volume ([`a54a20c`](https://github.com/WIPACrepo/SkyDriver/commit/a54a20cad3f806cf4f05f7281292d581bb4d93df)) ## v0.0.20 (2023-03-08) + ### Other + * Set backoff limit to 1 to stop pod restarts ([`f18d771`](https://github.com/WIPACrepo/SkyDriver/commit/f18d7716ab804ae24c0e861dfb9d170c7da78ccd)) * Name each job "skyscan-{scan_id}" ([`28f93fe`](https://github.com/WIPACrepo/SkyDriver/commit/28f93feb17e6e225f8235ddd74422d38022b700e)) ## v0.0.19 (2023-03-08) + ### Other + * Don't let ArgoCD prune dynamic jobs ([`8989bd1`](https://github.com/WIPACrepo/SkyDriver/commit/8989bd1f4506963654a076f55210729f2cb25b22)) ## v0.0.18 (2023-03-08) + ### Other + * Add labels for k8s resource tracking ([`8596c7a`](https://github.com/WIPACrepo/SkyDriver/commit/8596c7a58a2a4b3cc37d67bd931ddc1a89a9eefe)) ## v0.0.17 (2023-03-08) + ### Other + * Make k8s job containers' names unique ([`4b86551`](https://github.com/WIPACrepo/SkyDriver/commit/4b865512db62aa5c411e0a044b7ea16ce89d0f7d)) ## v0.0.16 (2023-03-08) + ### Other + * Send 500 error if scanner k8s job fails to launch ([`4ab57ad`](https://github.com/WIPACrepo/SkyDriver/commit/4ab57ad4249860a1b2ee733abea75c0c3df747b2)) * Make index `event_run_index` non-unique ([`354da27`](https://github.com/WIPACrepo/SkyDriver/commit/354da2773665cc3acc9fc8ec46620f46f63becde)) ## v0.0.15 (2023-03-08) + ### Other + * Update Requestor Auth ([`6c208b2`](https://github.com/WIPACrepo/SkyDriver/commit/6c208b2d1102e628149d23baa3418123c3ed470e)) * Merge remote-tracking branch 'origin/main' ([`51f911b`](https://github.com/WIPACrepo/SkyDriver/commit/51f911b37f7ba09ccd5cbe19bf2374dfe3ed1c78)) * Appease mypy ([`929d164`](https://github.com/WIPACrepo/SkyDriver/commit/929d164769114224c2d4ee0523846a0127944919)) ## v0.0.14 (2023-03-07) + ### Other + * Update type hinting ([`9b4f318`](https://github.com/WIPACrepo/SkyDriver/commit/9b4f3188517b7f81ec69979ca585271a4ab9f5fe)) * Remove manual env var logging ([`89e9270`](https://github.com/WIPACrepo/SkyDriver/commit/89e927013f67fc675dae9dad02e6320f9ec256c9)) * CI `concurrency`: don't cancel on main/master/default ([`438a5d5`](https://github.com/WIPACrepo/SkyDriver/commit/438a5d557e15c03f7fd51f0fd67b1e2d5c202680)) ## v0.0.13 (2023-03-07) + ### Other + * Kube API Non-Default Configuration ([`26b7d96`](https://github.com/WIPACrepo/SkyDriver/commit/26b7d9605084bf5059b005dd79d97c3f33fb6c92)) ## v0.0.12 (2023-03-07) + ### Other + * Merge remote-tracking branch 'origin/main' ([`001aed3`](https://github.com/WIPACrepo/SkyDriver/commit/001aed3ce553ada10ae9adb86a642c4a75ec9434)) * Add More Logging ([`aad73c5`](https://github.com/WIPACrepo/SkyDriver/commit/aad73c53dc809e86487979aa82aa8c859b9c7020)) ## v0.0.11 (2023-03-07) + ### Other + * Merge remote-tracking branch 'origin/main' ([`0ca8014`](https://github.com/WIPACrepo/SkyDriver/commit/0ca8014b8cbfcd0238081d101d1979b3f0970e25)) * Kube API Default Configuration ([`5230e62`](https://github.com/WIPACrepo/SkyDriver/commit/5230e624e2634f164faa6e6b7d12ff40cce9d307)) * Mypy ([`4a2703d`](https://github.com/WIPACrepo/SkyDriver/commit/4a2703d8c09d6db0a6ea732f0edc0c8a5b3b722e)) ## v0.0.10 (2023-03-07) + ### Other + * Kube API Quick Fix ([`8fbb662`](https://github.com/WIPACrepo/SkyDriver/commit/8fbb662c8996eb9b1e3cfcfc647aeb193636dbac)) ## v0.0.9 (2023-03-07) + ### Other + * Env Var Quick Fix ([`daa2e1b`](https://github.com/WIPACrepo/SkyDriver/commit/daa2e1ba45b0b7ac3d2af74501b30993cf1f1f98)) ## v0.0.8 (2023-03-06) + ### Other + * Use `PERSONAL_ACCESS_TOKEN` for bot pt-2 ([`1c5fc65`](https://github.com/WIPACrepo/SkyDriver/commit/1c5fc65c4598613defe263b74c1d5942f1950cd7)) ## v0.0.7 (2023-03-06) + ### Other + * Use `PERSONAL_ACCESS_TOKEN` for bot ([`26822fa`](https://github.com/WIPACrepo/SkyDriver/commit/26822fa4ee60a706bbfb26e887e1c92c00a06e76)) ## v0.0.6 (2023-03-06) + ### Other + * Add Dockerfile & Publishing ([#11](https://github.com/WIPACrepo/SkyDriver/issues/11)) ([`eb6a122`](https://github.com/WIPACrepo/SkyDriver/commit/eb6a122a6ba24beae28b080c9a6784b379feb079)) ## v0.0.5 (2023-03-01) + ### Other + * Auth Part 2 ([#9](https://github.com/WIPACrepo/SkyDriver/issues/9)) ([`d8cde67`](https://github.com/WIPACrepo/SkyDriver/commit/d8cde67f295209651b10bc62fb00e9de7dc880e0)) ## v0.0.4 (2023-02-15) + ### Other + * Implement Auth ([#7](https://github.com/WIPACrepo/SkyDriver/issues/7)) ([`d92db06`](https://github.com/WIPACrepo/SkyDriver/commit/d92db0663f75c801528610a60ab8542ac52543d9)) ## v0.0.3 (2023-02-06) - ## v0.0.2 (2023-01-27) - ## v0.0.1 (2022-12-14) - diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index 655d0063..d36c3906 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -74,7 +74,7 @@ async def post( is_deleted=False, i3_event_id=i3_event_id, scanner_server_args=scanner_server_args, - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=tms_args_list, env_vars=env_vars, ), @@ -144,7 +144,7 @@ def _put_once_scan_metadata( def _put_ewms_task( in_db: schema.Manifest, upserting: dict, - cluster: schema.Cluster | None, + cluster: schema.ManualCluster | None, complete: bool | None, ): if not cluster and not complete: @@ -180,7 +180,7 @@ async def patch( progress: schema.Progress | None = None, event_metadata: schema.EventMetadata | None = None, scan_metadata: schema.StrDict | None = None, - cluster: schema.Cluster | None = None, + cluster: schema.ManualCluster | None = None, complete: bool | None = None, # workforce is done ) -> schema.Manifest: """Update `progress` at doc matching `scan_id`.""" diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 985b18fe..f3c7c0f5 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -133,7 +133,7 @@ class KubernetesLocation: @typechecked @dc.dataclass -class Cluster: +class ManualCluster: """Stores information for a worker cluster.""" orchestrator: Literal["condor", "k8s"] @@ -219,12 +219,13 @@ def obfuscate_cl_args(args: str) -> str: @typechecked @dc.dataclass -class EWMSTaskDirective: - """Encapsulates the directive of a unique EWMS task entity.""" +class ManualStarterInfo: + """Encapsulates what info is/was used for starting the scanner, within SkyDriver.""" tms_args: list[str] # TODO - move to TMS env_vars: EnvVars # TODO - move to TMS - clusters: list[Cluster] = dc.field(default_factory=list) + + clusters: list[ManualCluster] = dc.field(default_factory=list) # signifies k8s workers and condor cluster(s) AKA workforce is done complete: bool = False # TODO - move to TMS @@ -234,6 +235,20 @@ def __post_init__(self) -> None: # NOTE - self.env_vars done in EnvVars +@typechecked +@dc.dataclass +class EWMSRequestInfo: + """Some of the info sent to EWMS in the workflow request.""" + + cluster_locations: list[str] # TODO: does this need to be dict with n_workers? + n_workers: int + + workflow_id: str = "" # set once the request has been sent to EWMS + + # TODO: add more fields that are needed for EWMS but not already in manifest + # NOTE: besides 'workflow_id', this object is immutable + + DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" @@ -245,7 +260,7 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: EWMSTaskDirective + ewms_task: ManualStarterInfo | EWMSRequestInfo # yes, this was a poor naming choice # args placed in k8s job obj scanner_server_args: str diff --git a/skydriver/ewms.py b/skydriver/ewms.py new file mode 100644 index 00000000..35eeb948 --- /dev/null +++ b/skydriver/ewms.py @@ -0,0 +1,52 @@ +"""Tools for interfacing with EMWS.""" + +from rest_tools.client import RestClient + +from . import database + + +async def request_workflow_on_ewms( + ewms_rc: RestClient, + ewms_request_info: database.schema.EWMSRequestInfo, +) -> str: + """Request a workflow in EWMS.""" + body = { + "public_queue_aliases": ["to-client-queue", "from-client-queue"], + "tasks": [ + { + "cluster_locations": ewms_request_info.cluster_locations, + "input_queue_aliases": ["to-client-queue"], + "output_queue_aliases": ["from-client-queue"], + "task_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", + "task_args": "python -m skymap_scanner.client --infile {{INFILE}} --outfile {{OUTFILE}} --client-startup-json {{DATA_HUB}}/startup.json", + "init_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", + "init_args": "bash -c \"curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json '$S3_OBJECT_URL'\" ", + "n_workers": ewms_request_info.n_workers, + "pilot_config": { + "tag": "${PILOT_TAG:-'latest'}", + "environment": { + "EWMS_PILOT_INIT_TIMEOUT": 1 * 60, + "EWMS_PILOT_TASK_TIMEOUT": 1 * 60 * 60, + "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": 10 * 60, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": 5 * 60, + "EWMS_PILOT_CONTAINER_DEBUG": "True", + "EWMS_PILOT_INFILE_EXT": ".json", + "EWMS_PILOT_OUTFILE_EXT": ".json", + }, + "input_files": [], + }, + "worker_config": { + "do_transfer_worker_stdouterr": True, + "max_worker_runtime": 2 * 60 * 60, + "n_cores": 1, + "priority": 99, + "worker_disk": "512M", + "worker_memory": "8G", + "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", + }, + } + ], + } + """Request a new workflow on EWMS.""" + resp = await ewms_rc.request("POST", "/v0/workflows", body) + return resp["workflow"]["workflow_id"] diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 5cc562ab..65003499 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -12,7 +12,7 @@ from tornado import web from .utils import KubeAPITools -from .. import database +from .. import database, ewms from ..config import ENV LOGGER = logging.getLogger(__name__) @@ -46,7 +46,7 @@ async def get_next_backlog_entry( scan_backlog: database.interface.ScanBacklogClient, manifests: database.interface.ManifestClient, include_low_priority_scans: bool, -) -> database.schema.ScanBacklogEntry: +) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest]: """Get the next entry & remove any that have been cancelled.""" while True: # get next up -- raises DocumentNotFoundException if none @@ -68,7 +68,7 @@ async def get_next_backlog_entry( continue # all good! - return entry # ready to start job + return entry, manifest # ready to start job async def run( @@ -155,8 +155,8 @@ async def _run( ewms_rc: RestClient, ) -> None: """The (actual) main loop.""" - manifests = database.interface.ManifestClient(mongo_client) - scan_backlog = database.interface.ScanBacklogClient(mongo_client) + manifest_client = database.interface.ManifestClient(mongo_client) + backlog_client = database.interface.ScanBacklogClient(mongo_client) last_log_heartbeat = 0.0 # log every so often, not on every iteration long_interval_timer = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) @@ -167,9 +167,9 @@ async def _run( # get next entry try: - entry = await get_next_backlog_entry( - scan_backlog, - manifests, + entry, manifest = await get_next_backlog_entry( + backlog_client, + manifest_client, # include low priority scans only when enough time has passed include_low_priority_scans=long_interval_timer.has_interval_elapsed(), ) @@ -177,8 +177,21 @@ async def _run( long_interval_timer.fastforward() continue # empty queue - # TODO: Request to SkyDriver - resp = await ewms_rc.request("POST", "/v0/workflows", {}) + # request a workflow on EWMS + if isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): + try: + workflow_id = await ewms.request_workflow_on_ewms( + ewms_rc, manifest.ewms_task + ) + except Exception as e: + LOGGER.exception(e) + long_interval_timer.fastforward() # nothing was started, so don't wait long + continue + await manifest_client.collection.find_one_and_update( + {"scan_id": manifest.scan_id}, + {"$set": {"ewms_task.workflow_id": workflow_id}}, + ) + # TODO: Start K8s Job # get k8s job object @@ -194,15 +207,15 @@ async def _run( ) # NOTE: the job_obj is enormous, so don't log it - # start job + # start k8s job try: resp = KubeAPITools.start_job(k8s_batch_api, job_obj) LOGGER.info(resp) except kubernetes.client.exceptions.ApiException as e: - # job (entry) will be revived & restarted in future iteration + # k8s job (backlog entry) will be revived & restarted in future iteration LOGGER.exception(e) long_interval_timer.fastforward() # nothing was started, so don't wait long continue # remove from backlog now that startup succeeded - await scan_backlog.remove(entry) + await backlog_client.remove(entry) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index e1d1e95b..4354898b 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -25,7 +25,7 @@ def get_cluster_auth_v1envvars( - cluster: schema.Cluster, + cluster: schema.ManualCluster, ) -> list[kubernetes.client.V1EnvVar]: """Get the `V1EnvVar`s for workers' auth.""" LOGGER.debug(f"getting auth secret env vars for {cluster=}") @@ -75,7 +75,7 @@ def __init__( starter_exc: str, # TODO - remove once tested in prod worker_memory_bytes: int, worker_disk_bytes: int, - request_clusters: list[schema.Cluster], + request_clusters: list[schema.ManualCluster], max_pixel_reco_time: int, max_worker_runtime: int, priority: int, @@ -193,7 +193,7 @@ def get_cluster_starter_args( docker_tag: str, worker_memory_bytes: int, worker_disk_bytes: int, - request_cluster: schema.Cluster, + request_cluster: schema.ManualCluster, debug_mode: list[DebugMode], max_worker_runtime: int, priority: int, @@ -353,7 +353,7 @@ def make_skyscan_server_v1envvars( def make_cluster_starter_v1envvars( rest_address: str, scan_id: str, - cluster: schema.Cluster, + cluster: schema.ManualCluster, max_pixel_reco_time: int, debug_mode: list[DebugMode], ) -> list[kubernetes.client.V1EnvVar]: @@ -445,7 +445,7 @@ def __init__( self, k8s_batch_api: kubernetes.client.BatchV1Api, scan_id: str, - clusters: list[schema.Cluster], + clusters: list[schema.ManualCluster], ): self.k8s_batch_api = k8s_batch_api self.scan_id = scan_id diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 7edaeff8..0e2c91e0 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -228,17 +228,17 @@ async def get(self) -> None: # ----------------------------------------------------------------------------- -def _cluster_lookup(name: str, n_workers: int) -> database.schema.Cluster: - """Grab the Cluster object known using `name`.""" +def _cluster_lookup(name: str, n_workers: int) -> database.schema.ManualCluster: + """Grab the ManualCluster object known using `name`.""" if cluster := KNOWN_CLUSTERS.get(name): if cluster["orchestrator"] == "condor": - return database.schema.Cluster( + return database.schema.ManualCluster( orchestrator=cluster["orchestrator"], location=database.schema.HTCondorLocation(**cluster["location"]), n_workers=n_workers, ) elif cluster["orchestrator"] == "k8s": - return database.schema.Cluster( + return database.schema.ManualCluster( orchestrator=cluster["orchestrator"], location=database.schema.KubernetesLocation(**cluster["location"]), n_workers=n_workers, @@ -273,7 +273,7 @@ def _json_to_dict(val: Any) -> dict: def _dict_or_list_to_request_clusters( val: dict | list, -) -> list[database.schema.Cluster]: +) -> list[database.schema.ManualCluster]: _error = argparse.ArgumentTypeError( "must be a dict of cluster location and number of workers, Ex: {'sub-2': 1500, ...}" " (to request a cluster location more than once, provide a list of 2-lists instead)" @@ -461,7 +461,7 @@ async def post(self) -> None: raise web.HTTPError( 400, log_message=( - f"Too many workers: Cluster '{cname}' can only have " + f"Too many workers: ManualCluster '{cname}' can only have " f"{cinfo.get('max_n_clients_during_debug_mode')} " f"workers when 'debug_mode' " f"includes '{DebugMode.CLIENT_LOGS.value}'" @@ -878,7 +878,7 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: ) arghand.add_argument( "cluster", - type=lambda x: from_dict_wrapper_or_none(database.schema.Cluster, x), + type=lambda x: from_dict_wrapper_or_none(database.schema.ManualCluster, x), default=None, ) args = arghand.parse_args() diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index be9be076..74e6a1c7 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -1,9 +1,9 @@ """Test dynamically generating the scan state.""" - import time import pytest + from skydriver.database import schema @@ -15,7 +15,7 @@ def test_00__scan_finished_successfully() -> None: is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=True, @@ -57,12 +57,12 @@ def test_10__partial_result_generated( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.Cluster( + schema.ManualCluster( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -111,12 +111,12 @@ def test_20__waiting_on_first_pixel_reco( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.Cluster( + schema.ManualCluster( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -165,12 +165,12 @@ def test_30__waiting_on_cluster_startup( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, # clusters=[ - # schema.Cluster( + # schema.ManualCluster( # orchestrator="condor", # location=schema.HTCondorLocation( # collector="foo", @@ -219,12 +219,12 @@ def test_40__waiting_on_scanner_server_startup( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.Cluster( + schema.ManualCluster( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -271,12 +271,12 @@ def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.EWMSTaskDirective( + ewms_task=schema.ManualStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, # clusters=[ - # schema.Cluster( + # schema.ManualCluster( # orchestrator="condor", # location=schema.HTCondorLocation( # collector="foo", From de21edc3f063caf99472728ee7d57d073ea5a1d0 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 14:10:53 -0700 Subject: [PATCH 005/327] simplify manifest.post/put --- skydriver/database/interface.py | 30 ------------------------------ skydriver/rest_handlers.py | 23 +++++++++++++++-------- 2 files changed, 15 insertions(+), 38 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index d36c3906..b50e7c88 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -54,36 +54,6 @@ async def get(self, scan_id: str, incl_del: bool) -> schema.Manifest: ) from e return manifest - async def post( - self, - i3_event_id: str, - scan_id: str, - scanner_server_args: str, - tms_args_list: list[str], - env_vars: schema.EnvVars, - classifiers: dict[str, str | bool | float | int], - priority: int, - ) -> schema.Manifest: - """Create `schema.Manifest` doc.""" - LOGGER.debug("creating new manifest") - - # validate - manifest = schema.Manifest( - scan_id=scan_id, - timestamp=time.time(), - is_deleted=False, - i3_event_id=i3_event_id, - scanner_server_args=scanner_server_args, - ewms_task=schema.ManualStarterInfo( - tms_args=tms_args_list, - env_vars=env_vars, - ), - classifiers=classifiers, - priority=priority, - ) - - return await self.put(manifest) - async def put(self, manifest: schema.Manifest) -> schema.Manifest: """Put into db.""" try: diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 0e2c91e0..51d7a8f6 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -7,6 +7,7 @@ import logging import pickle import re +import time import uuid from typing import Any, Type, TypeVar @@ -564,15 +565,21 @@ async def _start_scan( ) # put in db (do before k8s start so if k8s fail, we can debug using db's info) - manifest = await manifests.post( - scan_request_obj["i3_event_id"], - scan_id, - scanner_wrapper.scanner_server_args, - scanner_wrapper.cluster_starter_args_list, - from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), - scan_request_obj["classifiers"], - scan_request_obj["priority"], + LOGGER.debug("creating new manifest") + manifest = schema.Manifest( + scan_id=scan_id, + timestamp=time.time(), + is_deleted=False, + i3_event_id=scan_request_obj["i3_event_id"], + scanner_server_args=scanner_wrapper.scanner_server_args, + ewms_task=schema.ManualStarterInfo( + tms_args=scanner_wrapper.cluster_starter_args_list, + env_vars=from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), + ), + classifiers=scan_request_obj["classifiers"], + priority=scan_request_obj["priority"], ) + await manifests.put(manifest) await designate_for_startup( scan_id, From f7da7778a78452c98bda35605991c12729df1e6b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 14:35:47 -0700 Subject: [PATCH 006/327] notes --- skydriver/database/schema.py | 1 + skydriver/ewms.py | 10 ++++++---- skydriver/k8s/scan_backlog.py | 4 +--- skydriver/rest_handlers.py | 5 ++++- 4 files changed, 12 insertions(+), 8 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index f3c7c0f5..7297d723 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -246,6 +246,7 @@ class EWMSRequestInfo: workflow_id: str = "" # set once the request has been sent to EWMS # TODO: add more fields that are needed for EWMS but not already in manifest + # OR the backlogger could also pull info from 'scan_request_obj' # NOTE: besides 'workflow_id', this object is immutable diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 35eeb948..e8fa6575 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -7,21 +7,23 @@ async def request_workflow_on_ewms( ewms_rc: RestClient, - ewms_request_info: database.schema.EWMSRequestInfo, + manifest: database.schema.Manifest, ) -> str: """Request a workflow in EWMS.""" + if not isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): + raise TypeError("Manifest is not designated for EWMS") body = { "public_queue_aliases": ["to-client-queue", "from-client-queue"], "tasks": [ { - "cluster_locations": ewms_request_info.cluster_locations, + "cluster_locations": manifest.ewms_task.cluster_locations, "input_queue_aliases": ["to-client-queue"], "output_queue_aliases": ["from-client-queue"], "task_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", "task_args": "python -m skymap_scanner.client --infile {{INFILE}} --outfile {{OUTFILE}} --client-startup-json {{DATA_HUB}}/startup.json", "init_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", "init_args": "bash -c \"curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json '$S3_OBJECT_URL'\" ", - "n_workers": ewms_request_info.n_workers, + "n_workers": manifest.ewms_task.n_workers, "pilot_config": { "tag": "${PILOT_TAG:-'latest'}", "environment": { @@ -39,7 +41,7 @@ async def request_workflow_on_ewms( "do_transfer_worker_stdouterr": True, "max_worker_runtime": 2 * 60 * 60, "n_cores": 1, - "priority": 99, + "priority": manifest.priority, "worker_disk": "512M", "worker_memory": "8G", "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 65003499..6b585414 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -180,9 +180,7 @@ async def _run( # request a workflow on EWMS if isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): try: - workflow_id = await ewms.request_workflow_on_ewms( - ewms_rc, manifest.ewms_task - ) + workflow_id = await ewms.request_workflow_on_ewms(ewms_rc, manifest) except Exception as e: LOGGER.exception(e) long_interval_timer.fastforward() # nothing was started, so don't wait long diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 51d7a8f6..f1d23594 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -543,7 +543,9 @@ async def _start_scan( is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], # cluster starter - starter_exc=str( # TODO - remove once tested in prod + # TODO: this arg could be good to control whether to use ewms or manual + # but not determined using 'classifiers'. May need to keep the attr bc pkl + starter_exc=str( scan_request_obj["classifiers"].get( "__unstable_starter_exc", "clientmanager" ) @@ -572,6 +574,7 @@ async def _start_scan( is_deleted=False, i3_event_id=scan_request_obj["i3_event_id"], scanner_server_args=scanner_wrapper.scanner_server_args, + # TODO: detect whether 'schema.EWMSRequestInfo' should be used (see 'starter_exc') above ewms_task=schema.ManualStarterInfo( tms_args=scanner_wrapper.cluster_starter_args_list, env_vars=from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), From 2da5adf5519926967033f4cad40020b25ca28856 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 19 Dec 2024 14:39:40 -0700 Subject: [PATCH 007/327] fix CHANGELOG.md --- CHANGELOG.md | 58 ++++++++++++++++++++++++++-------------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e71a1455..1913e794 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -161,8 +161,8 @@ ### Other * update dependencies*.log files(s) ([`8625ece`](https://github.com/WIPACrepo/SkyDriver/commit/8625ece5f5765321c8a65711c34b4f7532355f9e)) -* ManualCluster-Watching: include `HoldReason` - 2 ([`96975ed`](https://github.com/WIPACrepo/SkyDriver/commit/96975ede756ae8a06ec09926ee0c0e0c998e31a4)) -* ManualCluster-Watching: include `HoldReason` ([`dd447e1`](https://github.com/WIPACrepo/SkyDriver/commit/dd447e1ccc9c685d79d01929a47cc321bba6fd3e)) +* Cluster-Watching: include `HoldReason` - 2 ([`96975ed`](https://github.com/WIPACrepo/SkyDriver/commit/96975ede756ae8a06ec09926ee0c0e0c998e31a4)) +* Cluster-Watching: include `HoldReason` ([`dd447e1`](https://github.com/WIPACrepo/SkyDriver/commit/dd447e1ccc9c685d79d01929a47cc321bba6fd3e)) ## v0.7.6 (2023-11-20) @@ -176,7 +176,7 @@ ### Other -* ManualCluster-Watching: only update skydriver if needed - 2 ([`fcdd669`](https://github.com/WIPACrepo/SkyDriver/commit/fcdd669bd22ee79d323a759bf9282cf6983d727d)) +* Cluster-Watching: only update skydriver if needed - 2 ([`fcdd669`](https://github.com/WIPACrepo/SkyDriver/commit/fcdd669bd22ee79d323a759bf9282cf6983d727d)) ## v0.7.4 (2023-11-20) @@ -188,19 +188,19 @@ ### Other -* ManualCluster-Watching: only update skydriver if needed ([`72654cc`](https://github.com/WIPACrepo/SkyDriver/commit/72654cc051e8234d6114f25eb2fc6c6cfe0482c4)) +* Cluster-Watching: only update skydriver if needed ([`72654cc`](https://github.com/WIPACrepo/SkyDriver/commit/72654cc051e8234d6114f25eb2fc6c6cfe0482c4)) ## v0.7.2 (2023-11-20) ### Other -* ManualCluster-Watching: CL arg fix ([`4d44629`](https://github.com/WIPACrepo/SkyDriver/commit/4d4462913612a1b7de420c75a9075e112a788a6c)) +* Cluster-Watching: CL arg fix ([`4d44629`](https://github.com/WIPACrepo/SkyDriver/commit/4d4462913612a1b7de420c75a9075e112a788a6c)) ## v0.7.1 (2023-11-20) ### Other -* ManualCluster-Watching: add `WATCHER_MAX_RUNTIME` ([`9e7171f`](https://github.com/WIPACrepo/SkyDriver/commit/9e7171fac83e35bf6ded50b9c81af8c46f57ad65)) +* Cluster-Watching: add `WATCHER_MAX_RUNTIME` ([`9e7171f`](https://github.com/WIPACrepo/SkyDriver/commit/9e7171fac83e35bf6ded50b9c81af8c46f57ad65)) ## v0.7.0 (2023-11-20) @@ -209,7 +209,7 @@ ### Other * update dependencies*.log files(s) ([`2980045`](https://github.com/WIPACrepo/SkyDriver/commit/2980045aa51199a44f2812633fdfdfdef0165367)) -* ManualCluster-Watching: stop watching once we consistently get no response ([`5a50bfd`](https://github.com/WIPACrepo/SkyDriver/commit/5a50bfd03eeac3d4de0e544f82fa76ba94cbede8)) +* Cluster-Watching: stop watching once we consistently get no response ([`5a50bfd`](https://github.com/WIPACrepo/SkyDriver/commit/5a50bfd03eeac3d4de0e544f82fa76ba94cbede8)) ## v0.6.19 (2023-11-16) @@ -219,15 +219,15 @@ * Use `WIPACrepo/wipac-dev-py-setup-action@v2.9` ([`950ec18`](https://github.com/WIPACrepo/SkyDriver/commit/950ec186fc32b4983b1ae3fe130af57383bad78a)) * Mypy - 2 ([`6238014`](https://github.com/WIPACrepo/SkyDriver/commit/623801411ac5354ef9d084f25f9b42d4f62be980)) * Mypy ([`0d05891`](https://github.com/WIPACrepo/SkyDriver/commit/0d058914a034911d1ebb8162b0cf84211b5b0eda)) -* ManualCluster-Watching: updates for new ewms-pilot chirp attrs ([`48e3693`](https://github.com/WIPACrepo/SkyDriver/commit/48e3693f061cda4341373bcaa44043fd1b9add1d)) +* Cluster-Watching: updates for new ewms-pilot chirp attrs ([`48e3693`](https://github.com/WIPACrepo/SkyDriver/commit/48e3693f061cda4341373bcaa44043fd1b9add1d)) ## v0.6.18 (2023-11-09) ### Other * update dependencies*.log files(s) ([`8dcb032`](https://github.com/WIPACrepo/SkyDriver/commit/8dcb03270e7028878319d3eaec454eea8e99445f)) -* ManualCluster-Watching: use projection ([`d19cd67`](https://github.com/WIPACrepo/SkyDriver/commit/d19cd67e509a49535e5eac082686f7786d9b54ed)) -* ManualCluster-Watching: only query newly updated jobs ([`bd0d5c4`](https://github.com/WIPACrepo/SkyDriver/commit/bd0d5c445eae090e60c53b67853d17ab18da35a7)) +* Cluster-Watching: use projection ([`d19cd67`](https://github.com/WIPACrepo/SkyDriver/commit/d19cd67e509a49535e5eac082686f7786d9b54ed)) +* Cluster-Watching: only query newly updated jobs ([`bd0d5c4`](https://github.com/WIPACrepo/SkyDriver/commit/bd0d5c445eae090e60c53b67853d17ab18da35a7)) ## v0.6.17 (2023-11-09) @@ -236,13 +236,13 @@ * update dependencies*.log files(s) ([`dbf120c`](https://github.com/WIPACrepo/SkyDriver/commit/dbf120c20fc3756dd8e44c6d429cbdb7d3141fad)) * Mypy Fix ([`2ba43a8`](https://github.com/WIPACrepo/SkyDriver/commit/2ba43a813101b6c7c1ccd83dc772bd311009908e)) * Condor: Set `+OriginalTime` (4 hours) ([`95de516`](https://github.com/WIPACrepo/SkyDriver/commit/95de5169aa86ada9ff51dd3009e074ec27b6d7c5)) -* ManualCluster-Watching: ignore already-completed jobs ([`0b47442`](https://github.com/WIPACrepo/SkyDriver/commit/0b47442673875359bb4787cabb23106b9277c50f)) +* Cluster-Watching: ignore already-completed jobs ([`0b47442`](https://github.com/WIPACrepo/SkyDriver/commit/0b47442673875359bb4787cabb23106b9277c50f)) ## v0.6.16 (2023-11-08) ### Other -* ManualCluster-Watching: record source - 2 ([`ed75709`](https://github.com/WIPACrepo/SkyDriver/commit/ed7570938d3416750d1f8049c1a6f2303f06cf57)) +* Cluster-Watching: record source - 2 ([`ed75709`](https://github.com/WIPACrepo/SkyDriver/commit/ed7570938d3416750d1f8049c1a6f2303f06cf57)) ## v0.6.15 (2023-11-08) @@ -254,23 +254,23 @@ ### Other -* ManualCluster-Watching: use `htcondor.classad.unquote()` - 2 ([`e0b812e`](https://github.com/WIPACrepo/SkyDriver/commit/e0b812e6bbe083b899b2cb6df4e274ab83dd98f2)) +* Cluster-Watching: use `htcondor.classad.unquote()` - 2 ([`e0b812e`](https://github.com/WIPACrepo/SkyDriver/commit/e0b812e6bbe083b899b2cb6df4e274ab83dd98f2)) ## v0.6.13 (2023-11-08) ### Other * update dependencies*.log files(s) ([`a608f55`](https://github.com/WIPACrepo/SkyDriver/commit/a608f551d5efa5dd6ed00a64b188d25df7d952b4)) -* ManualCluster-Watching: filter completed jobs post-hoc ([`707247c`](https://github.com/WIPACrepo/SkyDriver/commit/707247c2b3919cb0f59f58448a2ba7748ee09a84)) -* ManualCluster-Watching: record source ([`cfdbcc9`](https://github.com/WIPACrepo/SkyDriver/commit/cfdbcc9ea45d4952b0e7533b765696ac778a253f)) -* ManualCluster-Watching: use `htcondor.classad.unquote()` ([`a595dde`](https://github.com/WIPACrepo/SkyDriver/commit/a595dde7362cc2e4ac752ed33d1c4f57890af1ac)) +* Cluster-Watching: filter completed jobs post-hoc ([`707247c`](https://github.com/WIPACrepo/SkyDriver/commit/707247c2b3919cb0f59f58448a2ba7748ee09a84)) +* Cluster-Watching: record source ([`cfdbcc9`](https://github.com/WIPACrepo/SkyDriver/commit/cfdbcc9ea45d4952b0e7533b765696ac778a253f)) +* Cluster-Watching: use `htcondor.classad.unquote()` ([`a595dde`](https://github.com/WIPACrepo/SkyDriver/commit/a595dde7362cc2e4ac752ed33d1c4f57890af1ac)) ## v0.6.12 (2023-11-07) ### Other -* ManualCluster-Watching: limit querying further - 2 ([`751cca8`](https://github.com/WIPACrepo/SkyDriver/commit/751cca836e421a7a351d4e64ba1237d6dfc99139)) -* ManualCluster-Watching: limit querying further ([`a96eba4`](https://github.com/WIPACrepo/SkyDriver/commit/a96eba4a5dcd475afce42ac1d3ffbcc30388b4e4)) +* Cluster-Watching: limit querying further - 2 ([`751cca8`](https://github.com/WIPACrepo/SkyDriver/commit/751cca836e421a7a351d4e64ba1237d6dfc99139)) +* Cluster-Watching: limit querying further ([`a96eba4`](https://github.com/WIPACrepo/SkyDriver/commit/a96eba4a5dcd475afce42ac1d3ffbcc30388b4e4)) ## v0.6.11 (2023-11-07) @@ -278,61 +278,61 @@ * update dependencies*.log files(s) ([`b9e4192`](https://github.com/WIPACrepo/SkyDriver/commit/b9e4192c72aa2c5740a35eb6a4721212f5416267)) * Mypy ([`08202e5`](https://github.com/WIPACrepo/SkyDriver/commit/08202e51a5068872ebf5c15f014fc21e06fcc811)) -* ManualCluster-Watching: limit querying ([`b912ecb`](https://github.com/WIPACrepo/SkyDriver/commit/b912ecb158883a603627bb605467082b99e47da6)) -* ManualCluster-Watching: handle condor types ([`a7238dd`](https://github.com/WIPACrepo/SkyDriver/commit/a7238dde60ab2f329b24decbd162e22b6fd31e6c)) +* Cluster-Watching: limit querying ([`b912ecb`](https://github.com/WIPACrepo/SkyDriver/commit/b912ecb158883a603627bb605467082b99e47da6)) +* Cluster-Watching: handle condor types ([`a7238dd`](https://github.com/WIPACrepo/SkyDriver/commit/a7238dde60ab2f329b24decbd162e22b6fd31e6c)) ## v0.6.10 (2023-11-06) ### Other * update dependencies*.log files(s) ([`8b45eca`](https://github.com/WIPACrepo/SkyDriver/commit/8b45ecaac7f7d79857a3e14fe7f434b36300d97a)) -* ManualCluster-Watching: store chirps ([`26fe1e2`](https://github.com/WIPACrepo/SkyDriver/commit/26fe1e2d10ba3ff3796154f1024b09424614f66a)) +* Cluster-Watching: store chirps ([`26fe1e2`](https://github.com/WIPACrepo/SkyDriver/commit/26fe1e2d10ba3ff3796154f1024b09424614f66a)) ## v0.6.9 (2023-10-31) ### Other * update dependencies*.log files(s) ([`7185334`](https://github.com/WIPACrepo/SkyDriver/commit/718533494d99042a9aa193110f29df661e31324c)) -* ManualCluster-Watching: add basic status aggregation and timely exit ([`934c671`](https://github.com/WIPACrepo/SkyDriver/commit/934c671e1fb4e00fb66ad07b1e51e253f3dea129)) +* Cluster-Watching: add basic status aggregation and timely exit ([`934c671`](https://github.com/WIPACrepo/SkyDriver/commit/934c671e1fb4e00fb66ad07b1e51e253f3dea129)) ## v0.6.8 (2023-10-31) ### Other -* ManualCluster-Watching Fix 5 ([`b83de7d`](https://github.com/WIPACrepo/SkyDriver/commit/b83de7d0da21468d6e17da311da99701a1e41c37)) +* Cluster-Watching Fix 5 ([`b83de7d`](https://github.com/WIPACrepo/SkyDriver/commit/b83de7d0da21468d6e17da311da99701a1e41c37)) ## v0.6.7 (2023-10-31) ### Other * update dependencies*.log files(s) ([`774aab8`](https://github.com/WIPACrepo/SkyDriver/commit/774aab81e9f528d2e999b6497185246a790c2cae)) -* ManualCluster-Watching Fix 4 ([`db1a960`](https://github.com/WIPACrepo/SkyDriver/commit/db1a96087917dcfe5719651b200ada1b78f85cf2)) +* Cluster-Watching Fix 4 ([`db1a960`](https://github.com/WIPACrepo/SkyDriver/commit/db1a96087917dcfe5719651b200ada1b78f85cf2)) ## v0.6.6 (2023-10-30) ### Other * update dependencies*.log files(s) ([`0dda5cc`](https://github.com/WIPACrepo/SkyDriver/commit/0dda5ccec403ef915f1b8f6cde8de80686dbc805)) -* ManualCluster-Watching Fix 3 ([`2f513f7`](https://github.com/WIPACrepo/SkyDriver/commit/2f513f712b12de36ff7a3c0afeef21beeecaa072)) +* Cluster-Watching Fix 3 ([`2f513f7`](https://github.com/WIPACrepo/SkyDriver/commit/2f513f712b12de36ff7a3c0afeef21beeecaa072)) ## v0.6.5 (2023-10-30) ### Other * Misc Test Fix ([`a794d28`](https://github.com/WIPACrepo/SkyDriver/commit/a794d282c5c817d76993ad95016573b4d1da6adf)) -* ManualCluster-Watching Fix 2 ([`ce6f2a5`](https://github.com/WIPACrepo/SkyDriver/commit/ce6f2a565476f2705d07b2827cd2ab40bcf9df1e)) +* Cluster-Watching Fix 2 ([`ce6f2a5`](https://github.com/WIPACrepo/SkyDriver/commit/ce6f2a565476f2705d07b2827cd2ab40bcf9df1e)) ## v0.6.4 (2023-10-30) ### Other -* ManualCluster-Watching Fix 1 ([`7265da7`](https://github.com/WIPACrepo/SkyDriver/commit/7265da73108996c4c7dcc8b94f7e9111431f0fb5)) +* Cluster-Watching Fix 1 ([`7265da7`](https://github.com/WIPACrepo/SkyDriver/commit/7265da73108996c4c7dcc8b94f7e9111431f0fb5)) ## v0.6.3 (2023-10-30) ### Other -* Add ManualCluster-watching to Starter ([#88](https://github.com/WIPACrepo/SkyDriver/issues/88)) ([`46f6502`](https://github.com/WIPACrepo/SkyDriver/commit/46f6502f143bd39af9698934b5085486811f88c3)) +* Add Cluster-watching to Starter ([#88](https://github.com/WIPACrepo/SkyDriver/issues/88)) ([`46f6502`](https://github.com/WIPACrepo/SkyDriver/commit/46f6502f143bd39af9698934b5085486811f88c3)) ## v0.6.2 (2023-10-25) @@ -625,7 +625,7 @@ ### Other -* Add GKE ManualCluster: `gke-2306` ([`7279ca3`](https://github.com/WIPACrepo/SkyDriver/commit/7279ca3db600a6c5c6aaef74d3892752902e4de2)) +* Add GKE Cluster: `gke-2306` ([`7279ca3`](https://github.com/WIPACrepo/SkyDriver/commit/7279ca3db600a6c5c6aaef74d3892752902e4de2)) * Update Auth Handling Pt-6 ([`287dc6c`](https://github.com/WIPACrepo/SkyDriver/commit/287dc6cfc87d92ac0369d62677dde304b488c06b)) ## v0.3.24 (2023-06-26) From aec82222b4f2c7a71dc0bc451cb889d82ca2ea4e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 14:33:22 -0600 Subject: [PATCH 008/327] add s3 url-generation to backlogger --- skydriver/config.py | 12 +++++++++--- skydriver/ewms.py | 5 +++-- skydriver/k8s/scan_backlog.py | 27 ++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 0f55a703..c08d0e44 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -42,9 +42,15 @@ class EnvConfig: # EWMS connections EWMS_ADDRESS: str - EWMS_TOKEN_URL: str = "" # needed in prod - EWMS_CLIENT_ID: str = "" # '' - EWMS_CLIENT_SECRET: str = "" # '' + EWMS_TOKEN_URL: str + EWMS_CLIENT_ID: str + EWMS_CLIENT_SECRET: str + + # s3 + S3_URL: str + S3_ACCESS_KEY_ID: str + S3_SECRET_KEY: str + S3_BUCKET: str # misc AUTH_AUDIENCE: str = "skydriver" diff --git a/skydriver/ewms.py b/skydriver/ewms.py index e8fa6575..6e612f17 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -12,6 +12,7 @@ async def request_workflow_on_ewms( """Request a workflow in EWMS.""" if not isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): raise TypeError("Manifest is not designated for EWMS") + body = { "public_queue_aliases": ["to-client-queue", "from-client-queue"], "tasks": [ @@ -25,7 +26,7 @@ async def request_workflow_on_ewms( "init_args": "bash -c \"curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json '$S3_OBJECT_URL'\" ", "n_workers": manifest.ewms_task.n_workers, "pilot_config": { - "tag": "${PILOT_TAG:-'latest'}", + "tag": "latest", "environment": { "EWMS_PILOT_INIT_TIMEOUT": 1 * 60, "EWMS_PILOT_TASK_TIMEOUT": 1 * 60 * 60, @@ -49,6 +50,6 @@ async def request_workflow_on_ewms( } ], } - """Request a new workflow on EWMS.""" + resp = await ewms_rc.request("POST", "/v0/workflows", body) return resp["workflow"]["workflow_id"] diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 6b585414..a5f713a9 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -5,6 +5,7 @@ import pickle import time +import boto3 import bson import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient @@ -149,6 +150,28 @@ def has_interval_elapsed(self) -> bool: return False +def generate_s3_url(scan_id: str) -> str: + """Generate a pre-signed S3 url for putting shared files.""" + s3_client = boto3.client( + "s3", + "us-east-1", + endpoint_url=ENV.S3_URL, + aws_access_key_id=ENV.S3_ACCESS_KEY_ID, + aws_secret_access_key=ENV.S3_SECRET_KEY, + ) + + # get GET url + get_url = s3_client.generate_presigned_url( + "get_object", + Params={ + "Bucket": ENV.S3_BUCKET, + "Key": f"{scan_id}-s3-object", + }, + ExpiresIn=24 * 60 * 60, # seconds + ) + return get_url + + async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, @@ -175,7 +198,9 @@ async def _run( ) except database.mongodc.DocumentNotFoundException: long_interval_timer.fastforward() - continue # empty queue + continue # empty queue- + + # generate pre-signed S3 url # request a workflow on EWMS if isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): From 4c000ee1b34858528e73cfc9bc31079ff3718d84 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 15:15:44 -0600 Subject: [PATCH 009/327] wip - s3, etc. --- skydriver/ewms.py | 25 ++++++++++++++++++++----- skydriver/k8s/scan_backlog.py | 29 ++++++++++++++++++++++++----- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 6e612f17..b8bfaae2 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -2,17 +2,22 @@ from rest_tools.client import RestClient -from . import database +from . import database, images async def request_workflow_on_ewms( ewms_rc: RestClient, manifest: database.schema.Manifest, + s3_obj_url: str, + scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" if not isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): raise TypeError("Manifest is not designated for EWMS") + image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) + # TODO: grab other values from scan request object; eventually, cut down k8s wrapper class + body = { "public_queue_aliases": ["to-client-queue", "from-client-queue"], "tasks": [ @@ -20,10 +25,20 @@ async def request_workflow_on_ewms( "cluster_locations": manifest.ewms_task.cluster_locations, "input_queue_aliases": ["to-client-queue"], "output_queue_aliases": ["from-client-queue"], - "task_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", - "task_args": "python -m skymap_scanner.client --infile {{INFILE}} --outfile {{OUTFILE}} --client-startup-json {{DATA_HUB}}/startup.json", - "init_image": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:$SKYSCAN_TAG", - "init_args": "bash -c \"curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json '$S3_OBJECT_URL'\" ", + "task_image": image, + "task_args": ( + "python -m skymap_scanner.client " + "--infile {{INFILE}} --outfile {{OUTFILE}} " + "--client-startup-json {{DATA_HUB}}/startup.json" + ), + "init_image": image, # piggyback this image since it's already present + "init_args": ( + "bash -c " + '"' # quote for bash -c "..." + "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " + f"'{s3_obj_url}'" # single-quote the url + '"' # unquote for bash -c "..." + ), "n_workers": manifest.ewms_task.n_workers, "pilot_config": { "tag": "latest", diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index a5f713a9..62267899 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -8,7 +8,7 @@ import boto3 import bson import kubernetes.client # type: ignore[import-untyped] -from motor.motor_asyncio import AsyncIOMotorClient +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web @@ -46,8 +46,9 @@ async def designate_for_startup( async def get_next_backlog_entry( scan_backlog: database.interface.ScanBacklogClient, manifests: database.interface.ManifestClient, + scan_request_client: AsyncIOMotorCollection, include_low_priority_scans: bool, -) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest]: +) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest, dict]: """Get the next entry & remove any that have been cancelled.""" while True: # get next up -- raises DocumentNotFoundException if none @@ -68,8 +69,13 @@ async def get_next_backlog_entry( await scan_backlog.remove(entry) continue + # grab the scan request object--it has other info + scan_request_obj = await scan_request_client.find_one( + {"scan_id": manifest.scan_id} + ) + # all good! - return entry, manifest # ready to start job + return entry, manifest, scan_request_obj # ready to start job async def run( @@ -180,6 +186,12 @@ async def _run( """The (actual) main loop.""" manifest_client = database.interface.ManifestClient(mongo_client) backlog_client = database.interface.ScanBacklogClient(mongo_client) + scan_request_client = ( + AsyncIOMotorCollection( # in contrast, this one is accessed directly + mongo_client[database.interface._DB_NAME], # type: ignore[index] + database.utils._SCAN_REQUEST_COLL_NAME, + ) + ) last_log_heartbeat = 0.0 # log every so often, not on every iteration long_interval_timer = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) @@ -190,9 +202,10 @@ async def _run( # get next entry try: - entry, manifest = await get_next_backlog_entry( + entry, manifest, scan_request_obj = await get_next_backlog_entry( backlog_client, manifest_client, + scan_request_client, # include low priority scans only when enough time has passed include_low_priority_scans=long_interval_timer.has_interval_elapsed(), ) @@ -201,11 +214,17 @@ async def _run( continue # empty queue- # generate pre-signed S3 url + s3_obj_url = generate_s3_url(manifest.scan_id) # request a workflow on EWMS if isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): try: - workflow_id = await ewms.request_workflow_on_ewms(ewms_rc, manifest) + workflow_id = await ewms.request_workflow_on_ewms( + ewms_rc, + manifest, + s3_obj_url, + scan_request_obj, + ) except Exception as e: LOGGER.exception(e) long_interval_timer.fastforward() # nothing was started, so don't wait long From 5c110f53c7e5a28d634afe4b3967b81914a3f23f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 16:16:42 -0600 Subject: [PATCH 010/327] squash merge in `replay` --- dependencies-from-Dockerfile.log | 56 +++++++------- resources/prod_tester/config.py | 1 + resources/prod_tester/test_getter.py | 12 +++ resources/prod_tester/test_runner.py | 19 ++++- resources/prod_tester/test_suit_prod.py | 97 ++++++++++++++++++++++--- skydriver/config.py | 3 +- skydriver/k8s/scan_backlog.py | 2 + skydriver/k8s/utils.py | 2 +- skydriver/rest_handlers.py | 41 ++++------- 9 files changed, 164 insertions(+), 69 deletions(-) diff --git a/dependencies-from-Dockerfile.log b/dependencies-from-Dockerfile.log index 4d0e63f9..4817e859 100644 --- a/dependencies-from-Dockerfile.log +++ b/dependencies-from-Dockerfile.log @@ -6,12 +6,13 @@ ######################################################################## # pip freeze ######################################################################## -boto3==1.35.84 -botocore==1.35.84 +boto3==1.35.83 +botocore==1.35.83 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 charset-normalizer==3.4.0 +coloredlogs==15.0.1 cryptography==44.0.0 dacite==1.8.1 dnspython==2.7.0 @@ -41,10 +42,13 @@ six==1.17.0 tornado==6.4.2 typeguard==4.4.1 typing_extensions==4.12.2 -urllib3==2.2.3 +urllib3==2.3.0 websocket-client==1.8.0 wipac-dev-tools==1.13.0 -wipac-rest-tools==1.8.4 +wipac-rest-tools==1.5.3 +wipac-telemetry==0.3.1 +wrapt==1.17.0 +zipp==3.21.0 ######################################################################## # pipdeptree ######################################################################## @@ -56,19 +60,19 @@ pipdeptree==2.24.0 └── pip [required: >=24.2, installed: 24.3.1] setuptools==65.5.1 skydriver-clientmanager-ewms-sidecar -├── boto3 [required: Any, installed: 1.35.84] -│ ├── botocore [required: >=1.35.84,<1.36.0, installed: 1.35.84] +├── boto3 [required: Any, installed: 1.35.83] +│ ├── botocore [required: >=1.35.83,<1.36.0, installed: 1.35.83] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] -│ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.2.3] +│ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.10.0,<0.11.0, installed: 0.10.4] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.84] +│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.83] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] -│ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.2.3] +│ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] ├── dacite [required: Any, installed: 1.8.1] ├── htcondor [required: Any, installed: 24.2.1] ├── humanfriendly [required: Any, installed: 10.0] @@ -87,18 +91,18 @@ skydriver-clientmanager-ewms-sidecar │ ├── PyYAML [required: >=5.4.1, installed: 6.0.2] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] -│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] +│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] -│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] +│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ ├── requests-oauthlib [required: Any, installed: 2.0.0] │ │ ├── oauthlib [required: >=3.0.0, installed: 3.2.2] │ │ └── requests [required: >=2.0.0, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] -│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] +│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] -│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] +│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ ├── six [required: >=1.9.0, installed: 1.17.0] -│ ├── urllib3 [required: >=1.24.2, installed: 2.2.3] +│ ├── urllib3 [required: >=1.24.2, installed: 2.3.0] │ └── websocket-client [required: >=0.32.0,!=0.42.*,!=0.41.*,!=0.40.0, installed: 1.8.0] ├── motor [required: ==3.3.2, installed: 3.3.2] │ └── pymongo [required: >=4.5,<5, installed: 4.6.1] @@ -107,41 +111,41 @@ skydriver-clientmanager-ewms-sidecar │ └── dnspython [required: >=1.16.0,<3.0.0, installed: 2.7.0] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] -│ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] +│ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] -│ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] +│ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.1] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] ├── wipac-dev-tools [required: Any, installed: 1.13.0] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] -│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] +│ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] -│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] +│ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ └── typing_extensions [required: Any, installed: 4.12.2] -└── wipac-rest-tools [required: Any, installed: 1.8.4] +└── wipac-rest-tools [required: <1.6.0, installed: 1.5.3] ├── cachetools [required: Any, installed: 5.5.0] ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] - │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] + │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] - │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] + │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── requests-futures [required: Any, installed: 1.0.2] │ └── requests [required: >=1.2.0, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] - │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] + │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] - │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] + │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] - ├── urllib3 [required: >=2.0.4, installed: 2.2.3] + ├── urllib3 [required: >=2.0.4, installed: 2.3.0] └── wipac-dev-tools [required: Any, installed: 1.13.0] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] - │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.0] + │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] - │ └── urllib3 [required: >=1.21.1,<3, installed: 2.2.3] + │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] └── typing_extensions [required: Any, installed: 4.12.2] wheel==0.45.1 diff --git a/resources/prod_tester/config.py b/resources/prod_tester/config.py index b90856e5..9c436504 100644 --- a/resources/prod_tester/config.py +++ b/resources/prod_tester/config.py @@ -3,6 +3,7 @@ from pathlib import Path SANDBOX_DIR = Path("./test-suit-sandbox") +SANDBOX_MAP_FPATH = SANDBOX_DIR / "map.json" GHA_FILE_URL = "https://raw.githubusercontent.com/icecube/skymap_scanner/main/.github/workflows/tests.yml" diff --git a/resources/prod_tester/test_getter.py b/resources/prod_tester/test_getter.py index 8bf2cd61..8a28f55d 100644 --- a/resources/prod_tester/test_getter.py +++ b/resources/prod_tester/test_getter.py @@ -40,6 +40,8 @@ class TestParamSet: test_status: TestStatus = TestStatus.UNKNOWN + rescan_origin_id: str = "" # set if the test suit is rescanning previous test-scans + @property def log_file(self) -> Path: """Based on the scan id.S""" @@ -47,6 +49,16 @@ def log_file(self) -> Path: raise ValueError("scan_id not set") return config.SANDBOX_DIR / f"logs/{self.scan_id}.log" + def to_json(self) -> dict: + """To a json-friendly dict.""" + return dict( + event_file=str(self.event_file), + reco_algo=self.reco_algo, + result_file=str(self.result_file), + scan_id=self.scan_id, + rescan_origin_id=self.rescan_origin_id, + ) + def download_file(url: str, dest: Path) -> Path: """Download a file from a URL.""" diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 257a477f..8a9800ff 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -40,13 +40,21 @@ def get_rest_client(skydriver_url: str) -> RestClient: ) +async def rescan_a_scan(rc: RestClient, rescan_origin_id: str) -> dict: + """Request to SkyDriver to rescan.""" + manifest = await rc.request("POST", f"/scan/{rescan_origin_id}/actions/rescan") + + print(manifest["scan_id"], flush=True) + return manifest # type: ignore[no-any-return] + + async def launch_a_scan( rc: RestClient, event_file: Path, cluster: str, n_workers: int, reco_algo: str, -) -> str: +) -> dict: """Request to SkyDriver to scan an event.""" body = { "reco_algo": reco_algo, @@ -62,11 +70,14 @@ async def launch_a_scan( "scanner_server_env": { "SKYSCAN_MINI_TEST": True, }, + "classifiers": { + "_TEST": True, + }, } - resp = await rc.request("POST", "/scan", body) + manifest = await rc.request("POST", "/scan", body) - print(resp["scan_id"], flush=True) - return resp["scan_id"] # type: ignore[no-any-return] + print(manifest["scan_id"], flush=True) + return manifest # type: ignore[no-any-return] async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> dict: diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 995002e2..1c35a7b6 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -7,6 +7,7 @@ import subprocess import tarfile from datetime import datetime +from pathlib import Path import texttable # type: ignore from rest_tools.client import RestClient @@ -118,14 +119,24 @@ async def launch_scans( ) test.test_status = test_getter.TestStatus.RUNNING try: - scan_id = await test_runner.launch_a_scan( - rc, - test.event_file, - cluster, - n_workers, - test.reco_algo, - ) - test.scan_id = scan_id + # rescan? + if test.rescan_origin_id: + manifest = await test_runner.rescan_a_scan( + rc, + test.rescan_origin_id, + ) + test.scan_id = manifest["scan_id"] + assert test.scan_id != test.rescan_origin_id + # or normal scan? + else: + manifest = await test_runner.launch_a_scan( + rc, + test.event_file, + cluster, + n_workers, + test.reco_algo, + ) + test.scan_id = manifest["scan_id"] except Exception as e: logging.error(f"Failed to launch test #{i+1}: {e}") raise @@ -156,23 +167,46 @@ def display_test_status(tests: list[test_getter.TestParamSet]): print(table.draw()) +def _match_rescans_to_tests( + rescans: list[test_getter.TestParamSet], tests: list[test_getter.TestParamSet] +) -> None: + """Match rescans to tests, in order to send the rescan id to skydriver.""" + logging.info("matching tests to rescan-tests") + logging.info(json.dumps([r.to_json() for r in rescans], indent=4)) + for t in tests: + for r in rescans: + if (t.reco_algo, t.event_file.name) == (r.reco_algo, r.event_file.name): + t.rescan_origin_id = r.scan_id + break + if not t.rescan_origin_id: + raise RuntimeError(f"could not match test to rescan-test: {t}") + + async def test_all( rc: RestClient, cluster: str, n_workers: int, -): + rescans: list[test_getter.TestParamSet] | None, +) -> None: + """Do all the tests.""" + # setup tests = list(test_getter.setup_tests()) - tests = await launch_scans( + if rescans: + _match_rescans_to_tests(rescans, tests) + + # launch! + tests = await launch_scans( # adds scan ids to 'tests' tests, rc, cluster, n_workers, ) + with open(config.SANDBOX_MAP_FPATH, "w") as f: # dump to file + json.dump([t.to_json() for t in tests], f, indent=4) display_test_status(tests) - checker = ResultChecker() - # start test-waiters + checker = ResultChecker() logging.info("Starting scan watchers...") tasks = set() for test in tests: @@ -199,6 +233,7 @@ async def test_all( logging.error(f"A test failed: {repr(e)}") display_test_status(tests) + # how'd it all go? if n_failed: raise RuntimeError(f"{n_failed}/{len(tests)} tests failed.") else: @@ -226,8 +261,45 @@ async def main(): type=int, help="number of workers to request", ) + parser.add_argument( + "--rescan", + default=False, + action="store_true", + help="submit rescans for all test-scans in existing (previously ran) sandbox", + ) + parser.add_argument( + "--rescan-dir", + type=Path, + default=config.SANDBOX_DIR, + help="the existing (previously ran) sandbox to submit rescans for", + ) args = parser.parse_args() + if args.rescan: + # grab json map + if args.rescan_dir.is_dir(): + with open(args.rescan_dir / config.SANDBOX_MAP_FPATH.name) as f: + json_data = json.loads(f.read()) + else: + with tarfile.open(args.rescan_dir) as tar: + member = tar.getmember( + f"{config.SANDBOX_DIR.name}/{config.SANDBOX_MAP_FPATH.name}" + ) + with tar.extractfile(member) as f: + json_data = json.loads(f.read()) + rescans = [ + test_getter.TestParamSet( + Path(x["event_file"]), + x["reco_algo"], + Path(x["result_file"]), + x["scan_id"], + ) + for x in json_data + ] + else: + rescans = None + + # tar existing sandbox if config.SANDBOX_DIR.exists(): logging.info( f"taring the existing '{config.SANDBOX_DIR}', then overwriting the directory" @@ -259,6 +331,7 @@ async def main(): rc, args.cluster, args.n_workers, + rescans, ) diff --git a/skydriver/config.py b/skydriver/config.py index c08d0e44..a8cd2b3d 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -78,7 +78,8 @@ class EnvConfig: K8S_SECRET_NAME: str = "" K8S_SKYSCAN_JOBS_SERVICE_ACCOUNT: str = "" K8S_APPLICATION_NAME: str = "" - K8S_TTL_SECONDS_AFTER_FINISHED: int = 600 + K8S_TTL_SECONDS_AFTER_FINISHED: int = 10 * 60 + K8S_ACTIVE_DEADLINE_SECONDS: int = 24 * 60 * 60 # keycloak KEYCLOAK_OIDC_URL: str = "" diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 62267899..de982571 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -11,6 +11,8 @@ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web +from motor.motor_asyncio import AsyncIOMotorClient +from tornado import web from .utils import KubeAPITools from .. import database, ewms diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 32097ad5..2fd98e7b 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -1,6 +1,5 @@ """An interface to the Kubernetes cluster.""" - import json import logging from pathlib import Path @@ -100,6 +99,7 @@ def kube_create_job_object( ttl_seconds_after_finished=ttl_seconds_after_finished, template=template.template, backoff_limit=n_retries, + active_deadline_seconds=ENV.K8S_ACTIVE_DEADLINE_SECONDS, ) return body diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index f1d23594..a292ab5f 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -5,9 +5,7 @@ import dataclasses as dc import json import logging -import pickle import re -import time import uuid from typing import Any, Type, TypeVar @@ -15,12 +13,8 @@ import kubernetes.client # type: ignore[import-untyped] from dacite import from_dict from dacite.exceptions import DaciteError -from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection -from pymongo import ReturnDocument -from rest_tools.client import RestClient +from motor.motor_asyncio import AsyncIOMotorClient from rest_tools.server import ( - ArgumentHandler, - ArgumentSource, RestHandler, token_attribute_role_mapping_auth, ) @@ -37,8 +31,6 @@ KNOWN_CLUSTERS, is_testing, ) -from .database import schema -from .k8s.scan_backlog import designate_for_startup from .k8s.scanner_instance import SkymapScannerK8sWrapper LOGGER = logging.getLogger(__name__) @@ -483,6 +475,9 @@ async def post(self) -> None: ) # -> store scan_request_obj in db scan_request_obj = dict( + scan_id=scan_id, + rescan_ids=[], + # docker_tag=args.docker_tag, scanner_server_memory_bytes=args.scanner_server_memory, # already in bytes reco_algo=args.reco_algo, @@ -490,13 +485,13 @@ async def post(self) -> None: real_or_simulated_event=args.real_or_simulated_event, predictive_scanning_threshold=args.predictive_scanning_threshold, classifiers=args.classifiers, - request_clusters=args.cluster, + request_clusters=[dataclasses.asdict(c) for c in args.cluster], worker_memory_bytes=args.worker_memory, worker_disk_bytes=args.worker_disk, # already in bytes max_pixel_reco_time=args.max_pixel_reco_time, max_worker_runtime=args.max_worker_runtime, priority=args.priority, - debug_mode=args.debug_mode, + debug_mode=[d.value for d in args.debug_mode], skyscan_mq_client_timeout_wait_for_first_message=( args.skyscan_mq_client_timeout_wait_for_first_message if args.skyscan_mq_client_timeout_wait_for_first_message != -1 @@ -506,19 +501,12 @@ async def post(self) -> None: rest_address=self.request.full_url().rstrip(self.request.uri), scanner_server_env_from_user=args.scanner_server_env, ) - await self.scan_request_coll.insert_one( - { - "scan_id": scan_id, - "scan_request_obj_pkl": pickle.dumps(scan_request_obj), - # ^^^ can be well compressed, obj will only be decompressed for re-scans - }, - ) + await self.scan_request_coll.insert_one(scan_request_obj) # go! manifest = await _start_scan( self.manifests, self.scan_backlog, - scan_id, scan_request_obj, ) self.write( @@ -529,9 +517,11 @@ async def post(self) -> None: async def _start_scan( manifests: database.interface.ManifestClient, scan_backlog: database.interface.ScanBacklogClient, - scan_id: str, scan_request_obj: dict, + new_scan_id: str = "", # don't use scan_request_obj.scan_id--this could be a rescan ) -> schema.Manifest: + scan_id = new_scan_id or scan_request_obj["scan_id"] + # get the container info ready scanner_wrapper = SkymapScannerK8sWrapper( docker_tag=scan_request_obj["docker_tag"], @@ -543,21 +533,22 @@ async def _start_scan( is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], # cluster starter - # TODO: this arg could be good to control whether to use ewms or manual - # but not determined using 'classifiers'. May need to keep the attr bc pkl - starter_exc=str( + starter_exc=str( # TODO - remove once tested in prod scan_request_obj["classifiers"].get( "__unstable_starter_exc", "clientmanager" ) ), - request_clusters=scan_request_obj["request_clusters"], + request_clusters=[ + dacite.from_dict(database.schema.Cluster, c) + for c in scan_request_obj["request_clusters"] + ], worker_memory_bytes=scan_request_obj["worker_memory_bytes"], worker_disk_bytes=scan_request_obj["worker_disk_bytes"], max_pixel_reco_time=scan_request_obj["max_pixel_reco_time"], max_worker_runtime=scan_request_obj["max_worker_runtime"], priority=scan_request_obj["priority"], # universal - debug_mode=scan_request_obj["debug_mode"], + debug_mode=_debug_mode(scan_request_obj["debug_mode"]), # env rest_address=scan_request_obj["rest_address"], skyscan_mq_client_timeout_wait_for_first_message=scan_request_obj[ From 5b12406f2452ab374df2e0f7644ba64e31460881 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 16:41:51 -0600 Subject: [PATCH 011/327] only use workflow id --- skydriver/database/schema.py | 18 ++---------------- skydriver/ewms.py | 4 ++-- skydriver/k8s/scan_backlog.py | 14 +++++--------- skydriver/rest_handlers.py | 14 ++++++++++++-- 4 files changed, 21 insertions(+), 29 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 7297d723..921e9308 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -235,21 +235,6 @@ def __post_init__(self) -> None: # NOTE - self.env_vars done in EnvVars -@typechecked -@dc.dataclass -class EWMSRequestInfo: - """Some of the info sent to EWMS in the workflow request.""" - - cluster_locations: list[str] # TODO: does this need to be dict with n_workers? - n_workers: int - - workflow_id: str = "" # set once the request has been sent to EWMS - - # TODO: add more fields that are needed for EWMS but not already in manifest - # OR the backlogger could also pull info from 'scan_request_obj' - # NOTE: besides 'workflow_id', this object is immutable - - DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" @@ -261,7 +246,8 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: ManualStarterInfo | EWMSRequestInfo # yes, this was a poor naming choice + ewms_task: ManualStarterInfo | str # yes, this was a poor naming choice + # ^^^ str -> EWMS workflow id (i.e. this id points to info in the EWMS db) # args placed in k8s job obj scanner_server_args: str diff --git a/skydriver/ewms.py b/skydriver/ewms.py index b8bfaae2..646d6187 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -12,7 +12,7 @@ async def request_workflow_on_ewms( scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" - if not isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): + if not isinstance(manifest.ewms_task, str): raise TypeError("Manifest is not designated for EWMS") image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) @@ -22,7 +22,7 @@ async def request_workflow_on_ewms( "public_queue_aliases": ["to-client-queue", "from-client-queue"], "tasks": [ { - "cluster_locations": manifest.ewms_task.cluster_locations, + "cluster_locations": scan_request_obj["request_clusters"], "input_queue_aliases": ["to-client-queue"], "output_queue_aliases": ["from-client-queue"], "task_image": image, diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index de982571..663e117e 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -11,8 +11,6 @@ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web -from motor.motor_asyncio import AsyncIOMotorClient -from tornado import web from .utils import KubeAPITools from .. import database, ewms @@ -215,12 +213,10 @@ async def _run( long_interval_timer.fastforward() continue # empty queue- - # generate pre-signed S3 url - s3_obj_url = generate_s3_url(manifest.scan_id) - - # request a workflow on EWMS - if isinstance(manifest.ewms_task, database.schema.EWMSRequestInfo): + # request a workflow on EWMS? + if not isinstance(manifest.ewms_task, database.schema.ManualStarterInfo): try: + s3_obj_url = generate_s3_url(manifest.scan_id) workflow_id = await ewms.request_workflow_on_ewms( ewms_rc, manifest, @@ -233,7 +229,7 @@ async def _run( continue await manifest_client.collection.find_one_and_update( {"scan_id": manifest.scan_id}, - {"$set": {"ewms_task.workflow_id": workflow_id}}, + {"$set": {"ewms_task": workflow_id}}, ) # TODO: Start K8s Job @@ -251,7 +247,7 @@ async def _run( ) # NOTE: the job_obj is enormous, so don't log it - # start k8s job + # start k8s job -- this could be any k8s job (pre- or post-ewms switchover) try: resp = KubeAPITools.start_job(k8s_batch_api, job_obj) LOGGER.info(resp) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index a292ab5f..853b943c 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -2,19 +2,27 @@ import argparse import asyncio +import dataclasses import dataclasses as dc import json import logging +import pickle import re +import time import uuid from typing import Any, Type, TypeVar +import dacite import humanfriendly import kubernetes.client # type: ignore[import-untyped] from dacite import from_dict from dacite.exceptions import DaciteError -from motor.motor_asyncio import AsyncIOMotorClient +from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection +from pymongo import ReturnDocument +from rest_tools.client import RestClient from rest_tools.server import ( + ArgumentHandler, + ArgumentSource, RestHandler, token_attribute_role_mapping_auth, ) @@ -31,6 +39,8 @@ KNOWN_CLUSTERS, is_testing, ) +from .database import schema +from .k8s.scan_backlog import designate_for_startup from .k8s.scanner_instance import SkymapScannerK8sWrapper LOGGER = logging.getLogger(__name__) @@ -565,7 +575,7 @@ async def _start_scan( is_deleted=False, i3_event_id=scan_request_obj["i3_event_id"], scanner_server_args=scanner_wrapper.scanner_server_args, - # TODO: detect whether 'schema.EWMSRequestInfo' should be used (see 'starter_exc') above + # TODO: switch over to ewms design ewms_task=schema.ManualStarterInfo( tms_args=scanner_wrapper.cluster_starter_args_list, env_vars=from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), From a8f28436b3c06f02b131122ffb097e63c41d61ae Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 17:16:16 -0600 Subject: [PATCH 012/327] use simplified cluster design --- skydriver/ewms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 646d6187..7d340fe4 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -22,7 +22,9 @@ async def request_workflow_on_ewms( "public_queue_aliases": ["to-client-queue", "from-client-queue"], "tasks": [ { - "cluster_locations": scan_request_obj["request_clusters"], + "cluster_locations": [ + cname for cname, _ in scan_request_obj["request_clusters"] + ], "input_queue_aliases": ["to-client-queue"], "output_queue_aliases": ["from-client-queue"], "task_image": image, @@ -39,7 +41,8 @@ async def request_workflow_on_ewms( f"'{s3_obj_url}'" # single-quote the url '"' # unquote for bash -c "..." ), - "n_workers": manifest.ewms_task.n_workers, + "n_workers": scan_request_obj["request_clusters"][0][1], + # TODO: ^^^ pass on varying # of workers per cluster "pilot_config": { "tag": "latest", "environment": { From 014956847ef0a26dccd24596f5021c6af4469af5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 17:25:03 -0600 Subject: [PATCH 013/327] squash merge in `replay` --- dependencies-from-Dockerfile.log | 20 +++---- skydriver/rest_handlers.py | 90 +++++++++++++++++++++++++------- 2 files changed, 79 insertions(+), 31 deletions(-) diff --git a/dependencies-from-Dockerfile.log b/dependencies-from-Dockerfile.log index 4817e859..e94f71e4 100644 --- a/dependencies-from-Dockerfile.log +++ b/dependencies-from-Dockerfile.log @@ -6,13 +6,12 @@ ######################################################################## # pip freeze ######################################################################## -boto3==1.35.83 -botocore==1.35.83 +boto3==1.35.92 +botocore==1.35.92 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 -charset-normalizer==3.4.0 -coloredlogs==15.0.1 +charset-normalizer==3.4.1 cryptography==44.0.0 dacite==1.8.1 dnspython==2.7.0 @@ -45,10 +44,7 @@ typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 wipac-dev-tools==1.13.0 -wipac-rest-tools==1.5.3 -wipac-telemetry==0.3.1 -wrapt==1.17.0 -zipp==3.21.0 +wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree ######################################################################## @@ -60,15 +56,15 @@ pipdeptree==2.24.0 └── pip [required: >=24.2, installed: 24.3.1] setuptools==65.5.1 skydriver-clientmanager-ewms-sidecar -├── boto3 [required: Any, installed: 1.35.83] -│ ├── botocore [required: >=1.35.83,<1.36.0, installed: 1.35.83] +├── boto3 [required: Any, installed: 1.35.92] +│ ├── botocore [required: >=1.35.92,<1.36.0, installed: 1.35.92] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.10.0,<0.11.0, installed: 0.10.4] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.83] +│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.92] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -124,7 +120,7 @@ skydriver-clientmanager-ewms-sidecar │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ └── typing_extensions [required: Any, installed: 4.12.2] -└── wipac-rest-tools [required: <1.6.0, installed: 1.5.3] +└── wipac-rest-tools [required: Any, installed: 1.8.5] ├── cachetools [required: Any, installed: 5.5.0] ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 853b943c..7270230f 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -274,25 +274,34 @@ def _json_to_dict(val: Any) -> dict: raise _error -def _dict_or_list_to_request_clusters( +def _validate_request_clusters( val: dict | list, -) -> list[database.schema.ManualCluster]: +) -> list[tuple[str, int]]: _error = argparse.ArgumentTypeError( "must be a dict of cluster location and number of workers, Ex: {'sub-2': 1500, ...}" " (to request a cluster location more than once, provide a list of 2-lists instead)" - # TODO: make n_workers optional when using "TMS smart starter" + # TODO: make n_workers optional when using "EWMS smart starter" ) if isinstance(val, dict): - val = list(val.items()) # {'a': 1, 'b': 2} -> [('a', 1), ('b', 2)} - if not val: + # {'a': 1, 'b': 2} -> [('a', 1), ('b', 2)} + list_tups: list[tuple[str, int]] = list(val.items()) + else: + list_tups = val + del val + + # validate + if not list_tups: raise _error - if not isinstance(val, list): + if not isinstance(list_tups, list): raise _error # check all entries are 2-lists (or tuple) - if not all(isinstance(a, list | tuple) and len(a) == 2 for a in val): + if not all(isinstance(a, list | tuple) and len(a) == 2 for a in list_tups): raise _error - # - return [_cluster_lookup(name, n_workers) for name, n_workers in val] + # check that all locations are known (this validates sooner than ewms, if using ewms) + for name, n_workers in list_tups: + _cluster_lookup(name, n_workers) + + return list_tups def _classifiers_validator(val: Any) -> dict[str, str | bool | float | int]: @@ -381,7 +390,7 @@ async def post(self) -> None: ) arghand.add_argument( "cluster", - type=_dict_or_list_to_request_clusters, + type=_validate_request_clusters, ) # scanner args arghand.add_argument( @@ -456,16 +465,17 @@ async def post(self) -> None: # more arg validation if DebugMode.CLIENT_LOGS in args.debug_mode: - for cluster in args.cluster: - cname, cinfo = cluster.to_known_cluster() - if cluster.n_workers > cinfo.get( - "max_n_clients_during_debug_mode", float("inf") + for cname, cworkers in args.cluster: + if cworkers > ( + val := KNOWN_CLUSTERS[cname].get( + "max_n_clients_during_debug_mode", float("inf") + ) ): raise web.HTTPError( 400, log_message=( - f"Too many workers: ManualCluster '{cname}' can only have " - f"{cinfo.get('max_n_clients_during_debug_mode')} " + f"Too many workers: Cluster '{cname}' can only have " + f"{val} " f"workers when 'debug_mode' " f"includes '{DebugMode.CLIENT_LOGS.value}'" ), @@ -495,7 +505,7 @@ async def post(self) -> None: real_or_simulated_event=args.real_or_simulated_event, predictive_scanning_threshold=args.predictive_scanning_threshold, classifiers=args.classifiers, - request_clusters=[dataclasses.asdict(c) for c in args.cluster], + request_clusters=args.cluster, # a list worker_memory_bytes=args.worker_memory, worker_disk_bytes=args.worker_disk, # already in bytes max_pixel_reco_time=args.max_pixel_reco_time, @@ -549,8 +559,8 @@ async def _start_scan( ) ), request_clusters=[ - dacite.from_dict(database.schema.Cluster, c) - for c in scan_request_obj["request_clusters"] + _cluster_lookup(name, n_workers) # values were pre-validated on user input + for name, n_workers in scan_request_obj["request_clusters"] ], worker_memory_bytes=scan_request_obj["worker_memory_bytes"], worker_disk_bytes=scan_request_obj["worker_disk_bytes"], @@ -984,6 +994,48 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- +class ScanI3EventHandler(BaseSkyDriverHandler): # pylint: disable=W0223 + """Handles grabbing i3 events using scan ids.""" + + ROUTE = r"/scan/(?P\w+)/i3-event$" + + @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + async def get(self, scan_id: str) -> None: + """Get scan's i3 event.""" + manifest = await self.manifests.get(scan_id, True) + + # look up event in collection + if manifest.i3_event_id: + doc = await self.i3_event_coll.find_one( + {"i3_event_id": manifest.i3_event_id} + ) + if doc: + i3_event = doc["json_dict"] + else: # this would mean the event was removed from the db + error_msg = ( + f"No i3 event document found with id '{manifest.i3_event_id}'" + ) + raise web.HTTPError( + 404, + log_message=error_msg, + reason=error_msg, + ) + # unless, this is an old scan -- where the whole dict was stored w/ the manifest + else: + i3_event = manifest.event_i3live_json_dict + + self.write({"i3_event": i3_event}) + + # + # NOTE - handler needs to stay user-read-only + # + # FUTURE - add delete? + # + + +# ----------------------------------------------------------------------------- + + class ScanResultHandler(BaseSkyDriverHandler): # pylint: disable=W0223 """Handles actions on persisted scan results.""" From 7c1d38cb71a6f97bb0aa3f896887a4a918f2ce02 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 17:26:09 -0600 Subject: [PATCH 014/327] squash merge in `replay` - 2 --- skydriver/rest_handlers.py | 44 -------------------------------------- 1 file changed, 44 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 7270230f..6afc2e30 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -2,7 +2,6 @@ import argparse import asyncio -import dataclasses import dataclasses as dc import json import logging @@ -12,7 +11,6 @@ import uuid from typing import Any, Type, TypeVar -import dacite import humanfriendly import kubernetes.client # type: ignore[import-untyped] from dacite import from_dict @@ -994,48 +992,6 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanI3EventHandler(BaseSkyDriverHandler): # pylint: disable=W0223 - """Handles grabbing i3 events using scan ids.""" - - ROUTE = r"/scan/(?P\w+)/i3-event$" - - @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore - async def get(self, scan_id: str) -> None: - """Get scan's i3 event.""" - manifest = await self.manifests.get(scan_id, True) - - # look up event in collection - if manifest.i3_event_id: - doc = await self.i3_event_coll.find_one( - {"i3_event_id": manifest.i3_event_id} - ) - if doc: - i3_event = doc["json_dict"] - else: # this would mean the event was removed from the db - error_msg = ( - f"No i3 event document found with id '{manifest.i3_event_id}'" - ) - raise web.HTTPError( - 404, - log_message=error_msg, - reason=error_msg, - ) - # unless, this is an old scan -- where the whole dict was stored w/ the manifest - else: - i3_event = manifest.event_i3live_json_dict - - self.write({"i3_event": i3_event}) - - # - # NOTE - handler needs to stay user-read-only - # - # FUTURE - add delete? - # - - -# ----------------------------------------------------------------------------- - - class ScanResultHandler(BaseSkyDriverHandler): # pylint: disable=W0223 """Handles actions on persisted scan results.""" From b2337320b2e49556ae53e73f949077e6e6cedc2e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 17:27:40 -0600 Subject: [PATCH 015/327] squash merge in `replay` - 3 --- skydriver/rest_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 6afc2e30..658a0a4a 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -655,8 +655,8 @@ async def post(self, scan_id: str) -> None: manifest = await _start_scan( self.manifests, self.scan_backlog, - new_scan_id, scan_request_obj, + new_scan_id=new_scan_id, ) self.write( dict_projection(dc.asdict(manifest), args.manifest_projection), From 0be0e539f2c6227041202fb07b8db367665321a3 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 3 Jan 2025 17:42:12 -0600 Subject: [PATCH 016/327] use request object --- skydriver/database/interface.py | 4 ++-- skydriver/database/schema.py | 14 +++++++------- skydriver/ewms.py | 25 ++++++++++++++----------- skydriver/k8s/scan_backlog.py | 2 +- skydriver/k8s/scanner_instance.py | 10 +++++----- skydriver/rest_handlers.py | 19 ++++++++++++++----- tests/unit/test_scan_state.py | 18 +++++++++--------- 7 files changed, 52 insertions(+), 40 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index b50e7c88..79a0b4fd 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -114,7 +114,7 @@ def _put_once_scan_metadata( def _put_ewms_task( in_db: schema.Manifest, upserting: dict, - cluster: schema.ManualCluster | None, + cluster: schema.InHouseClusterInfo | None, complete: bool | None, ): if not cluster and not complete: @@ -150,7 +150,7 @@ async def patch( progress: schema.Progress | None = None, event_metadata: schema.EventMetadata | None = None, scan_metadata: schema.StrDict | None = None, - cluster: schema.ManualCluster | None = None, + cluster: schema.InHouseClusterInfo | None = None, complete: bool | None = None, # workforce is done ) -> schema.Manifest: """Update `progress` at doc matching `scan_id`.""" diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 921e9308..a5a00f8d 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -133,7 +133,7 @@ class KubernetesLocation: @typechecked @dc.dataclass -class ManualCluster: +class InHouseClusterInfo: """Stores information for a worker cluster.""" orchestrator: Literal["condor", "k8s"] @@ -219,16 +219,16 @@ def obfuscate_cl_args(args: str) -> str: @typechecked @dc.dataclass -class ManualStarterInfo: +class InHouseStarterInfo: """Encapsulates what info is/was used for starting the scanner, within SkyDriver.""" - tms_args: list[str] # TODO - move to TMS - env_vars: EnvVars # TODO - move to TMS + tms_args: list[str] + env_vars: EnvVars - clusters: list[ManualCluster] = dc.field(default_factory=list) + clusters: list[InHouseClusterInfo] = dc.field(default_factory=list) # signifies k8s workers and condor cluster(s) AKA workforce is done - complete: bool = False # TODO - move to TMS + complete: bool = False def __post_init__(self) -> None: self.tms_args = [obfuscate_cl_args(a) for a in self.tms_args] @@ -246,7 +246,7 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: ManualStarterInfo | str # yes, this was a poor naming choice + ewms_task: InHouseStarterInfo | str # yes, this was a poor naming choice # ^^^ str -> EWMS workflow id (i.e. this id points to info in the EWMS db) # args placed in k8s job obj diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 7d340fe4..08908b9e 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -12,11 +12,10 @@ async def request_workflow_on_ewms( scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" - if not isinstance(manifest.ewms_task, str): + if isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): raise TypeError("Manifest is not designated for EWMS") image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) - # TODO: grab other values from scan request object; eventually, cut down k8s wrapper class body = { "public_queue_aliases": ["to-client-queue", "from-client-queue"], @@ -46,23 +45,27 @@ async def request_workflow_on_ewms( "pilot_config": { "tag": "latest", "environment": { - "EWMS_PILOT_INIT_TIMEOUT": 1 * 60, - "EWMS_PILOT_TASK_TIMEOUT": 1 * 60 * 60, - "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": 10 * 60, + "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout + "EWMS_PILOT_TASK_TIMEOUT": scan_request_obj[ + "max_pixel_reco_time" + ], + "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": scan_request_obj[ + "skyscan_mq_client_timeout_wait_for_first_message" + ], "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": 5 * 60, - "EWMS_PILOT_CONTAINER_DEBUG": "True", + "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? "EWMS_PILOT_INFILE_EXT": ".json", "EWMS_PILOT_OUTFILE_EXT": ".json", }, "input_files": [], }, "worker_config": { - "do_transfer_worker_stdouterr": True, - "max_worker_runtime": 2 * 60 * 60, + "do_transfer_worker_stdouterr": True, # toggle? + "max_worker_runtime": 6 * 60 * 60, # 6 hours "n_cores": 1, - "priority": manifest.priority, - "worker_disk": "512M", - "worker_memory": "8G", + "priority": scan_request_obj["priority"], + "worker_disk": scan_request_obj["worker_disk_bytes"], + "worker_memory": scan_request_obj["worker_memory_bytes"], "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", }, } diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 663e117e..bfdbcd8f 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -214,7 +214,7 @@ async def _run( continue # empty queue- # request a workflow on EWMS? - if not isinstance(manifest.ewms_task, database.schema.ManualStarterInfo): + if not isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): try: s3_obj_url = generate_s3_url(manifest.scan_id) workflow_id = await ewms.request_workflow_on_ewms( diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 4354898b..7753036b 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -25,7 +25,7 @@ def get_cluster_auth_v1envvars( - cluster: schema.ManualCluster, + cluster: schema.InHouseClusterInfo, ) -> list[kubernetes.client.V1EnvVar]: """Get the `V1EnvVar`s for workers' auth.""" LOGGER.debug(f"getting auth secret env vars for {cluster=}") @@ -75,7 +75,7 @@ def __init__( starter_exc: str, # TODO - remove once tested in prod worker_memory_bytes: int, worker_disk_bytes: int, - request_clusters: list[schema.ManualCluster], + request_clusters: list[schema.InHouseClusterInfo], max_pixel_reco_time: int, max_worker_runtime: int, priority: int, @@ -193,7 +193,7 @@ def get_cluster_starter_args( docker_tag: str, worker_memory_bytes: int, worker_disk_bytes: int, - request_cluster: schema.ManualCluster, + request_cluster: schema.InHouseClusterInfo, debug_mode: list[DebugMode], max_worker_runtime: int, priority: int, @@ -353,7 +353,7 @@ def make_skyscan_server_v1envvars( def make_cluster_starter_v1envvars( rest_address: str, scan_id: str, - cluster: schema.ManualCluster, + cluster: schema.InHouseClusterInfo, max_pixel_reco_time: int, debug_mode: list[DebugMode], ) -> list[kubernetes.client.V1EnvVar]: @@ -445,7 +445,7 @@ def __init__( self, k8s_batch_api: kubernetes.client.BatchV1Api, scan_id: str, - clusters: list[schema.ManualCluster], + clusters: list[schema.InHouseClusterInfo], ): self.k8s_batch_api = k8s_batch_api self.scan_id = scan_id diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 658a0a4a..7955520b 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -229,17 +229,17 @@ async def get(self) -> None: # ----------------------------------------------------------------------------- -def _cluster_lookup(name: str, n_workers: int) -> database.schema.ManualCluster: +def _cluster_lookup(name: str, n_workers: int) -> database.schema.InHouseClusterInfo: """Grab the ManualCluster object known using `name`.""" if cluster := KNOWN_CLUSTERS.get(name): if cluster["orchestrator"] == "condor": - return database.schema.ManualCluster( + return database.schema.InHouseClusterInfo( orchestrator=cluster["orchestrator"], location=database.schema.HTCondorLocation(**cluster["location"]), n_workers=n_workers, ) elif cluster["orchestrator"] == "k8s": - return database.schema.ManualCluster( + return database.schema.InHouseClusterInfo( orchestrator=cluster["orchestrator"], location=database.schema.KubernetesLocation(**cluster["location"]), n_workers=n_workers, @@ -497,12 +497,17 @@ async def post(self) -> None: rescan_ids=[], # docker_tag=args.docker_tag, + # + # skyscan server config scanner_server_memory_bytes=args.scanner_server_memory, # already in bytes reco_algo=args.reco_algo, nsides=args.nsides, real_or_simulated_event=args.real_or_simulated_event, predictive_scanning_threshold=args.predictive_scanning_threshold, + # classifiers=args.classifiers, + # + # cluster (condor) config request_clusters=args.cluster, # a list worker_memory_bytes=args.worker_memory, worker_disk_bytes=args.worker_disk, # already in bytes @@ -510,6 +515,8 @@ async def post(self) -> None: max_worker_runtime=args.max_worker_runtime, priority=args.priority, debug_mode=[d.value for d in args.debug_mode], + # + # misc skyscan_mq_client_timeout_wait_for_first_message=( args.skyscan_mq_client_timeout_wait_for_first_message if args.skyscan_mq_client_timeout_wait_for_first_message != -1 @@ -584,7 +591,7 @@ async def _start_scan( i3_event_id=scan_request_obj["i3_event_id"], scanner_server_args=scanner_wrapper.scanner_server_args, # TODO: switch over to ewms design - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=scanner_wrapper.cluster_starter_args_list, env_vars=from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), ), @@ -897,7 +904,9 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: ) arghand.add_argument( "cluster", - type=lambda x: from_dict_wrapper_or_none(database.schema.ManualCluster, x), + type=lambda x: from_dict_wrapper_or_none( + database.schema.InHouseClusterInfo, x + ), default=None, ) args = arghand.parse_args() diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 74e6a1c7..ff1888b2 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -15,7 +15,7 @@ def test_00__scan_finished_successfully() -> None: is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=True, @@ -57,12 +57,12 @@ def test_10__partial_result_generated( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.ManualCluster( + schema.InHouseClusterInfo( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -111,12 +111,12 @@ def test_20__waiting_on_first_pixel_reco( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.ManualCluster( + schema.InHouseClusterInfo( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -165,7 +165,7 @@ def test_30__waiting_on_cluster_startup( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, @@ -219,12 +219,12 @@ def test_40__waiting_on_scanner_server_startup( is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, clusters=[ - schema.ManualCluster( + schema.InHouseClusterInfo( orchestrator="condor", location=schema.HTCondorLocation( collector="foo", @@ -271,7 +271,7 @@ def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: is_deleted=False, event_i3live_json_dict={"abc": 123}, scanner_server_args="", - ewms_task=schema.ManualStarterInfo( + ewms_task=schema.InHouseStarterInfo( tms_args=[], env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), complete=is_complete, From 5ff4562b73d43a5005c3151d7d1cc82030ecbcfe Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 15:45:59 -0600 Subject: [PATCH 017/327] remove 1.x local k8s metadata from schema (bc: dict) --- skydriver/database/schema.py | 111 ++--------------------------------- 1 file changed, 4 insertions(+), 107 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index a5a00f8d..af84f781 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -2,13 +2,11 @@ import dataclasses as dc import enum -from typing import Any, Iterator, Literal +from typing import Any import wipac_dev_tools as wdt from typeguard import typechecked -from .. import config - StrDict = dict[str, Any] @@ -113,90 +111,6 @@ class EventMetadata: is_real_event: bool # as opposed to simulation -@typechecked -@dc.dataclass -class HTCondorLocation: - """Stores location metadata for a HTCondor cluster.""" - - collector: str - schedd: str - - -@typechecked -@dc.dataclass -class KubernetesLocation: - """Stores location metadata for a Kubernetes cluster.""" - - host: str - namespace: str - - -@typechecked -@dc.dataclass -class InHouseClusterInfo: - """Stores information for a worker cluster.""" - - orchestrator: Literal["condor", "k8s"] - location: HTCondorLocation | KubernetesLocation - n_workers: int - - uuid: str = "" # "" is a non-started cluster -- universally unique - cluster_id: str = "" # "" is a non-started cluster -- quasi-unique to location - - starter_info: StrDict = dc.field(default_factory=dict) - - statuses: dict[str, dict[str, int]] = dc.field(default_factory=dict) - top_task_errors: dict[str, int] = dc.field(default_factory=dict) - - def __post_init__(self) -> None: - match self.orchestrator: - case "condor": - if not isinstance(self.location, HTCondorLocation): - raise TypeError( - "condor orchestrator must use condor sub-fields for 'location'" - ) - case "k8s": - if not isinstance(self.location, KubernetesLocation): - raise TypeError( - "k8s orchestrator must use k8s sub-fields for 'location'" - ) - case other: - raise ValueError(f"Unknown cluster orchestrator: {other}") - - def to_known_cluster(self) -> tuple[str, StrDict]: - """Map to a config.KNOWN_CLUSTERS entry.""" - return next( - (k, v) - for k, v in config.KNOWN_CLUSTERS.items() - if v["location"] == dc.asdict(self.location) - ) - - -@typechecked -@dc.dataclass -class EnvVars: - """Encapsulates env var object originating from K8s objects.""" - - scanner_server: list[StrDict] - tms_starters: list[list[StrDict]] - - def __post_init__(self) -> None: - # - # obfuscate tokens & such (sensitive values) - # - def obfuscate(env_list: list[StrDict]) -> Iterator[StrDict]: - for env_entry in env_list: - if env_entry["value"]: - safe_val = wdt.data_safety_tools.obfuscate_value_if_sensitive( - env_entry["name"], env_entry["value"] - ) - env_entry["value"] = safe_val - yield env_entry - - self.scanner_server = list(obfuscate(self.scanner_server)) - self.tms_starters = [list(obfuscate(s)) for s in self.tms_starters] - - def obfuscate_cl_args(args: str) -> str: # first, check if any sensitive strings (searches using substrings) if not wdt.data_safety_tools.is_name_sensitive(args): @@ -217,24 +131,6 @@ def obfuscate_cl_args(args: str) -> str: return " ".join(out_args) -@typechecked -@dc.dataclass -class InHouseStarterInfo: - """Encapsulates what info is/was used for starting the scanner, within SkyDriver.""" - - tms_args: list[str] - env_vars: EnvVars - - clusters: list[InHouseClusterInfo] = dc.field(default_factory=list) - - # signifies k8s workers and condor cluster(s) AKA workforce is done - complete: bool = False - - def __post_init__(self) -> None: - self.tms_args = [obfuscate_cl_args(a) for a in self.tms_args] - # NOTE - self.env_vars done in EnvVars - - DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" @@ -246,8 +142,9 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: InHouseStarterInfo | str # yes, this was a poor naming choice - # ^^^ str -> EWMS workflow id (i.e. this id points to info in the EWMS db) + ewms_task: dict | str + # ^^^ str -> EWMS workflow id (i.e. this id points to info in the EWMS db) + # ^^^ dict -> *DEPRECATED* was used in skydriver 1.x to use local k8s starter/stopper # args placed in k8s job obj scanner_server_args: str From 6c4bc679963157d9535ea60a795c58c86f083787 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 16:10:10 -0600 Subject: [PATCH 018/327] remove 1.x local k8s logic - 1 --- skydriver/database/interface.py | 41 +------------- skydriver/database/schema.py | 6 +++ skydriver/ewms.py | 15 ++++++ skydriver/k8s/scanner_instance.py | 88 ------------------------------- skydriver/rest_handlers.py | 65 ++++------------------- 5 files changed, 32 insertions(+), 183 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index 79a0b4fd..3dd196c2 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -1,6 +1,5 @@ """Database interface for persisted scan data.""" -import copy import dataclasses as dc import logging import time @@ -110,47 +109,12 @@ def _put_once_scan_metadata( reason=msg, ) - @staticmethod - def _put_ewms_task( - in_db: schema.Manifest, - upserting: dict, - cluster: schema.InHouseClusterInfo | None, - complete: bool | None, - ): - if not cluster and not complete: - raise ValueError("cluster and complete cannot both be falsy") - - upserting["ewms_task"] = copy.deepcopy(in_db.ewms_task) - # cluster / clusters - # TODO - when TMS is up and running, it will handle cluster updating--remove then - # NOTE - there is a race condition inherent with list attributes, don't do this in TMS - if not cluster: - pass # don't put in DB - else: - try: # find by uuid -> replace - idx = next( - i - for i, c in enumerate(in_db.ewms_task.clusters) - if cluster.uuid == c.uuid - ) - upserting["ewms_task"].clusters = ( - in_db.ewms_task.clusters[:idx] - + [cluster] - + in_db.ewms_task.clusters[idx + 1 :] - ) - except StopIteration: # not found -> append - upserting["ewms_task"].clusters = in_db.ewms_task.clusters + [cluster] - # complete # workforce is done - if complete is not None: - upserting["ewms_task"].complete = complete # workforce is done - async def patch( self, scan_id: str, progress: schema.Progress | None = None, event_metadata: schema.EventMetadata | None = None, scan_metadata: schema.StrDict | None = None, - cluster: schema.InHouseClusterInfo | None = None, complete: bool | None = None, # workforce is done ) -> schema.Manifest: """Update `progress` at doc matching `scan_id`.""" @@ -160,7 +124,6 @@ async def patch( progress or event_metadata or scan_metadata - or cluster or complete is not None # True/False is ok # workforce is done ): LOGGER.debug(f"nothing to patch for manifest ({scan_id=})") @@ -169,6 +132,8 @@ async def patch( upserting: schema.StrDict = {} if progress: upserting["progress"] = progress + if complete is not None: + upserting["complete"] = complete # Validate, then store # NOTE: in theory there's a race condition (get+upsert) @@ -177,8 +142,6 @@ async def patch( self._put_once_event_metadata(in_db, upserting, scan_id, event_metadata) if scan_metadata: self._put_once_scan_metadata(in_db, upserting, scan_id, scan_metadata) - if cluster or complete is not None: - self._put_ewms_task(in_db, upserting, cluster, complete) # Update db if not upserting: # did we actually update anything? diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index af84f781..bff66017 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -171,6 +171,8 @@ class Manifest(ScanIDDataclass): last_updated: float = 0.0 + complete: bool = False + def __post_init__(self) -> None: if ( not self.i3_event_id @@ -182,6 +184,10 @@ def __post_init__(self) -> None: ) self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) + # Backward compatibility: 1.x had 'complete' in a nested field + if isinstance(self.ewms_task, dict): + self.complete = self.ewms_task.get("complete", False) + def get_state(self) -> ScanState: """Determine the state of the scan by parsing attributes.""" if ( diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 08908b9e..704a7f3a 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -74,3 +74,18 @@ async def request_workflow_on_ewms( resp = await ewms_rc.request("POST", "/v0/workflows", body) return resp["workflow"]["workflow_id"] + + +async def request_stop_on_ewms( + ewms_rc: RestClient, + workflow_id: str, +) -> int: + """Signal that an EWMS workflow is finished, and stop whatever is needed. + + Returns the number of stopped taskforces. + """ + resp = await ewms_rc.request( + "POST", + f"/v0/workflows/{workflow_id}/actions/finished", + ) + return resp["n_taskforces"] diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 7753036b..c53a4f9b 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -4,7 +4,6 @@ import logging import uuid from pathlib import Path -from typing import Any import kubernetes.client # type: ignore[import-untyped] from rest_tools.client import ClientCredentialsAuth @@ -12,12 +11,9 @@ from .utils import KubeAPITools from .. import images from ..config import ( - CLUSTER_STOPPER_K8S_JOB_N_RETRIES, - CLUSTER_STOPPER_K8S_TTL_SECONDS_AFTER_FINISHED, DebugMode, ENV, K8S_CONTAINER_MEMORY_CLUSTER_STARTER_BYTES, - K8S_CONTAINER_MEMORY_CLUSTER_STOPPER_BYTES, ) from ..database import schema @@ -436,87 +432,3 @@ def make_cluster_starter_v1envvars( ) return env - - -class SkymapScannerWorkerStopperK8sWrapper: - """Wraps K8s logic to stop workers of a Skymap Scanner instance.""" - - def __init__( - self, - k8s_batch_api: kubernetes.client.BatchV1Api, - scan_id: str, - clusters: list[schema.InHouseClusterInfo], - ): - self.k8s_batch_api = k8s_batch_api - self.scan_id = scan_id - - # make a container per cluster - containers = [] - for i, cluster in enumerate(clusters): - args = f"python -m clientmanager --uuid {cluster.uuid}" - match cluster.orchestrator: - case "condor": - args += ( - f" condor " # type: ignore[union-attr] - f" --collector {cluster.location.collector} " - f" --schedd {cluster.location.schedd} " - ) - case "k8s": - args += ( - f" k8s " # type: ignore[union-attr] - f" --host {cluster.location.host} " - f" --namespace {cluster.location.namespace} " - ) - case other: - raise ValueError(f"Unknown cluster orchestrator: {other}") - args += f" stop --cluster-id {cluster.cluster_id} " - - containers.append( - KubeAPITools.create_container( - f"cluster-stopper-{i}-{scan_id}", - ENV.THIS_IMAGE_WITH_TAG, - cpu=0.125, - env=get_cluster_auth_v1envvars(cluster), - args=args.split(), - memory=K8S_CONTAINER_MEMORY_CLUSTER_STOPPER_BYTES, - ) - ) - - if not containers: - self.worker_stopper_job_obj = None - else: - self.worker_stopper_job_obj = KubeAPITools.kube_create_job_object( - f"cluster-stopper-{scan_id}", - containers, - ENV.K8S_NAMESPACE, - CLUSTER_STOPPER_K8S_TTL_SECONDS_AFTER_FINISHED, - n_retries=CLUSTER_STOPPER_K8S_JOB_N_RETRIES, - ) - - def go(self) -> Any: - """Stop all workers of a Skymap Scanner instance.""" - - # NOTE - we don't want to stop the first k8s job because its containers will stop themselves. - # plus, 'K8S_TTL_SECONDS_AFTER_FINISHED' will allow logs & pod status to be retrieved for some time - # - # stop first k8s job (server & cluster starters) -- may not be instantaneous - # LOGGER.info( - # f"requesting removal of Skymap Scanner Job (server & cluster starters) -- {self.scan_id=}..." - # ) - # resp = self.k8s_batch_api.delete_namespaced_job( - # name=SkymapScannerK8sWrapper.get_job_name(self.scan_id), - # namespace=ENV.K8S_NAMESPACE, - # body=kubernetes.client.V1DeleteOptions( - # propagation_policy="Foreground", grace_period_seconds=5 - # ), - # ) - # LOGGER.info( - # f"removed Skymap Scanner Job {self.scan_id=} -- with response {resp.status} " - # ) - - # stop workers - if self.worker_stopper_job_obj: - LOGGER.info(f"starting k8s CLUSTER-STOPPER job for {self.scan_id=}") - KubeAPITools.start_job(self.k8s_batch_api, self.worker_stopper_job_obj) - else: - LOGGER.info(f"no workers to stop for {self.scan_id=}") diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 7955520b..b81063c5 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -38,6 +38,7 @@ is_testing, ) from .database import schema +from .ewms import request_stop_on_ewms from .k8s.scan_backlog import designate_for_startup from .k8s.scanner_instance import SkymapScannerK8sWrapper @@ -676,26 +677,20 @@ async def post(self, scan_id: str) -> None: async def stop_scanner_instance( manifests: database.interface.ManifestClient, scan_id: str, - k8s_batch_api: kubernetes.client.BatchV1Api, + ewms_rc: RestClient, ) -> database.schema.Manifest: """Stop all parts of the Scanner instance (if running) and mark in DB.""" manifest = await manifests.get(scan_id, True) - if manifest.ewms_task.complete: # workforce is done + if manifest.complete: # workforce is done return manifest - stopper_wrapper = k8s.scanner_instance.SkymapScannerWorkerStopperK8sWrapper( - k8s_batch_api, - scan_id, - manifest.ewms_task.clusters, - ) - - try: - stopper_wrapper.go() - except kubernetes.client.exceptions.ApiException as e: - LOGGER.exception(e) + # request to ewms + if manifest.ewms_task and isinstance(manifest.ewms_task, str): + await request_stop_on_ewms(ewms_rc, manifest.ewms_task) + else: raise web.HTTPError( 400, - log_message="Failed to stop Scanner instance", + log_message="Could not stop scanner workers since this is a non-EWMS scan.", ) return await manifests.patch(scan_id, complete=True) # workforce is done @@ -774,7 +769,7 @@ async def delete(self, scan_id: str) -> None: # mark as deleted -> also stops backlog from starting manifest = await self.manifests.mark_as_deleted(scan_id) # abort - await stop_scanner_instance(self.manifests, scan_id, self.k8s_batch_api) + await stop_scanner_instance(self.manifests, scan_id, self.ewms_rc) try: result_dict = dc.asdict(await self.results.get(scan_id)) @@ -902,13 +897,6 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: type=dict, default={}, ) - arghand.add_argument( - "cluster", - type=lambda x: from_dict_wrapper_or_none( - database.schema.InHouseClusterInfo, x - ), - default=None, - ) args = arghand.parse_args() manifest = await self.manifests.patch( @@ -916,43 +904,8 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: args.progress, args.event_metadata, args.scan_metadata, - args.cluster, ) - # NOTE - the following will be moved to TMS, then improved - # check cluster statuses & stop scan if workers are all failing - for db_cluster in manifest.ewms_task.clusters: - # Job-Status -> "Held:*" & Pilot-Status -> ANY - # -- sum the total counts of all job-statuses prefixed with "Held:" - n_held = sum( - sum( # pilot-status counts - cts for cts in db_cluster.statuses[job_status].values() - ) - for job_status in db_cluster.statuses.keys() - if job_status.startswith("Held:") - ) - - # Job-Status -> ANY & Pilot-Status -> "FatalError" - n_fatal_error = sum( - db_cluster.statuses[job_status].get("FatalError", 0) # int - for job_status in db_cluster.statuses.keys() - ) - - # overlap - n_held_and_fatal_error = sum( - db_cluster.statuses[job_status].get("FatalError", 0) # int - for job_status in db_cluster.statuses.keys() - if job_status.startswith("Held:") - ) - - if n_held + n_fatal_error - n_held_and_fatal_error >= db_cluster.n_workers: - manifest = await stop_scanner_instance( - self.manifests, - scan_id, - self.k8s_batch_api, - ) - break - self.write(dc.asdict(manifest)) # don't use a projection From c02a009f504cde6e2fca959ecf5cc0b8ccabd9ee Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 16:12:27 -0600 Subject: [PATCH 019/327] remove 1.x local k8s logic - 2 --- skydriver/rest_handlers.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index b81063c5..d2cb4a4f 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -674,7 +674,7 @@ async def post(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -async def stop_scanner_instance( +async def stop_skyscan_workers( manifests: database.interface.ManifestClient, scan_id: str, ewms_rc: RestClient, @@ -769,7 +769,7 @@ async def delete(self, scan_id: str) -> None: # mark as deleted -> also stops backlog from starting manifest = await self.manifests.mark_as_deleted(scan_id) # abort - await stop_scanner_instance(self.manifests, scan_id, self.ewms_rc) + await stop_skyscan_workers(self.manifests, scan_id, self.ewms_rc) try: result_dict = dc.asdict(await self.results.get(scan_id)) @@ -1013,7 +1013,7 @@ async def put(self, scan_id: str) -> None: await asyncio.sleep( WAIT_BEFORE_TEARDOWN ) # regular time.sleep() sleeps the entire server - await stop_scanner_instance(self.manifests, scan_id, self.k8s_batch_api) + await stop_skyscan_workers(self.manifests, scan_id, self.k8s_batch_api) # ----------------------------------------------------------------------------- From c36f3619e5a9c416bcac300a83d5dd482bf2d8fc Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 16:14:43 -0600 Subject: [PATCH 020/327] remove 1.x local k8s logic - 3 --- skydriver/database/interface.py | 8 ++++---- skydriver/database/schema.py | 4 ++-- skydriver/rest_handlers.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index 3dd196c2..5c1853cb 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -115,7 +115,7 @@ async def patch( progress: schema.Progress | None = None, event_metadata: schema.EventMetadata | None = None, scan_metadata: schema.StrDict | None = None, - complete: bool | None = None, # workforce is done + ewms_finished: bool | None = None, # workforce is done ) -> schema.Manifest: """Update `progress` at doc matching `scan_id`.""" LOGGER.debug(f"patching manifest for {scan_id=}") @@ -124,7 +124,7 @@ async def patch( progress or event_metadata or scan_metadata - or complete is not None # True/False is ok # workforce is done + or ewms_finished is not None # True/False is ok # workforce is done ): LOGGER.debug(f"nothing to patch for manifest ({scan_id=})") return await self.get(scan_id, incl_del=True) @@ -132,8 +132,8 @@ async def patch( upserting: schema.StrDict = {} if progress: upserting["progress"] = progress - if complete is not None: - upserting["complete"] = complete + if ewms_finished is not None: + upserting["ewms_finished"] = ewms_finished # Validate, then store # NOTE: in theory there's a race condition (get+upsert) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index bff66017..9f84ce90 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -171,7 +171,7 @@ class Manifest(ScanIDDataclass): last_updated: float = 0.0 - complete: bool = False + ewms_finished: bool = False # a cache so we don't have to call to ewms each time def __post_init__(self) -> None: if ( @@ -186,7 +186,7 @@ def __post_init__(self) -> None: # Backward compatibility: 1.x had 'complete' in a nested field if isinstance(self.ewms_task, dict): - self.complete = self.ewms_task.get("complete", False) + self.ewms_finished = self.ewms_task.get("complete", False) def get_state(self) -> ScanState: """Determine the state of the scan by parsing attributes.""" diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index d2cb4a4f..8550ae84 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -681,7 +681,7 @@ async def stop_skyscan_workers( ) -> database.schema.Manifest: """Stop all parts of the Scanner instance (if running) and mark in DB.""" manifest = await manifests.get(scan_id, True) - if manifest.complete: # workforce is done + if manifest.ewms_finished: # workforce is done return manifest # request to ewms @@ -693,7 +693,7 @@ async def stop_skyscan_workers( log_message="Could not stop scanner workers since this is a non-EWMS scan.", ) - return await manifests.patch(scan_id, complete=True) # workforce is done + return await manifests.patch(scan_id, ewms_finished=True) # workforce is done # ----------------------------------------------------------------------------- From dba6906ae08425be93f33cd5cb26b361fd22a933 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 17:02:44 -0600 Subject: [PATCH 021/327] remove 1.x local k8s logic - 4 --- skydriver/database/schema.py | 4 +- skydriver/k8s/scanner_instance.py | 204 ++---------------------------- skydriver/rest_handlers.py | 21 +-- 3 files changed, 15 insertions(+), 214 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 9f84ce90..b46331e9 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -142,8 +142,8 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: dict | str - # ^^^ str -> EWMS workflow id (i.e. this id points to info in the EWMS db) + ewms_task: dict | str # `""` -> workflow request has not (yet) been sent to EWMS + # ^^^ str -> EWMS workflow id (i.e. this id points to info in EWMS) # ^^^ dict -> *DEPRECATED* was used in skydriver 1.x to use local k8s starter/stopper # args placed in k8s job obj diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index c53a4f9b..a63e8ec0 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -2,9 +2,9 @@ instances.""" import logging -import uuid from pathlib import Path +import humanfriendly import kubernetes.client # type: ignore[import-untyped] from rest_tools.client import ClientCredentialsAuth @@ -13,7 +13,6 @@ from ..config import ( DebugMode, ENV, - K8S_CONTAINER_MEMORY_CLUSTER_STARTER_BYTES, ) from ..database import schema @@ -67,14 +66,6 @@ def __init__( nsides: dict[int, int], is_real_event: bool, predictive_scanning_threshold: float, - # cluster starter - starter_exc: str, # TODO - remove once tested in prod - worker_memory_bytes: int, - worker_disk_bytes: int, - request_clusters: list[schema.InHouseClusterInfo], - max_pixel_reco_time: int, - max_worker_runtime: int, - priority: int, # universal debug_mode: list[DebugMode], # env @@ -112,45 +103,21 @@ def __init__( ) self.env_dict["scanner_server"] = [e.to_dict() for e in scanner_server.env] - # CONTAINER(S): Cluster Starter(s) - tms_starters = [] - for i, cluster in enumerate(request_clusters): - tms_starters.append( - KubeAPITools.create_container( - f"{starter_exc.replace('_','-')}-{i}-{scan_id}", # TODO - replace once tested in prod - ENV.THIS_IMAGE_WITH_TAG, - env=self.make_cluster_starter_v1envvars( - rest_address=rest_address, - scan_id=scan_id, - cluster=cluster, - max_pixel_reco_time=max_pixel_reco_time, - debug_mode=debug_mode, - ), - args=self.get_cluster_starter_args( - starter_exc=starter_exc, # TODO - remove once tested in prod - common_space_volume_path=common_space_volume_path, - docker_tag=docker_tag, - worker_memory_bytes=worker_memory_bytes, - worker_disk_bytes=worker_disk_bytes, - request_cluster=cluster, - debug_mode=debug_mode, - max_worker_runtime=max_worker_runtime, - priority=priority, - ), - cpu=0.125, - volumes={common_space_volume_path.name: common_space_volume_path}, - memory=K8S_CONTAINER_MEMORY_CLUSTER_STARTER_BYTES, - ) - ) - self.cluster_starter_args_list = [" ".join(c.args) for c in tms_starters] - self.env_dict["tms_starters"] = [ - [e.to_dict() for e in c.env] for c in tms_starters - ] + # s3 uploader + s3_uploader = KubeAPITools.create_container( + f"s3-uploader-{scan_id}", + images.get_skyscan_docker_image(docker_tag), + [], + "".split(), + cpu=0.25, + volumes={common_space_volume_path.name: common_space_volume_path}, + memory=humanfriendly.parse_size("0.25 G"), + ) # job self.job_obj = KubeAPITools.kube_create_job_object( self.get_job_name(scan_id), - [scanner_server] + tms_starters, + [scanner_server, s3_uploader], ENV.K8S_NAMESPACE, ENV.K8S_TTL_SECONDS_AFTER_FINISHED, volumes=[common_space_volume_path.name], @@ -182,65 +149,6 @@ def get_scanner_server_args( ) return args - @staticmethod - def get_cluster_starter_args( - starter_exc: str, # TODO - remove once tested in prod - common_space_volume_path: Path, - docker_tag: str, - worker_memory_bytes: int, - worker_disk_bytes: int, - request_cluster: schema.InHouseClusterInfo, - debug_mode: list[DebugMode], - max_worker_runtime: int, - priority: int, - ) -> list[str]: - """Make the starter container args.""" - args = f"python -m clientmanager --uuid {str(uuid.uuid4().hex)}" - - match request_cluster.orchestrator: - case "condor": - args += ( - f" condor " # type: ignore[union-attr] - f" --collector {request_cluster.location.collector} " - f" --schedd {request_cluster.location.schedd} " - ) - worker_image = images.get_skyscan_cvmfs_singularity_image(docker_tag) - case "k8s": - args += ( - f" k8s " # type: ignore[union-attr] - f" --host {request_cluster.location.host} " - f" --namespace {request_cluster.location.namespace} " - ) - worker_image = images.get_skyscan_docker_image(docker_tag) - case other: - raise ValueError(f"Unknown cluster orchestrator: {other}") - - args += ( - f" start " - f" --n-workers {request_cluster.n_workers} " - # f" --dryrun" - # f" --spool " # see below - f" --worker-memory-bytes {worker_memory_bytes} " - f" --worker-disk-bytes {worker_disk_bytes} " - f" --image {worker_image} " - f" --client-startup-json {common_space_volume_path/'startup.json'} " - # f" --client-args {client_args} " # only potentially relevant arg is --debug-directory - f" --max-worker-runtime {max_worker_runtime}" - f" --priority {priority}" - ) - - if DebugMode.CLIENT_LOGS in debug_mode: - args += " --spool " - - # ADAPT args for EWMS Sidecar - # TODO - remove once tested in prod - if starter_exc == "ewms_sidecar" and request_cluster.orchestrator == "condor": - args = args.replace("clientmanager", "ewms_sidecar direct-remote-condor") - args = args.replace(" condor ", " ") - args = args.replace(" start ", " ") - - return args.split() - @staticmethod def _get_token_from_keycloak( token_url: str, @@ -344,91 +252,3 @@ def make_skyscan_server_v1envvars( ) return env - - @staticmethod - def make_cluster_starter_v1envvars( - rest_address: str, - scan_id: str, - cluster: schema.InHouseClusterInfo, - max_pixel_reco_time: int, - debug_mode: list[DebugMode], - ) -> list[kubernetes.client.V1EnvVar]: - """Get the environment variables provided to all containers. - - Also, get the secrets' keys & their values. - """ - LOGGER.debug(f"making cluster starter env vars for {scan_id=}") - env = [] - - # 1. start w/ secrets - # NOTE: the values come from an existing secret in the current namespace - env.extend(get_cluster_auth_v1envvars(cluster)) - env.extend(get_cluster_starter_s3_v1envvars()) - - # 2. add required env vars - required = { - # broker/mq vars - "SKYSCAN_BROKER_ADDRESS": ENV.SKYSCAN_BROKER_ADDRESS, - # skydriver vars - "SKYSCAN_SKYDRIVER_ADDRESS": rest_address, - "SKYSCAN_SKYDRIVER_SCAN_ID": scan_id, - # - "EWMS_TMS_S3_BUCKET": ENV.EWMS_TMS_S3_BUCKET, - "EWMS_TMS_S3_URL": ENV.EWMS_TMS_S3_URL, - # - "EWMS_PILOT_TASK_TIMEOUT": max_pixel_reco_time, - # - "WORKER_K8S_LOCAL_APPLICATION_NAME": ENV.K8S_APPLICATION_NAME, - } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in required.items() - ] - ) - - # 3. add extra env vars, then filter out if 'None' - prefiltered = { - "SKYSCAN_MQ_TIMEOUT_TO_CLIENTS": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, - "SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS": ENV.SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS, - # - "SKYSCAN_LOG": ENV.SKYSCAN_LOG, - "SKYSCAN_LOG_THIRD_PARTY": ENV.SKYSCAN_LOG_THIRD_PARTY, - # - "SKYSCAN_EWMS_PILOT_LOG": "WARNING", # default is too low - "SKYSCAN_MQ_CLIENT_LOG": "WARNING", # default is too low - # - "EWMS_PILOT_QUARANTINE_TIME": ENV.EWMS_PILOT_QUARANTINE_TIME, - "EWMS_PILOT_DUMP_TASK_OUTPUT": ( - True if DebugMode.CLIENT_LOGS in debug_mode else None - ), - } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in prefiltered.items() - if v is not None - ] - ) - - # 4. generate & add auth tokens - tokens = { - "SKYSCAN_BROKER_AUTH": SkymapScannerK8sWrapper._get_token_from_keycloak( - ENV.KEYCLOAK_OIDC_URL, - ENV.KEYCLOAK_CLIENT_ID_BROKER, - ENV.KEYCLOAK_CLIENT_SECRET_BROKER, - ), - "SKYSCAN_SKYDRIVER_AUTH": SkymapScannerK8sWrapper._get_token_from_keycloak( - ENV.KEYCLOAK_OIDC_URL, - ENV.KEYCLOAK_CLIENT_ID_SKYDRIVER_REST, - ENV.KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST, - ), - } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in tokens.items() - ] - ) - - return env diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 8550ae84..c3ad3775 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -558,21 +558,6 @@ async def _start_scan( nsides=scan_request_obj["nsides"], is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], - # cluster starter - starter_exc=str( # TODO - remove once tested in prod - scan_request_obj["classifiers"].get( - "__unstable_starter_exc", "clientmanager" - ) - ), - request_clusters=[ - _cluster_lookup(name, n_workers) # values were pre-validated on user input - for name, n_workers in scan_request_obj["request_clusters"] - ], - worker_memory_bytes=scan_request_obj["worker_memory_bytes"], - worker_disk_bytes=scan_request_obj["worker_disk_bytes"], - max_pixel_reco_time=scan_request_obj["max_pixel_reco_time"], - max_worker_runtime=scan_request_obj["max_worker_runtime"], - priority=scan_request_obj["priority"], # universal debug_mode=_debug_mode(scan_request_obj["debug_mode"]), # env @@ -591,11 +576,7 @@ async def _start_scan( is_deleted=False, i3_event_id=scan_request_obj["i3_event_id"], scanner_server_args=scanner_wrapper.scanner_server_args, - # TODO: switch over to ewms design - ewms_task=schema.InHouseStarterInfo( - tms_args=scanner_wrapper.cluster_starter_args_list, - env_vars=from_dict(database.schema.EnvVars, scanner_wrapper.env_dict), - ), + ewms_task="", # set once the workflow request has been sent to EWMS (see backlogger) classifiers=scan_request_obj["classifiers"], priority=scan_request_obj["priority"], ) From b333ae0b8e4dd78a648d351ef9d90640bbcc4d03 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 6 Jan 2025 17:46:36 -0600 Subject: [PATCH 022/327] remove 1.x local k8s logic - 5 --- skydriver/database/schema.py | 3 +- skydriver/ewms.py | 3 +- skydriver/k8s/scan_backlog.py | 25 ------------- skydriver/k8s/scanner_instance.py | 4 ++- skydriver/rest_handlers.py | 5 ++- skydriver/s3.py | 59 +++++++++++++++++++++++++++++++ 6 files changed, 69 insertions(+), 30 deletions(-) create mode 100644 skydriver/s3.py diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index b46331e9..098b84c1 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -146,8 +146,9 @@ class Manifest(ScanIDDataclass): # ^^^ str -> EWMS workflow id (i.e. this id points to info in EWMS) # ^^^ dict -> *DEPRECATED* was used in skydriver 1.x to use local k8s starter/stopper - # args placed in k8s job obj + # attrs placed in k8s job obj scanner_server_args: str + s3_obj_url: str = "" # in 2.x scans, this is always set priority: int = ( 0 # same as https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 704a7f3a..71ae5feb 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -8,7 +8,6 @@ async def request_workflow_on_ewms( ewms_rc: RestClient, manifest: database.schema.Manifest, - s3_obj_url: str, scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" @@ -37,7 +36,7 @@ async def request_workflow_on_ewms( "bash -c " '"' # quote for bash -c "..." "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " - f"'{s3_obj_url}'" # single-quote the url + f"'{manifest.s3_obj_url}'" # single-quote the url '"' # unquote for bash -c "..." ), "n_workers": scan_request_obj["request_clusters"][0][1], diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index bfdbcd8f..d57d2103 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -5,7 +5,6 @@ import pickle import time -import boto3 import bson import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection @@ -156,28 +155,6 @@ def has_interval_elapsed(self) -> bool: return False -def generate_s3_url(scan_id: str) -> str: - """Generate a pre-signed S3 url for putting shared files.""" - s3_client = boto3.client( - "s3", - "us-east-1", - endpoint_url=ENV.S3_URL, - aws_access_key_id=ENV.S3_ACCESS_KEY_ID, - aws_secret_access_key=ENV.S3_SECRET_KEY, - ) - - # get GET url - get_url = s3_client.generate_presigned_url( - "get_object", - Params={ - "Bucket": ENV.S3_BUCKET, - "Key": f"{scan_id}-s3-object", - }, - ExpiresIn=24 * 60 * 60, # seconds - ) - return get_url - - async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, @@ -216,11 +193,9 @@ async def _run( # request a workflow on EWMS? if not isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): try: - s3_obj_url = generate_s3_url(manifest.scan_id) workflow_id = await ewms.request_workflow_on_ewms( ewms_rc, manifest, - s3_obj_url, scan_request_obj, ) except Exception as e: diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index a63e8ec0..1aa70534 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -66,6 +66,8 @@ def __init__( nsides: dict[int, int], is_real_event: bool, predictive_scanning_threshold: float, + # s3 uploader + s3_obj_url: str, # universal debug_mode: list[DebugMode], # env @@ -108,7 +110,7 @@ def __init__( f"s3-uploader-{scan_id}", images.get_skyscan_docker_image(docker_tag), [], - "".split(), + "python s3_uploader.py".split(), # TODO: write an impromptu script and put in job cpu=0.25, volumes={common_space_volume_path.name: common_space_volume_path}, memory=humanfriendly.parse_size("0.25 G"), diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index c3ad3775..018466f5 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -27,7 +27,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, images, k8s +from . import database, images, k8s, s3 from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_WORKER_DISK_BYTES, @@ -547,6 +547,7 @@ async def _start_scan( new_scan_id: str = "", # don't use scan_request_obj.scan_id--this could be a rescan ) -> schema.Manifest: scan_id = new_scan_id or scan_request_obj["scan_id"] + s3_obj_url = s3.generate_s3_url(scan_id) # get the container info ready scanner_wrapper = SkymapScannerK8sWrapper( @@ -558,6 +559,8 @@ async def _start_scan( nsides=scan_request_obj["nsides"], is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], + # s3 uploader + s3_obj_url=s3_obj_url, # universal debug_mode=_debug_mode(scan_request_obj["debug_mode"]), # env diff --git a/skydriver/s3.py b/skydriver/s3.py new file mode 100644 index 00000000..09d4b72c --- /dev/null +++ b/skydriver/s3.py @@ -0,0 +1,59 @@ +"""Utilities for interacting with S3 buckets.""" + +import logging +import pathlib + +import boto3 +import requests + +from .config import ENV + +LOGGER = logging.getLogger(__name__) + + +def _get_client(): + LOGGER.info("Connecting to S3...") + return boto3.client( + "s3", + "us-east-1", + endpoint_url=ENV.S3_URL, + aws_access_key_id=ENV.S3_ACCESS_KEY_ID, + aws_secret_access_key=ENV.S3_SECRET_KEY, + ) + + +def generate_s3_url(scan_id: str) -> str: + """Generate a pre-signed S3 url for putting shared files.""" + s3_client = _get_client() + + # get GET url + get_url = s3_client.generate_presigned_url( + "get_object", + Params={ + "Bucket": ENV.S3_BUCKET, + "Key": f"{scan_id}-s3-object", + }, + ExpiresIn=24 * 60 * 60, # seconds + ) + return get_url + + +def upload_to_s3(fpath: pathlib.Path) -> str: + """Upload a file to S3.""" + s3_client = _get_client() + + # POST + upload_details = s3_client.generate_presigned_post( + ENV.S3_BUCKET, ENV.S3_OBJECT_DEST_FILE + ) + + LOGGER.info("uploading S3...") + with open(fpath, "rb") as f: + response = requests.post( + upload_details["url"], + data=upload_details["fields"], + files={"file": (fpath.name, f)}, # maps filename to obj + ) + + print(f"Upload response: {response.status_code}") + print(str(response.content)) From c53eef4bfc6f6d1840d6033f58ed4555e4fb0d97 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 15:38:19 -0600 Subject: [PATCH 023/327] use yaml to make k8s job; put in own mongo collection --- setup.cfg | 76 ++++++------- skydriver/database/schema.py | 4 +- skydriver/database/utils.py | 8 ++ skydriver/k8s/scan_backlog.py | 3 - skydriver/k8s/scanner_instance.py | 170 ++++++++++++++++++++---------- skydriver/rest_handlers.py | 27 +++-- 6 files changed, 181 insertions(+), 107 deletions(-) diff --git a/setup.cfg b/setup.cfg index 33bb48ba..6d33fc62 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.11 patch_without_tag = False package_dirs = - skydriver - clientmanager - ewms_sidecar + skydriver + clientmanager + ewms_sidecar [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-clientmanager-ewms-sidecar [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,49 +26,49 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools + pyyaml python_requires = >=3.10, <3.12 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - clientmanager - ewms_sidecar - skydriver.* - clientmanager.* - ewms_sidecar.* + skydriver + clientmanager + ewms_sidecar + skydriver.* + clientmanager.* + ewms_sidecar.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 098b84c1..ef609163 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -42,7 +42,7 @@ class ScanBacklogEntry(ScanIDDataclass): """An entry for the scan backlog used for rate-limiting.""" timestamp: float - pickled_k8s_job: bytes + pickled_k8s_job: bytes | None = None # **DEPRECATED** replaced SkyScanK8sJob in db priority: int = 0 pending_timestamp: float = 0.0 next_attempt: int = 0 @@ -144,7 +144,7 @@ class Manifest(ScanIDDataclass): ewms_task: dict | str # `""` -> workflow request has not (yet) been sent to EWMS # ^^^ str -> EWMS workflow id (i.e. this id points to info in EWMS) - # ^^^ dict -> *DEPRECATED* was used in skydriver 1.x to use local k8s starter/stopper + # ^^^ dict -> **DEPRECATED** was used in skydriver 1.x to use local k8s starter/stopper # attrs placed in k8s job obj scanner_server_args: str diff --git a/skydriver/database/utils.py b/skydriver/database/utils.py index 4f1dc522..d538fef3 100644 --- a/skydriver/database/utils.py +++ b/skydriver/database/utils.py @@ -11,6 +11,7 @@ _SCAN_BACKLOG_COLL_NAME = "ScanBacklog" _SCAN_REQUEST_COLL_NAME = "ScanRequests" _I3_EVENT_COLL_NAME = "I3Events" +_SKYSCAN_K8S_JOB_COLL_NAME = "SkyScanK8sJobs" async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ignore[valid-type] @@ -32,6 +33,13 @@ async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ign unique=True, ) + # SKYSCAN K8S JOB COLL + await motor_client[_DB_NAME][_SKYSCAN_K8S_JOB_COLL_NAME].create_index( # type: ignore[index] + "scan_id", + name="scan_id_index", + unique=True, + ) + # MANIFEST COLL await motor_client[_DB_NAME][_MANIFEST_COLL_NAME].create_index( # type: ignore[index] "scan_id", diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index d57d2103..27fba555 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -5,7 +5,6 @@ import pickle import time -import bson import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient @@ -20,7 +19,6 @@ async def designate_for_startup( scan_id: str, - job_obj: kubernetes.client.V1Job, scan_backlog: database.interface.ScanBacklogClient, priority: int, ) -> None: @@ -30,7 +28,6 @@ async def designate_for_startup( entry = database.schema.ScanBacklogEntry( scan_id=scan_id, timestamp=time.time(), - pickled_k8s_job=bson.Binary(pickle.dumps(job_obj)), priority=priority, ) await scan_backlog.insert(entry) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 1aa70534..2ae11469 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -2,30 +2,23 @@ instances.""" import logging +import textwrap from pathlib import Path +from typing import Any -import humanfriendly import kubernetes.client # type: ignore[import-untyped] +import yaml from rest_tools.client import ClientCredentialsAuth -from .utils import KubeAPITools from .. import images from ..config import ( DebugMode, ENV, ) -from ..database import schema LOGGER = logging.getLogger(__name__) - -def get_cluster_auth_v1envvars( - cluster: schema.InHouseClusterInfo, -) -> list[kubernetes.client.V1EnvVar]: - """Get the `V1EnvVar`s for workers' auth.""" - LOGGER.debug(f"getting auth secret env vars for {cluster=}") - _, info = cluster.to_known_cluster() - return info["v1envvars"] # type: ignore[no-any-return] +sdict = dict[str, Any] def get_cluster_starter_s3_v1envvars() -> list[kubernetes.client.V1EnvVar]: @@ -52,11 +45,13 @@ def get_cluster_starter_s3_v1envvars() -> list[kubernetes.client.V1EnvVar]: ] -class SkymapScannerK8sWrapper: - """Wraps a Skymap Scanner Kubernetes job with tools to start and manage.""" +class SkyScanK8sJobFactory: + """Makes Skymap Scanner Kubernetes jobs, plus misc tools.""" + + COMMON_SPACE_VOLUME_PATH = Path("/common-space") - def __init__( - self, + @staticmethod + def make( # docker_tag: str, scan_id: str, @@ -74,57 +69,117 @@ def __init__( rest_address: str, skyscan_mq_client_timeout_wait_for_first_message: int | None, scanner_server_env_from_user: dict, - ): - LOGGER.info(f"making k8s job for {scan_id=}") - self.scan_id = scan_id - self.env_dict = {} + ) -> tuple[sdict, str]: + """Make the K8s job dict. - common_space_volume_path = Path("/common-space") + Also, returns the server's args (so the user can see this later). + """ + LOGGER.info(f"making k8s job for {scan_id=}") - # CONTAINER: SkyScan Server - self.scanner_server_args = self.get_scanner_server_args( - common_space_volume_path=common_space_volume_path, + # pre-create some job components + scanner_server_args = SkyScanK8sJobFactory.get_scanner_server_args( reco_algo=reco_algo, nsides=nsides, is_real_event=is_real_event, predictive_scanning_threshold=predictive_scanning_threshold, ) - scanner_server = KubeAPITools.create_container( - f"skyscan-server-{scan_id}", - images.get_skyscan_docker_image(docker_tag), - self.make_skyscan_server_v1envvars( - rest_address=rest_address, - scan_id=scan_id, - skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, - scanner_server_env_from_user=scanner_server_env_from_user, - ), - self.scanner_server_args.split(), - cpu=1, - volumes={common_space_volume_path.name: common_space_volume_path}, - memory=scanner_server_memory_bytes, + scanner_server_envvars = SkyScanK8sJobFactory.make_skyscan_server_v1envvars( + rest_address=rest_address, + scan_id=scan_id, + skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, + scanner_server_env_from_user=scanner_server_env_from_user, ) - self.env_dict["scanner_server"] = [e.to_dict() for e in scanner_server.env] - # s3 uploader - s3_uploader = KubeAPITools.create_container( - f"s3-uploader-{scan_id}", - images.get_skyscan_docker_image(docker_tag), - [], - "python s3_uploader.py".split(), # TODO: write an impromptu script and put in job - cpu=0.25, - volumes={common_space_volume_path.name: common_space_volume_path}, - memory=humanfriendly.parse_size("0.25 G"), + # assemble the job + job_dict = SkyScanK8sJobFactory._make_job( + scan_id, + docker_tag, + s3_obj_url, + scanner_server_memory_bytes, + scanner_server_args, + scanner_server_envvars, ) - # job - self.job_obj = KubeAPITools.kube_create_job_object( - self.get_job_name(scan_id), - [scanner_server, s3_uploader], - ENV.K8S_NAMESPACE, - ENV.K8S_TTL_SECONDS_AFTER_FINISHED, - volumes=[common_space_volume_path.name], + return job_dict, scanner_server_args + + @staticmethod + def _make_job( + scan_id: str, + docker_tag: str, + s3_obj_url: str, + scanner_server_memory_bytes: int, + scanner_server_args: str, + scanner_server_envvars: list[kubernetes.client.V1EnvVar], + ) -> sdict: + """Create the K8s job manifest. + + NOTE: Let's keep definitions as straightforward as possible. + """ + job_yaml = textwrap.dedent( # fixes """-indentation + f""" + apiVersion: batch/v1 + kind: Job + metadata: + namespace: {ENV.K8S_NAMESPACE} + name: {SkyScanK8sJobFactory.get_job_name(scan_id)} + labels: + app.kubernetes.io/instance: {ENV.K8S_APPLICATION_NAME} + annotations: + argocd.argoproj.io/sync-options: "Prune=false" + spec: + ttlSecondsAfterFinished: {ENV.K8S_TTL_SECONDS_AFTER_FINISHED} + backoffLimit: 0 + activeDeadlineSeconds: {ENV.K8S_ACTIVE_DEADLINE_SECONDS} + template: + metadata: + labels: + app: scanner-instance + spec: + serviceAccountName: {ENV.K8S_SKYSCAN_JOBS_SERVICE_ACCOUNT} + restartPolicy: Never + containers: + - name: skyscan-server-{scan_id} + image: {images.get_skyscan_docker_image(docker_tag)} + command: [] + args: {scanner_server_args.split()} + env: + resources: + limits: + memory: "{scanner_server_memory_bytes}" + cpu: "1" + requests: + memory: "{scanner_server_memory_bytes}" + cpu: "1" + ephemeral-storage: "1M" + volumeMounts: + - name: common-space-volume + mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" + - name: s3-uploader-{scan_id} + restartPolicy: OnFailure + image: {images.get_skyscan_docker_image(docker_tag)} + command: [] + args: ["echo", "{s3_obj_url}"] + resources: + limits: + memory: "256Mi" + cpu: "0.25" + requests: + memory: "256Mi" + cpu: "0.25" + ephemeral-storage: "1M" + volumeMounts: + - name: common-space-volume + mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" + volumes: + - name: common-space-volume + emptyDir: {{}} + """ ) + # Parse the YAML string into a Python dictionary + job_dict = yaml.safe_load(job_yaml) + return job_dict + @staticmethod def get_job_name(scan_id: str) -> str: """Get the name of the K8s job (deterministic).""" @@ -132,7 +187,6 @@ def get_job_name(scan_id: str) -> str: @staticmethod def get_scanner_server_args( - common_space_volume_path: Path, reco_algo: str, nsides: dict[int, int], is_real_event: bool, @@ -142,9 +196,9 @@ def get_scanner_server_args( args = ( f"python -m skymap_scanner.server " f" --reco-algo {reco_algo}" - f" --cache-dir {common_space_volume_path} " + f" --cache-dir {SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH} " # f" --output-dir {common_space_volume_path} " # output is sent to skydriver - f" --client-startup-json {common_space_volume_path/'startup.json'} " + f" --client-startup-json {SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'} " f" --nsides {' '.join(f'{n}:{x}' for n,x in nsides.items())} " # k1:v1 k2:v2 f" {'--real-event' if is_real_event else '--simulated-event'} " f" --predictive-scanning-threshold {predictive_scanning_threshold} " @@ -227,12 +281,12 @@ def make_skyscan_server_v1envvars( # 4. generate & add auth tokens tokens = { - "SKYSCAN_BROKER_AUTH": SkymapScannerK8sWrapper._get_token_from_keycloak( + "SKYSCAN_BROKER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( ENV.KEYCLOAK_OIDC_URL, ENV.KEYCLOAK_CLIENT_ID_BROKER, ENV.KEYCLOAK_CLIENT_SECRET_BROKER, ), - "SKYSCAN_SKYDRIVER_AUTH": SkymapScannerK8sWrapper._get_token_from_keycloak( + "SKYSCAN_SKYDRIVER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( ENV.KEYCLOAK_OIDC_URL, ENV.KEYCLOAK_CLIENT_ID_SKYDRIVER_REST, ENV.KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST, diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 018466f5..2a3196b7 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -40,7 +40,7 @@ from .database import schema from .ewms import request_stop_on_ewms from .k8s.scan_backlog import designate_for_startup -from .k8s.scanner_instance import SkymapScannerK8sWrapper +from .k8s.scanner_instance import SkyScanK8sJobFactory LOGGER = logging.getLogger(__name__) @@ -144,6 +144,12 @@ def initialize( # type: ignore # pylint: disable=W0221 database.utils._I3_EVENT_COLL_NAME, ) ) + self.skyscan_k8s_job_coll = ( + AsyncIOMotorCollection( # in contrast, this one is accessed directly + mongo_client[database.interface._DB_NAME], # type: ignore[index] + database.utils._SKYSCAN_K8S_JOB_COLL_NAME, + ) + ) self.k8s_batch_api = k8s_batch_api self.ewms_rc = ewms_rc @@ -533,6 +539,7 @@ async def post(self) -> None: manifest = await _start_scan( self.manifests, self.scan_backlog, + self.skyscan_k8s_job_coll, scan_request_obj, ) self.write( @@ -543,6 +550,7 @@ async def post(self) -> None: async def _start_scan( manifests: database.interface.ManifestClient, scan_backlog: database.interface.ScanBacklogClient, + skyscan_k8s_job_coll: AsyncIOMotorCollection, scan_request_obj: dict, new_scan_id: str = "", # don't use scan_request_obj.scan_id--this could be a rescan ) -> schema.Manifest: @@ -550,7 +558,7 @@ async def _start_scan( s3_obj_url = s3.generate_s3_url(scan_id) # get the container info ready - scanner_wrapper = SkymapScannerK8sWrapper( + skyscan_k8s_job_dict, scanner_server_args = SkyScanK8sJobFactory.make( docker_tag=scan_request_obj["docker_tag"], scan_id=scan_id, # server @@ -578,16 +586,22 @@ async def _start_scan( timestamp=time.time(), is_deleted=False, i3_event_id=scan_request_obj["i3_event_id"], - scanner_server_args=scanner_wrapper.scanner_server_args, + scanner_server_args=scanner_server_args, ewms_task="", # set once the workflow request has been sent to EWMS (see backlogger) classifiers=scan_request_obj["classifiers"], priority=scan_request_obj["priority"], ) await manifests.put(manifest) + await skyscan_k8s_job_coll.insert_one( + { + "scan_id": scan_id, + "k8_job": skyscan_k8s_job_dict, + } + ) + await designate_for_startup( scan_id, - scanner_wrapper.job_obj, scan_backlog, scan_request_obj["priority"], ) @@ -647,6 +661,7 @@ async def post(self, scan_id: str) -> None: manifest = await _start_scan( self.manifests, self.scan_backlog, + self.skyscan_k8s_job_coll, scan_request_obj, new_scan_id=new_scan_id, ) @@ -1027,7 +1042,7 @@ async def get(self, scan_id: str) -> None: try: pods_411["pod_status"] = k8s.utils.KubeAPITools.get_pod_status( self.k8s_batch_api, - SkymapScannerK8sWrapper.get_job_name(scan_id), + SkyScanK8sJobFactory.get_job_name(scan_id), ENV.K8S_NAMESPACE, ) pods_411["pod_message"] = "retrieved" @@ -1071,7 +1086,7 @@ async def get(self, scan_id: str) -> None: try: pod_container_logs = k8s.utils.KubeAPITools.get_container_logs( self.k8s_batch_api, - SkymapScannerK8sWrapper.get_job_name(scan_id), + SkyScanK8sJobFactory.get_job_name(scan_id), ENV.K8S_NAMESPACE, ) pod_container_logs_message = "retrieved" From 133d8d3d5cd5937a77fb96ed9e897555c2484283 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 15:46:31 -0600 Subject: [PATCH 024/327] use yaml to make k8s job - 2 (wip) --- skydriver/k8s/scanner_instance.py | 58 ++++--------------------------- 1 file changed, 6 insertions(+), 52 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 2ae11469..4ffc544f 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -6,7 +6,6 @@ from pathlib import Path from typing import Any -import kubernetes.client # type: ignore[import-untyped] import yaml from rest_tools.client import ClientCredentialsAuth @@ -21,30 +20,6 @@ sdict = dict[str, Any] -def get_cluster_starter_s3_v1envvars() -> list[kubernetes.client.V1EnvVar]: - """Get the `V1EnvVar`s for TMS's S3 auth.""" - return [ - kubernetes.client.V1EnvVar( - name="EWMS_TMS_S3_ACCESS_KEY_ID", - value_from=kubernetes.client.V1EnvVarSource( - secret_key_ref=kubernetes.client.V1SecretKeySelector( - name=ENV.K8S_SECRET_NAME, - key="ewms_tms_s3_access_key_id", - ) - ), - ), - kubernetes.client.V1EnvVar( - name="EWMS_TMS_S3_SECRET_KEY", - value_from=kubernetes.client.V1EnvVarSource( - secret_key_ref=kubernetes.client.V1SecretKeySelector( - name=ENV.K8S_SECRET_NAME, - key="ewms_tms_s3_secret_key", - ) - ), - ), - ] - - class SkyScanK8sJobFactory: """Makes Skymap Scanner Kubernetes jobs, plus misc tools.""" @@ -109,7 +84,7 @@ def _make_job( s3_obj_url: str, scanner_server_memory_bytes: int, scanner_server_args: str, - scanner_server_envvars: list[kubernetes.client.V1EnvVar], + scanner_server_envvars: list[tuple[str, str]], ) -> sdict: """Create the K8s job manifest. @@ -228,7 +203,7 @@ def make_skyscan_server_v1envvars( scan_id: str, skyscan_mq_client_timeout_wait_for_first_message: int | None, scanner_server_env_from_user: dict, - ) -> list[kubernetes.client.V1EnvVar]: + ) -> list[tuple[str, str]]: """Get the environment variables provided to the skyscan server. Also, get the secrets' keys & their values. @@ -248,12 +223,7 @@ def make_skyscan_server_v1envvars( "SKYSCAN_SKYDRIVER_ADDRESS": rest_address, "SKYSCAN_SKYDRIVER_SCAN_ID": scan_id, } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in required.items() - ] - ) + env.extend([(k, str(v)) for k, v in required.items()]) # 3. add extra env vars, then filter out if 'None' prefiltered = { @@ -271,13 +241,7 @@ def make_skyscan_server_v1envvars( # "SKYSCAN_MQ_CLIENT_TIMEOUT_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in prefiltered.items() - if v is not None - ] - ) + env.extend([(k, str(v)) for k, v in prefiltered.items() if v is not None]) # 4. generate & add auth tokens tokens = { @@ -292,19 +256,9 @@ def make_skyscan_server_v1envvars( ENV.KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST, ), } - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in tokens.items() - ] - ) + env.extend([(k, str(v)) for k, v in tokens.items()]) # 5. Add user's env - env.extend( - [ - kubernetes.client.V1EnvVar(name=k, value=str(v)) - for k, v in scanner_server_env_from_user.items() - ] - ) + env.extend([(k, str(v)) for k, v in scanner_server_env_from_user.items()]) return env From 8d0a1b3300bbab9d2ca9d3d895703e2e545874ad Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 15:58:06 -0600 Subject: [PATCH 025/327] use yaml to make k8s job - 3 --- skydriver/k8s/scanner_instance.py | 52 +++++++++++++++++-------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 4ffc544f..f1186db0 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -58,7 +58,7 @@ def make( is_real_event=is_real_event, predictive_scanning_threshold=predictive_scanning_threshold, ) - scanner_server_envvars = SkyScanK8sJobFactory.make_skyscan_server_v1envvars( + scanner_server_envvars = SkyScanK8sJobFactory.make_skyscan_server_envvars( rest_address=rest_address, scan_id=scan_id, skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, @@ -84,12 +84,25 @@ def _make_job( s3_obj_url: str, scanner_server_memory_bytes: int, scanner_server_args: str, - scanner_server_envvars: list[tuple[str, str]], + scanner_server_envvars: sdict, ) -> sdict: """Create the K8s job manifest. NOTE: Let's keep definitions as straightforward as possible. """ + + # first, convert obj-based attrs to yaml-syntax + # -> inline, compact formatting, no indenting needed + scanner_env_yaml = yaml.safe_dump( + [{"name": k, "value": v} for k, v in scanner_server_envvars.items()], + default_flow_style=True, + ) + scanner_args_yaml = yaml.safe_dump( + scanner_server_args.split(), + default_flow_style=True, + ) + + # now, assemble job_yaml = textwrap.dedent( # fixes """-indentation f""" apiVersion: batch/v1 @@ -116,8 +129,8 @@ def _make_job( - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} command: [] - args: {scanner_server_args.split()} - env: + args: {scanner_args_yaml} + env: {scanner_env_yaml} resources: limits: memory: "{scanner_server_memory_bytes}" @@ -198,24 +211,17 @@ def _get_token_from_keycloak( return token @staticmethod - def make_skyscan_server_v1envvars( + def make_skyscan_server_envvars( rest_address: str, scan_id: str, skyscan_mq_client_timeout_wait_for_first_message: int | None, scanner_server_env_from_user: dict, - ) -> list[tuple[str, str]]: - """Get the environment variables provided to the skyscan server. - - Also, get the secrets' keys & their values. - """ + ) -> sdict: + """Get the environment variables provided to the skyscan server.""" LOGGER.debug(f"making scanner server env vars for {scan_id=}") - env = [] - - # 1. start w/ secrets - # NOTE: the values come from an existing secret in the current namespace - # *none* + env = {} - # 2. add required env vars + # 1. add required env vars required = { # broker/mq vars "SKYSCAN_BROKER_ADDRESS": ENV.SKYSCAN_BROKER_ADDRESS, @@ -223,9 +229,9 @@ def make_skyscan_server_v1envvars( "SKYSCAN_SKYDRIVER_ADDRESS": rest_address, "SKYSCAN_SKYDRIVER_SCAN_ID": scan_id, } - env.extend([(k, str(v)) for k, v in required.items()]) + env.update(required) - # 3. add extra env vars, then filter out if 'None' + # 2. add extra env vars, then filter out if 'None' prefiltered = { "SKYSCAN_PROGRESS_INTERVAL_SEC": ENV.SKYSCAN_PROGRESS_INTERVAL_SEC, "SKYSCAN_RESULT_INTERVAL_SEC": ENV.SKYSCAN_RESULT_INTERVAL_SEC, @@ -241,9 +247,9 @@ def make_skyscan_server_v1envvars( # "SKYSCAN_MQ_CLIENT_TIMEOUT_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, } - env.extend([(k, str(v)) for k, v in prefiltered.items() if v is not None]) + env.update({k: str(v) for k, v in prefiltered.items() if v is not None}) - # 4. generate & add auth tokens + # 3. generate & add auth tokens tokens = { "SKYSCAN_BROKER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( ENV.KEYCLOAK_OIDC_URL, @@ -256,9 +262,9 @@ def make_skyscan_server_v1envvars( ENV.KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST, ), } - env.extend([(k, str(v)) for k, v in tokens.items()]) + env.update(tokens) - # 5. Add user's env - env.extend([(k, str(v)) for k, v in scanner_server_env_from_user.items()]) + # 4. Add user's env + env.update(scanner_server_env_from_user) return env From f9d828aaf9d15faa59aa29894870707e34c4cd54 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 16:43:27 -0600 Subject: [PATCH 026/327] use yaml to make k8s job - 4 --- skydriver/config.py | 2 + skydriver/k8s/scan_backlog.py | 57 +++++++------ skydriver/k8s/scanner_instance.py | 4 +- skydriver/k8s/utils.py | 136 ++---------------------------- skydriver/rest_handlers.py | 2 +- 5 files changed, 40 insertions(+), 161 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index a8cd2b3d..5c542b29 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -9,6 +9,8 @@ import kubernetes.client # type: ignore[import-untyped] from wipac_dev_tools import from_environment_as_dataclass, logging_tools +sdict = dict[str, Any] + # -------------------------------------------------------------------------------------- # Constants diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 27fba555..6dcea627 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -2,10 +2,9 @@ import asyncio import logging -import pickle import time -import kubernetes.client # type: ignore[import-untyped] +import kubernetes.client.V1Job # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web @@ -39,12 +38,13 @@ async def designate_for_startup( ) -async def get_next_backlog_entry( +async def get_next( scan_backlog: database.interface.ScanBacklogClient, manifests: database.interface.ManifestClient, scan_request_client: AsyncIOMotorCollection, + skyscan_k8s_job_client: AsyncIOMotorClient, include_low_priority_scans: bool, -) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest, dict]: +) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest, dict, dict]: """Get the next entry & remove any that have been cancelled.""" while True: # get next up -- raises DocumentNotFoundException if none @@ -70,8 +70,12 @@ async def get_next_backlog_entry( {"scan_id": manifest.scan_id} ) + # grab the k8s + doc = await skyscan_k8s_job_client.find_one({"scan_id": manifest.scan_id}) + skyscan_k8s_job = doc["skyscan_k8s_job_dict"] + # all good! - return entry, manifest, scan_request_obj # ready to start job + return entry, manifest, scan_request_obj, skyscan_k8s_job async def run( @@ -166,6 +170,12 @@ async def _run( database.utils._SCAN_REQUEST_COLL_NAME, ) ) + skyscan_k8s_job_client = ( + AsyncIOMotorCollection( # in contrast, this one is accessed directly + mongo_client[database.interface._DB_NAME], # type: ignore[index] + database.utils._SKYSCAN_K8S_JOB_COLL_NAME, + ) + ) last_log_heartbeat = 0.0 # log every so often, not on every iteration long_interval_timer = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) @@ -176,10 +186,11 @@ async def _run( # get next entry try: - entry, manifest, scan_request_obj = await get_next_backlog_entry( + entry, manifest, scan_request_obj, skyscan_k8s_job = await get_next( backlog_client, manifest_client, scan_request_client, + skyscan_k8s_job_client, # include low priority scans only when enough time has passed include_low_priority_scans=long_interval_timer.has_interval_elapsed(), ) @@ -187,32 +198,21 @@ async def _run( long_interval_timer.fastforward() continue # empty queue- - # request a workflow on EWMS? - if not isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): - try: - workflow_id = await ewms.request_workflow_on_ewms( - ewms_rc, - manifest, - scan_request_obj, - ) - except Exception as e: - LOGGER.exception(e) - long_interval_timer.fastforward() # nothing was started, so don't wait long - continue - await manifest_client.collection.find_one_and_update( - {"scan_id": manifest.scan_id}, - {"$set": {"ewms_task": workflow_id}}, - ) - - # TODO: Start K8s Job - - # get k8s job object + # request a workflow on EWMS try: - job_obj = pickle.loads(entry.pickled_k8s_job) + workflow_id = await ewms.request_workflow_on_ewms( + ewms_rc, + manifest, + scan_request_obj, + ) except Exception as e: LOGGER.exception(e) long_interval_timer.fastforward() # nothing was started, so don't wait long continue + await manifest_client.collection.find_one_and_update( + {"scan_id": manifest.scan_id}, + {"$set": {"ewms_task": workflow_id}}, + ) LOGGER.info( f"Starting Scanner Instance: ({entry.scan_id=}) ({entry.timestamp})" @@ -221,7 +221,7 @@ async def _run( # start k8s job -- this could be any k8s job (pre- or post-ewms switchover) try: - resp = KubeAPITools.start_job(k8s_batch_api, job_obj) + resp = KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) LOGGER.info(resp) except kubernetes.client.exceptions.ApiException as e: # k8s job (backlog entry) will be revived & restarted in future iteration @@ -231,3 +231,4 @@ async def _run( # remove from backlog now that startup succeeded await backlog_client.remove(entry) + # TODO: remove k8s job doc? diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index f1186db0..fb1ed0b6 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -4,7 +4,6 @@ import logging import textwrap from pathlib import Path -from typing import Any import yaml from rest_tools.client import ClientCredentialsAuth @@ -13,12 +12,11 @@ from ..config import ( DebugMode, ENV, + sdict, ) LOGGER = logging.getLogger(__name__) -sdict = dict[str, Any] - class SkyScanK8sJobFactory: """Makes Skymap Scanner Kubernetes jobs, plus misc tools.""" diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 2fd98e7b..854ad3a6 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -2,13 +2,12 @@ import json import logging -from pathlib import Path from typing import Any, Iterator import kubernetes.client # type: ignore[import-untyped] from kubernetes.client.rest import ApiException # type: ignore[import-untyped] -from ..config import ENV, K8S_CONTAINER_MEMORY_DEFAULT_BYTES +from ..config import ENV, sdict LOGGER = logging.getLogger(__name__) @@ -16,143 +15,22 @@ class KubeAPITools: """A convenience wrapper around `kubernetes.client`.""" - @staticmethod - def kube_create_job_object( - name: str, - containers: list[kubernetes.client.V1Container], - namespace: str, - ttl_seconds_after_finished: int, - volumes: list[str] | None = None, # volume names - n_retries: int = 0, - ) -> kubernetes.client.V1Job: - """Create a k8 Job Object Minimum definition of a job object. - - Based on https://blog.pythian.com/how-to-create-kubernetes-jobs-with-python/ - - {'api_version': None, - Str - 'kind': None, - Str - 'metadata': None, - Metada Object - 'spec': None, -V1JobSpec - 'status': None} - V1Job Status - Docs: https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Job.md - Docs2: https://kubernetes.io/docs/concepts/workloads/controllers/jobs-run-to-completion/#writing-a-job-spec - - Also docs are pretty pretty bad. Best way is to ´pip install kubernetes´ and go via the autogenerated code - And figure out the chain of objects that you need to hold a final valid object So for a job object you need: - V1Job -> V1ObjectMeta - -> V1JobStatus - -> V1JobSpec -> V1PodTemplate -> V1PodTemplateSpec -> V1Container - - Now the tricky part, is that V1Job.spec needs a .template, but not a PodTemplateSpec, as such - you need to build a PodTemplate, add a template field (template.template) and make sure - template.template.spec is now the PodSpec. - Then, the V1Job.spec needs to be a JobSpec which has a template the template.template field of the PodTemplate. - Failure to do so will trigger an API error. - - Also Containers must be a list! - - Docs3: https://github.com/kubernetes-client/python/issues/589 - """ - if not volumes: - volumes = [] - - # Body is the object Body - body = kubernetes.client.V1Job(api_version="batch/v1", kind="Job") - # Body needs Metadata - # Attention: Each JOB must have a different name! - body.metadata = kubernetes.client.V1ObjectMeta( - namespace=namespace, - name=name, - labels={ - # https://argo-cd.readthedocs.io/en/stable/user-guide/resource_tracking/ - "app.kubernetes.io/instance": ENV.K8S_APPLICATION_NAME, - }, - annotations={ - "argocd.argoproj.io/sync-options": "Prune=false" # don't want argocd to prune this job - }, - ) - # And a Status - body.status = kubernetes.client.V1JobStatus() - # Now we start with the Template... - template = kubernetes.client.V1PodTemplate() - template.template = kubernetes.client.V1PodTemplateSpec( - metadata=kubernetes.client.V1ObjectMeta( - labels={ - "app": "scanner-instance", - }, - ), - ) - # Make Pod Spec - template.template.spec = kubernetes.client.V1PodSpec( - service_account_name=ENV.K8S_SKYSCAN_JOBS_SERVICE_ACCOUNT, - containers=containers, - restart_policy="Never", - volumes=[ - kubernetes.client.V1Volume( - name=n, empty_dir=kubernetes.client.V1EmptyDirVolumeSource() - ) - for n in volumes - ], - ) - # And finaly we can create our V1JobSpec! - body.spec = kubernetes.client.V1JobSpec( - ttl_seconds_after_finished=ttl_seconds_after_finished, - template=template.template, - backoff_limit=n_retries, - active_deadline_seconds=ENV.K8S_ACTIVE_DEADLINE_SECONDS, - ) - return body - - @staticmethod - def create_container( - name: str, - image: str, - env: list[kubernetes.client.V1EnvVar], - args: list[str], - cpu: float, - volumes: dict[str, Path] | None = None, - memory: int = K8S_CONTAINER_MEMORY_DEFAULT_BYTES, - ) -> kubernetes.client.V1Container: - """Make a Container instance.""" - if not volumes: - volumes = {} - - return kubernetes.client.V1Container( - name=name, - image=image, - env=env, - args=args, - volume_mounts=[ - kubernetes.client.V1VolumeMount(name=vol, mount_path=str(mnt)) - for vol, mnt in volumes.items() - ], - resources=kubernetes.client.V1ResourceRequirements( - limits={ - "memory": memory, - "cpu": str(cpu), - }, - requests={ - "memory": memory, - "cpu": str(cpu), - "ephemeral-storage": "1M", - }, - ), - ) - @staticmethod def start_job( k8s_batch_api: kubernetes.client.BatchV1Api, - job_obj: kubernetes.client.V1Job, + job_dict: sdict, ) -> Any: """Start the k8s job. Returns REST response. """ - if not job_obj: + if not job_dict: raise ValueError("Job object not created") try: - api_response = k8s_batch_api.create_namespaced_job( - ENV.K8S_NAMESPACE, job_obj + api_response = kubernetes.utils.create_from_dict( + k8s_batch_api, + job_dict, + namespace=ENV.K8S_NAMESPACE, ) LOGGER.info(api_response) except ApiException as e: diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 2a3196b7..12c29c56 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -596,7 +596,7 @@ async def _start_scan( await skyscan_k8s_job_coll.insert_one( { "scan_id": scan_id, - "k8_job": skyscan_k8s_job_dict, + "skyscan_k8s_job_dict": skyscan_k8s_job_dict, } ) From 678875801c6321b7b90d546d21ae4ffa5e2de546 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 16:45:03 -0600 Subject: [PATCH 027/327] use yaml to make k8s job - 5 --- skydriver/rest_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 12c29c56..4dfebec3 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -592,7 +592,6 @@ async def _start_scan( priority=scan_request_obj["priority"], ) await manifests.put(manifest) - await skyscan_k8s_job_coll.insert_one( { "scan_id": scan_id, @@ -600,6 +599,7 @@ async def _start_scan( } ) + # place on backlog await designate_for_startup( scan_id, scan_backlog, From 9278b82aed36265f504c830f7cbe51c0f64ed035 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 16:46:32 -0600 Subject: [PATCH 028/327] misc --- skydriver/rest_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 4dfebec3..31e20ade 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -756,7 +756,7 @@ async def delete(self, scan_id: str) -> None: # check DB states manifest = await self.manifests.get(scan_id, True) if ( - manifest.ewms_task.complete and not args.delete_completed_scan + manifest.ewms_finished and not args.delete_completed_scan ): # workforce is done msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( From 0c3e1991b521f6dec96845b2df6712987f05122d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 17:55:09 -0600 Subject: [PATCH 029/327] attempt 1: use configmap for s3 uploader --- skydriver/config.py | 6 ++++-- skydriver/k8s/scanner_instance.py | 31 +++++++++++++++++++++++++++++-- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 5c542b29..55385562 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -50,8 +50,10 @@ class EnvConfig: # s3 S3_URL: str - S3_ACCESS_KEY_ID: str - S3_SECRET_KEY: str + S3_ACCESS_KEY_ID: str # the actual value + S3_ACCESS_KEY_ID__K8S_SECRET_KEY: str # the key used in the k8s secrets.yml + S3_SECRET_KEY: str # the actual value + S3_SECRET_KEY__K8S_SECRET_KEY: str # the key used in the k8s secrets.yml S3_BUCKET: str # misc diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index fb1ed0b6..e2542e34 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -143,8 +143,25 @@ def _make_job( - name: s3-uploader-{scan_id} restartPolicy: OnFailure image: {images.get_skyscan_docker_image(docker_tag)} - command: [] - args: ["echo", "{s3_obj_url}"] + command: ["/scripts/s3_upload.sh"] + args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}"] + env: + - name: S3_URL + value: "{ENV.S3_URL}" + - name: S3_ACCESS_KEY_ID + valueFrom: + secretKeyRef: + name: {ENV.K8S_SECRET_NAME} + key: {ENV.S3_ACCESS_KEY_ID__K8S_SECRET_KEY} + - name: S3_SECRET_KEY + valueFrom: + secretKeyRef: + name: {ENV.K8S_SECRET_NAME} + key: {ENV.S3_SECRET_KEY__K8S_SECRET_KEY} + - name: S3_BUCKET + value: "{ENV.S3_BUCKET}" + - name: S3_OBJECT_DEST_FILE + value: "path/to/object" # Replace with destination path resources: limits: memory: "256Mi" @@ -156,9 +173,19 @@ def _make_job( volumeMounts: - name: common-space-volume mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" + - name: scripts + mountPath: /scripts + readOnly: true volumes: - name: common-space-volume emptyDir: {{}} + - name: scripts + configMap: + name: skyscan-sidecar-scripts + defaultMode: 0777 + items: + - key: "s3_upload.sh" + path: "s3_upload.sh" """ ) From f451dcec1815ad991fa4248753b5217f86fc5958 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 7 Jan 2025 18:19:24 -0600 Subject: [PATCH 030/327] use configmap for s3 uploader - 2 (wip) --- skydriver/database/schema.py | 3 +-- skydriver/ewms.py | 5 +++-- skydriver/k8s/scanner_instance.py | 10 +++------ skydriver/rest_handlers.py | 5 +---- skydriver/s3.py | 34 ++++++++----------------------- 5 files changed, 16 insertions(+), 41 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index ef609163..f52d9749 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -146,9 +146,8 @@ class Manifest(ScanIDDataclass): # ^^^ str -> EWMS workflow id (i.e. this id points to info in EWMS) # ^^^ dict -> **DEPRECATED** was used in skydriver 1.x to use local k8s starter/stopper - # attrs placed in k8s job obj + # args placed in k8s job obj scanner_server_args: str - s3_obj_url: str = "" # in 2.x scans, this is always set priority: int = ( 0 # same as https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 71ae5feb..755ee8e1 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -2,7 +2,7 @@ from rest_tools.client import RestClient -from . import database, images +from . import database, images, s3 async def request_workflow_on_ewms( @@ -14,6 +14,7 @@ async def request_workflow_on_ewms( if isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): raise TypeError("Manifest is not designated for EWMS") + s3_url_get = s3.generate_s3_get_url(manifest.scan_id) image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) body = { @@ -36,7 +37,7 @@ async def request_workflow_on_ewms( "bash -c " '"' # quote for bash -c "..." "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " - f"'{manifest.s3_obj_url}'" # single-quote the url + f"'{s3_url_get}'" # single-quote the url '"' # unquote for bash -c "..." ), "n_workers": scan_request_obj["request_clusters"][0][1], diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index e2542e34..5b58a24f 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -8,7 +8,7 @@ import yaml from rest_tools.client import ClientCredentialsAuth -from .. import images +from .. import images, s3 from ..config import ( DebugMode, ENV, @@ -34,8 +34,6 @@ def make( nsides: dict[int, int], is_real_event: bool, predictive_scanning_threshold: float, - # s3 uploader - s3_obj_url: str, # universal debug_mode: list[DebugMode], # env @@ -67,7 +65,6 @@ def make( job_dict = SkyScanK8sJobFactory._make_job( scan_id, docker_tag, - s3_obj_url, scanner_server_memory_bytes, scanner_server_args, scanner_server_envvars, @@ -79,7 +76,6 @@ def make( def _make_job( scan_id: str, docker_tag: str, - s3_obj_url: str, scanner_server_memory_bytes: int, scanner_server_args: str, scanner_server_envvars: sdict, @@ -160,8 +156,8 @@ def _make_job( key: {ENV.S3_SECRET_KEY__K8S_SECRET_KEY} - name: S3_BUCKET value: "{ENV.S3_BUCKET}" - - name: S3_OBJECT_DEST_FILE - value: "path/to/object" # Replace with destination path + - name: S3_OBJECT_KEY + value: "{s3.make_object_key(scan_id)}" resources: limits: memory: "256Mi" diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 31e20ade..a8fb1f17 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -27,7 +27,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, images, k8s, s3 +from . import database, images, k8s from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_WORKER_DISK_BYTES, @@ -555,7 +555,6 @@ async def _start_scan( new_scan_id: str = "", # don't use scan_request_obj.scan_id--this could be a rescan ) -> schema.Manifest: scan_id = new_scan_id or scan_request_obj["scan_id"] - s3_obj_url = s3.generate_s3_url(scan_id) # get the container info ready skyscan_k8s_job_dict, scanner_server_args = SkyScanK8sJobFactory.make( @@ -567,8 +566,6 @@ async def _start_scan( nsides=scan_request_obj["nsides"], is_real_event=scan_request_obj["real_or_simulated_event"] in REAL_CHOICES, predictive_scanning_threshold=scan_request_obj["predictive_scanning_threshold"], - # s3 uploader - s3_obj_url=s3_obj_url, # universal debug_mode=_debug_mode(scan_request_obj["debug_mode"]), # env diff --git a/skydriver/s3.py b/skydriver/s3.py index 09d4b72c..34eadf3b 100644 --- a/skydriver/s3.py +++ b/skydriver/s3.py @@ -1,10 +1,8 @@ """Utilities for interacting with S3 buckets.""" import logging -import pathlib import boto3 -import requests from .config import ENV @@ -22,8 +20,13 @@ def _get_client(): ) -def generate_s3_url(scan_id: str) -> str: - """Generate a pre-signed S3 url for putting shared files.""" +def make_object_key(scan_id: str) -> str: + """Construct the object key from the scan_id (deterministic).""" + return f"{scan_id}-s3-object" + + +def generate_s3_get_url(object_key: str) -> str: + """Generate a pre-signed S3 url for retrieving shared files.""" s3_client = _get_client() # get GET url @@ -31,29 +34,8 @@ def generate_s3_url(scan_id: str) -> str: "get_object", Params={ "Bucket": ENV.S3_BUCKET, - "Key": f"{scan_id}-s3-object", + "Key": object_key, }, ExpiresIn=24 * 60 * 60, # seconds ) return get_url - - -def upload_to_s3(fpath: pathlib.Path) -> str: - """Upload a file to S3.""" - s3_client = _get_client() - - # POST - upload_details = s3_client.generate_presigned_post( - ENV.S3_BUCKET, ENV.S3_OBJECT_DEST_FILE - ) - - LOGGER.info("uploading S3...") - with open(fpath, "rb") as f: - response = requests.post( - upload_details["url"], - data=upload_details["fields"], - files={"file": (fpath.name, f)}, # maps filename to obj - ) - - print(f"Upload response: {response.status_code}") - print(str(response.content)) From b9a14b3f6ceaca6e725c0b71d93cfd7fd091eee6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 8 Jan 2025 15:38:31 -0600 Subject: [PATCH 031/327] piggyback skydriver image to post to s3 --- s3_sidecar/__init__.py | 16 +++++++ s3_sidecar/post.py | 71 +++++++++++++++++++++++++++++++ setup.cfg | 2 +- skydriver/ewms.py | 2 +- skydriver/k8s/scanner_instance.py | 20 +++------ 5 files changed, 94 insertions(+), 17 deletions(-) create mode 100644 s3_sidecar/__init__.py create mode 100644 s3_sidecar/post.py diff --git a/s3_sidecar/__init__.py b/s3_sidecar/__init__.py new file mode 100644 index 00000000..a5b4c4b9 --- /dev/null +++ b/s3_sidecar/__init__.py @@ -0,0 +1,16 @@ +"""Public init.""" + +# version is a human-readable version number. + +# version_info is a four-tuple for programmatic comparison. The first +# three numbers are the components of the version number. The fourth +# is zero for an official release, positive for a development branch, +# or negative for a release candidate or beta (after the base version +# number has been incremented) +__version__ = "1.1.0" +version_info = ( + int(__version__.split(".")[0]), + int(__version__.split(".")[1]), + int(__version__.split(".")[2]), + 0, +) diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py new file mode 100644 index 00000000..bf5e7cff --- /dev/null +++ b/s3_sidecar/post.py @@ -0,0 +1,71 @@ +"""Utilities for posting to an S3 bucket.""" + +import argparse +import os +import time +from pathlib import Path + +import boto3 +import requests + + +def post(fpath: Path) -> None: + """Post the file to the S3 bucket.""" + if not fpath.exists(): + raise FileNotFoundError(str(fpath)) + time.sleep(5) # in case the file is currently being written (good enough logic?) + + s3_client = boto3.client( + "s3", + "us-east-1", + endpoint_url=os.environ["S3_URL"], + aws_access_key_id=os.environ["S3_ACCESS_KEY_ID"], + aws_secret_access_key=os.environ["S3_SECRET_KEY"], + ) + + # POST + upload_details = s3_client.generate_presigned_post( + os.environ["S3_BUCKET"], + os.environ["S3_OBJECT_KEY"], + ) + with open(fpath, "rb") as f: + response = requests.post( + upload_details["url"], + data=upload_details["fields"], + files={"file": (fpath.name, f)}, # maps filename to obj + ) + + print(f"Upload response: {response.status_code}") + print(str(response.content)) + + +def main() -> None: + """Main.""" + parser = argparse.ArgumentParser( + description="Post the file to the S3 bucket.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + parser.add_argument( + "fpath", + type=Path, + help="the file to post", + ) + parser.add_argument( + "--wait-indefinitely", + action="store_true", + default=False, + help="whether to wait indefinitely for the file to exist", + ) + + args = parser.parse_args() + + if args.wait_indefinitely: + while not args.fpath.exists(): + time.sleep(1) + + post(args.fpath) + + +if __name__ == "__main__": + main() diff --git a/setup.cfg b/setup.cfg index 6d33fc62..825d8675 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,7 +5,7 @@ patch_without_tag = False package_dirs = skydriver clientmanager - ewms_sidecar + s3_sidecar [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 755ee8e1..470ab22b 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -11,7 +11,7 @@ async def request_workflow_on_ewms( scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" - if isinstance(manifest.ewms_task, database.schema.InHouseStarterInfo): + if not (isinstance(manifest.ewms_task, str) and manifest.ewms_task): raise TypeError("Manifest is not designated for EWMS") s3_url_get = s3.generate_s3_get_url(manifest.scan_id) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 5b58a24f..7c0dc2f7 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -136,14 +136,14 @@ def _make_job( volumeMounts: - name: common-space-volume mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" - - name: s3-uploader-{scan_id} + - name: s3-sidecar-{scan_id} restartPolicy: OnFailure - image: {images.get_skyscan_docker_image(docker_tag)} - command: ["/scripts/s3_upload.sh"] - args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}"] + image: {ENV.THIS_IMAGE_WITH_TAG} + command: ["python", "-m", "s3_sidecar.post"] + args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}" "--wait-indefinitely"] env: - name: S3_URL - value: "{ENV.S3_URL}" + value: "{ENV.S3_URL}" - name: S3_ACCESS_KEY_ID valueFrom: secretKeyRef: @@ -169,19 +169,9 @@ def _make_job( volumeMounts: - name: common-space-volume mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" - - name: scripts - mountPath: /scripts - readOnly: true volumes: - name: common-space-volume emptyDir: {{}} - - name: scripts - configMap: - name: skyscan-sidecar-scripts - defaultMode: 0777 - items: - - key: "s3_upload.sh" - path: "s3_upload.sh" """ ) From 49b3ef9c0dc2a098622daa4cd0ed6ed77c26a7c6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 8 Jan 2025 15:39:10 -0600 Subject: [PATCH 032/327] `rm clientmanager/` --- clientmanager/__init__.py | 16 -- clientmanager/__main__.py | 13 -- clientmanager/clientmanager.py | 222 ---------------------- clientmanager/condor/__init__.py | 3 - clientmanager/condor/act.py | 98 ---------- clientmanager/condor/condor_tools.py | 47 ----- clientmanager/condor/starter.py | 203 -------------------- clientmanager/condor/stopper.py | 32 ---- clientmanager/condor/watcher.py | 266 --------------------------- clientmanager/config.py | 56 ------ clientmanager/k8s/__init__.py | 3 - clientmanager/k8s/act.py | 140 -------------- clientmanager/k8s/k8s_tools.py | 66 ------- clientmanager/k8s/starter.py | 243 ------------------------ clientmanager/k8s/stopper.py | 45 ----- clientmanager/py.typed | 0 clientmanager/utils.py | 138 -------------- setup.cfg | 1 - 18 files changed, 1592 deletions(-) delete mode 100644 clientmanager/__init__.py delete mode 100644 clientmanager/__main__.py delete mode 100644 clientmanager/clientmanager.py delete mode 100644 clientmanager/condor/__init__.py delete mode 100644 clientmanager/condor/act.py delete mode 100644 clientmanager/condor/condor_tools.py delete mode 100644 clientmanager/condor/starter.py delete mode 100644 clientmanager/condor/stopper.py delete mode 100644 clientmanager/condor/watcher.py delete mode 100644 clientmanager/config.py delete mode 100644 clientmanager/k8s/__init__.py delete mode 100644 clientmanager/k8s/act.py delete mode 100644 clientmanager/k8s/k8s_tools.py delete mode 100644 clientmanager/k8s/starter.py delete mode 100644 clientmanager/k8s/stopper.py delete mode 100644 clientmanager/py.typed delete mode 100644 clientmanager/utils.py diff --git a/clientmanager/__init__.py b/clientmanager/__init__.py deleted file mode 100644 index a5b4c4b9..00000000 --- a/clientmanager/__init__.py +++ /dev/null @@ -1,16 +0,0 @@ -"""Public init.""" - -# version is a human-readable version number. - -# version_info is a four-tuple for programmatic comparison. The first -# three numbers are the components of the version number. The fourth -# is zero for an official release, positive for a development branch, -# or negative for a release candidate or beta (after the base version -# number has been incremented) -__version__ = "1.1.0" -version_info = ( - int(__version__.split(".")[0]), - int(__version__.split(".")[1]), - int(__version__.split(".")[2]), - 0, -) diff --git a/clientmanager/__main__.py b/clientmanager/__main__.py deleted file mode 100644 index 965e6308..00000000 --- a/clientmanager/__main__.py +++ /dev/null @@ -1,13 +0,0 @@ -"""Entry-point to start up clientmanager service.""" - - -import logging - -from . import clientmanager - -LOGGER = logging.getLogger(__name__) - - -if __name__ == "__main__": - clientmanager.main() - LOGGER.info("Done.") diff --git a/clientmanager/clientmanager.py b/clientmanager/clientmanager.py deleted file mode 100644 index 9892030f..00000000 --- a/clientmanager/clientmanager.py +++ /dev/null @@ -1,222 +0,0 @@ -"""The central module.""" - - -import argparse -import logging -import time -from pathlib import Path - -from wipac_dev_tools import argparse_tools, logging_tools - -from . import condor, k8s -from .config import ENV - -LOGGER = logging.getLogger(__name__) - - -def main() -> None: - """Main.""" - parser = argparse.ArgumentParser( - description="Manage Skymap Scanner client workers", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--uuid", - required=True, - help="the uuid for the cluster", - ) - - # orchestrator - orch_subparsers = parser.add_subparsers( - required=True, - dest="orchestrator", - help="the resource orchestration tool to use for worker scheduling", - ) - OrchestratorArgs.condor( - orch_condor_parser := orch_subparsers.add_parser( - "condor", help="orchestrate with HTCondor" - ) - ) - OrchestratorArgs.k8s( - orch_k8s_parser := orch_subparsers.add_parser( - "k8s", help="orchestrate with Kubernetes" - ) - ) - - # action -- add sub-parser to each sub-parser (can't add multiple sub-parsers) - for p in [orch_condor_parser, orch_k8s_parser]: - act_subparsers = p.add_subparsers( - required=True, - dest="action", - help="the action to perform on the worker cluster", - ) - ActionArgs.starter(act_subparsers.add_parser("start", help="start workers")) - ActionArgs.stopper(act_subparsers.add_parser("stop", help="stop workers")) - - # parse args & set up logging - args = parser.parse_args() - logging_tools.set_level( - "DEBUG", # os.getenv("SKYSCAN_LOG", "INFO"), # type: ignore[arg-type] - first_party_loggers=LOGGER, - third_party_level=ENV.SKYSCAN_LOG_THIRD_PARTY, # type: ignore[arg-type] - use_coloredlogs=True, # for formatting - future_third_parties=["boto3", "botocore"], - ) - logging_tools.log_argparse_args(args, logger=LOGGER, level="WARNING") - - # Go! - match args.orchestrator: - case "condor": - condor.act(args) - case "k8s": - k8s.act(args) - case other: - raise RuntimeError(f"Orchestrator not supported: {other}") - - -class OrchestratorArgs: - @staticmethod - def condor(sub_parser: argparse.ArgumentParser) -> None: - """Add args to subparser.""" - sub_parser.add_argument( - "--collector", - default="", - help="the full URL address of the HTCondor collector server. Ex: foo-bar.icecube.wisc.edu", - ) - sub_parser.add_argument( - "--schedd", - default="", - help="the full DNS name of the HTCondor Schedd server. Ex: baz.icecube.wisc.edu", - ) - - @staticmethod - def k8s(sub_parser: argparse.ArgumentParser) -> None: - """Add args to subparser.""" - sub_parser.add_argument( - "--host", - required=True, - help="the host server address to connect to for running workers", - ) - sub_parser.add_argument( - "--namespace", - required=True, - help="the k8s namespace to use for running workers", - ) - sub_parser.add_argument( - "--cpu-arch", - default="x64", - help="which CPU architecture to use for running workers", - ) - sub_parser.add_argument( - "--job-config-stub", - type=Path, - default=Path("resources/worker_k8s_job_stub.json"), - help="worker k8s job config file to dynamically complete, then run (json)", - ) - - -class ActionArgs: - @staticmethod - def starter(sub_parser: argparse.ArgumentParser) -> None: - """Add args to subparser.""" - - def wait_for_file(waitee: Path, wait_time: int) -> Path: - """Wait for `waitee` to exist, then return fullly-resolved path.""" - elapsed_time = 0 - sleep = 5 - while not waitee.exists(): - LOGGER.info(f"waiting for {waitee} ({sleep}s intervals)...") - time.sleep(sleep) - elapsed_time += sleep - if elapsed_time >= wait_time: - raise argparse.ArgumentTypeError( - f"FileNotFoundError: waited {wait_time}s [{waitee}]" - ) - return waitee.resolve() - - # helper args - sub_parser.add_argument( - "--dryrun", - default=False, - action="store_true", - help="does everything except submitting the worker(s)", - ) - sub_parser.add_argument( - "--spool", - default=False, - action="store_true", - help="whether to spool (persist) logs -- if not given, logs are not kept", - ) - - # worker args - sub_parser.add_argument( - "--worker-memory-bytes", - required=True, - type=int, - help="amount of worker memory (bytes)", - ) - sub_parser.add_argument( - "--worker-disk-bytes", - required=True, - type=int, - help="amount of worker disk (bytes)", - ) - sub_parser.add_argument( - "--n-cores", - default=1, - type=int, - help="number of cores per worker", - ) - sub_parser.add_argument( - "--n-workers", - required=True, - type=int, - help="number of worker to start", - ) - sub_parser.add_argument( - "--max-worker-runtime", - required=True, - type=int, - help="how long each worker is allowed to run -- condor only", # TODO - set for k8s? - ) - sub_parser.add_argument( - "--priority", - required=True, - help="relative priority of this job/jobs -- condor only", # TODO - set for k8s? - ) - - # client args - sub_parser.add_argument( - "--client-args", - required=False, - nargs="*", - type=lambda x: argparse_tools.validate_arg( - x.split(":", maxsplit=1), - len(x.split(":", maxsplit=1)) == 2, - ValueError('must " "-delimited series of "clientarg:value"-tuples'), - ), - help="n 'key:value' pairs containing the python CL arguments to pass to skymap_scanner.client", - ) - sub_parser.add_argument( - "--client-startup-json", - help="The 'startup.json' file to startup each client", - type=lambda x: wait_for_file( - Path(x), - ENV.CLIENT_STARTER_WAIT_FOR_STARTUP_JSON, - ), - ) - sub_parser.add_argument( - "--image", - required=True, - help="a path or url to the workers' image", - ) - - @staticmethod - def stopper(sub_parser: argparse.ArgumentParser) -> None: - """Add args to subparser.""" - sub_parser.add_argument( - "--cluster-id", - required=True, - help="the cluster id of the workers to be stopped/removed", - ) diff --git a/clientmanager/condor/__init__.py b/clientmanager/condor/__init__.py deleted file mode 100644 index a526f44f..00000000 --- a/clientmanager/condor/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Init.""" - -from .act import act # noqa: F401 diff --git a/clientmanager/condor/act.py b/clientmanager/condor/act.py deleted file mode 100644 index 1321b6ee..00000000 --- a/clientmanager/condor/act.py +++ /dev/null @@ -1,98 +0,0 @@ -"""The post-argparse entry point for condor actions.""" - - -import argparse -import logging - -import htcondor # type: ignore[import-untyped] - -from .. import utils -from ..config import ENV -from . import condor_tools, starter, stopper, watcher - -LOGGER = logging.getLogger(__name__) - - -def act(args: argparse.Namespace) -> None: - """Do the action.""" - htcondor.set_subsystem("TOOL") - htcondor.param["TOOL_DEBUG"] = "D_FULLDEBUG" - # htcondor.param["TOOL_LOG"] = "log.txt" - # htcondor.enable_log() - htcondor.enable_debug() - - # condor auth & go - with htcondor.SecMan() as secman: - secman.setToken(htcondor.Token(ENV.CONDOR_TOKEN)) - schedd_obj = condor_tools.get_schedd_obj(args.collector, args.schedd) - _act(args, schedd_obj) - - -def _act(args: argparse.Namespace, schedd_obj: htcondor.Schedd) -> None: - match args.action: - case "start": - LOGGER.info( - f"Starting {args.n_workers} Skymap Scanner client workers on {args.collector} / {args.schedd}" - ) - # make connections -- do now so we don't have any surprises downstream - skydriver_rc = utils.connect_to_skydriver() - # start - submit_dict = starter.prep( - spool=args.spool, - # starter CL args -- worker - worker_memory_bytes=args.worker_memory_bytes, - worker_disk_bytes=args.worker_disk_bytes, - n_cores=args.n_cores, - max_worker_runtime=args.max_worker_runtime, - priority=args.priority, - # starter CL args -- client - client_args=args.client_args, - client_startup_json_s3=utils.s3ify(args.client_startup_json), - image=args.image, - ) - # final checks - if args.dryrun: - LOGGER.critical("Script Aborted: dryrun enabled") - return - if utils.skydriver_aborted_scan(skydriver_rc): - LOGGER.critical("Script Aborted: SkyDriver aborted scan") - return - # start - submit_result_obj = starter.start( - schedd_obj=schedd_obj, - n_workers=args.n_workers, - submit_dict=submit_dict, - spool=args.spool, - ) - # report to SkyDriver - skydriver_cluster_obj = dict( - orchestrator="condor", - location={ - "collector": args.collector, - "schedd": args.schedd, - }, - uuid=args.uuid, - cluster_id=submit_result_obj.cluster(), - n_workers=submit_result_obj.num_procs(), - starter_info=submit_dict, - ) - utils.update_skydriver(skydriver_rc, **skydriver_cluster_obj) - LOGGER.info("Sent cluster info to SkyDriver") - watcher.watch( - args.collector, - args.schedd, - submit_result_obj.cluster(), - schedd_obj, - submit_result_obj.num_procs(), - skydriver_rc, - skydriver_cluster_obj, - ) - case "stop": - stopper.stop( - args.collector, - args.schedd, - args.cluster_id, - schedd_obj, - ) - case _: - raise RuntimeError(f"Unknown action: {args.action}") diff --git a/clientmanager/condor/condor_tools.py b/clientmanager/condor/condor_tools.py deleted file mode 100644 index 3bc4aed6..00000000 --- a/clientmanager/condor/condor_tools.py +++ /dev/null @@ -1,47 +0,0 @@ -"""Util functions wrapping common htcondor actions.""" - - -import logging - -import htcondor # type: ignore[import-untyped] - -LOGGER = logging.getLogger(__name__) - - -def get_schedd_obj(collector: str, schedd: str) -> htcondor.Schedd: - """Get object for talking with HTCondor schedd. - - Examples: - `collector = "foo-bar.icecube.wisc.edu"` - `schedd = "baz.icecube.wisc.edu"` - """ - schedd_ad = htcondor.Collector(collector).locate( # ~> exception - htcondor.DaemonTypes.Schedd, schedd - ) - schedd_obj = htcondor.Schedd(schedd_ad) - LOGGER.info(f"Connected to Schedd {collector=} {schedd=}") - return schedd_obj - - -IDLE = 1 -RUNNING = 2 -REMOVED = 3 -COMPLETED = 4 -HELD = 5 -TRANSFERRING_OUTPUT = 6 -SUSPENDED = 7 - -_STATUS_MAPPING = { - IDLE: "Idle", - RUNNING: "Running", - REMOVED: "Removed", - COMPLETED: "Completed", - HELD: "Held", - TRANSFERRING_OUTPUT: "Transferring Output", - SUSPENDED: "Suspended", -} - - -def job_status_to_str(status_code: int) -> str: - """Get the human-readable string for the job status int.""" - return _STATUS_MAPPING.get(status_code, f"Invalid status code: {status_code}") diff --git a/clientmanager/condor/starter.py b/clientmanager/condor/starter.py deleted file mode 100644 index 6ce17f20..00000000 --- a/clientmanager/condor/starter.py +++ /dev/null @@ -1,203 +0,0 @@ -"""For starting Skymap Scanner clients on an HTCondor cluster.""" - - -import logging -from pathlib import Path -from typing import Any - -import htcondor # type: ignore[import-untyped] -import humanfriendly - -from ..config import ENV, FORWARDED_ENV_VARS -from ..utils import S3File - -LOGGER = logging.getLogger(__name__) - - -def make_condor_logs_dir() -> Path: - """Make the condor logs subdirectory.""" - dpath = Path("tms-cluster") - dpath.mkdir(parents=True) - LOGGER.info(f"HTCondor will write log files to {dpath}") - return dpath - - -def make_condor_job_description( - spool: bool, - # condor args - worker_memory_bytes: int, - worker_disk_bytes: int, - n_cores: int, - max_worker_runtime: int, - priority: int, - # skymap scanner args - image: str, - client_startup_json_s3: S3File, - client_args_string: str, -) -> dict[str, Any]: - """Make the condor job description (dict).""" - - # NOTE: - # In the newest version of condor we could use: - # universe = container - # container_image = ... - # arguments = python -m ... - # But for now, we're stuck with: - # executable = ... - # +SingularityImage = ... - # arguments = /usr/local/icetray/env-shell.sh python -m ... - # Because "this universe doesn't know how to do the - # entrypoint, and loading the icetray env file - # directly from cvmfs messes up the paths" -DS - - # Build the environment specification for condor - env_vars = ["EWMS_PILOT_HTCHIRP=True"] - # EWMS_* are inherited via condor `getenv`, but we have default in case these are not set. - if not ENV.EWMS_PILOT_QUARANTINE_TIME: - env_vars.append("EWMS_PILOT_QUARANTINE_TIME=1800") - # The container sets I3_DATA to /opt/i3-data, however `millipede_wilks` requires files (spline tables) that are not available in the image. For the time being we require CVFMS and we load I3_DATA from there. In order to override the environment variables we need to prepend APPTAINERENV_ or SINGULARITYENV_ to the variable name. There are site-dependent behaviour but these two should cover all cases. See https://github.com/icecube/skymap_scanner/issues/135#issuecomment-1449063054. - for prefix in ["APPTAINERENV_", "SINGULARITYENV_"]: - env_vars.append(f"{prefix}I3_DATA=/cvmfs/icecube.opensciencegrid.org/data") - environment = " ".join(env_vars) - - # write - submit_dict = { - "executable": "/bin/bash", - "arguments": f"/usr/local/icetray/env-shell.sh python -m skymap_scanner.client {client_args_string} --client-startup-json ./{client_startup_json_s3.fname}", - "+SingularityImage": f'"{image}"', # must be quoted - "Requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", - "getenv": ", ".join(FORWARDED_ENV_VARS), - "environment": f'"{environment}"', # must be quoted - "+FileSystemDomain": '"blah"', # must be quoted - # - "should_transfer_files": "YES", - "transfer_input_files": client_startup_json_s3.url, - "transfer_output_files": '""', # must be quoted for "none" - # - # Don't transfer executable (/bin/bash) in case of - # version (dependency) mismatch. - # Ex: - # "/lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.36' not found" - # Technically this is just needed for spooling -- since if - # we don't spool, the executable (/bin/bash) can't be - # transferred anyway and so a local version will be used - "transfer_executable": "false", - # - "request_cpus": str(n_cores), - "request_memory": humanfriendly.format_size( # 1073741824 -> "1 GiB" -> "1 GB" - worker_memory_bytes, binary=True - ).replace("i", ""), - "request_disk": humanfriendly.format_size( # 1073741824 -> "1 GiB" -> "1 GB" - worker_disk_bytes, binary=True - ).replace("i", ""), - "priority": int(priority), - "+WantIOProxy": "true", # for HTChirp - "+OriginalTime": max_worker_runtime, # Execution time limit -- 1 hour default on OSG - } - - # outputs - if spool: - # this is the location where the files will go when/if *returned here* - logs_dir = make_condor_logs_dir() - submit_dict.update( - { - "output": str(logs_dir / "tms-worker-$(ProcId).out"), - "error": str(logs_dir / "tms-worker-$(ProcId).err"), - "log": str(logs_dir / "tms-cluster.log"), - } - ) - # https://htcondor.readthedocs.io/en/latest/users-manual/file-transfer.html#specifying-if-and-when-to-transfer-files - submit_dict.update( - { - "transfer_output_files": ",".join( - [ - submit_dict["output"], # type: ignore[list-item] - submit_dict["error"], # type: ignore[list-item] - submit_dict["log"], # type: ignore[list-item] - ] - ), - "when_to_transfer_output": "ON_EXIT_OR_EVICT", - } - ) - else: - # NOTE: this needs to be removed if we ARE transferring files - submit_dict["initialdir"] = "/tmp" - - return submit_dict - - -def prep( - # starter CL args -- helper - spool: bool, - # starter CL args -- worker - worker_memory_bytes: int, - worker_disk_bytes: int, - n_cores: int, - max_worker_runtime: int, - priority: int, - # starter CL args -- client - client_args: list[tuple[str, str]], - client_startup_json_s3: S3File, - image: str, -) -> dict[str, Any]: - """Create objects needed for starting cluster.""" - - # get client args - client_args_string = "" - if client_args: - for carg, value in client_args: - client_args_string += f" --{carg} {value} " - LOGGER.info(f"Client Args: {client_args}") - if "--client-startup-json" in client_args_string: - raise RuntimeError( - "The '--client-args' arg cannot include \"--client-startup-json\". " - "This needs to be given to this script explicitly ('--client-startup-json')." - ) - - # make condor job description - submit_dict = make_condor_job_description( - spool, - # condor args - worker_memory_bytes, - worker_disk_bytes, - n_cores, - max_worker_runtime, - priority, - # skymap scanner args - image, - client_startup_json_s3, - client_args_string, - ) - LOGGER.info(submit_dict) - - return submit_dict - - -def start( - schedd_obj: htcondor.Schedd, - n_workers: int, - # - submit_dict: dict[str, Any], - spool: bool, -) -> htcondor.SubmitResult: - """Start cluster.""" - submit_obj = htcondor.Submit(submit_dict) - LOGGER.info(submit_obj) - - # submit - submit_result_obj = schedd_obj.submit( - submit_obj, - count=n_workers, # submit N workers - spool=spool, # for transferring logs & files - ) - LOGGER.info(submit_result_obj) - if spool: - jobs = list( - submit_obj.jobs( - count=n_workers, - clusterid=submit_result_obj.cluster(), - ) - ) - schedd_obj.spool(jobs) - - return submit_result_obj diff --git a/clientmanager/condor/stopper.py b/clientmanager/condor/stopper.py deleted file mode 100644 index d0b6f0c1..00000000 --- a/clientmanager/condor/stopper.py +++ /dev/null @@ -1,32 +0,0 @@ -"""For stopping Skymap Scanner clients on an HTCondor cluster.""" - - -import logging - -import htcondor # type: ignore[import-untyped] - -LOGGER = logging.getLogger(__name__) - - -def stop( - collector: str, - schedd: str, - cluster_id: str, - schedd_obj: htcondor.Schedd, -) -> None: - """Main logic.""" - LOGGER.info( - f"Stopping Skymap Scanner client workers on {cluster_id} / {collector} / {schedd}" - ) - - # Remove workers -- may not be instantaneous - LOGGER.info("Requesting removal...") - act_obj = schedd_obj.act( - htcondor.JobAction.Remove, - f"ClusterId == {cluster_id}", - reason="Requested by SkyDriver", - ) - LOGGER.debug(act_obj) - LOGGER.info(f"Removed {act_obj['TotalSuccess']} workers") - - # TODO: get/forward worker logs diff --git a/clientmanager/condor/watcher.py b/clientmanager/condor/watcher.py deleted file mode 100644 index 46d2475c..00000000 --- a/clientmanager/condor/watcher.py +++ /dev/null @@ -1,266 +0,0 @@ -"""For watching Skymap Scanner clients on an HTCondor cluster.""" - - -import collections -import logging -import time -from pprint import pformat -from typing import Any, Iterator - -import htcondor # type: ignore[import-untyped] -from rest_tools.client import RestClient - -from .. import utils -from ..config import WATCHER_INTERVAL, WATCHER_MAX_RUNTIME, WATCHER_N_TOP_TASK_ERRORS -from . import condor_tools as ct - -LOGGER = logging.getLogger(__name__) - - -PROJECTION = [ - "ClusterId", - "JobStatus", - "EnteredCurrentStatus", - "ProcId", - # - "HoldReason", - "HoldReasonCode", - "HoldReasonSubCode", - # - "HTChirpEWMSPilotLastUpdatedTimestamp", - "HTChirpEWMSPilotStartedTimestamp", - "HTChirpEWMSPilotStatus", - # - "HTChirpEWMSPilotTasksTotal", - "HTChirpEWMSPilotTasksFailed", - "HTChirpEWMSPilotTasksSuccess", - # - "HTChirpEWMSPilotError", - "HTChirpEWMSPilotErrorTraceback", -] - - -DONE_JOB_STATUSES: list[int] = [ - ct.REMOVED, - ct.COMPLETED, - ct.HELD, -] -NON_RESPONSE_LIMIT = 10 - - -def _translate_special_attrs(job_ad: dict[str, Any]) -> None: - """Special handling for specific attrs.""" - for attr in job_ad: - if attr.startswith("HTChirp"): - # unquote - if isinstance(job_ad[attr], str): - try: - job_ad[attr] = htcondor.classad.unquote(job_ad[attr]) - except Exception: - # LOGGER.error(f"could not unquote: {job[attr]}") - # LOGGER.exception(e) - pass - try: - job_ad["JobStatus"] = int(job_ad["JobStatus"]) - except Exception as e: - LOGGER.exception(e) - - -def update_stored_job_infos( - job_infos: dict[int, dict[str, Any]], - classad: Any, - source: str, -) -> None: - """Update the job's classad attrs in `job_infos`.""" - procid = int(classad["ProcId"]) - job_infos[procid]["source"] = source - job_infos[procid].update(dict(classad)) # start with everything - _translate_special_attrs(job_infos[procid]) - - -def iter_job_classads( - schedd_obj: htcondor.Schedd, - constraint: str, - projection: list[str], -) -> Iterator[tuple[htcondor.classad.ClassAd, str]]: - """Get the job class ads, trying various sources. - - May not get all of them. - """ - for call in [ - schedd_obj.query, - schedd_obj.history, - schedd_obj.jobEpochHistory, - ]: - try: - for classad in call(constraint, projection): - if "ProcId" not in classad: - continue - # LOGGER.info(f"looking at job {classad['ProcId']}") - # LOGGER.debug(str(call)) - # LOGGER.debug(classad) - yield classad, call.__name__ - except Exception as e: - LOGGER.exception(e) - - -def get_aggregate_statuses( - job_infos: dict[int, dict[str, Any]], - previous: dict[str, dict[str, int]], -) -> tuple[dict[str, dict[str, int]], bool]: - """Aggregate statuses of jobs & return whether this is an new value.""" - - def transform_job_status_val(info: dict[str, Any]) -> str: - """Get job status -- transforming any as needed. - - NOTE: each transformation needs to be generic - enough to aggregate nicely with others; e.g. don't - append a timestamp, do append a standard reason str. - """ - if info["JobStatus"] == ct.HELD: - codes = ( - info.get("HoldReasonCode", None), - info.get("HoldReasonSubCode", None), - ) - return ( - f"{ct.job_status_to_str(ct.HELD)}: " - f"{codes} " - f"{info.get('HoldReason', 'unknown reason')}" - ) - else: - return ct.job_status_to_str(info["JobStatus"]) - - statuses: dict[str, dict[str, int]] = { - k: {} - for k in set(transform_job_status_val(info) for info in job_infos.values()) - } - - for job_status in statuses: - ids_for_this_job_status = [ # subset of job_infos ids - i - for i, info in job_infos.items() - if transform_job_status_val(info) == job_status - ] - # NOTE - if the pilot did not send a status (ex: Held job), it is `None` - statuses[job_status] = dict( - collections.Counter( - job_infos[i]["HTChirpEWMSPilotStatus"] for i in ids_for_this_job_status - ) - ) - - return statuses, statuses != previous - - -def get_aggregate_top_task_errors( - job_infos: dict[int, dict[str, Any]], - n_top_task_errors: int, - previous: dict[str, int], -) -> tuple[dict[str, int], bool]: - """Aggregate top X errors of jobs & return whether this is an new value.""" - counts = collections.Counter( - dicto.get("HTChirpEWMSPilotError") for dicto in job_infos.values() - ) - counts.pop(None, None) # remove counts of "no error" - - errors = dict(counts.most_common(n_top_task_errors)) - return errors, errors != previous # type: ignore[return-value] - - -def watch( - collector: str, - schedd: str, - cluster_id: str, - schedd_obj: htcondor.Schedd, - n_workers: int, - # - skydriver_rc: RestClient, - skydriver_cluster_obj: dict[str, Any], -) -> None: - """Main logic.""" - LOGGER.info( - f"Watching Skymap Scanner client workers on {cluster_id} / {collector} / {schedd}" - ) - - job_infos: dict[int, dict[str, Any]] = { - i: { # NOTE - it's important that attrs reported on later are `None` to start - "JobStatus": None, - "HTChirpEWMSPilotStatus": None, - } - for i in range(n_workers) - } - - start = time.time() - non_response_ct = 0 - aggregate_statuses: dict[str, dict[str, int]] = {} - aggregate_top_task_errors: dict[str, int] = {} - - def keep_watching() -> bool: - """ - NOTE - condor may be lagging, so we can't just quit when - all jobs are done, since there may be more attrs to be updated. - """ - if not any( # if no done jobs, then keep going always - job_infos[j]["JobStatus"] in DONE_JOB_STATUSES for j in job_infos - ): - return True - else: - # condor may occasionally slow down & prematurely return nothing - return non_response_ct < NON_RESPONSE_LIMIT # allow X non-responses - - # WATCHING LOOP - while ( - keep_watching() - and time.time() - start - < WATCHER_MAX_RUNTIME # just in case, stop if taking too long - ): - # wait -- sleeping at top guarantees this happens - time.sleep(WATCHER_INTERVAL) - LOGGER.info("(re)checking jobs...") - - # query - classads = iter_job_classads( - schedd_obj, - ( - f"ClusterId == {cluster_id} && " - # only care about "older" status jobs if they are RUNNING - f"( JobStatus == {ct.RUNNING} || EnteredCurrentStatus >= {int(time.time()) - WATCHER_INTERVAL*3} )" - ), - PROJECTION, - ) - non_response_ct += 1 # just in case - for ad, source in classads: - non_response_ct = 0 - update_stored_job_infos(job_infos, ad, source) - # NOTE - if memory becomes an issue, switch to an in-iterator design - - # aggregate - aggregate_statuses, has_new_statuses = get_aggregate_statuses( - job_infos, - aggregate_statuses, - ) - aggregate_top_task_errors, has_new_errors = get_aggregate_top_task_errors( - job_infos, - WATCHER_N_TOP_TASK_ERRORS, - aggregate_top_task_errors, - ) - - # log - LOGGER.info(f"job aggregate statuses ({n_workers=})") - LOGGER.info(f"{pformat(aggregate_statuses, indent=4)}") - LOGGER.info( - f"job aggregate top {WATCHER_N_TOP_TASK_ERRORS} task errors ({n_workers=})" - ) - LOGGER.info(f"{pformat(aggregate_top_task_errors, indent=4)}") - - # figure updates - if not has_new_statuses and not has_new_errors: - LOGGER.info("no updates") - else: - # send updates - LOGGER.info("sending updates to skydriver") - utils.update_skydriver( - skydriver_rc, - **skydriver_cluster_obj, - statuses=aggregate_statuses, - top_task_errors=aggregate_top_task_errors, - ) diff --git a/clientmanager/config.py b/clientmanager/config.py deleted file mode 100644 index ec0921be..00000000 --- a/clientmanager/config.py +++ /dev/null @@ -1,56 +0,0 @@ -"""Config settings.""" - - -import dataclasses as dc -import os - -from wipac_dev_tools import from_environment_as_dataclass - -LOCAL_K8S_HOST = "local" - -_FORWARDED_ENV_VAR_PREFIXES = ["SKYSCAN_", "EWMS_"] -_NONFORWARDED_ENV_VAR_PREFIXES = ["EWMS_TMS_"] -FORWARDED_ENV_VARS = [ - var - for var in os.environ - if not any(var.startswith(p) for p in _NONFORWARDED_ENV_VAR_PREFIXES) - and any(var.startswith(p) for p in _FORWARDED_ENV_VAR_PREFIXES) -] -SECRET_FORWARDED_ENV_VARS = ["SKYSCAN_SKYDRIVER_AUTH", "SKYSCAN_BROKER_AUTH"] - -WATCHER_INTERVAL = 60 * 3 -WATCHER_MAX_RUNTIME = 60 * 60 * 24 -WATCHER_N_TOP_TASK_ERRORS = 10 - - -@dc.dataclass(frozen=True) -class EnvConfig: - """Environment variables.""" - - # pylint:disable=invalid-name - CLIENT_STARTER_WAIT_FOR_STARTUP_JSON: int = 60 - CONDOR_TOKEN: str = "" - # - WORKER_K8S_TOKEN: str = "" - WORKER_K8S_CACERT: str = "" - WORKER_K8S_CONFIG_FILE_BASE64: str = "" - # local k8s - WORKER_K8S_LOCAL_APPLICATION_NAME: str = "" - WORKER_K8S_LOCAL_WORKERS_MAX: int = 3 # don't want too many *local* workers - # - EWMS_PILOT_QUARANTINE_TIME: int = 0 - # - EWMS_TMS_S3_ACCESS_KEY_ID: str = "" - EWMS_TMS_S3_BUCKET: str = "" - EWMS_TMS_S3_EXPIRATION: int = 60 * 60 * 24 # seconds / 1 day - EWMS_TMS_S3_SECRET_KEY: str = "" - EWMS_TMS_S3_URL: str = "" - - # piggy-back scanner env vars - SKYSCAN_LOG_THIRD_PARTY: str = "WARNING" - SKYSCAN_SKYDRIVER_ADDRESS: str = "" - SKYSCAN_SKYDRIVER_AUTH: str = "" - SKYSCAN_SKYDRIVER_SCAN_ID: str = "" - - -ENV = from_environment_as_dataclass(EnvConfig) diff --git a/clientmanager/k8s/__init__.py b/clientmanager/k8s/__init__.py deleted file mode 100644 index a526f44f..00000000 --- a/clientmanager/k8s/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Init.""" - -from .act import act # noqa: F401 diff --git a/clientmanager/k8s/act.py b/clientmanager/k8s/act.py deleted file mode 100644 index 50b1cdbe..00000000 --- a/clientmanager/k8s/act.py +++ /dev/null @@ -1,140 +0,0 @@ -"""The post-argparse entry point for k8s actions.""" - - -import argparse -import base64 -import logging -import time -from tempfile import NamedTemporaryFile - -import kubernetes # type: ignore[import-untyped] - -from .. import utils -from ..config import ENV, LOCAL_K8S_HOST -from . import starter, stopper - -LOGGER = logging.getLogger(__name__) - - -def act(args: argparse.Namespace) -> None: - """Do the action.""" - k8s_client_config = kubernetes.client.Configuration() - - # Creating K8S cluster client - # Local - if args.host == LOCAL_K8S_HOST: - LOGGER.info("connecting to local k8s...") - # use *this* pod's service account - kubernetes.config.load_incluster_config(k8s_client_config) - # Using config file + token - elif ENV.WORKER_K8S_CONFIG_FILE_BASE64 and ENV.WORKER_K8S_TOKEN: - LOGGER.info("connecting to remote k8s via config file + token...") - # connect to remote host - with NamedTemporaryFile(delete=False) as tempf: - tempf.write(base64.b64decode(ENV.WORKER_K8S_CONFIG_FILE_BASE64)) - LOGGER.info("loading k8s configuration...") - kubernetes.config.load_kube_config( - config_file=tempf.name, - client_configuration=k8s_client_config, - ) - k8s_client_config.host = args.host - k8s_client_config.api_key["authorization"] = ENV.WORKER_K8S_TOKEN - # Using CA cert + token - elif ENV.WORKER_K8S_CACERT and ENV.WORKER_K8S_TOKEN: - # https://medium.com/@jankrynauw/run-a-job-on-google-kubernetes-engine-using-the-python-client-library-and-not-kubectl-4ee8bdd55b1b - LOGGER.info("connecting to remote k8s via ca cert + token...") - with NamedTemporaryFile(delete=False) as tempf: - tempf.write(base64.b64decode(ENV.WORKER_K8S_CACERT)) - k8s_client_config.ssl_ca_cert = tempf.name - k8s_client_config.host = args.host - k8s_client_config.verify_ssl = True - k8s_client_config.debug = True # remove? - k8s_client_config.api_key = {"authorization": "Bearer " + ENV.WORKER_K8S_TOKEN} - k8s_client_config.assert_hostname = False - kubernetes.client.Configuration.set_default(k8s_client_config) - else: - raise RuntimeError( - f"Did not provide sufficient configuration to connect to {args.host}" - ) - - # connect & go - with kubernetes.client.ApiClient(k8s_client_config) as k8s_api: - try: - LOGGER.debug("testing k8s credentials") - resp = kubernetes.client.BatchV1Api(k8s_api).get_api_resources() - LOGGER.debug(resp) - except kubernetes.client.rest.ApiException as e: - LOGGER.exception(e) - raise - _act(args, k8s_api) - - -def _act(args: argparse.Namespace, k8s_api: kubernetes.client.ApiClient) -> None: - match args.action: - case "start": - cluster_id = f"skyscan-worker-{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}-{int(time.time())}" # TODO: make more unique - LOGGER.info( - f"Starting {args.n_workers} Skymap Scanner client workers on " - f"{args.host}/{args.namespace}/{cluster_id}" - ) - # make connections -- do now so we don't have any surprises downstream - skydriver_rc = utils.connect_to_skydriver() - # start - k8s_job_dict = starter.prep( - cluster_id=cluster_id, - # k8s CL args - cpu_arch=args.cpu_arch, - host=args.host, - job_config_stub=args.job_config_stub, - namespace=args.namespace, - # starter CL args -- worker - worker_memory_bytes=args.worker_memory_bytes, - worker_disk_bytes=args.worker_disk_bytes, - n_cores=args.n_cores, - n_workers=args.n_workers, - # starter CL args -- client - client_args=args.client_args if args.client_args else [], - client_startup_json_s3=utils.s3ify(args.client_startup_json), - container_image=args.image, - ) - # final checks - if args.dryrun: - LOGGER.critical("Script Aborted: dryrun enabled") - return - if utils.skydriver_aborted_scan(skydriver_rc): - LOGGER.critical("Script Aborted: SkyDriver aborted scan") - return - # start - k8s_job_dict = starter.start( - k8s_api, - k8s_job_dict, - cluster_id, - args.host, - args.namespace, - ) - # report to SkyDriver - utils.update_skydriver( - skydriver_rc, - "k8s", - location={ - "host": args.host, - "namespace": args.namespace, - }, - uuid=args.uuid, - cluster_id=cluster_id, - n_workers=args.n_workers, - starter_info=k8s_job_dict, - ) - LOGGER.info("Sent cluster info to SkyDriver") - case "stop": - LOGGER.info( - f"Stopping Skymap Scanner client workers on " - f"{args.host}/{args.namespace}/{args.cluster_id}" - ) - stopper.stop( - args.namespace, - args.cluster_id, - k8s_api, - ) - case _: - raise RuntimeError(f"Unknown action: {args.action}") diff --git a/clientmanager/k8s/k8s_tools.py b/clientmanager/k8s/k8s_tools.py deleted file mode 100644 index 71b9f4d3..00000000 --- a/clientmanager/k8s/k8s_tools.py +++ /dev/null @@ -1,66 +0,0 @@ -"""An interface to the Kubernetes cluster.""" - - -import logging - -import kubernetes.client # type: ignore[import-untyped] - -from ..config import ENV, LOCAL_K8S_HOST - -LOGGER = logging.getLogger(__name__) - - -def get_worker_k8s_secret_name(cluster_id: str) -> str: - return f"{cluster_id}-secret" - - -def patch_or_create_namespaced_secret( - k8s_core_api: kubernetes.client.CoreV1Api, - host: str, - namespace: str, - secret_name: str, - secret_type: str, - encoded_secret_data: dict[str, str], -) -> None: - """Patch secret and if not exist create.""" - - if host == LOCAL_K8S_HOST: - metadata = kubernetes.client.V1ObjectMeta( - name=secret_name, - labels={ - # https://argo-cd.readthedocs.io/en/stable/user-guide/resource_tracking/ - "app.kubernetes.io/instance": ENV.WORKER_K8S_LOCAL_APPLICATION_NAME, - }, - annotations={ - "argocd.argoproj.io/sync-options": "Prune=false" # don't want argocd to prune this job - }, - ) - else: - metadata = kubernetes.client.V1ObjectMeta(name=secret_name) - - # Instantiate the Secret object - body = kubernetes.client.V1Secret( - data=encoded_secret_data, - type=secret_type, - metadata=metadata, - ) - - # try to patch first - try: - k8s_core_api.patch_namespaced_secret(secret_name, namespace, body) - LOGGER.info(f"Secret {secret_name} in namespace {namespace} has been patched") - except kubernetes.client.rest.ApiException as e: - # a (None or 404) means we can create secret instead, see below - if e.status and e.status != 404: - LOGGER.exception(e) - raise - - # create if patch failed - try: - k8s_core_api.create_namespaced_secret(namespace=namespace, body=body) - LOGGER.info( - f"Created secret {secret_name} of type {secret_type} in namespace {namespace}" - ) - except kubernetes.client.rest.ApiException as e: - LOGGER.exception(e) - raise diff --git a/clientmanager/k8s/starter.py b/clientmanager/k8s/starter.py deleted file mode 100644 index 760178ac..00000000 --- a/clientmanager/k8s/starter.py +++ /dev/null @@ -1,243 +0,0 @@ -"""For starting Skymap Scanner clients on an K8s cluster.""" - - -import base64 -import json -import logging -import os -import pprint -from pathlib import Path -from typing import Any - -import kubernetes # type: ignore[import-untyped] - -from ..config import ENV, FORWARDED_ENV_VARS, LOCAL_K8S_HOST, SECRET_FORWARDED_ENV_VARS -from ..utils import S3File -from . import k8s_tools - -LOGGER = logging.getLogger(__name__) - - -def make_k8s_job_desc( - job_config_stub: Path, - # k8s args - host: str, - namespace: str, - cluster_id: str, - worker_memory_bytes: int, - worker_disk_bytes: int, - n_workers: int, - n_cores: int, - # skymap scanner args - container_image: str, - client_startup_json_s3: S3File, - add_client_args: list[tuple[str, str]], - # special args for the cloud - cpu_arch: str, - # env vars for secrets - secret_env_vars: list[str], -) -> dict[str, Any]: - """Make the k8s job description (submit object).""" - with open(job_config_stub, "r") as f: - k8s_job_dict = json.load(f) - - # multiple different variations add to these... - for meta_field in ["labels", "annotations"]: - if meta_field not in k8s_job_dict["metadata"]: - k8s_job_dict["metadata"][meta_field] = {} - - # ARM-specific fields - # TODO: cleanup these ifs - if cpu_arch == "arm": - cpu_arch = "arm64" - else: - # elif cpu_arch == "x86": - cpu_arch = "amd64" - # labels - k8s_job_dict["metadata"]["labels"].update({"kubernetes.io/arch": cpu_arch}) - # affinity - k8s_job_dict["spec"]["template"]["spec"]["affinity"] = { - "nodeAffinity": { - "requiredDuringSchedulingIgnoredDuringExecution": { - "nodeSelectorTerms": [ - { - "matchExpressions": [ - { - "key": "kubernetes.io/arch", - "operator": "In", - "values": [cpu_arch], - } - ] - } - ] - } - } - } - - # Setting metadata - k8s_job_dict["metadata"]["namespace"] = namespace - k8s_job_dict["metadata"]["name"] = cluster_id - if host == LOCAL_K8S_HOST: - k8s_job_dict["metadata"]["labels"].update( - { - # https://argo-cd.readthedocs.io/en/stable/user-guide/resource_tracking/ - "app.kubernetes.io/instance": ENV.WORKER_K8S_LOCAL_APPLICATION_NAME, - } - ) - k8s_job_dict["metadata"]["annotations"].update( - { - "argocd.argoproj.io/sync-options": "Prune=false" # don't want argocd to prune this job - } - ) - - # Setting parallelism - k8s_job_dict["spec"]["completions"] = n_workers - k8s_job_dict["spec"]["parallelism"] = n_workers - - # set memory & # cores - k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["resources"] = { - "limits": { - "cpu": str(n_cores), - # TODO: give a bit more just in case? - "memory": str(worker_memory_bytes), - "ephemeral-storage": str(worker_disk_bytes), - }, - "requests": { - "cpu": str(n_cores), - "memory": str(worker_memory_bytes), - "ephemeral-storage": str(worker_disk_bytes), - }, - } - - # Setting JSON input file url - k8s_job_dict["spec"]["template"]["spec"]["initContainers"][0]["env"][0][ - "value" - ] = client_startup_json_s3.url - - def add_override_env(new_env_dicts: list[dict[str, Any]]) -> None: - k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["env"] = [ - x - for x in k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["env"] - if x["name"] not in new_env_dicts - ] + new_env_dicts - - # Forward all env vars: ex. SKYSCAN_* & EWMS_* - add_override_env( - [{"name": var, "value": os.environ[var]} for var in FORWARDED_ENV_VARS] - ) - # now add/override any env vars that need to be in a secret - add_override_env( - [ - { - "name": v, # "SKYDRIVER_TOKEN" - "valueFrom": { - "secretKeyRef": { - "name": k8s_tools.get_worker_k8s_secret_name(cluster_id), - "key": v.lower(), # "skydriver_token" - } - }, - } - for v in secret_env_vars - ] - ) - - # Container image - k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["image"] = container_image - - # Adding more args to client - client_args = k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["args"] - for carg, value in add_client_args: - client_args.append(f"--{carg}") - client_args.append(f"{value}") - k8s_job_dict["spec"]["template"]["spec"]["containers"][0]["args"] = client_args - - return k8s_job_dict # type: ignore[no-any-return] - - -def prep( - cluster_id: str, - # k8s CL args - job_config_stub: Path, - host: str, - namespace: str, - cpu_arch: str, - # starter CL args -- worker - worker_memory_bytes: int, - worker_disk_bytes: int, - n_workers: int, - n_cores: int, - # starter CL args -- client - client_args: list[tuple[str, str]], - client_startup_json_s3: S3File, - container_image: str, -) -> dict[str, Any]: - """Create objects needed for starting cluster.""" - if host == LOCAL_K8S_HOST and n_workers > ENV.WORKER_K8S_LOCAL_WORKERS_MAX: - LOGGER.warning( - f"Requested more workers ({n_workers}) than the max allowed {ENV.WORKER_K8S_LOCAL_WORKERS_MAX}. Using the maximum instead." - ) - n_workers = ENV.WORKER_K8S_LOCAL_WORKERS_MAX - - # make k8s job description - k8s_job_dict = make_k8s_job_desc( - job_config_stub, - host, - namespace, - cluster_id, - # condor args - worker_memory_bytes, - worker_disk_bytes, - n_workers, - n_cores, - # skymap scanner args - container_image, - client_startup_json_s3, - client_args, - cpu_arch, - # env vars for secrets - SECRET_FORWARDED_ENV_VARS, - ) - try: - # must be natively json-encodable - LOGGER.info(json.dumps(k8s_job_dict, indent=4)) - except json.decoder.JSONDecodeError: - LOGGER.info(pprint.pformat(k8s_job_dict, indent=4)) - raise - - return k8s_job_dict - - -def start( - k8s_api: kubernetes.client.ApiClient, - k8s_job_dict: dict[str, Any], - cluster_id: str, - # k8s CL args - host: str, - namespace: str, -) -> dict[str, Any]: - """Start cluster.""" - - # create namespace - # kubernetes.client.CoreV1Api(k8s_api).create_namespace( - # kubernetes.client.V1Namespace( - # metadata=kubernetes.client.V1ObjectMeta(name=namespace) - # ) - # ) - - # create secret - k8s_tools.patch_or_create_namespaced_secret( - kubernetes.client.CoreV1Api(k8s_api), - host, - namespace, - k8s_tools.get_worker_k8s_secret_name(cluster_id), - "opaque", - { - v.lower(): base64.b64encode(os.environ[v].encode("ascii")).decode("utf-8") - for v in SECRET_FORWARDED_ENV_VARS - }, - ) - - # submit jobs - kubernetes.utils.create_from_dict(k8s_api, k8s_job_dict, namespace=namespace) - - return k8s_job_dict diff --git a/clientmanager/k8s/stopper.py b/clientmanager/k8s/stopper.py deleted file mode 100644 index e5e53de6..00000000 --- a/clientmanager/k8s/stopper.py +++ /dev/null @@ -1,45 +0,0 @@ -"""For stopping Skymap Scanner clients on a K8s cluster.""" - - -import logging - -import kubernetes # type: ignore[import-untyped] - -from . import k8s_tools - -LOGGER = logging.getLogger(__name__) - - -def stop( - namespace: str, - cluster_id: str, - k8s_api: kubernetes.client.ApiClient, -) -> None: - """Main logic.""" - - # Remove workers -- may not be instantaneous - LOGGER.info("Requesting removal...") - resp = kubernetes.client.BatchV1Api(k8s_api).delete_namespaced_job( - name=cluster_id, - namespace=namespace, - body=kubernetes.client.V1DeleteOptions( - propagation_policy="Foreground", grace_period_seconds=5 - ), - ) - LOGGER.info( - f"Removed workers: {cluster_id} in namespace {namespace} with response {resp.status} " - ) - - # Remove secret -- may not be instantaneous - resp = kubernetes.client.CoreV1Api(k8s_api).delete_namespaced_secret( - name=k8s_tools.get_worker_k8s_secret_name(cluster_id), - namespace=namespace, - body=kubernetes.client.V1DeleteOptions( - propagation_policy="Foreground", grace_period_seconds=5 - ), - ) - LOGGER.info( - f"Removed secret: {k8s_tools.get_worker_k8s_secret_name(cluster_id)} in namespace {namespace} with response {resp.status} " - ) - - # TODO: get/forward job logs diff --git a/clientmanager/py.typed b/clientmanager/py.typed deleted file mode 100644 index e69de29b..00000000 diff --git a/clientmanager/utils.py b/clientmanager/utils.py deleted file mode 100644 index 02f6e49a..00000000 --- a/clientmanager/utils.py +++ /dev/null @@ -1,138 +0,0 @@ -"""General Utilities.""" - - -import dataclasses as dc -import logging -from pathlib import Path -from typing import Any - -import boto3 # type: ignore[import-untyped] -import requests -from rest_tools.client import RestClient - -from .config import ENV - -LOGGER = logging.getLogger(__name__) - - -def connect_to_skydriver() -> RestClient: - """Connect to SkyDriver REST server & check scan id.""" - if not ENV.SKYSCAN_SKYDRIVER_SCAN_ID: - raise RuntimeError( - "Cannot connect to SkyDriver without `SKYSCAN_SKYDRIVER_SCAN_ID`" - ) - - skydriver_rc = RestClient( - ENV.SKYSCAN_SKYDRIVER_ADDRESS, - token=ENV.SKYSCAN_SKYDRIVER_AUTH, - ) - - LOGGER.info("Connected to SkyDriver") - return skydriver_rc - - -def skydriver_aborted_scan(skydriver_rc: RestClient) -> bool: - """Return whether the scan has been signaled for deletion.""" - ret = skydriver_rc.request_seq( - "GET", - f"/scan/{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}/manifest", - ) - return ret["is_deleted"] # type: ignore[no-any-return] - - -def update_skydriver( - skydriver_rc: RestClient, - orchestrator: str, - location: dict[str, str], - uuid: str, - cluster_id: str | int, - n_workers: int, - starter_info: dict[str, Any], - # - statuses: dict[str, dict[str, int]] | None = None, - top_task_errors: dict[str, int] | None = None, -) -> None: - """Send SkyDriver updates from the `submit_result`.""" - skydriver_cluster_obj = { - "orchestrator": orchestrator, - "location": location, - "uuid": uuid, - "cluster_id": str(cluster_id), - "n_workers": n_workers, - "starter_info": starter_info, - } - if statuses: - skydriver_cluster_obj["statuses"] = statuses - if top_task_errors: - skydriver_cluster_obj["top_task_errors"] = top_task_errors - - skydriver_rc.request_seq( - "PATCH", - f"/scan/{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}/manifest", - {"cluster": skydriver_cluster_obj}, - ) - - -@dc.dataclass -class S3File: - """Wrap an S3 file.""" - - url: str - fname: str - - -def s3ify(filepath: Path) -> S3File: - """Put the file in s3 and return info about it.""" - if not ( - ENV.EWMS_TMS_S3_URL - and ENV.EWMS_TMS_S3_ACCESS_KEY_ID - and ENV.EWMS_TMS_S3_SECRET_KEY - and ENV.EWMS_TMS_S3_BUCKET - and ENV.SKYSCAN_SKYDRIVER_SCAN_ID - ): - raise RuntimeError( - "must define all EWMS_TMS_S3_* environment variables to use S3" - ) - s3_client = boto3.client( - "s3", - "us-east-1", - endpoint_url=ENV.EWMS_TMS_S3_URL, - aws_access_key_id=ENV.EWMS_TMS_S3_ACCESS_KEY_ID, - aws_secret_access_key=ENV.EWMS_TMS_S3_SECRET_KEY, - ) - bucket = ENV.EWMS_TMS_S3_BUCKET - key = f"{ENV.SKYSCAN_SKYDRIVER_SCAN_ID}-s3-{filepath.stem}" - - # get GET url - get_url = s3_client.generate_presigned_url( - "get_object", - Params={ - "Bucket": bucket, - "Key": key, - }, - ExpiresIn=ENV.EWMS_TMS_S3_EXPIRATION, # seconds - ) - s3_file = S3File(get_url, key) - - # check if already there (via other process/container) - try: - resp = requests.get(get_url) - resp.raise_for_status() - LOGGER.debug(resp) - LOGGER.info(f"File is already in S3. Using url: {get_url}") - return s3_file - except requests.exceptions.HTTPError: - LOGGER.info("File is not in S3 yet. Posting...") - - # POST - upload_details = s3_client.generate_presigned_post(bucket, key) - with open(filepath, "rb") as f: - response = requests.post( - upload_details["url"], - data=upload_details["fields"], - files={"file": (filepath.name, f)}, # maps filename to obj - ) - LOGGER.info(f"Upload response: {response.status_code}") - LOGGER.info(str(response.content)) - - return s3_file diff --git a/setup.cfg b/setup.cfg index 825d8675..082e7d27 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,7 +4,6 @@ python_max = 3.11 patch_without_tag = False package_dirs = skydriver - clientmanager s3_sidecar [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords From fbb9d3eb85664f76643ffb0deb4d25c499fda613 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 8 Jan 2025 17:55:21 -0600 Subject: [PATCH 033/327] add `MIN_SKYMAP_SCANNER_TAG` - 1 --- skydriver/config.py | 1 + skydriver/images.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/skydriver/config.py b/skydriver/config.py index 55385562..d824b71b 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -76,6 +76,7 @@ class EnvConfig: SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE: int = 5 * 60 # entry is revived after N secs THIS_IMAGE_WITH_TAG: str = "" + MIN_SKYMAP_SCANNER_TAG: str = "v4.0.0" # k8s K8S_NAMESPACE: str = "" diff --git a/skydriver/images.py b/skydriver/images.py index c40d65f5..cde24dfd 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -6,6 +6,9 @@ import cachetools.func import requests +from dateutil import parser as dateutil_parser + +from skydriver.config import ENV LOGGER = logging.getLogger(__name__) @@ -70,6 +73,17 @@ def _match_sha_to_majminpatch(sha: str) -> str | None: return None +@cachetools.func.lru_cache() +def _get_image_ts(docker_tag: str) -> float: + """Get the timestamp for when the image was created.""" + try: + dtime = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}").json()["last_updated"] + return dateutil_parser.parse(dtime).timestamp() + except Exception as e: + LOGGER.exception(e) + raise e + + @cachetools.func.ttl_cache(ttl=5 * 60) def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: """Get the '#.#.#' tag on Docker Hub w/ `docker_tag`'s SHA if possible. @@ -94,14 +108,22 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: if VERSION_REGEX_MAJMINPATCH.fullmatch(docker_tag): return docker_tag + # check that the image is not too old + if _get_image_ts(docker_tag) < _get_image_ts(ENV.MIN_SKYMAP_SCANNER_TAG): + raise ValueError( + "Image tag is too old to be supported--contact admins for more info" + ) + _error = ValueError("Image tag could not resolve to a full version") + # get sha try: sha = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}").json()["digest"] except Exception as e: LOGGER.exception(e) raise _error + # match sha to vX.Y.Z try: if majminpatch := _match_sha_to_majminpatch(sha): return majminpatch @@ -112,6 +134,7 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: raise _error +@cachetools.func.ttl_cache(ttl=5 * 60) def tag_exists_on_docker_hub(docker_tag: str) -> bool: """Return whether the tag exists on Docker Hub.""" if not docker_tag or not docker_tag.strip(): From 319eb4451142534c094f9d6fdabf85aeede92937 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 8 Jan 2025 18:12:32 -0600 Subject: [PATCH 034/327] add `MIN_SKYMAP_SCANNER_TAG` - 2 --- skydriver/images.py | 53 ++++++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/skydriver/images.py b/skydriver/images.py index cde24dfd..1fed5639 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -73,18 +73,23 @@ def _match_sha_to_majminpatch(sha: str) -> str | None: return None -@cachetools.func.lru_cache() -def _get_image_ts(docker_tag: str) -> float: +def _parse_image_ts(info: dict) -> float: """Get the timestamp for when the image was created.""" try: - dtime = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}").json()["last_updated"] - return dateutil_parser.parse(dtime).timestamp() + return dateutil_parser.parse(info["last_updated"]).timestamp() except Exception as e: LOGGER.exception(e) raise e -@cachetools.func.ttl_cache(ttl=5 * 60) +@cachetools.func.lru_cache() # cache it forever +def min_skymap_scanner_tag_ts() -> float: + """Get the timestamp for when the `MIN_SKYMAP_SCANNER_TAG` image was created.""" + info = get_info_from_docker_hub(ENV.MIN_SKYMAP_SCANNER_TAG) + return _parse_image_ts(info) + + +@cachetools.func.ttl_cache(ttl=5 * 60) # don't cache too long, tags can be overwritten def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: """Get the '#.#.#' tag on Docker Hub w/ `docker_tag`'s SHA if possible. @@ -102,49 +107,47 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: ValueError -- if `docker_tag` doesn't exist on Docker Hub ValueError -- if there's an issue communicating w/ Docker Hub API """ - if not tag_exists_on_docker_hub(docker_tag): - raise ValueError(f"Image tag not on Docker Hub: {docker_tag}") + info = get_info_from_docker_hub(docker_tag) if VERSION_REGEX_MAJMINPATCH.fullmatch(docker_tag): return docker_tag # check that the image is not too old - if _get_image_ts(docker_tag) < _get_image_ts(ENV.MIN_SKYMAP_SCANNER_TAG): + if _parse_image_ts(info) < min_skymap_scanner_tag_ts(): raise ValueError( - "Image tag is too old to be supported--contact admins for more info" + f"Image tag is older than the minimum supported tag " + f"'{ENV.MIN_SKYMAP_SCANNER_TAG}'. Contact admins for more info" ) - _error = ValueError("Image tag could not resolve to a full version") - - # get sha - try: - sha = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}").json()["digest"] - except Exception as e: - LOGGER.exception(e) - raise _error - # match sha to vX.Y.Z try: - if majminpatch := _match_sha_to_majminpatch(sha): + if majminpatch := _match_sha_to_majminpatch(info["digest"]): return majminpatch else: # no match return docker_tag except Exception as e: LOGGER.exception(e) - raise _error + raise ValueError("Image tag could not resolve to a full version") + +def get_info_from_docker_hub(docker_tag: str) -> dict: + """Get the json dict from GET @ Docker Hub.""" + _error = ValueError(f"Image tag not on Docker Hub: {docker_tag}") -@cachetools.func.ttl_cache(ttl=5 * 60) -def tag_exists_on_docker_hub(docker_tag: str) -> bool: - """Return whether the tag exists on Docker Hub.""" if not docker_tag or not docker_tag.strip(): - return False + raise _error + try: - return requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}").ok + resp = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}") except Exception as e: LOGGER.exception(e) raise ValueError("Image tag verification failed") + if not resp.ok: + raise _error + + return resp.json() + def resolve_docker_tag(docker_tag: str) -> str: """Check if the docker tag exists, then resolve 'latest' if needed. From 3dff4586daa39f4888abe4ca8b587f5707615d10 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 13:29:42 -0600 Subject: [PATCH 035/327] add `ewms_init_container` --- ewms_init_container/__init__.py | 16 +++++++++ ewms_init_container/__main__.py | 56 +++++++++++++++++++++++++++++++ s3_sidecar/post.py | 14 ++++++-- setup.cfg | 1 + skydriver/k8s/scanner_instance.py | 6 +++- 5 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 ewms_init_container/__init__.py create mode 100644 ewms_init_container/__main__.py diff --git a/ewms_init_container/__init__.py b/ewms_init_container/__init__.py new file mode 100644 index 00000000..a5b4c4b9 --- /dev/null +++ b/ewms_init_container/__init__.py @@ -0,0 +1,16 @@ +"""Public init.""" + +# version is a human-readable version number. + +# version_info is a four-tuple for programmatic comparison. The first +# three numbers are the components of the version number. The fourth +# is zero for an official release, positive for a development branch, +# or negative for a release candidate or beta (after the base version +# number has been incremented) +__version__ = "1.1.0" +version_info = ( + int(__version__.split(".")[0]), + int(__version__.split(".")[1]), + int(__version__.split(".")[2]), + 0, +) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py new file mode 100644 index 00000000..b9a7894d --- /dev/null +++ b/ewms_init_container/__main__.py @@ -0,0 +1,56 @@ +"""Run the EWMS Init Container logic.""" + +import argparse +import json +import logging +from pathlib import Path + +LOGGER = logging.getLogger(__package__) + + +def get_workflow_id(scan_id: str) -> str: + """Retrieve the workflow id for the scan (w/ `scan_id`).""" + LOGGER.info(f"getting workflow id for scan {scan_id}...") + + +def get_ewms_attrs(workflow_id: str) -> dict[str, str]: + """Retrieve the EWMS attributes for the workflow.""" + LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") + + +def _assure_json(val: str) -> Path: + fpath = Path(val) + if fpath.suffix != ".json": + raise ValueError(f"File {fpath} is not a JSON file.") + return fpath + + +def main() -> None: + """Main.""" + parser = argparse.ArgumentParser( + description="Retrieve EWMS attributes for use by a Skymap Scanner instance.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "scan_id", + type=str, + help="the scan id", + ) + parser.add_argument( + "--json-out", + type=_assure_json, + help="the json file to write the map of EWMS attributes to", + ) + args = parser.parse_args() + + workflow_id = get_workflow_id(args.scan_id) + ewms_dict = get_ewms_attrs(workflow_id) + + LOGGER.info(f"dumping EWMS attributes to '{args.json_out}'...") + with open(args.json_out, "w") as f: + json.dump(ewms_dict, f) + + +if __name__ == "__main__": + main() + LOGGER.info("Done.") diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index bf5e7cff..a045d513 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -1,6 +1,7 @@ """Utilities for posting to an S3 bucket.""" import argparse +import logging import os import time from pathlib import Path @@ -8,13 +9,18 @@ import boto3 import requests +LOGGER = logging.getLogger(__package__) + def post(fpath: Path) -> None: """Post the file to the S3 bucket.""" if not fpath.exists(): raise FileNotFoundError(str(fpath)) + + LOGGER.info("file exists, waiting a bit longer just in case") time.sleep(5) # in case the file is currently being written (good enough logic?) + LOGGER.info("connecting to s3...") s3_client = boto3.client( "s3", "us-east-1", @@ -24,10 +30,12 @@ def post(fpath: Path) -> None: ) # POST + LOGGER.info("generating presigned post-url...") upload_details = s3_client.generate_presigned_post( os.environ["S3_BUCKET"], os.environ["S3_OBJECT_KEY"], ) + LOGGER.info("posting file to s3...") with open(fpath, "rb") as f: response = requests.post( upload_details["url"], @@ -35,8 +43,8 @@ def post(fpath: Path) -> None: files={"file": (fpath.name, f)}, # maps filename to obj ) - print(f"Upload response: {response.status_code}") - print(str(response.content)) + LOGGER.info(f"Upload response: {response.status_code}") + LOGGER.info(str(response.content)) def main() -> None: @@ -61,6 +69,7 @@ def main() -> None: args = parser.parse_args() if args.wait_indefinitely: + LOGGER.info("Waiting for file to exist...") while not args.fpath.exists(): time.sleep(1) @@ -69,3 +78,4 @@ def main() -> None: if __name__ == "__main__": main() + LOGGER.info("Done.") diff --git a/setup.cfg b/setup.cfg index 082e7d27..f2f36c5e 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,7 @@ patch_without_tag = False package_dirs = skydriver s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 7c0dc2f7..5b49652c 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -119,6 +119,10 @@ def _make_job( spec: serviceAccountName: {ENV.K8S_SKYSCAN_JOBS_SERVICE_ACCOUNT} restartPolicy: Never + initContainers: + - name: init-ewms-{scan_id} + image: {ENV.THIS_IMAGE_WITH_TAG} + command: ["python", "-m", "ewms_init_container"] containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} @@ -136,7 +140,7 @@ def _make_job( volumeMounts: - name: common-space-volume mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" - - name: s3-sidecar-{scan_id} + - name: sidecar-s3-{scan_id} restartPolicy: OnFailure image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "s3_sidecar.post"] From 0098793216462fbcea9cae71546c0256edab7832 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 13:55:27 -0600 Subject: [PATCH 036/327] add `ewms_init_container` - 2 --- ewms_init_container/__main__.py | 29 ++++++++++++++++++++++++----- s3_sidecar/post.py | 1 + 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index b9a7894d..95e2c9c3 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -1,22 +1,41 @@ """Run the EWMS Init Container logic.""" import argparse +import asyncio import json import logging +import os from pathlib import Path +from rest_tools.client import ClientCredentialsAuth + LOGGER = logging.getLogger(__package__) -def get_workflow_id(scan_id: str) -> str: +async def get_workflow_id(scan_id: str) -> str: """Retrieve the workflow id for the scan (w/ `scan_id`).""" LOGGER.info(f"getting workflow id for scan {scan_id}...") + skydriver_rc = ClientCredentialsAuth( + os.environ["EWMS_ADDRESS"], + os.environ["EWMS_TOKEN_URL"], + os.environ["EWMS_CLIENT_ID"], + os.environ["EWMS_CLIENT_SECRET"], + logger=LOGGER, + ) + resp = skydriver_rc.request("GET", f"/scan/{scan_id}/manifest") + workflow_id = resp["ewms_task"] # this is the workflow_id + + LOGGER.info(f"workflow id: {workflow_id}") + return workflow_id + def get_ewms_attrs(workflow_id: str) -> dict[str, str]: """Retrieve the EWMS attributes for the workflow.""" LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") + # TODO + def _assure_json(val: str) -> Path: fpath = Path(val) @@ -25,7 +44,7 @@ def _assure_json(val: str) -> Path: return fpath -def main() -> None: +async def main() -> None: """Main.""" parser = argparse.ArgumentParser( description="Retrieve EWMS attributes for use by a Skymap Scanner instance.", @@ -43,8 +62,8 @@ def main() -> None: ) args = parser.parse_args() - workflow_id = get_workflow_id(args.scan_id) - ewms_dict = get_ewms_attrs(workflow_id) + workflow_id = await get_workflow_id(args.scan_id) + ewms_dict = await get_ewms_attrs(workflow_id) LOGGER.info(f"dumping EWMS attributes to '{args.json_out}'...") with open(args.json_out, "w") as f: @@ -52,5 +71,5 @@ def main() -> None: if __name__ == "__main__": - main() + asyncio.run(main()) LOGGER.info("Done.") diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index a045d513..c91f0cc6 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -71,6 +71,7 @@ def main() -> None: if args.wait_indefinitely: LOGGER.info("Waiting for file to exist...") while not args.fpath.exists(): + # TODO: use wipac_dev_tools.timing_tools.IntervalTimer to log every X sec time.sleep(1) post(args.fpath) From 89714193ee5d017891c19e62daba0e2562032d4d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 14:16:00 -0600 Subject: [PATCH 037/327] use `ewms_workflow_id` instead of overloading `ewms_task` --- skydriver/database/schema.py | 27 ++++++++++++++++++++------- skydriver/ewms.py | 2 +- skydriver/k8s/scan_backlog.py | 2 +- skydriver/rest_handlers.py | 12 +++++++++--- 4 files changed, 31 insertions(+), 12 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index f52d9749..74cb6214 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -131,7 +131,10 @@ def obfuscate_cl_args(args: str) -> str: return " ".join(out_args) +PENDING_EWMS_WORKFLOW = "pending ewms" + DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" +DEPRECATED_EWMS_TASK = "use 'ewms_workflow_id'" @typechecked @@ -142,13 +145,16 @@ class Manifest(ScanIDDataclass): timestamp: float is_deleted: bool - ewms_task: dict | str # `""` -> workflow request has not (yet) been sent to EWMS - # ^^^ str -> EWMS workflow id (i.e. this id points to info in EWMS) - # ^^^ dict -> **DEPRECATED** was used in skydriver 1.x to use local k8s starter/stopper - # args placed in k8s job obj scanner_server_args: str + # EWMS interface + ewms_workflow_id: str | None = None # id points to info in EWMS + ewms_finished: bool = False # a cache so we don't have to call to ewms each time + # -> deprecated fields -- see __post_init__ for backward compatibility logic + ewms_task: dict | str = DEPRECATED_EWMS_TASK # **DEPRECATED** + # ^^^ was used in skydriver 1.x to use local k8s starter/stopper + priority: int = ( 0 # same as https://htcondor.readthedocs.io/en/latest/users-manual/priorities-and-preemption.html#job-priority ) @@ -171,9 +177,8 @@ class Manifest(ScanIDDataclass): last_updated: float = 0.0 - ewms_finished: bool = False # a cache so we don't have to call to ewms each time - def __post_init__(self) -> None: + # Backward compatibility: 'i3_event_id' replaced 'event_i3live_json_dict' if ( not self.i3_event_id and self.event_i3live_json_dict == DEPRECATED_EVENT_I3LIVE_JSON_DICT @@ -182,12 +187,20 @@ def __post_init__(self) -> None: "Manifest must define 'i3_event_id' " "(old manifests may define 'event_i3live_json_dict' instead)" ) - self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) + # Backward compatibility: 'ewms_workflow_id' replaced 'ewms_task' + if not self.ewms_workflow_id and self.ewms_task == DEPRECATED_EWMS_TASK: + raise ValueError( + "Manifest must define 'ewms_workflow_id' " + "(old manifests may define 'ewms_task' instead)" + ) # Backward compatibility: 1.x had 'complete' in a nested field if isinstance(self.ewms_task, dict): self.ewms_finished = self.ewms_task.get("complete", False) + # don't show sensitive data to user + self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) + def get_state(self) -> ScanState: """Determine the state of the scan by parsing attributes.""" if ( diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 470ab22b..75fdc1c2 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -11,7 +11,7 @@ async def request_workflow_on_ewms( scan_request_obj: dict, ) -> str: """Request a workflow in EWMS.""" - if not (isinstance(manifest.ewms_task, str) and manifest.ewms_task): + if manifest.ewms_workflow_id != database.schema.PENDING_EWMS_WORKFLOW: raise TypeError("Manifest is not designated for EWMS") s3_url_get = s3.generate_s3_get_url(manifest.scan_id) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 6dcea627..5df70476 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -211,7 +211,7 @@ async def _run( continue await manifest_client.collection.find_one_and_update( {"scan_id": manifest.scan_id}, - {"$set": {"ewms_task": workflow_id}}, + {"$set": {"ewms_workflow_id": workflow_id}}, ) LOGGER.info( diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index a8fb1f17..3900b8bd 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -584,7 +584,8 @@ async def _start_scan( is_deleted=False, i3_event_id=scan_request_obj["i3_event_id"], scanner_server_args=scanner_server_args, - ewms_task="", # set once the workflow request has been sent to EWMS (see backlogger) + ewms_workflow_id=schema.PENDING_EWMS_WORKFLOW, + # ^^^ set once the workflow request has been sent to EWMS (see backlogger) classifiers=scan_request_obj["classifiers"], priority=scan_request_obj["priority"], ) @@ -681,8 +682,13 @@ async def stop_skyscan_workers( return manifest # request to ewms - if manifest.ewms_task and isinstance(manifest.ewms_task, str): - await request_stop_on_ewms(ewms_rc, manifest.ewms_task) + if manifest.ewms_workflow_id: + if manifest.ewms_workflow_id == schema.PENDING_EWMS_WORKFLOW: + LOGGER.info( + "OK: attempted to stop skyscan workers but scan has not been sent to EWMS" + ) + else: + await request_stop_on_ewms(ewms_rc, manifest.ewms_workflow_id) else: raise web.HTTPError( 400, From 5cb0af6446bf98a169cf216442532244501f5357 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 15:11:59 -0600 Subject: [PATCH 038/327] add `ewms_init_container` - 3 --- ewms_init_container/__main__.py | 63 ++++++++++++++++++++++++++----- skydriver/k8s/scanner_instance.py | 3 +- 2 files changed, 55 insertions(+), 11 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 95e2c9c3..e0d7f98f 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -5,9 +5,10 @@ import json import logging import os +import time from pathlib import Path -from rest_tools.client import ClientCredentialsAuth +from rest_tools.client import ClientCredentialsAuth, RestClient LOGGER = logging.getLogger(__package__) @@ -16,25 +17,67 @@ async def get_workflow_id(scan_id: str) -> str: """Retrieve the workflow id for the scan (w/ `scan_id`).""" LOGGER.info(f"getting workflow id for scan {scan_id}...") - skydriver_rc = ClientCredentialsAuth( - os.environ["EWMS_ADDRESS"], - os.environ["EWMS_TOKEN_URL"], - os.environ["EWMS_CLIENT_ID"], - os.environ["EWMS_CLIENT_SECRET"], + skyd_rc = RestClient( + os.environ["SKYSCAN_SKYDRIVER_ADDRESS"], + os.environ["SKYSCAN_SKYDRIVER_AUTH"], logger=LOGGER, ) - resp = skydriver_rc.request("GET", f"/scan/{scan_id}/manifest") - workflow_id = resp["ewms_task"] # this is the workflow_id + resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") + workflow_id = resp["ewms_workflow_id"] LOGGER.info(f"workflow id: {workflow_id}") return workflow_id -def get_ewms_attrs(workflow_id: str) -> dict[str, str]: +async def get_ewms_attrs(workflow_id: str) -> dict[str, str]: """Retrieve the EWMS attributes for the workflow.""" LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") - # TODO + ewms_rc = ClientCredentialsAuth( + os.environ["EWMS_ADDRESS"], + os.environ["EWMS_TOKEN_URL"], + os.environ["EWMS_CLIENT_ID"], + os.environ["EWMS_CLIENT_SECRET"], + logger=LOGGER, + ) + + # loop until mqprofiles is not empty and all "is_activated" fields are true + while True: + LOGGER.info("requesting EWMS mqprofiles...") + mqprofiles = ( + await ewms_rc.request( + "GET", + f"/v0/mqs/workflows/{workflow_id}/mq-profiles/public", + ) + )["mqprofiles"] + if mqprofiles and all(m["is_activated"] for m in mqprofiles): + break + else: + LOGGER.info("mqprofiles are not all activated, retrying soon...") + time.sleep(10) + + LOGGER.info(f"mqprofiles: {mqprofiles}") + + # convert mqprofiles to dicts based on the queue aliases + toclient = next( + p for p in mqprofiles if p["mqid"] == os.environ["QUEUE_ALIAS_TOCLIENT"] + ) + fromclient = next( + p for p in mqprofiles if p["mqid"] == os.environ["QUEUE_ALIAS_FROMCLIENT"] + ) + + return { + # to-client + "SKYSCAN_MQ_TOCLIENT": toclient["mqid"], + "SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN": toclient["auth_token"], + "SKYSCAN_MQ_TOCLIENT_BROKER_TYPE": toclient["broker_type"], + "SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS": toclient["broker_address"], + # from-client + "SKYSCAN_MQ_FROMCLIENT": fromclient["mqid"], + "SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN": fromclient["auth_token"], + "SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE": fromclient["broker_type"], + "SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS": fromclient["broker_address"], + } def _assure_json(val: str) -> Path: diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 5b49652c..b38b3675 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -123,6 +123,7 @@ def _make_job( - name: init-ewms-{scan_id} image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "ewms_init_container"] + args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'ewms_env.json'}"] containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} @@ -144,7 +145,7 @@ def _make_job( restartPolicy: OnFailure image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "s3_sidecar.post"] - args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}" "--wait-indefinitely"] + args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}", "--wait-indefinitely"] env: - name: S3_URL value: "{ENV.S3_URL}" From df02daf8f185edef7640048d0874be7419f6f1f6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 16:23:50 -0600 Subject: [PATCH 039/327] add `ewms_init_container` - 4 --- ewms_init_container/__main__.py | 24 +++++++++++++----------- skydriver/config.py | 1 - skydriver/k8s/scanner_instance.py | 10 ++++++---- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index e0d7f98f..414b9050 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -29,7 +29,7 @@ async def get_workflow_id(scan_id: str) -> str: return workflow_id -async def get_ewms_attrs(workflow_id: str) -> dict[str, str]: +async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: """Retrieve the EWMS attributes for the workflow.""" LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") @@ -67,16 +67,18 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, str]: ) return { - # to-client - "SKYSCAN_MQ_TOCLIENT": toclient["mqid"], - "SKYSCAN_MQ_TOCLIENT_AUTH_TOKEN": toclient["auth_token"], - "SKYSCAN_MQ_TOCLIENT_BROKER_TYPE": toclient["broker_type"], - "SKYSCAN_MQ_TOCLIENT_BROKER_ADDRESS": toclient["broker_address"], - # from-client - "SKYSCAN_MQ_FROMCLIENT": fromclient["mqid"], - "SKYSCAN_MQ_FROMCLIENT_AUTH_TOKEN": fromclient["auth_token"], - "SKYSCAN_MQ_FROMCLIENT_BROKER_TYPE": fromclient["broker_type"], - "SKYSCAN_MQ_FROMCLIENT_BROKER_ADDRESS": fromclient["broker_address"], + "toclient": { + "name": toclient["mqid"], + "auth_token": toclient["auth_token"], + "broker_type": toclient["broker_type"], + "broker_address": toclient["broker_address"], + }, + "fromclient": { + "name": fromclient["mqid"], + "auth_token": fromclient["auth_token"], + "broker_type": fromclient["broker_type"], + "broker_address": fromclient["broker_address"], + }, } diff --git a/skydriver/config.py b/skydriver/config.py index d824b71b..0890473a 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -94,7 +94,6 @@ class EnvConfig: KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST: str = "" # skyscan (forwarded) - SKYSCAN_BROKER_ADDRESS: str = "localhost" # TODO: see https://github.com/WIPACrepo/wipac-dev-tools/pull/69 SKYSCAN_PROGRESS_INTERVAL_SEC: Optional[int] = None SKYSCAN_RESULT_INTERVAL_SEC: Optional[int] = None diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index b38b3675..c69e71f3 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -22,6 +22,8 @@ class SkyScanK8sJobFactory: """Makes Skymap Scanner Kubernetes jobs, plus misc tools.""" COMMON_SPACE_VOLUME_PATH = Path("/common-space") + _STARTUP_JSON_FPATH = COMMON_SPACE_VOLUME_PATH / "startup.json" + _EWMS_JSON_FPATH = COMMON_SPACE_VOLUME_PATH / "ewms.json" @staticmethod def make( @@ -123,7 +125,7 @@ def _make_job( - name: init-ewms-{scan_id} image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "ewms_init_container"] - args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'ewms_env.json'}"] + args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory._EWMS_JSON_FPATH}"] containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} @@ -145,7 +147,7 @@ def _make_job( restartPolicy: OnFailure image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "s3_sidecar.post"] - args: ["{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'}", "--wait-indefinitely"] + args: ["{SkyScanK8sJobFactory._STARTUP_JSON_FPATH}", "--wait-indefinitely"] env: - name: S3_URL value: "{ENV.S3_URL}" @@ -202,7 +204,7 @@ def get_scanner_server_args( f" --reco-algo {reco_algo}" f" --cache-dir {SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH} " # f" --output-dir {common_space_volume_path} " # output is sent to skydriver - f" --client-startup-json {SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH/'startup.json'} " + f" --client-startup-json {SkyScanK8sJobFactory._STARTUP_JSON_FPATH} " f" --nsides {' '.join(f'{n}:{x}' for n,x in nsides.items())} " # k1:v1 k2:v2 f" {'--real-event' if is_real_event else '--simulated-event'} " f" --predictive-scanning-threshold {predictive_scanning_threshold} " @@ -240,7 +242,7 @@ def make_skyscan_server_envvars( # 1. add required env vars required = { # broker/mq vars - "SKYSCAN_BROKER_ADDRESS": ENV.SKYSCAN_BROKER_ADDRESS, + "SKYSCAN_EWMS_JSON": str(SkyScanK8sJobFactory._EWMS_JSON_FPATH), # skydriver vars "SKYSCAN_SKYDRIVER_ADDRESS": rest_address, "SKYSCAN_SKYDRIVER_SCAN_ID": scan_id, From 1ba8d584b683bf00fd844b07a2ac308b60b8486c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 9 Jan 2025 16:50:59 -0600 Subject: [PATCH 040/327] add `ewms_init_container` - 5 (env vars) --- skydriver/config.py | 3 ++ skydriver/ewms.py | 7 +++-- skydriver/k8s/scanner_instance.py | 46 +++++++++++++++++++++++-------- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 0890473a..8332c97c 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -30,6 +30,9 @@ SCAN_MIN_PRIORITY_TO_START_ASAP = 100 +QUEUE_ALIAS_TOCLIENT = "to-client-queue" # this *needs* to stay constant, stored in db +QUEUE_ALIAS_FROMCLIENT = "from-client-queue" # '' + @enum.unique class DebugMode(enum.Enum): diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 75fdc1c2..ad027707 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -3,6 +3,7 @@ from rest_tools.client import RestClient from . import database, images, s3 +from .config import QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT async def request_workflow_on_ewms( @@ -18,14 +19,14 @@ async def request_workflow_on_ewms( image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) body = { - "public_queue_aliases": ["to-client-queue", "from-client-queue"], + "public_queue_aliases": [QUEUE_ALIAS_TOCLIENT, QUEUE_ALIAS_FROMCLIENT], "tasks": [ { "cluster_locations": [ cname for cname, _ in scan_request_obj["request_clusters"] ], - "input_queue_aliases": ["to-client-queue"], - "output_queue_aliases": ["from-client-queue"], + "input_queue_aliases": [QUEUE_ALIAS_TOCLIENT], + "output_queue_aliases": [QUEUE_ALIAS_FROMCLIENT], "task_image": image, "task_args": ( "python -m skymap_scanner.client " diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index c69e71f3..e8c57282 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -12,12 +12,31 @@ from ..config import ( DebugMode, ENV, + QUEUE_ALIAS_FROMCLIENT, + QUEUE_ALIAS_TOCLIENT, sdict, ) LOGGER = logging.getLogger(__name__) +def _to_inline_yaml(obj: list[str] | sdict) -> str: + """Convert obj-based attrs to yaml-syntax""" + # -> inline, compact formatting, no indenting needed + if isinstance(obj, dict): + return yaml.safe_dump( + [{"name": k, "value": v} for k, v in obj.items()], + default_flow_style=True, + ) + elif isinstance(obj, list): + yaml.safe_dump( + obj, + default_flow_style=True, + ) + else: + raise TypeError(f"unsupported type {type(obj)}") + + class SkyScanK8sJobFactory: """Makes Skymap Scanner Kubernetes jobs, plus misc tools.""" @@ -86,16 +105,18 @@ def _make_job( NOTE: Let's keep definitions as straightforward as possible. """ - - # first, convert obj-based attrs to yaml-syntax - # -> inline, compact formatting, no indenting needed - scanner_env_yaml = yaml.safe_dump( - [{"name": k, "value": v} for k, v in scanner_server_envvars.items()], - default_flow_style=True, - ) - scanner_args_yaml = yaml.safe_dump( - scanner_server_args.split(), - default_flow_style=True, + init_ewms_envvars = {} + for k in ["SKYSCAN_SKYDRIVER_ADDRESS", "SKYSCAN_SKYDRIVER_AUTH"]: + init_ewms_envvars[k] = scanner_server_envvars[k] + init_ewms_envvars.update( + { + "EWMS_ADDRESS": ENV.EWMS_ADDRESS, + "EWMS_TOKEN_URL": ENV.EWMS_TOKEN_URL, + "EWMS_CLIENT_ID": ENV.EWMS_CLIENT_ID, + "EWMS_CLIENT_SECRET": ENV.EWMS_CLIENT_SECRET, + "QUEUE_ALIAS_TOCLIENT": QUEUE_ALIAS_TOCLIENT, + "QUEUE_ALIAS_FROMCLIENT": QUEUE_ALIAS_FROMCLIENT, + } ) # now, assemble @@ -126,12 +147,13 @@ def _make_job( image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "ewms_init_container"] args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory._EWMS_JSON_FPATH}"] + env: {_to_inline_yaml(init_ewms_envvars)} containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} command: [] - args: {scanner_args_yaml} - env: {scanner_env_yaml} + args: {_to_inline_yaml(scanner_server_args.split())} + env: {_to_inline_yaml(scanner_server_envvars)} resources: limits: memory: "{scanner_server_memory_bytes}" From 5bee893a330bfaef3c7ffbeb45b5acd4cf730ef5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 10 Jan 2025 11:24:37 -0600 Subject: [PATCH 041/327] abort logic --- skydriver/rest_handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 3900b8bd..b5965523 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -687,16 +687,16 @@ async def stop_skyscan_workers( LOGGER.info( "OK: attempted to stop skyscan workers but scan has not been sent to EWMS" ) + return manifest else: await request_stop_on_ewms(ewms_rc, manifest.ewms_workflow_id) + return await manifests.patch(scan_id, ewms_finished=True) else: raise web.HTTPError( 400, log_message="Could not stop scanner workers since this is a non-EWMS scan.", ) - return await manifests.patch(scan_id, ewms_finished=True) # workforce is done - # ----------------------------------------------------------------------------- From 5621cf2bc5d3f974b57d2d692c8b010add2a52b9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 10 Jan 2025 12:09:54 -0600 Subject: [PATCH 042/327] abort logic - 2 --- skydriver/ewms.py | 15 +++++++++++---- skydriver/rest_handlers.py | 9 ++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index ad027707..31762e52 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -80,13 +80,20 @@ async def request_workflow_on_ewms( async def request_stop_on_ewms( ewms_rc: RestClient, workflow_id: str, + abort: bool, ) -> int: """Signal that an EWMS workflow is finished, and stop whatever is needed. Returns the number of stopped taskforces. """ - resp = await ewms_rc.request( - "POST", - f"/v0/workflows/{workflow_id}/actions/finished", - ) + if abort: + resp = await ewms_rc.request( + "POST", + f"/v0/workflows/{workflow_id}/actions/abort", + ) + else: + resp = await ewms_rc.request( + "POST", + f"/v0/workflows/{workflow_id}/actions/finished", + ) return resp["n_taskforces"] diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index b5965523..0597445a 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -675,6 +675,7 @@ async def stop_skyscan_workers( manifests: database.interface.ManifestClient, scan_id: str, ewms_rc: RestClient, + abort: bool, ) -> database.schema.Manifest: """Stop all parts of the Scanner instance (if running) and mark in DB.""" manifest = await manifests.get(scan_id, True) @@ -689,7 +690,7 @@ async def stop_skyscan_workers( ) return manifest else: - await request_stop_on_ewms(ewms_rc, manifest.ewms_workflow_id) + await request_stop_on_ewms(ewms_rc, manifest.ewms_workflow_id, abort=abort) return await manifests.patch(scan_id, ewms_finished=True) else: raise web.HTTPError( @@ -771,7 +772,7 @@ async def delete(self, scan_id: str) -> None: # mark as deleted -> also stops backlog from starting manifest = await self.manifests.mark_as_deleted(scan_id) # abort - await stop_skyscan_workers(self.manifests, scan_id, self.ewms_rc) + await stop_skyscan_workers(self.manifests, scan_id, self.ewms_rc, abort=True) try: result_dict = dc.asdict(await self.results.get(scan_id)) @@ -1015,7 +1016,9 @@ async def put(self, scan_id: str) -> None: await asyncio.sleep( WAIT_BEFORE_TEARDOWN ) # regular time.sleep() sleeps the entire server - await stop_skyscan_workers(self.manifests, scan_id, self.k8s_batch_api) + await stop_skyscan_workers( + self.manifests, scan_id, self.k8s_batch_api, abort=False + ) # ----------------------------------------------------------------------------- From b563731a516c63fb2df9664df8f96ec62d85f530 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 10 Jan 2025 13:35:29 -0600 Subject: [PATCH 043/327] don't cache ewms state in db --- skydriver/database/interface.py | 10 +--------- skydriver/database/schema.py | 4 ---- skydriver/ewms.py | 32 ++++++++++++++++++++++---------- skydriver/rest_handlers.py | 12 ++++++------ 4 files changed, 29 insertions(+), 29 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index 5c1853cb..bda75082 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -115,25 +115,17 @@ async def patch( progress: schema.Progress | None = None, event_metadata: schema.EventMetadata | None = None, scan_metadata: schema.StrDict | None = None, - ewms_finished: bool | None = None, # workforce is done ) -> schema.Manifest: """Update `progress` at doc matching `scan_id`.""" LOGGER.debug(f"patching manifest for {scan_id=}") - if not ( - progress - or event_metadata - or scan_metadata - or ewms_finished is not None # True/False is ok # workforce is done - ): + if not (progress or event_metadata or scan_metadata): LOGGER.debug(f"nothing to patch for manifest ({scan_id=})") return await self.get(scan_id, incl_del=True) upserting: schema.StrDict = {} if progress: upserting["progress"] = progress - if ewms_finished is not None: - upserting["ewms_finished"] = ewms_finished # Validate, then store # NOTE: in theory there's a race condition (get+upsert) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 74cb6214..d1fe9285 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -150,7 +150,6 @@ class Manifest(ScanIDDataclass): # EWMS interface ewms_workflow_id: str | None = None # id points to info in EWMS - ewms_finished: bool = False # a cache so we don't have to call to ewms each time # -> deprecated fields -- see __post_init__ for backward compatibility logic ewms_task: dict | str = DEPRECATED_EWMS_TASK # **DEPRECATED** # ^^^ was used in skydriver 1.x to use local k8s starter/stopper @@ -194,9 +193,6 @@ def __post_init__(self) -> None: "Manifest must define 'ewms_workflow_id' " "(old manifests may define 'ewms_task' instead)" ) - # Backward compatibility: 1.x had 'complete' in a nested field - if isinstance(self.ewms_task, dict): - self.ewms_finished = self.ewms_task.get("complete", False) # don't show sensitive data to user self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 31762e52..144733d3 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -1,10 +1,15 @@ """Tools for interfacing with EMWS.""" +import logging + +import requests from rest_tools.client import RestClient from . import database, images, s3 from .config import QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT +LOGGER = logging.Logger(__name__) + async def request_workflow_on_ewms( ewms_rc: RestClient, @@ -85,15 +90,22 @@ async def request_stop_on_ewms( """Signal that an EWMS workflow is finished, and stop whatever is needed. Returns the number of stopped taskforces. + + Suppresses any HTTP errors. """ - if abort: - resp = await ewms_rc.request( - "POST", - f"/v0/workflows/{workflow_id}/actions/abort", - ) + try: + if abort: + resp = await ewms_rc.request( + "POST", + f"/v0/workflows/{workflow_id}/actions/abort", + ) + else: + resp = await ewms_rc.request( + "POST", + f"/v0/workflows/{workflow_id}/actions/finished", + ) + except requests.exceptions.HTTPError as e: + LOGGER.warning(repr(e)) + return 0 else: - resp = await ewms_rc.request( - "POST", - f"/v0/workflows/{workflow_id}/actions/finished", - ) - return resp["n_taskforces"] + return resp["n_taskforces"] diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 0597445a..baf05a31 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -679,8 +679,6 @@ async def stop_skyscan_workers( ) -> database.schema.Manifest: """Stop all parts of the Scanner instance (if running) and mark in DB.""" manifest = await manifests.get(scan_id, True) - if manifest.ewms_finished: # workforce is done - return manifest # request to ewms if manifest.ewms_workflow_id: @@ -688,16 +686,16 @@ async def stop_skyscan_workers( LOGGER.info( "OK: attempted to stop skyscan workers but scan has not been sent to EWMS" ) - return manifest else: await request_stop_on_ewms(ewms_rc, manifest.ewms_workflow_id, abort=abort) - return await manifests.patch(scan_id, ewms_finished=True) else: raise web.HTTPError( 400, log_message="Could not stop scanner workers since this is a non-EWMS scan.", ) + return manifest + # ----------------------------------------------------------------------------- @@ -760,8 +758,10 @@ async def delete(self, scan_id: str) -> None: # check DB states manifest = await self.manifests.get(scan_id, True) if ( - manifest.ewms_finished and not args.delete_completed_scan - ): # workforce is done + manifest.progress + and manifest.progress.processing_stats.finished + and not args.delete_completed_scan + ): msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( 400, From f2e5630654e75282bde828e2cab75245983e92df Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 10 Jan 2025 15:25:32 -0600 Subject: [PATCH 044/327] scan status tracking logic --- skydriver/database/schema.py | 84 ++++++++++++++++++++--------------- skydriver/ewms.py | 31 +++++++++++++ skydriver/k8s/scan_backlog.py | 4 +- skydriver/rest_handlers.py | 12 +++-- tests/unit/test_scan_state.py | 14 +++--- 5 files changed, 97 insertions(+), 48 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index d1fe9285..4adf9192 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -5,8 +5,11 @@ from typing import Any import wipac_dev_tools as wdt +from rest_tools.client import RestClient from typeguard import typechecked +from skydriver import ewms + StrDict = dict[str, Any] @@ -15,12 +18,6 @@ class ScanState(enum.Enum): SCAN_FINISHED_SUCCESSFULLY = enum.auto() - STOPPED__PARTIAL_RESULT_GENERATED = enum.auto() - STOPPED__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() - STOPPED__WAITING_ON_CLUSTER_STARTUP = enum.auto() - STOPPED__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() - STOPPED__PRESTARTUP = enum.auto() - IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() PENDING__WAITING_ON_CLUSTER_STARTUP = enum.auto() @@ -197,38 +194,53 @@ def __post_init__(self) -> None: # don't show sensitive data to user self.scanner_server_args = obfuscate_cl_args(self.scanner_server_args) - def get_state(self) -> ScanState: - """Determine the state of the scan by parsing attributes.""" - if ( - self.ewms_task.complete - and self.progress - and self.progress.processing_stats.finished - ): - return ScanState.SCAN_FINISHED_SUCCESSFULLY - - def get_nonfinished_state() -> ScanState: - if self.progress: # from scanner server - if self.ewms_task.clusters: - # NOTE - we only know if the workers have started up once the server has gotten pixels - if self.progress.processing_stats.rate: - return ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED - else: - return ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO - else: - return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP - else: - if self.ewms_task.clusters: - return ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP - else: - return ScanState.PENDING__PRESTARTUP - - if self.ewms_task.complete: - return ScanState[f"STOPPED__{get_nonfinished_state().name.split('__')[1]}"] - else: - return get_nonfinished_state() - def __repr__(self) -> str: dicto = dc.asdict(self) dicto.pop("event_i3live_json_dict") rep = f"{self.__class__.__name__}{dicto}" return rep + + +async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: + """Determine the state of the scan by parsing attributes and talking with EWMS.""" + if manifest.progress and manifest.progress.processing_stats.finished: + return ScanState.SCAN_FINISHED_SUCCESSFULLY.name + + def _has_scanner_server_started() -> bool: + return bool(manifest.progress) # attr only updated by scanner server requests + + def _has_request_been_sent_to_ewms() -> bool: + return ( + manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW # pending ewms req. + or ( # backward compatibility... + manifest.ewms_task != DEPRECATED_EWMS_TASK + and manifest.ewms_task.get("clusters") + ) + ) + + def _get_nonfinished_state() -> ScanState: + # has the scanner server started? + # -> yes + if _has_scanner_server_started(): + if _has_request_been_sent_to_ewms(): + return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP + else: + if manifest.progress.processing_stats.rate: + return ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED + else: + return ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO + # -> no + else: + if _has_request_been_sent_to_ewms(): + # NOTE: assume that the ewms-request and scanner server startup happen in tandem + return ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + else: + return ScanState.PENDING__PRESTARTUP + + # is EWMS still running the scan workers? + # -> yes + if dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id): + return f"{dtype.upper()}__{_get_nonfinished_state().name.split('__')[1]}" + # -> no + else: + return _get_nonfinished_state().name diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 144733d3..6257db91 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -2,6 +2,7 @@ import logging +import cachetools.func import requests from rest_tools.client import RestClient @@ -109,3 +110,33 @@ async def request_stop_on_ewms( return 0 else: return resp["n_taskforces"] + + +@cachetools.func.ttl_cache(ttl=1 * 60) # don't cache too long, but avoid spamming ewms +async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | None: + """Grab the 'deactivated' field for the workflow. + + Example: 'ABORTED', 'FINISHED + """ + workflow = await ewms_rc.request( + "GET", + f"/v0/workflows/{workflow_id}", + ) + return workflow["deactivated"] + + +@cachetools.func.ttl_cache(ttl=1 * 60) # don't cache too long, but avoid spamming ewms +async def get_taskforce_phases( + ewms_rc: RestClient, + workflow_id: str, +) -> list[dict[str, str]]: + """Get all the states of all the taskforces associated with the workflow.""" + resp = await ewms_rc.request( + "POST", + f"/v0/query/taskforces", + {"workflow_id": workflow_id}, + ) + return [ + {"taskforce": tf["taskforce_uuid"], "phase": tf["phase"]} + for tf in resp["taskforces"] + ] diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 5df70476..627ef0cb 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -58,10 +58,10 @@ async def get_next( await scan_backlog.remove(entry) continue - # check if scan was aborted (cancelled) + # check if scan was 'deleted' manifest = await manifests.get(entry.scan_id, incl_del=True) if manifest.is_deleted: - LOGGER.info(f"Backlog entry was aborted ({entry.scan_id=})") + LOGGER.info(f"Backlog entry was removed ({entry.scan_id=})") await scan_backlog.remove(entry) continue diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index baf05a31..38916a72 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -27,7 +27,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, images, k8s +from . import database, ewms, images, k8s from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_WORKER_DISK_BYTES, @@ -38,6 +38,7 @@ is_testing, ) from .database import schema +from .database.schema import get_scan_state from .ewms import request_stop_on_ewms from .k8s.scan_backlog import designate_for_startup from .k8s.scanner_instance import SkyScanK8sJobFactory @@ -1062,12 +1063,15 @@ async def get(self, scan_id: str) -> None: LOGGER.exception(e) # respond + scan_state = await get_scan_state(manifest, self.ewms_rc) resp = { - "scan_state": manifest.get_state().name, + "scan_state": scan_state, "is_deleted": manifest.is_deleted, - "scan_complete": manifest.ewms_task.complete, # workforce is done + "scan_complete": bool( + scan_state == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name + ), "pods": pods_411, - "clusters": [dc.asdict(c) for c in manifest.ewms_task.clusters], + "clusters": await ewms.get_taskforce_phases(manifest.ewms_workflow_id), } if not args.include_pod_statuses: resp.pop("pods") diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index ff1888b2..5befc330 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -37,7 +37,9 @@ def test_00__scan_finished_successfully() -> None: str(time.time()), ), ) - assert manifest.get_state() == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY + assert ( + await get_scan_state(manifest)() == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY + ) @pytest.mark.parametrize( @@ -91,7 +93,7 @@ def test_10__partial_result_generated( str(time.time()), ), ) - assert manifest.get_state() == state + assert await get_scan_state(manifest)() == state @pytest.mark.parametrize( @@ -145,7 +147,7 @@ def test_20__waiting_on_first_pixel_reco( str(time.time()), ), ) - assert manifest.get_state() == state + assert await get_scan_state(manifest)() == state @pytest.mark.parametrize( @@ -199,7 +201,7 @@ def test_30__waiting_on_cluster_startup( str(time.time()), ), ) - assert manifest.get_state() == state + assert await get_scan_state(manifest)() == state @pytest.mark.parametrize( @@ -253,7 +255,7 @@ def test_40__waiting_on_scanner_server_startup( # str(time.time()), # ), ) - assert manifest.get_state() == state + assert await get_scan_state(manifest)() == state @pytest.mark.parametrize( @@ -305,4 +307,4 @@ def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: # str(time.time()), # ), ) - assert manifest.get_state() == state + assert await get_scan_state(manifest)() == state From b00035c5e21692669a38338065d9513b1e2fbb86 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 10 Jan 2025 15:31:28 -0600 Subject: [PATCH 045/327] flake8 --- skydriver/ewms.py | 2 +- skydriver/k8s/scanner_instance.py | 2 +- tests/unit/test_scan_state.py | 39 +++++++++++++++++++++---------- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 6257db91..fa086b2a 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -133,7 +133,7 @@ async def get_taskforce_phases( """Get all the states of all the taskforces associated with the workflow.""" resp = await ewms_rc.request( "POST", - f"/v0/query/taskforces", + "/v0/query/taskforces", {"workflow_id": workflow_id}, ) return [ diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index e8c57282..87bcc576 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -172,7 +172,7 @@ def _make_job( args: ["{SkyScanK8sJobFactory._STARTUP_JSON_FPATH}", "--wait-indefinitely"] env: - name: S3_URL - value: "{ENV.S3_URL}" + value: "{ENV.S3_URL}" - name: S3_ACCESS_KEY_ID valueFrom: secretKeyRef: diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 5befc330..f52f9e39 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -1,14 +1,18 @@ """Test dynamically generating the scan state.""" import time +from unittest.mock import MagicMock import pytest from skydriver.database import schema +from skydriver.database.schema import get_scan_state -def test_00__scan_finished_successfully() -> None: +async def test_00__scan_finished_successfully() -> None: """Test with SCAN_FINISHED_SUCCESSFULLY.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -38,7 +42,8 @@ def test_00__scan_finished_successfully() -> None: ), ) assert ( - await get_scan_state(manifest)() == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY + await get_scan_state(manifest, ewms_rc) + == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY ) @@ -49,10 +54,12 @@ def test_00__scan_finished_successfully() -> None: (False, schema.ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED), ], ) -def test_10__partial_result_generated( +async def test_10__partial_result_generated( is_complete: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -93,7 +100,7 @@ def test_10__partial_result_generated( str(time.time()), ), ) - assert await get_scan_state(manifest)() == state + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -103,10 +110,12 @@ def test_10__partial_result_generated( (False, schema.ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO), ], ) -def test_20__waiting_on_first_pixel_reco( +async def test_20__waiting_on_first_pixel_reco( is_complete: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -147,7 +156,7 @@ def test_20__waiting_on_first_pixel_reco( str(time.time()), ), ) - assert await get_scan_state(manifest)() == state + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -157,10 +166,12 @@ def test_20__waiting_on_first_pixel_reco( (False, schema.ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP), ], ) -def test_30__waiting_on_cluster_startup( +async def test_30__waiting_on_cluster_startup( is_complete: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -201,7 +212,7 @@ def test_30__waiting_on_cluster_startup( str(time.time()), ), ) - assert await get_scan_state(manifest)() == state + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -211,10 +222,12 @@ def test_30__waiting_on_cluster_startup( (False, schema.ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP), ], ) -def test_40__waiting_on_scanner_server_startup( +async def test_40__waiting_on_scanner_server_startup( is_complete: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -255,7 +268,7 @@ def test_40__waiting_on_scanner_server_startup( # str(time.time()), # ), ) - assert await get_scan_state(manifest)() == state + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -265,8 +278,10 @@ def test_40__waiting_on_scanner_server_startup( (False, schema.ScanState.PENDING__PRESTARTUP), ], ) -def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: +async def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: """Test normal and stopped varriants.""" + ewms_rc = MagicMock() + manifest = schema.Manifest( scan_id="abc123", timestamp=time.time(), @@ -307,4 +322,4 @@ def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: # str(time.time()), # ), ) - assert await get_scan_state(manifest)() == state + assert await get_scan_state(manifest, ewms_rc) == state From 4674f6e912e28b93f9f78f7e312afb808beec950 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 13:52:57 -0600 Subject: [PATCH 046/327] mypy - 1 --- s3_sidecar/post.py | 2 +- skydriver/database/schema.py | 24 ++++++++++++++---------- skydriver/k8s/scanner_instance.py | 2 +- skydriver/s3.py | 2 +- 4 files changed, 17 insertions(+), 13 deletions(-) diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index c91f0cc6..dc4c897f 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -6,7 +6,7 @@ import time from pathlib import Path -import boto3 +import boto3 # type: ignore[import-untyped] import requests LOGGER = logging.getLogger(__package__) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 4adf9192..c1f0cd14 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -206,22 +206,21 @@ async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: if manifest.progress and manifest.progress.processing_stats.finished: return ScanState.SCAN_FINISHED_SUCCESSFULLY.name - def _has_scanner_server_started() -> bool: - return bool(manifest.progress) # attr only updated by scanner server requests - def _has_request_been_sent_to_ewms() -> bool: - return ( + return bool( manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW # pending ewms req. or ( # backward compatibility... manifest.ewms_task != DEPRECATED_EWMS_TASK + and isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("clusters") ) ) - def _get_nonfinished_state() -> ScanState: + def get_nonfinished_state() -> ScanState: + """Get the ScanState of the scan, only by parsing attributes.""" # has the scanner server started? # -> yes - if _has_scanner_server_started(): + if manifest.progress: # attr only updated by scanner server requests if _has_request_been_sent_to_ewms(): return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP else: @@ -239,8 +238,13 @@ def _get_nonfinished_state() -> ScanState: # is EWMS still running the scan workers? # -> yes - if dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id): - return f"{dtype.upper()}__{_get_nonfinished_state().name.split('__')[1]}" - # -> no + if manifest.ewms_workflow_id and ( + dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) + ): + return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" + # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? + elif manifest.ewms_task.get("complete"): + return f"STOPPED__{get_nonfinished_state().name.split('__')[1]}" + # -> no, this is a non-finished scan else: - return _get_nonfinished_state().name + return get_nonfinished_state().name diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 87bcc576..941f7f00 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -29,7 +29,7 @@ def _to_inline_yaml(obj: list[str] | sdict) -> str: default_flow_style=True, ) elif isinstance(obj, list): - yaml.safe_dump( + return yaml.safe_dump( obj, default_flow_style=True, ) diff --git a/skydriver/s3.py b/skydriver/s3.py index 34eadf3b..0d011dd7 100644 --- a/skydriver/s3.py +++ b/skydriver/s3.py @@ -2,7 +2,7 @@ import logging -import boto3 +import boto3 # type: ignore[import-untyped] from .config import ENV From d44c1a415ff3c07d742e1f8dc7de78f8c96a51ea Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 14:02:51 -0600 Subject: [PATCH 047/327] mypy - 2 --- skydriver/config.py | 52 ++------------------------------------ skydriver/rest_handlers.py | 29 +++++---------------- 2 files changed, 8 insertions(+), 73 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 8332c97c..22a38ad6 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -136,58 +136,10 @@ def __post_init__(self) -> None: # known cluster locations KNOWN_CLUSTERS: dict[str, dict[str, Any]] = { "sub-2": { - "orchestrator": "condor", - "location": { - "collector": "glidein-cm.icecube.wisc.edu", - "schedd": "sub-2.icecube.wisc.edu", - }, - "v1envvars": [ - kubernetes.client.V1EnvVar( - name="CONDOR_TOKEN", - value_from=kubernetes.client.V1EnvVarSource( - secret_key_ref=kubernetes.client.V1SecretKeySelector( - name=ENV.K8S_SECRET_NAME, - key="condor_token_sub2", - ) - ), - ) - ], - "max_n_clients_during_debug_mode": 10, + "max_n_clients_during_debug_mode": 100, }, LOCAL_K8S_HOST: { - "orchestrator": "k8s", - "location": { - "host": LOCAL_K8S_HOST, - "namespace": ENV.K8S_NAMESPACE, - }, - "v1envvars": [], - }, - "gke-2306": { - "orchestrator": "k8s", - "location": { - "host": "https://34.171.167.119:443", - "namespace": "icecube-skymap-scanner", - }, - "v1envvars": [ - kubernetes.client.V1EnvVar( - name="WORKER_K8S_CACERT", - value_from=kubernetes.client.V1EnvVarSource( - secret_key_ref=kubernetes.client.V1SecretKeySelector( - name=ENV.K8S_SECRET_NAME, - key="worker_k8s_cacert_gke", - ) - ), - ), - kubernetes.client.V1EnvVar( - name="WORKER_K8S_TOKEN", - value_from=kubernetes.client.V1EnvVarSource( - secret_key_ref=kubernetes.client.V1SecretKeySelector( - name=ENV.K8S_SECRET_NAME, - key="worker_k8s_token_gke", - ) - ), - ), - ], + "max_n_clients_during_debug_mode": 5, }, } diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 38916a72..5bc0f0ec 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -237,27 +237,6 @@ async def get(self) -> None: # ----------------------------------------------------------------------------- -def _cluster_lookup(name: str, n_workers: int) -> database.schema.InHouseClusterInfo: - """Grab the ManualCluster object known using `name`.""" - if cluster := KNOWN_CLUSTERS.get(name): - if cluster["orchestrator"] == "condor": - return database.schema.InHouseClusterInfo( - orchestrator=cluster["orchestrator"], - location=database.schema.HTCondorLocation(**cluster["location"]), - n_workers=n_workers, - ) - elif cluster["orchestrator"] == "k8s": - return database.schema.InHouseClusterInfo( - orchestrator=cluster["orchestrator"], - location=database.schema.KubernetesLocation(**cluster["location"]), - n_workers=n_workers, - ) - raise argparse.ArgumentTypeError( - f"requested unknown cluster: {name} (available:" - f" {', '.join(KNOWN_CLUSTERS.keys())})" - ) - - def _json_to_dict(val: Any) -> dict: _error = argparse.ArgumentTypeError("must be JSON-string or JSON-friendly dict") # str -> json-dict @@ -303,9 +282,13 @@ def _validate_request_clusters( # check all entries are 2-lists (or tuple) if not all(isinstance(a, list | tuple) and len(a) == 2 for a in list_tups): raise _error - # check that all locations are known (this validates sooner than ewms, if using ewms) + # check that all locations are known (this validates sooner than ewms) for name, n_workers in list_tups: - _cluster_lookup(name, n_workers) + if name not in KNOWN_CLUSTERS: + raise argparse.ArgumentTypeError( + f"requested unknown cluster: {name} (available:" + f" {', '.join(KNOWN_CLUSTERS.keys())})" + ) return list_tups From 581f9d882329880a2ea71ec25b7eb7c2c6397503 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 14:12:05 -0600 Subject: [PATCH 048/327] mypy - 3 --- skydriver/config.py | 1 - skydriver/database/schema.py | 2 +- skydriver/k8s/scan_backlog.py | 5 +++-- skydriver/rest_handlers.py | 6 ++++-- tests/integration/conftest.py | 7 ++++--- 5 files changed, 12 insertions(+), 9 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 22a38ad6..0e3f74d1 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -6,7 +6,6 @@ from typing import Any, Optional import humanfriendly -import kubernetes.client # type: ignore[import-untyped] from wipac_dev_tools import from_environment_as_dataclass, logging_tools sdict = dict[str, Any] diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index c1f0cd14..7e75048e 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -243,7 +243,7 @@ def get_nonfinished_state() -> ScanState: ): return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? - elif manifest.ewms_task.get("complete"): + elif isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("complete"): return f"STOPPED__{get_nonfinished_state().name.split('__')[1]}" # -> no, this is a non-finished scan else: diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 627ef0cb..8f612202 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -41,8 +41,8 @@ async def designate_for_startup( async def get_next( scan_backlog: database.interface.ScanBacklogClient, manifests: database.interface.ManifestClient, - scan_request_client: AsyncIOMotorCollection, - skyscan_k8s_job_client: AsyncIOMotorClient, + scan_request_client: AsyncIOMotorCollection, # type: ignore[valid-type] + skyscan_k8s_job_client: AsyncIOMotorClient, # type: ignore[valid-type] include_low_priority_scans: bool, ) -> tuple[database.schema.ScanBacklogEntry, database.schema.Manifest, dict, dict]: """Get the next entry & remove any that have been cancelled.""" @@ -212,6 +212,7 @@ async def _run( await manifest_client.collection.find_one_and_update( {"scan_id": manifest.scan_id}, {"$set": {"ewms_workflow_id": workflow_id}}, + return_dclass=dict, ) LOGGER.info( diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 5bc0f0ec..7a12e200 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -534,7 +534,7 @@ async def post(self) -> None: async def _start_scan( manifests: database.interface.ManifestClient, scan_backlog: database.interface.ScanBacklogClient, - skyscan_k8s_job_coll: AsyncIOMotorCollection, + skyscan_k8s_job_coll: AsyncIOMotorCollection, # type: ignore[valid-type] scan_request_obj: dict, new_scan_id: str = "", # don't use scan_request_obj.scan_id--this could be a rescan ) -> schema.Manifest: @@ -1054,7 +1054,9 @@ async def get(self, scan_id: str) -> None: scan_state == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name ), "pods": pods_411, - "clusters": await ewms.get_taskforce_phases(manifest.ewms_workflow_id), + "clusters": await ewms.get_taskforce_phases( + self.ewms_rc, manifest.ewms_workflow_id + ), } if not args.include_pod_statuses: resp.pop("pods") diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 058f221c..3896727e 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -3,7 +3,7 @@ import asyncio import socket from typing import Any, AsyncIterator, Callable -from unittest.mock import Mock +from unittest.mock import MagicMock, Mock import kubernetes.client # type: ignore[import] import pytest @@ -131,11 +131,12 @@ async def server( mongo_client = await create_mongodb_client() k8s_batch_api = Mock() + ewms_rc = MagicMock() backlog_task = asyncio.create_task( - skydriver.k8s.scan_backlog.run(mongo_client, k8s_batch_api) + skydriver.k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc) ) await asyncio.sleep(0) # start up previous task - rs = await make(mongo_client, k8s_batch_api) + rs = await make(mongo_client, k8s_batch_api, ewms_rc) rs.startup(address="localhost", port=port) # type: ignore[no-untyped-call] def client() -> RestClient: From 3dbf0562cdfb397f396b33b0af8c39136ff46161 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 14:18:30 -0600 Subject: [PATCH 049/327] mypy - 4 --- skydriver/k8s/scan_backlog.py | 6 ++++-- skydriver/rest_handlers.py | 12 ++++++++---- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 8f612202..fc91c43f 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -66,12 +66,14 @@ async def get_next( continue # grab the scan request object--it has other info - scan_request_obj = await scan_request_client.find_one( + scan_request_obj = await scan_request_client.find_one( # type: ignore[attr-defined] {"scan_id": manifest.scan_id} ) # grab the k8s - doc = await skyscan_k8s_job_client.find_one({"scan_id": manifest.scan_id}) + doc = await skyscan_k8s_job_client.find_one( # type: ignore[attr-defined] + {"scan_id": manifest.scan_id}, + ) skyscan_k8s_job = doc["skyscan_k8s_job_dict"] # all good! diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 7a12e200..7e1c5aa5 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -574,7 +574,7 @@ async def _start_scan( priority=scan_request_obj["priority"], ) await manifests.put(manifest) - await skyscan_k8s_job_coll.insert_one( + await skyscan_k8s_job_coll.insert_one( # type: ignore[attr-defined] { "scan_id": scan_id, "skyscan_k8s_job_dict": skyscan_k8s_job_dict, @@ -1047,6 +1047,12 @@ async def get(self, scan_id: str) -> None: # respond scan_state = await get_scan_state(manifest, self.ewms_rc) + if manifest.ewms_workflow_id: + clusters = await ewms.get_taskforce_phases( + self.ewms_rc, manifest.ewms_workflow_id + ) + else: + clusters = [] resp = { "scan_state": scan_state, "is_deleted": manifest.is_deleted, @@ -1054,9 +1060,7 @@ async def get(self, scan_id: str) -> None: scan_state == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name ), "pods": pods_411, - "clusters": await ewms.get_taskforce_phases( - self.ewms_rc, manifest.ewms_workflow_id - ), + "clusters": clusters, } if not args.include_pod_statuses: resp.pop("pods") From f00044d63826651a4b3fdb08fa36354b0959059d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 14:53:57 -0600 Subject: [PATCH 050/327] update test_scan_state.py - 1 --- tests/unit/test_scan_state.py | 105 ++++++++++------------------------ 1 file changed, 30 insertions(+), 75 deletions(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index f52f9e39..514fb304 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -1,7 +1,7 @@ """Test dynamically generating the scan state.""" import time -from unittest.mock import MagicMock +from unittest.mock import MagicMock, patch import pytest @@ -14,93 +14,48 @@ async def test_00__scan_finished_successfully() -> None: ewms_rc = MagicMock() manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=True, - ), + scan_id=MagicMock(), + timestamp=MagicMock(), + is_deleted=MagicMock(), + event_i3live_json_dict=MagicMock(), + scanner_server_args=MagicMock(), # - progress=schema.Progress( - "summary", - "epilogue", - {}, - schema.ProgressProcessingStats( - start={}, - runtime={}, - # rate, - # end, - finished=True, - # predictions, - ), - 1.0, - str(time.time()), - ), + # now, args that actually matter: + ewms_workflow_id="ewms123", + progress=MagicMock(processing_stats=MagicMock(finished=True)), ) assert ( await get_scan_state(manifest, ewms_rc) - == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY + == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name ) @pytest.mark.parametrize( - "is_complete,state", + "ewms_dtype,state", [ - (True, schema.ScanState.STOPPED__PARTIAL_RESULT_GENERATED), - (False, schema.ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED), + ("ABORTED", "ABORTED__PARTIAL_RESULT_GENERATED"), + ("FINISHED", "FINISHED__PARTIAL_RESULT_GENERATED"), + (None, schema.ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED.name), ], ) -async def test_10__partial_result_generated( - is_complete: bool, state: schema.ScanState -) -> None: +async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=is_complete, - clusters=[ - schema.InHouseClusterInfo( - orchestrator="condor", - location=schema.HTCondorLocation( - collector="foo", - schedd="bar", - ), - n_workers=111, - cluster_id="abc123", # "" is a non-started cluster - starter_info={"abc": 123}, - ) - ], - ), + scan_id=MagicMock(), + timestamp=MagicMock(), + is_deleted=MagicMock(), + event_i3live_json_dict=MagicMock(), + scanner_server_args=MagicMock(), # - progress=schema.Progress( - "summary", - "epilogue", - {}, - schema.ProgressProcessingStats( - start={}, - runtime={}, - rate={"abc": 123}, - # end, - # finished=True, - # predictions, - ), - 1.0, - str(time.time()), - ), + # now, args that actually matter: + ewms_workflow_id="ewms123", + progress=MagicMock(processing_stats=MagicMock(rate={"abc": 123})), ) - assert await get_scan_state(manifest, ewms_rc) == state + + with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -111,7 +66,7 @@ async def test_10__partial_result_generated( ], ) async def test_20__waiting_on_first_pixel_reco( - is_complete: bool, state: schema.ScanState + stopped: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -167,7 +122,7 @@ async def test_20__waiting_on_first_pixel_reco( ], ) async def test_30__waiting_on_cluster_startup( - is_complete: bool, state: schema.ScanState + stopped: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -223,7 +178,7 @@ async def test_30__waiting_on_cluster_startup( ], ) async def test_40__waiting_on_scanner_server_startup( - is_complete: bool, state: schema.ScanState + stopped: bool, state: schema.ScanState ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -278,7 +233,7 @@ async def test_40__waiting_on_scanner_server_startup( (False, schema.ScanState.PENDING__PRESTARTUP), ], ) -async def test_50__prestartup(is_complete: bool, state: schema.ScanState) -> None: +async def test_50__prestartup(stopped: bool, state: schema.ScanState) -> None: """Test normal and stopped varriants.""" ewms_rc = MagicMock() From 690f5a4e93db08051522867d4c15d3db0fa59f42 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 14:59:25 -0600 Subject: [PATCH 051/327] update test_scan_state.py - 2 --- .github/workflows/wipac-cicd.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 3be36066..6732214d 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -9,6 +9,18 @@ env: SCAN_BACKLOG_RUNNER_DELAY: 1 SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE: 200 LOG_LEVEL: debug + # mandatory env vars... + EWMS_ADDRESS: fcb6c253 + EWMS_TOKEN_URL: 65f3b929 + EWMS_CLIENT_ID: b75a974d + EWMS_CLIENT_SECRET: 411b16fe + S3_URL: a4f92304 + S3_ACCESS_KEY_ID: 36c5c849 + S3_ACCESS_KEY_ID__K8S_SECRET_KEY: 230ec9dc + S3_SECRET_KEY: 8dea68a1 + S3_SECRET_KEY__K8S_SECRET_KEY: cdf7c60b + S3_BUCKET: 72017610 + jobs: @@ -140,7 +152,7 @@ jobs: --env THIS_IMAGE_WITH_TAG=$THIS_IMAGE_WITH_TAG \ $(env | grep '^SKYSCAN_' | awk '$0="--env "$0') \ $(env | grep '^EWMS_' | awk '$0="--env "$0') \ - $(env | grep '^CLIENTMANAGER_' | awk '$0="--env "$0') \ + $(env | grep '^S3_' | awk '$0="--env "$0') \ $(env | grep '^CI_' | awk '$0="--env "$0') \ $(env | grep '^SCAN_' | awk '$0="--env "$0') \ --mount type=bind,source=$(realpath $DIR),target=/local/$DIR \ From 65edef80689614caaad7650bc41105038835ee38 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:09:39 -0600 Subject: [PATCH 052/327] update test_scan_state.py - 3 (imports) --- skydriver/database/schema.py | 65 --------------------------------- skydriver/rest_handlers.py | 2 +- skydriver/utils.py | 69 +++++++++++++++++++++++++++++++++++ tests/unit/test_scan_state.py | 2 +- 4 files changed, 71 insertions(+), 67 deletions(-) create mode 100644 skydriver/utils.py diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 7e75048e..4b114b87 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -1,30 +1,14 @@ """Collection of dataclass-based schema for the database.""" import dataclasses as dc -import enum from typing import Any import wipac_dev_tools as wdt -from rest_tools.client import RestClient from typeguard import typechecked -from skydriver import ewms - StrDict = dict[str, Any] -class ScanState(enum.Enum): - """A non-persisted scan state.""" - - SCAN_FINISHED_SUCCESSFULLY = enum.auto() - - IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() - IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() - PENDING__WAITING_ON_CLUSTER_STARTUP = enum.auto() - PENDING__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() - PENDING__PRESTARTUP = enum.auto() - - @typechecked @dc.dataclass class ScanIDDataclass: @@ -199,52 +183,3 @@ def __repr__(self) -> str: dicto.pop("event_i3live_json_dict") rep = f"{self.__class__.__name__}{dicto}" return rep - - -async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: - """Determine the state of the scan by parsing attributes and talking with EWMS.""" - if manifest.progress and manifest.progress.processing_stats.finished: - return ScanState.SCAN_FINISHED_SUCCESSFULLY.name - - def _has_request_been_sent_to_ewms() -> bool: - return bool( - manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW # pending ewms req. - or ( # backward compatibility... - manifest.ewms_task != DEPRECATED_EWMS_TASK - and isinstance(manifest.ewms_task, dict) - and manifest.ewms_task.get("clusters") - ) - ) - - def get_nonfinished_state() -> ScanState: - """Get the ScanState of the scan, only by parsing attributes.""" - # has the scanner server started? - # -> yes - if manifest.progress: # attr only updated by scanner server requests - if _has_request_been_sent_to_ewms(): - return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP - else: - if manifest.progress.processing_stats.rate: - return ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED - else: - return ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO - # -> no - else: - if _has_request_been_sent_to_ewms(): - # NOTE: assume that the ewms-request and scanner server startup happen in tandem - return ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP - else: - return ScanState.PENDING__PRESTARTUP - - # is EWMS still running the scan workers? - # -> yes - if manifest.ewms_workflow_id and ( - dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) - ): - return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" - # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? - elif isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("complete"): - return f"STOPPED__{get_nonfinished_state().name.split('__')[1]}" - # -> no, this is a non-finished scan - else: - return get_nonfinished_state().name diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 7e1c5aa5..c06a83f7 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -38,10 +38,10 @@ is_testing, ) from .database import schema -from .database.schema import get_scan_state from .ewms import request_stop_on_ewms from .k8s.scan_backlog import designate_for_startup from .k8s.scanner_instance import SkyScanK8sJobFactory +from .utils import get_scan_state LOGGER = logging.getLogger(__name__) diff --git a/skydriver/utils.py b/skydriver/utils.py new file mode 100644 index 00000000..bb8bdadc --- /dev/null +++ b/skydriver/utils.py @@ -0,0 +1,69 @@ +"""Utility functions that don't fit anywhere else.""" + +import enum + +from rest_tools.client import RestClient + +from . import ewms +from .database.schema import DEPRECATED_EWMS_TASK, Manifest, PENDING_EWMS_WORKFLOW + + +class ScanState(enum.Enum): + """A non-persisted scan state.""" + + SCAN_FINISHED_SUCCESSFULLY = enum.auto() + + IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() + IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() + PENDING__WAITING_ON_CLUSTER_STARTUP = enum.auto() + PENDING__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() + PENDING__PRESTARTUP = enum.auto() + + +async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: + """Determine the state of the scan by parsing attributes and talking with EWMS.""" + if manifest.progress and manifest.progress.processing_stats.finished: + return ScanState.SCAN_FINISHED_SUCCESSFULLY.name + + def _has_request_been_sent_to_ewms() -> bool: + return bool( + manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW # pending ewms req. + or ( # backward compatibility... + manifest.ewms_task != DEPRECATED_EWMS_TASK + and isinstance(manifest.ewms_task, dict) + and manifest.ewms_task.get("clusters") + ) + ) + + def get_nonfinished_state() -> ScanState: + """Get the ScanState of the scan, only by parsing attributes.""" + # has the scanner server started? + # -> yes + if manifest.progress: # attr only updated by scanner server requests + if _has_request_been_sent_to_ewms(): + return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP + else: + if manifest.progress.processing_stats.rate: + return ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED + else: + return ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO + # -> no + else: + if _has_request_been_sent_to_ewms(): + # NOTE: assume that the ewms-request and scanner server startup happen in tandem + return ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + else: + return ScanState.PENDING__PRESTARTUP + + # is EWMS still running the scan workers? + # -> yes + if manifest.ewms_workflow_id and ( + dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) + ): + return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" + # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? + elif isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("complete"): + return f"STOPPED__{get_nonfinished_state().name.split('__')[1]}" + # -> no, this is a non-finished scan + else: + return get_nonfinished_state().name diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 514fb304..1251d7de 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -6,7 +6,7 @@ import pytest from skydriver.database import schema -from skydriver.database.schema import get_scan_state +from skydriver.utils import get_scan_state async def test_00__scan_finished_successfully() -> None: From cf86808516c37db1799b246254918d1d1fe6aa4a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:16:40 -0600 Subject: [PATCH 053/327] update test_scan_state.py - 4 (imports) --- skydriver/utils.py | 16 ++++++------- tests/unit/test_scan_state.py | 43 +++++++++++++++-------------------- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index bb8bdadc..e3415a67 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -8,7 +8,7 @@ from .database.schema import DEPRECATED_EWMS_TASK, Manifest, PENDING_EWMS_WORKFLOW -class ScanState(enum.Enum): +class _ScanState(enum.Enum): """A non-persisted scan state.""" SCAN_FINISHED_SUCCESSFULLY = enum.auto() @@ -23,7 +23,7 @@ class ScanState(enum.Enum): async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: """Determine the state of the scan by parsing attributes and talking with EWMS.""" if manifest.progress and manifest.progress.processing_stats.finished: - return ScanState.SCAN_FINISHED_SUCCESSFULLY.name + return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name def _has_request_been_sent_to_ewms() -> bool: return bool( @@ -35,25 +35,25 @@ def _has_request_been_sent_to_ewms() -> bool: ) ) - def get_nonfinished_state() -> ScanState: + def get_nonfinished_state() -> _ScanState: """Get the ScanState of the scan, only by parsing attributes.""" # has the scanner server started? # -> yes if manifest.progress: # attr only updated by scanner server requests if _has_request_been_sent_to_ewms(): - return ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP + return _ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP else: if manifest.progress.processing_stats.rate: - return ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED + return _ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED else: - return ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO + return _ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO # -> no else: if _has_request_been_sent_to_ewms(): # NOTE: assume that the ewms-request and scanner server startup happen in tandem - return ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP else: - return ScanState.PENDING__PRESTARTUP + return _ScanState.PENDING__PRESTARTUP # is EWMS still running the scan workers? # -> yes diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 1251d7de..00d3dd82 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -24,10 +24,7 @@ async def test_00__scan_finished_successfully() -> None: ewms_workflow_id="ewms123", progress=MagicMock(processing_stats=MagicMock(finished=True)), ) - assert ( - await get_scan_state(manifest, ewms_rc) - == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name - ) + assert await get_scan_state(manifest, ewms_rc) == "SCAN_FINISHED_SUCCESSFULLY" @pytest.mark.parametrize( @@ -35,7 +32,7 @@ async def test_00__scan_finished_successfully() -> None: [ ("ABORTED", "ABORTED__PARTIAL_RESULT_GENERATED"), ("FINISHED", "FINISHED__PARTIAL_RESULT_GENERATED"), - (None, schema.ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED.name), + (None, "IN_PROGRESS__PARTIAL_RESULT_GENERATED"), ], ) async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None: @@ -59,15 +56,13 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None @pytest.mark.parametrize( - "is_complete,state", + "ewms_dtype,state", [ - (True, schema.ScanState.STOPPED__WAITING_ON_FIRST_PIXEL_RECO), - (False, schema.ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO), + (True, "STOPPED__WAITING_ON_FIRST_PIXEL_RECO"), + (False, "IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO"), ], ) -async def test_20__waiting_on_first_pixel_reco( - stopped: bool, state: schema.ScanState -) -> None: +async def test_20__waiting_on_first_pixel_reco(ewms_dtype: str, state: str) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -115,15 +110,13 @@ async def test_20__waiting_on_first_pixel_reco( @pytest.mark.parametrize( - "is_complete,state", + "ewms_dtype,state", [ - (True, schema.ScanState.STOPPED__WAITING_ON_CLUSTER_STARTUP), - (False, schema.ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP), + (True, "STOPPED__WAITING_ON_CLUSTER_STARTUP"), + (False, "PENDING__WAITING_ON_CLUSTER_STARTUP"), ], ) -async def test_30__waiting_on_cluster_startup( - stopped: bool, state: schema.ScanState -) -> None: +async def test_30__waiting_on_cluster_startup(ewms_dtype: str, state: str) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -171,14 +164,14 @@ async def test_30__waiting_on_cluster_startup( @pytest.mark.parametrize( - "is_complete,state", + "ewms_dtype,state", [ - (True, schema.ScanState.STOPPED__WAITING_ON_SCANNER_SERVER_STARTUP), - (False, schema.ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP), + (True, "STOPPED__WAITING_ON_SCANNER_SERVER_STARTUP"), + (False, "PENDING__WAITING_ON_SCANNER_SERVER_STARTUP"), ], ) async def test_40__waiting_on_scanner_server_startup( - stopped: bool, state: schema.ScanState + ewms_dtype: str, state: str ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -227,13 +220,13 @@ async def test_40__waiting_on_scanner_server_startup( @pytest.mark.parametrize( - "is_complete,state", + "ewms_dtype,state", [ - (True, schema.ScanState.STOPPED__PRESTARTUP), - (False, schema.ScanState.PENDING__PRESTARTUP), + (True, "STOPPED__PRESTARTUP"), + (False, "PENDING__PRESTARTUP"), ], ) -async def test_50__prestartup(stopped: bool, state: schema.ScanState) -> None: +async def test_50__prestartup(ewms_dtype: str, state: str) -> None: """Test normal and stopped varriants.""" ewms_rc = MagicMock() From f41780bc8a20a1e2a9068e33f7ef4a652c26975b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:19:48 -0600 Subject: [PATCH 054/327] `--exitfirst` --- .github/workflows/wipac-cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 6732214d..30f6a8cd 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -97,7 +97,7 @@ jobs: - name: test run: | - pytest -vvv tests/unit + pytest -vvv tests/unit --exitfirst - name: Dump logs if: always() From 80a568efdcd20e6ac0203b3b350675e7c11b5887 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:25:47 -0600 Subject: [PATCH 055/327] update test_scan_state.py - 5 (mock attrs) --- tests/unit/test_scan_state.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 00d3dd82..65bdcec5 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -22,7 +22,13 @@ async def test_00__scan_finished_successfully() -> None: # # now, args that actually matter: ewms_workflow_id="ewms123", - progress=MagicMock(processing_stats=MagicMock(finished=True)), + progress=MagicMock( + spec_set=["processing_stats"], + processing_stats=MagicMock( + spec_set=["finished"], + finished=True, + ), + ), ) assert await get_scan_state(manifest, ewms_rc) == "SCAN_FINISHED_SUCCESSFULLY" @@ -48,7 +54,13 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None # # now, args that actually matter: ewms_workflow_id="ewms123", - progress=MagicMock(processing_stats=MagicMock(rate={"abc": 123})), + progress=MagicMock( + spec_set=["processing_stats"], + processing_stats=MagicMock( + spec_set=["rate"], + rate={"abc": 123}, + ), + ), ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): From a7fa13480f4d68dc642f6083a6f24054551f491b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:35:26 -0600 Subject: [PATCH 056/327] update test_scan_state.py - 6 (mock attrs) --- tests/unit/test_scan_state.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 65bdcec5..f6cab826 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -23,9 +23,9 @@ async def test_00__scan_finished_successfully() -> None: # now, args that actually matter: ewms_workflow_id="ewms123", progress=MagicMock( - spec_set=["processing_stats"], + spec_set=[], # enforce strict attribute access (same behavior if this was a dict) processing_stats=MagicMock( - spec_set=["finished"], + spec_set=[], # enforce strict attribute access (same behavior if this was a dict) finished=True, ), ), @@ -55,9 +55,9 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None # now, args that actually matter: ewms_workflow_id="ewms123", progress=MagicMock( - spec_set=["processing_stats"], + spec_set=[], # enforce strict attribute access (same behavior if this was a dict) processing_stats=MagicMock( - spec_set=["rate"], + spec_set=[], # enforce strict attribute access (same behavior if this was a dict) rate={"abc": 123}, ), ), From 65a1a0f89f2ff01de33df08574dcad58807b6f96 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:42:01 -0600 Subject: [PATCH 057/327] update test_scan_state.py - 7 (mock attrs) --- tests/unit/test_scan_state.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index f6cab826..313d849f 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -23,9 +23,9 @@ async def test_00__scan_finished_successfully() -> None: # now, args that actually matter: ewms_workflow_id="ewms123", progress=MagicMock( - spec_set=[], # enforce strict attribute access (same behavior if this was a dict) + spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict processing_stats=MagicMock( - spec_set=[], # enforce strict attribute access (same behavior if this was a dict) + spec_set=["finished"], # no magic strict attrs -- kind of like dict finished=True, ), ), @@ -55,9 +55,9 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None # now, args that actually matter: ewms_workflow_id="ewms123", progress=MagicMock( - spec_set=[], # enforce strict attribute access (same behavior if this was a dict) + spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict processing_stats=MagicMock( - spec_set=[], # enforce strict attribute access (same behavior if this was a dict) + spec_set=["rate"], # no magic strict attrs -- kind of like dict rate={"abc": 123}, ), ), From f562aff16b649a1d60e78cc03963ac2732323f6e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:45:18 -0600 Subject: [PATCH 058/327] update test_scan_state.py - 8 (mock attrs) --- tests/unit/test_scan_state.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 313d849f..3de08fc6 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -57,7 +57,11 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None progress=MagicMock( spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict processing_stats=MagicMock( - spec_set=["rate"], # no magic strict attrs -- kind of like dict + spec_set=[ # no magic strict attrs -- kind of like dict + "finished", + "rate", + ], + finished=False, rate={"abc": 123}, ), ), From 37831903543e037425505dff97b1fc700c63982d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:47:25 -0600 Subject: [PATCH 059/327] await syntax? --- skydriver/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index e3415a67..4ef79031 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -58,7 +58,7 @@ def get_nonfinished_state() -> _ScanState: # is EWMS still running the scan workers? # -> yes if manifest.ewms_workflow_id and ( - dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) + dtype := (await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id)) ): return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? From 5e737b620ac38273583d8fd29545c5bd7a0fc49d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:50:50 -0600 Subject: [PATCH 060/327] Revert "await syntax?" This reverts commit 37831903543e037425505dff97b1fc700c63982d. --- skydriver/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index 4ef79031..e3415a67 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -58,7 +58,7 @@ def get_nonfinished_state() -> _ScanState: # is EWMS still running the scan workers? # -> yes if manifest.ewms_workflow_id and ( - dtype := (await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id)) + dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) ): return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? From 451b4d3d280ff25482788d8d05b64f538bb74c0f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 15:51:47 -0600 Subject: [PATCH 061/327] use `aiocache` --- setup.cfg | 1 + skydriver/ewms.py | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/setup.cfg b/setup.cfg index f2f36c5e..2658ba8a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -28,6 +28,7 @@ branch = main install_requires = boto3 dacite + aiocache htcondor humanfriendly kubernetes diff --git a/skydriver/ewms.py b/skydriver/ewms.py index fa086b2a..5d7a1829 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -2,7 +2,7 @@ import logging -import cachetools.func +import aiocache import requests from rest_tools.client import RestClient @@ -112,7 +112,7 @@ async def request_stop_on_ewms( return resp["n_taskforces"] -@cachetools.func.ttl_cache(ttl=1 * 60) # don't cache too long, but avoid spamming ewms +@aiocache.cached(ttl=1 * 60) # don't cache too long, but avoid spamming ewms async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | None: """Grab the 'deactivated' field for the workflow. @@ -125,7 +125,7 @@ async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | N return workflow["deactivated"] -@cachetools.func.ttl_cache(ttl=1 * 60) # don't cache too long, but avoid spamming ewms +@aiocache.cached(ttl=1 * 60) # don't cache too long, but avoid spamming ewms async def get_taskforce_phases( ewms_rc: RestClient, workflow_id: str, From 251a0d8d75fba4291ab9ae0305e2c1da88c5607b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 16:21:23 -0600 Subject: [PATCH 062/327] fix scan status logic --- skydriver/utils.py | 32 ++++++------- tests/unit/test_scan_state.py | 85 +++++++++++++++-------------------- 2 files changed, 54 insertions(+), 63 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index e3415a67..b98c9ac4 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -15,9 +15,8 @@ class _ScanState(enum.Enum): IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() - PENDING__WAITING_ON_CLUSTER_STARTUP = enum.auto() PENDING__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() - PENDING__PRESTARTUP = enum.auto() + PENDING__IN_BACKLOG = enum.auto() async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: @@ -27,7 +26,10 @@ async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: def _has_request_been_sent_to_ewms() -> bool: return bool( - manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW # pending ewms req. + ( # has a real workflow id + manifest.ewms_workflow_id + and manifest.ewms_workflow_id != PENDING_EWMS_WORKFLOW + ) or ( # backward compatibility... manifest.ewms_task != DEPRECATED_EWMS_TASK and isinstance(manifest.ewms_task, dict) @@ -37,23 +39,23 @@ def _has_request_been_sent_to_ewms() -> bool: def get_nonfinished_state() -> _ScanState: """Get the ScanState of the scan, only by parsing attributes.""" - # has the scanner server started? - # -> yes - if manifest.progress: # attr only updated by scanner server requests - if _has_request_been_sent_to_ewms(): - return _ScanState.PENDING__WAITING_ON_CLUSTER_STARTUP - else: + # has scan cleared the backlog? (aka, has been submitted EWMS?) + if _has_request_been_sent_to_ewms(): + # has the scanner server started? + if manifest.progress: + # how far along is the scanner server? + # seen some pixels -> aka clients have processed pixels if manifest.progress.processing_stats.rate: return _ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED + # 0% -> aka clients haven't finished any pixels (yet) else: return _ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO - # -> no - else: - if _has_request_been_sent_to_ewms(): - # NOTE: assume that the ewms-request and scanner server startup happen in tandem - return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + # no -> hasn't started yet else: - return _ScanState.PENDING__PRESTARTUP + return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + # no -> still in backlog + else: + return _ScanState.PENDING__IN_BACKLOG # is EWMS still running the scan workers? # -> yes diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 3de08fc6..83b21d6b 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -41,7 +41,7 @@ async def test_00__scan_finished_successfully() -> None: (None, "IN_PROGRESS__PARTIAL_RESULT_GENERATED"), ], ) -async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None: +async def test_10__partial_result_generated(ewms_dtype: str | None, state: str) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -74,52 +74,36 @@ async def test_10__partial_result_generated(ewms_dtype: str, state: str) -> None @pytest.mark.parametrize( "ewms_dtype,state", [ - (True, "STOPPED__WAITING_ON_FIRST_PIXEL_RECO"), - (False, "IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO"), + ("ABORTED", "ABORTED__WAITING_ON_FIRST_PIXEL_RECO"), + ("FINISHED", "FINISHED__WAITING_ON_FIRST_PIXEL_RECO"), + (None, "IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO"), ], ) -async def test_20__waiting_on_first_pixel_reco(ewms_dtype: str, state: str) -> None: +async def test_20__waiting_on_first_pixel_reco( + ewms_dtype: str | None, state: str +) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=is_complete, - clusters=[ - schema.InHouseClusterInfo( - orchestrator="condor", - location=schema.HTCondorLocation( - collector="foo", - schedd="bar", - ), - n_workers=111, - cluster_id="abc123", # "" is a non-started cluster - starter_info={"abc": 123}, - ) - ], - ), + scan_id=MagicMock(), + timestamp=MagicMock(), + is_deleted=MagicMock(), + event_i3live_json_dict=MagicMock(), + scanner_server_args=MagicMock(), # - progress=schema.Progress( - "summary", - "epilogue", - {}, - schema.ProgressProcessingStats( - start={}, - runtime={}, - # rate={"abc": 123}, - # end, - # finished=True, - # predictions, + # now, args that actually matter: + ewms_workflow_id="pending ewms", + progress=MagicMock( + spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict + processing_stats=MagicMock( + spec_set=[ # no magic strict attrs -- kind of like dict + "finished", + "rate", + ], + finished=False, + rate={"abc": 123}, ), - 1.0, - str(time.time()), ), ) assert await get_scan_state(manifest, ewms_rc) == state @@ -128,11 +112,14 @@ async def test_20__waiting_on_first_pixel_reco(ewms_dtype: str, state: str) -> N @pytest.mark.parametrize( "ewms_dtype,state", [ - (True, "STOPPED__WAITING_ON_CLUSTER_STARTUP"), - (False, "PENDING__WAITING_ON_CLUSTER_STARTUP"), + ("ABORTED", "ABORTED__WAITING_ON_CLUSTER_STARTUP"), + ("FINISHED", "FINISHED__WAITING_ON_CLUSTER_STARTUP"), + (None, "PENDING__WAITING_ON_CLUSTER_STARTUP"), ], ) -async def test_30__waiting_on_cluster_startup(ewms_dtype: str, state: str) -> None: +async def test_30__waiting_on_cluster_startup( + ewms_dtype: str | None, state: str +) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -182,12 +169,13 @@ async def test_30__waiting_on_cluster_startup(ewms_dtype: str, state: str) -> No @pytest.mark.parametrize( "ewms_dtype,state", [ - (True, "STOPPED__WAITING_ON_SCANNER_SERVER_STARTUP"), - (False, "PENDING__WAITING_ON_SCANNER_SERVER_STARTUP"), + ("ABORTED", "ABORTED__WAITING_ON_SCANNER_SERVER_STARTUP"), + ("FINISHED", "FINISHED__WAITING_ON_SCANNER_SERVER_STARTUP"), + (None, "PENDING__WAITING_ON_SCANNER_SERVER_STARTUP"), ], ) async def test_40__waiting_on_scanner_server_startup( - ewms_dtype: str, state: str + ewms_dtype: str | None, state: str ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() @@ -238,11 +226,12 @@ async def test_40__waiting_on_scanner_server_startup( @pytest.mark.parametrize( "ewms_dtype,state", [ - (True, "STOPPED__PRESTARTUP"), - (False, "PENDING__PRESTARTUP"), + ("ABORTED", "ABORTED__IN_BACKLOG"), + ("FINISHED", "FINISHED__IN_BACKLOG"), + (None, "PENDING__IN_BACKLOG"), ], ) -async def test_50__prestartup(ewms_dtype: str, state: str) -> None: +async def test_50__IN_BACKLOG(ewms_dtype: str | None, state: str) -> None: """Test normal and stopped varriants.""" ewms_rc = MagicMock() From bc54e978c2c47b2bda4421dde3fef370320ff1ed Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 16:47:58 -0600 Subject: [PATCH 063/327] fix scan status logic - 2 --- skydriver/database/schema.py | 2 +- skydriver/ewms.py | 4 + skydriver/utils.py | 36 +++---- tests/unit/test_scan_state.py | 171 ++++++---------------------------- 4 files changed, 56 insertions(+), 157 deletions(-) diff --git a/skydriver/database/schema.py b/skydriver/database/schema.py index 4b114b87..7abcffee 100644 --- a/skydriver/database/schema.py +++ b/skydriver/database/schema.py @@ -112,7 +112,7 @@ def obfuscate_cl_args(args: str) -> str: return " ".join(out_args) -PENDING_EWMS_WORKFLOW = "pending ewms" +PENDING_EWMS_WORKFLOW = "pending-ewms" DEPRECATED_EVENT_I3LIVE_JSON_DICT = "use 'i3_event_id'" DEPRECATED_EWMS_TASK = "use 'ewms_workflow_id'" diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 5d7a1829..86139071 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -8,6 +8,7 @@ from . import database, images, s3 from .config import QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT +from .database.schema import PENDING_EWMS_WORKFLOW LOGGER = logging.Logger(__name__) @@ -118,6 +119,9 @@ async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | N Example: 'ABORTED', 'FINISHED """ + if workflow_id == PENDING_EWMS_WORKFLOW: + return None + workflow = await ewms_rc.request( "GET", f"/v0/workflows/{workflow_id}", diff --git a/skydriver/utils.py b/skydriver/utils.py index b98c9ac4..31a74dee 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -16,7 +16,7 @@ class _ScanState(enum.Enum): IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() PENDING__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() - PENDING__IN_BACKLOG = enum.auto() + PENDING__PRESTARTUP = enum.auto() async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: @@ -24,7 +24,7 @@ async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: if manifest.progress and manifest.progress.processing_stats.finished: return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name - def _has_request_been_sent_to_ewms() -> bool: + def _has_cleared_backlog() -> bool: return bool( ( # has a real workflow id manifest.ewms_workflow_id @@ -39,8 +39,8 @@ def _has_request_been_sent_to_ewms() -> bool: def get_nonfinished_state() -> _ScanState: """Get the ScanState of the scan, only by parsing attributes.""" - # has scan cleared the backlog? (aka, has been submitted EWMS?) - if _has_request_been_sent_to_ewms(): + # has scan cleared the backlog? (aka, has been *submitted* EWMS?) + if _has_cleared_backlog(): # has the scanner server started? if manifest.progress: # how far along is the scanner server? @@ -53,19 +53,23 @@ def get_nonfinished_state() -> _ScanState: # no -> hasn't started yet else: return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP - # no -> still in backlog + # no -> still in backlog (or aborted while in backlog) else: - return _ScanState.PENDING__IN_BACKLOG + return _ScanState.PENDING__PRESTARTUP - # is EWMS still running the scan workers? - # -> yes - if manifest.ewms_workflow_id and ( - dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id) + state = get_nonfinished_state().name # start here, augment if needed + + # AUGMENT STATUS... + if ( # Backward Compatibility: is this an old/pre-ewms scan? + not manifest.ewms_workflow_id + and isinstance(manifest.ewms_task, dict) + and manifest.ewms_task.get("complete") ): - return f"{dtype.upper()}__{get_nonfinished_state().name.split('__')[1]}" - # -> BACKWARD COMPATIBILITY: is this an old/pre-ewms scan? - elif isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("complete"): - return f"STOPPED__{get_nonfinished_state().name.split('__')[1]}" - # -> no, this is a non-finished scan + return f"STOPPED__{state.split('__')[1]}" # we didn't have info on what kind of stop + # has EWMS ceased running the scan workers? + elif dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id): + # -> yes, the ewms workflow has been deactivated + return f"{dtype.upper()}__{state.split('__')[1]}" else: - return get_nonfinished_state().name + # -> no, this is a non-finished scan + return state diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 83b21d6b..aafdfe19 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -1,11 +1,11 @@ """Test dynamically generating the scan state.""" -import time from unittest.mock import MagicMock, patch import pytest from skydriver.database import schema +from skydriver.database.schema import PENDING_EWMS_WORKFLOW from skydriver.utils import get_scan_state @@ -93,7 +93,7 @@ async def test_20__waiting_on_first_pixel_reco( scanner_server_args=MagicMock(), # # now, args that actually matter: - ewms_workflow_id="pending ewms", + ewms_workflow_id="ewms123", progress=MagicMock( spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict processing_stats=MagicMock( @@ -102,68 +102,13 @@ async def test_20__waiting_on_first_pixel_reco( "rate", ], finished=False, - rate={"abc": 123}, + rate=None, ), ), ) - assert await get_scan_state(manifest, ewms_rc) == state - -@pytest.mark.parametrize( - "ewms_dtype,state", - [ - ("ABORTED", "ABORTED__WAITING_ON_CLUSTER_STARTUP"), - ("FINISHED", "FINISHED__WAITING_ON_CLUSTER_STARTUP"), - (None, "PENDING__WAITING_ON_CLUSTER_STARTUP"), - ], -) -async def test_30__waiting_on_cluster_startup( - ewms_dtype: str | None, state: str -) -> None: - """Test normal and stopped variants.""" - ewms_rc = MagicMock() - - manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=is_complete, - # clusters=[ - # schema.ManualCluster( - # orchestrator="condor", - # location=schema.HTCondorLocation( - # collector="foo", - # schedd="bar", - # ), - # n_workers=111, - # cluster_id="abc123", # "" is a non-started cluster - # starter_info={"abc": 123}, - # ) - # ], - ), - # - progress=schema.Progress( - "summary", - "epilogue", - {}, - schema.ProgressProcessingStats( - start={}, - runtime={}, - # rate={"abc": 123}, - # end, - # finished=True, - # predictions, - ), - 1.0, - str(time.time()), - ), - ) - assert await get_scan_state(manifest, ewms_rc) == state + with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( @@ -181,98 +126,44 @@ async def test_40__waiting_on_scanner_server_startup( ewms_rc = MagicMock() manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=is_complete, - clusters=[ - schema.InHouseClusterInfo( - orchestrator="condor", - location=schema.HTCondorLocation( - collector="foo", - schedd="bar", - ), - n_workers=111, - cluster_id="abc123", # "" is a non-started cluster - starter_info={"abc": 123}, - ) - ], - ), + scan_id=MagicMock(), + timestamp=MagicMock(), + is_deleted=MagicMock(), + event_i3live_json_dict=MagicMock(), + scanner_server_args=MagicMock(), # - # progress=schema.Progress( - # "summary", - # "epilogue", - # {}, - # schema.ProgressProcessingStats( - # start={}, - # runtime={}, - # # rate={"abc": 123}, - # # end, - # # finished=True, - # # predictions, - # ), - # 1.0, - # str(time.time()), - # ), + # now, args that actually matter: + ewms_workflow_id="ewms123", + progress=None, ) - assert await get_scan_state(manifest, ewms_rc) == state + + with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): + assert await get_scan_state(manifest, ewms_rc) == state @pytest.mark.parametrize( "ewms_dtype,state", [ - ("ABORTED", "ABORTED__IN_BACKLOG"), - ("FINISHED", "FINISHED__IN_BACKLOG"), - (None, "PENDING__IN_BACKLOG"), + ("ABORTED", "ABORTED__PRESTARTUP"), + ("FINISHED", "FINISHED__PRESTARTUP"), + (None, "PENDING__PRESTARTUP"), ], ) -async def test_50__IN_BACKLOG(ewms_dtype: str | None, state: str) -> None: +async def test_50__prestartup(ewms_dtype: str | None, state: str) -> None: """Test normal and stopped varriants.""" ewms_rc = MagicMock() manifest = schema.Manifest( - scan_id="abc123", - timestamp=time.time(), - is_deleted=False, - event_i3live_json_dict={"abc": 123}, - scanner_server_args="", - ewms_task=schema.InHouseStarterInfo( - tms_args=[], - env_vars=schema.EnvVars(scanner_server=[], tms_starters=[]), - complete=is_complete, - # clusters=[ - # schema.ManualCluster( - # orchestrator="condor", - # location=schema.HTCondorLocation( - # collector="foo", - # schedd="bar", - # ), - # n_workers=111, - # cluster_id="abc123", # "" is a non-started cluster - # starter_info={"abc": 123}, - # ) - # ], - ), + scan_id=MagicMock(), + timestamp=MagicMock(), + is_deleted=MagicMock(), + event_i3live_json_dict=MagicMock(), + scanner_server_args=MagicMock(), # - # progress=schema.Progress( - # "summary", - # "epilogue", - # {}, - # schema.ProgressProcessingStats( - # start={}, - # runtime={}, - # # rate={"abc": 123}, - # # end, - # # finished=True, - # # predictions, - # ), - # 1.0, - # str(time.time()), - # ), + # now, args that actually matter: + ewms_workflow_id=PENDING_EWMS_WORKFLOW, + progress=None, ) - assert await get_scan_state(manifest, ewms_rc) == state + + with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): + assert await get_scan_state(manifest, ewms_rc) == state From 40db8885a4976d9c549155048b714b2291979ab1 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 16:51:23 -0600 Subject: [PATCH 064/327] mypy --- skydriver/ewms.py | 2 +- skydriver/rest_handlers.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 86139071..0aa13534 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -2,7 +2,7 @@ import logging -import aiocache +import aiocache # type: ignore[import-untyped] import requests from rest_tools.client import RestClient diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index c06a83f7..74af4f8b 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -27,7 +27,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, ewms, images, k8s +from . import database, ewms, images, k8s, utils from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_WORKER_DISK_BYTES, @@ -1057,7 +1057,7 @@ async def get(self, scan_id: str) -> None: "scan_state": scan_state, "is_deleted": manifest.is_deleted, "scan_complete": bool( - scan_state == schema.ScanState.SCAN_FINISHED_SUCCESSFULLY.name + scan_state == utils._ScanState.SCAN_FINISHED_SUCCESSFULLY.name ), "pods": pods_411, "clusters": clusters, From f0eb1858d02dce6733dc285ab3cca4a1b6c9cb32 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 17:14:06 -0600 Subject: [PATCH 065/327] simplify Dockerfile --- Dockerfile | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6cb9e9ac..2d2e9a7e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,13 +10,4 @@ COPY --chown=app:app . . RUN pip install --no-cache-dir . ENV PYTHONPATH=/home/app -# clientmanager needs GCP for GKE -USER root -RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list -RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - -RUN apt-get update -y -RUN apt-get install google-cloud-cli -y -RUN apt-get install google-cloud-sdk-gke-gcloud-auth-plugin -y -USER app - CMD ["python", "-m", "skydriver"] From 574438eb9d6d4bb23b2542c3cf5dcee3dc6f107a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 13 Jan 2025 17:19:44 -0600 Subject: [PATCH 066/327] simplify Dockerfile - 2 --- Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Dockerfile b/Dockerfile index 2d2e9a7e..a8e99a2c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,4 +10,6 @@ COPY --chown=app:app . . RUN pip install --no-cache-dir . ENV PYTHONPATH=/home/app +USER app + CMD ["python", "-m", "skydriver"] From f26f4a7425baeb9ed0c44e7f3ddc8c04ba3180f4 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 20:28:14 +0000 Subject: [PATCH 067/327] update setup.cfg --- setup.cfg | 83 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 41 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2658ba8a..289f8600 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,19 +3,19 @@ python_min = 3.10 python_max = 3.11 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube -name = skydriver-clientmanager-ewms-sidecar + WIPAC + IceCube +name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder -version_variable = skydriver/__init__.py:__version__,clientmanager/__init__.py:__version__,ewms_sidecar/__init__.py:__version__ +version_variable = skydriver/__init__.py:__version__,s3_sidecar/__init__.py:__version__,ewms_init_container/__init__.py:__version__ upload_to_pypi = False patch_without_tag = False commit_parser = semantic_release.history.emoji_parser @@ -26,50 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - boto3 - dacite - aiocache - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools - pyyaml + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.12 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - clientmanager - ewms_sidecar - skydriver.* - clientmanager.* - ewms_sidecar.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From 638488336d0b04b3daa044b867193cfa306c6496 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 20:28:14 +0000 Subject: [PATCH 068/327] add py.typed file(s) --- ewms_init_container/py.typed | 0 s3_sidecar/py.typed | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 ewms_init_container/py.typed create mode 100644 s3_sidecar/py.typed diff --git a/ewms_init_container/py.typed b/ewms_init_container/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/s3_sidecar/py.typed b/s3_sidecar/py.typed new file mode 100644 index 00000000..e69de29b From c651eb4c01ec0f72ea90c7ab1e8cd1ac45fb5180 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 14 Jan 2025 20:32:12 +0000 Subject: [PATCH 069/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index e94f71e4..d3fb96a2 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -1,13 +1,14 @@ # -# This file was autogenerated by WIPACrepo/wipac-dev-py-setup-action -# within the container built from './Dockerfile' +# This file was autogenerated by WIPACrepo/wipac-dev-py-dependencies-action +# within a container using the user-supplied image 'skydriver' # using Python 3.11. # ######################################################################## # pip freeze ######################################################################## -boto3==1.35.92 -botocore==1.35.92 +aiocache==0.12.3 +boto3==1.35.99 +botocore==1.35.99 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 @@ -17,7 +18,7 @@ dacite==1.8.1 dnspython==2.7.0 durationpy==0.9 google-auth==2.37.0 -htcondor==24.2.1 +htcondor==24.3.0 humanfriendly==10.0 idna==3.10 jmespath==1.0.1 @@ -43,7 +44,7 @@ typeguard==4.4.1 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.13.0 +wipac-dev-tools==1.14.0 wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree @@ -55,22 +56,23 @@ pipdeptree==2.24.0 ├── packaging [required: >=24.1, installed: 24.2] └── pip [required: >=24.2, installed: 24.3.1] setuptools==65.5.1 -skydriver-clientmanager-ewms-sidecar -├── boto3 [required: Any, installed: 1.35.92] -│ ├── botocore [required: >=1.35.92,<1.36.0, installed: 1.35.92] +skydriver-s3-sidecar-ewms-init-container +├── aiocache [required: Any, installed: 0.12.3] +├── boto3 [required: Any, installed: 1.35.99] +│ ├── botocore [required: >=1.35.99,<1.36.0, installed: 1.35.99] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.10.0,<0.11.0, installed: 0.10.4] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.92] +│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.99] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] ├── dacite [required: Any, installed: 1.8.1] -├── htcondor [required: Any, installed: 24.2.1] +├── htcondor [required: Any, installed: 24.3.0] ├── humanfriendly [required: Any, installed: 10.0] ├── kubernetes [required: Any, installed: 31.0.0] │ ├── certifi [required: >=14.05.14, installed: 2024.12.14] @@ -105,6 +107,7 @@ skydriver-clientmanager-ewms-sidecar │ └── dnspython [required: >=1.16.0,<3.0.0, installed: 2.7.0] ├── pymongo [required: ==4.6.1, installed: 4.6.1] │ └── dnspython [required: >=1.16.0,<3.0.0, installed: 2.7.0] +├── PyYAML [required: Any, installed: 6.0.2] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -113,7 +116,7 @@ skydriver-clientmanager-ewms-sidecar ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.1] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.13.0] +├── wipac-dev-tools [required: Any, installed: 1.14.0] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -137,7 +140,7 @@ skydriver-clientmanager-ewms-sidecar │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.13.0] + └── wipac-dev-tools [required: Any, installed: 1.14.0] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From 342602d963b5eb59a0bd2e516b91043b93bc23cc Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 15:38:08 -0600 Subject: [PATCH 070/327] import fix --- skydriver/k8s/scan_backlog.py | 2 +- tests/integration/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index fc91c43f..fa562983 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -4,7 +4,7 @@ import logging import time -import kubernetes.client.V1Job # type: ignore[import-untyped] +import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 3896727e..1fac73ad 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -5,7 +5,7 @@ from typing import Any, AsyncIterator, Callable from unittest.mock import MagicMock, Mock -import kubernetes.client # type: ignore[import] +import kubernetes.client # type: ignore[import-untyped] import pytest import pytest_asyncio from rest_tools.client import RestClient From 4f362a084a81659d589f6f5fce4cad0bb3e80c81 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 15:39:10 -0600 Subject: [PATCH 071/327] try 3.13? --- setup.cfg | 81 +++++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/setup.cfg b/setup.cfg index 289f8600..a8f473d9 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,17 +1,17 @@ [wipac:cicd_setup_builder] python_min = 3.10 -python_max = 3.11 +python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,51 +26,50 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.12 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples From a7731f4ac745f500478a4974e1708185c0676c2a Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 21:39:39 +0000 Subject: [PATCH 072/327] update setup.cfg --- setup.cfg | 81 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index a8f473d9..b6c43eef 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,50 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools -python_requires = >=3.10, <3.12 + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools +python_requires = >=3.10, <3.14 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From f2987626af67398af632c9a51f3e156f474062e7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 15:41:35 -0600 Subject: [PATCH 073/327] ci env var --- .github/workflows/wipac-cicd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 0dfc61ed..becbf290 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -207,6 +207,7 @@ jobs: $(env | grep '^SKYSCAN_' | awk '$0="--env "$0') \ $(env | grep '^EWMS_' | awk '$0="--env "$0') \ $(env | grep '^S3_' | awk '$0="--env "$0') \ + $(env | grep '^CI' | awk '$0="--env "$0') \ $(env | grep '^CI_' | awk '$0="--env "$0') \ $(env | grep '^SCAN_' | awk '$0="--env "$0') \ --mount type=bind,source=$(realpath $DIR),target=/local/$DIR \ From f8cddc8ed33f3acfecc6c886e8d78492e111d35d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 15:42:09 -0600 Subject: [PATCH 074/327] bump to 3.12 --- setup.cfg | 81 +++++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/setup.cfg b/setup.cfg index b6c43eef..4344abfe 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,17 +1,17 @@ [wipac:cicd_setup_builder] python_min = 3.10 -python_max = 3.13 +python_max = 3.12 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,51 +26,50 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.14 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples From 1332647c671d522b0f91644cce30123d1d07063d Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 21:42:44 +0000 Subject: [PATCH 075/327] update setup.cfg --- setup.cfg | 81 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 4344abfe..2636d791 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.12 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,50 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools -python_requires = >=3.10, <3.14 + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools +python_requires = >=3.10, <3.13 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From 2acdce94a815a7b8c52a30f1406956009a1f495a Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 21:46:26 +0000 Subject: [PATCH 076/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index d3fb96a2..9e865851 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.35.99 -botocore==1.35.99 +boto3==1.36.0 +botocore==1.36.0 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 @@ -37,14 +37,14 @@ requests==2.32.3 requests-futures==1.0.2 requests-oauthlib==2.0.0 rsa==4.9 -s3transfer==0.10.4 +s3transfer==0.11.0 six==1.17.0 tornado==6.4.2 typeguard==4.4.1 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.14.0 +wipac-dev-tools==1.15.0 wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree @@ -58,15 +58,15 @@ pipdeptree==2.24.0 setuptools==65.5.1 skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.35.99] -│ ├── botocore [required: >=1.35.99,<1.36.0, installed: 1.35.99] +├── boto3 [required: Any, installed: 1.36.0] +│ ├── botocore [required: >=1.36.0,<1.37.0, installed: 1.36.0] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] -│ └── s3transfer [required: >=0.10.0,<0.11.0, installed: 0.10.4] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.35.99] +│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.0] +│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.36.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -116,7 +116,7 @@ skydriver-s3-sidecar-ewms-init-container ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.1] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.14.0] +├── wipac-dev-tools [required: Any, installed: 1.15.0] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -140,7 +140,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.14.0] + └── wipac-dev-tools [required: Any, installed: 1.15.0] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From fdabb368f12c475cad1197e9f9124c4f8412f34f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:00:47 -0600 Subject: [PATCH 077/327] (debug) --- skydriver/images.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/skydriver/images.py b/skydriver/images.py index 1fed5639..e371f74e 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -155,6 +155,8 @@ def resolve_docker_tag(docker_tag: str) -> str: NOTE: Assumes tag exists (or will soon) on CVMFS. Condor will back off & retry until the image exists """ + LOGGER.info(f"checking docker tag: {docker_tag}") + if docker_tag == "latest": # 'latest' doesn't exist in CVMFS return _try_resolve_to_majminpatch_docker_hub("latest") From 28f8025a6e8f640ecdbac4d0ea5aadfc35a6fac6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:06:26 -0600 Subject: [PATCH 078/327] (debug-2) --- skydriver/images.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/skydriver/images.py b/skydriver/images.py index e371f74e..b640ab6a 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -157,11 +157,17 @@ def resolve_docker_tag(docker_tag: str) -> str: """ LOGGER.info(f"checking docker tag: {docker_tag}") - if docker_tag == "latest": # 'latest' doesn't exist in CVMFS - return _try_resolve_to_majminpatch_docker_hub("latest") + try: + + if docker_tag == "latest": # 'latest' doesn't exist in CVMFS + return _try_resolve_to_majminpatch_docker_hub("latest") + + if VERSION_REGEX_PREFIX_V.fullmatch(docker_tag): + # v4 -> 4; v5.1 -> 5.1; v3.6.9 -> 3.6.9 + docker_tag = docker_tag.lstrip("v") - if VERSION_REGEX_PREFIX_V.fullmatch(docker_tag): - # v4 -> 4; v5.1 -> 5.1; v3.6.9 -> 3.6.9 - docker_tag = docker_tag.lstrip("v") + return _try_resolve_to_majminpatch_docker_hub(docker_tag) - return _try_resolve_to_majminpatch_docker_hub(docker_tag) + except Exception as e: + LOGGER.exception(e) + raise e From f88c6eb6d94dec5bba54ffd17c16791f81d42db4 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:11:46 -0600 Subject: [PATCH 079/327] ci env var - 2 --- .github/workflows/wipac-cicd.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index becbf290..384bc0ad 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -20,7 +20,7 @@ env: S3_SECRET_KEY: 8dea68a1 S3_SECRET_KEY__K8S_SECRET_KEY: cdf7c60b S3_BUCKET: 72017610 - + MIN_SKYMAP_SCANNER_TAG: "v3.21.2" # TODO: remove once skyscan v4 is out (that's the real min) jobs: @@ -210,6 +210,7 @@ jobs: $(env | grep '^CI' | awk '$0="--env "$0') \ $(env | grep '^CI_' | awk '$0="--env "$0') \ $(env | grep '^SCAN_' | awk '$0="--env "$0') \ + $(env | grep '^MIN_SKYMAP_SCANNER_TAG' | awk '$0="--env "$0') \ --mount type=bind,source=$(realpath $DIR),target=/local/$DIR \ wipac/skydriver:local \ /local/$DIR/test-script.sh From d64d388c642538a979813e0778026d08eaedb16e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:16:42 -0600 Subject: [PATCH 080/327] tests: drop entire db --- skydriver/database/utils.py | 10 ++++------ tests/integration/conftest.py | 6 +++--- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/skydriver/database/utils.py b/skydriver/database/utils.py index d538fef3..692a2e7a 100644 --- a/skydriver/database/utils.py +++ b/skydriver/database/utils.py @@ -79,10 +79,8 @@ async def ensure_indexes(motor_client: AsyncIOMotorClient) -> None: # type: ign ) -async def drop_collections(motor_client: AsyncIOMotorClient) -> None: # type: ignore[valid-type] - """Drop the "regular" collections -- most useful for testing.""" +async def drop_database(motor_client: AsyncIOMotorClient) -> None: # type: ignore[valid-type] + """Drop the database -- only useful during CI testing.""" if not ENV.CI: - raise RuntimeError("Cannot drop collections if not in testing mode") - await motor_client[_DB_NAME][_MANIFEST_COLL_NAME].drop() # type: ignore[index] - await motor_client[_DB_NAME][_RESULTS_COLL_NAME].drop() # type: ignore[index] - await motor_client[_DB_NAME][_SCAN_BACKLOG_COLL_NAME].drop() # type: ignore[index] + raise RuntimeError("Cannot drop database if not in testing mode") + await motor_client.drop_database(_DB_NAME) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 1fac73ad..44a7e703 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -13,7 +13,7 @@ import skydriver import skydriver.images # noqa: F401 # export from skydriver.database import create_mongodb_client -from skydriver.database.utils import drop_collections +from skydriver.database.utils import drop_database from skydriver.server import make @@ -34,10 +34,10 @@ async def mongo_clear() -> Any: """Clear the MongoDB after test completes.""" motor_client = await create_mongodb_client() try: - await drop_collections(motor_client) + await drop_database(motor_client) yield finally: - await drop_collections(motor_client) + await drop_database(motor_client) ######################################################################################## From a617c3f03922789c9ce5e284fb04b79afb12b7f8 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:21:27 -0600 Subject: [PATCH 081/327] image tag logic --- skydriver/images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/images.py b/skydriver/images.py index b640ab6a..96f61b6e 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -85,7 +85,7 @@ def _parse_image_ts(info: dict) -> float: @cachetools.func.lru_cache() # cache it forever def min_skymap_scanner_tag_ts() -> float: """Get the timestamp for when the `MIN_SKYMAP_SCANNER_TAG` image was created.""" - info = get_info_from_docker_hub(ENV.MIN_SKYMAP_SCANNER_TAG) + info = get_info_from_docker_hub(ENV.MIN_SKYMAP_SCANNER_TAG.lstrip("v")) return _parse_image_ts(info) From 6a5d3df0ee9a58de7d9a776433c56c3d990a10d9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:22:22 -0600 Subject: [PATCH 082/327] mypy --- skydriver/database/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/database/utils.py b/skydriver/database/utils.py index 692a2e7a..ec6234f2 100644 --- a/skydriver/database/utils.py +++ b/skydriver/database/utils.py @@ -83,4 +83,4 @@ async def drop_database(motor_client: AsyncIOMotorClient) -> None: # type: igno """Drop the database -- only useful during CI testing.""" if not ENV.CI: raise RuntimeError("Cannot drop database if not in testing mode") - await motor_client.drop_database(_DB_NAME) + await motor_client.drop_database(_DB_NAME) # type: ignore[attr-defined] From 0dc04ecec511ff2eda131a60827a18c1939f1183 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:29:56 -0600 Subject: [PATCH 083/327] use `wipac_dev_tools.timing_tools` --- skydriver/k8s/scan_backlog.py | 50 +---------------------------------- 1 file changed, 1 insertion(+), 49 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index fa562983..fc935757 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -8,6 +8,7 @@ from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient from tornado import web +from wipac_dev_tools.timing_tools import IntervalTimer from .utils import KubeAPITools from .. import database, ewms @@ -109,55 +110,6 @@ def _logging_heartbeat(last_log_time: float) -> float: return last_log_time -class IntervalTimer: - """A utility class to track time intervals. - - This class allows tracking of elapsed time between actions and provides - mechanisms to wait until a specified time interval has passed. - - TODO: Move this to dev-tools (copied from TMS). - """ - - def __init__(self, seconds: float, logger: logging.Logger) -> None: - self.seconds = seconds - self._last_time = time.time() - self.logger = logger - - def fastforward(self): - """Reset the timer so that the next call to `has_interval_elapsed` will return True. - - This effectively skips the current interval and forces the timer to indicate - that the interval has elapsed on the next check. - """ - self._last_time = float("-inf") - - async def wait_until_interval(self) -> None: - """Wait asynchronously until the specified interval has elapsed. - - This method checks the elapsed time every second, allowing cooperative - multitasking during the wait. - """ - self.logger.debug( - f"Waiting until {self.seconds}s has elapsed since the last iteration..." - ) - while not self.has_interval_elapsed(): - await asyncio.sleep(1) - - def has_interval_elapsed(self) -> bool: - """Check if the specified time interval has elapsed since the last expiration. - - If the interval has elapsed, the internal timer is reset to the current time. - """ - diff = time.time() - self._last_time - if diff >= self.seconds: - self._last_time = time.time() - self.logger.debug( - f"At least {self.seconds}s have elapsed (actually {diff}s)." - ) - return True - return False - - async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, From 4e0d30e22b61c40b3c558baf893221fdab1fa178 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:31:55 -0600 Subject: [PATCH 084/327] use `wipac_dev_tools.timing_tools` - 2 --- s3_sidecar/post.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index dc4c897f..7979781c 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -8,6 +8,7 @@ import boto3 # type: ignore[import-untyped] import requests +from wipac_dev_tools.timing_tools import IntervalTimer LOGGER = logging.getLogger(__package__) @@ -68,10 +69,13 @@ def main() -> None: args = parser.parse_args() + logger_timer = IntervalTimer(5, LOGGER) + if args.wait_indefinitely: LOGGER.info("Waiting for file to exist...") while not args.fpath.exists(): - # TODO: use wipac_dev_tools.timing_tools.IntervalTimer to log every X sec + if logger_timer.has_interval_elapsed(): + LOGGER.info("still waiting...") time.sleep(1) post(args.fpath) From 6f0f24401bf4a9d92840cceade3f0815ebe43ed5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:37:00 -0600 Subject: [PATCH 085/327] use `wipac_dev_tools.timing_tools` - 3 --- skydriver/k8s/scan_backlog.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index fc935757..3395468d 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -102,14 +102,6 @@ async def run( LOGGER.info("Restarted scan backlog runner.") -def _logging_heartbeat(last_log_time: float) -> float: - if time.time() - last_log_time > ENV.SCAN_BACKLOG_RUNNER_DELAY: - LOGGER.info("scan backlog runner is still alive") - return time.time() - else: - return last_log_time - - async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, @@ -131,12 +123,14 @@ async def _run( ) ) - last_log_heartbeat = 0.0 # log every so often, not on every iteration - long_interval_timer = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) + timer_main_loop = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) + timer_logging = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) + # main loop while True: await asyncio.sleep(ENV.SCAN_BACKLOG_RUNNER_SHORT_DELAY) - last_log_heartbeat = _logging_heartbeat(last_log_heartbeat) + if timer_logging.has_interval_elapsed(): + LOGGER.info("scan backlog runner is still alive") # get next entry try: @@ -146,10 +140,10 @@ async def _run( scan_request_client, skyscan_k8s_job_client, # include low priority scans only when enough time has passed - include_low_priority_scans=long_interval_timer.has_interval_elapsed(), + include_low_priority_scans=timer_main_loop.has_interval_elapsed(), ) except database.mongodc.DocumentNotFoundException: - long_interval_timer.fastforward() + timer_main_loop.fastforward() continue # empty queue- # request a workflow on EWMS @@ -161,7 +155,7 @@ async def _run( ) except Exception as e: LOGGER.exception(e) - long_interval_timer.fastforward() # nothing was started, so don't wait long + timer_main_loop.fastforward() # nothing was started, so don't wait long continue await manifest_client.collection.find_one_and_update( {"scan_id": manifest.scan_id}, @@ -181,7 +175,7 @@ async def _run( except kubernetes.client.exceptions.ApiException as e: # k8s job (backlog entry) will be revived & restarted in future iteration LOGGER.exception(e) - long_interval_timer.fastforward() # nothing was started, so don't wait long + timer_main_loop.fastforward() # nothing was started, so don't wait long continue # remove from backlog now that startup succeeded From 5e1babcac621992c29486e0e85c7921627f3afe4 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:47:23 -0600 Subject: [PATCH 086/327] mock `generate_s3_get_url()` --- tests/integration/test_backlog_runner.py | 37 +++++++++++++++++++++--- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index 0a9a4b64..0244ccaa 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -38,7 +38,12 @@ def print_it(obj: Any) -> None: @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -async def test_00(kapitsj_mock: Mock, server: Callable[[], RestClient]) -> None: +@mock.patch("skydriver.s3.generate_s3_get_url") +async def test_00( + kapitsj_mock: Mock, + s3gs3gurl_mock: Mock, + server: Callable[[], RestClient], +) -> None: """Test backlog job starting.""" rc = server() await rc.request("POST", "/scan", POST_SCAN_BODY) @@ -46,13 +51,21 @@ async def test_00(kapitsj_mock: Mock, server: Callable[[], RestClient]) -> None: print_it(await rc.request("GET", "/scans/backlog")) await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) + + # call counts kapitsj_mock.assert_called_once() + s3gs3gurl_mock.assert_called_once() print_it(await rc.request("GET", "/scans/backlog")) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -async def test_01(kapitsj_mock: Mock, server: Callable[[], RestClient]) -> None: +@mock.patch("skydriver.s3.generate_s3_get_url") +async def test_01( + kapitsj_mock: Mock, + s3gs3gurl_mock: Mock, + server: Callable[[], RestClient], +) -> None: """Test backlog job starting with multiple.""" rc = server() @@ -66,12 +79,19 @@ async def test_01(kapitsj_mock: Mock, server: Callable[[], RestClient]) -> None: for i in range(N_JOBS): await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) + # call counts assert kapitsj_mock.call_count >= i + 1 # in case runner is faster + assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + # call counts assert kapitsj_mock.call_count == N_JOBS + assert s3gs3gurl_mock.call_count == N_JOBS - # any extra calls? await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) + + # any extra calls? assert kapitsj_mock.call_count == N_JOBS + assert s3gs3gurl_mock.call_count == N_JOBS + print_it(await rc.request("GET", "/scans/backlog")) @@ -80,8 +100,10 @@ async def test_01(kapitsj_mock: Mock, server: Callable[[], RestClient]) -> None: "skydriver.k8s.scanner_instance.SkymapScannerWorkerStopperK8sWrapper.go", new=Mock() ) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") +@mock.patch("skydriver.s3.generate_s3_get_url") async def test_10( kapitsj_mock: Mock, + s3gs3gurl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting with multiple cancels.""" @@ -106,11 +128,18 @@ async def test_10( for i in range(N_JOBS - 2): await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) + # call counts assert kapitsj_mock.call_count >= i + 1 # in case runner is faster + assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + # call counts assert kapitsj_mock.call_count == N_JOBS - 2 + assert s3gs3gurl_mock.call_count == N_JOBS - 2 - # any extra calls? await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) + + # any extra calls? assert kapitsj_mock.call_count == N_JOBS - 2 + assert s3gs3gurl_mock.call_count == N_JOBS - 2 + print_it(await rc.request("GET", "/scans/backlog")) assert not (await rc.request("GET", "/scans/backlog"))["entries"] From d36330d546e33170abd1c2dbb19efc60aad2cb71 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 16:53:45 -0600 Subject: [PATCH 087/327] mock `generate_s3_get_url()` - 2 --- tests/integration/test_backlog_runner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index 0244ccaa..8fa93861 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -53,8 +53,8 @@ async def test_00( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) # call counts - kapitsj_mock.assert_called_once() s3gs3gurl_mock.assert_called_once() + kapitsj_mock.assert_called_once() print_it(await rc.request("GET", "/scans/backlog")) @@ -80,17 +80,17 @@ async def test_01( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert kapitsj_mock.call_count >= i + 1 # in case runner is faster assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert kapitsj_mock.call_count == N_JOBS assert s3gs3gurl_mock.call_count == N_JOBS + assert kapitsj_mock.call_count == N_JOBS await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert kapitsj_mock.call_count == N_JOBS assert s3gs3gurl_mock.call_count == N_JOBS + assert kapitsj_mock.call_count == N_JOBS print_it(await rc.request("GET", "/scans/backlog")) @@ -129,17 +129,17 @@ async def test_10( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert kapitsj_mock.call_count >= i + 1 # in case runner is faster assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert kapitsj_mock.call_count == N_JOBS - 2 assert s3gs3gurl_mock.call_count == N_JOBS - 2 + assert kapitsj_mock.call_count == N_JOBS - 2 await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert kapitsj_mock.call_count == N_JOBS - 2 assert s3gs3gurl_mock.call_count == N_JOBS - 2 + assert kapitsj_mock.call_count == N_JOBS - 2 print_it(await rc.request("GET", "/scans/backlog")) assert not (await rc.request("GET", "/scans/backlog"))["entries"] From d01ed1f40b9f431ed0e6e45f6082d3f01ea9a80d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:02:07 -0600 Subject: [PATCH 088/327] tests: run a dummy ewms rest server --- .github/workflows/wipac-cicd.yml | 19 ++++++++++++++++--- tests/integration/conftest.py | 5 +++-- tests/integration/dummy_ewms.py | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+), 5 deletions(-) create mode 100644 tests/integration/dummy_ewms.py diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 384bc0ad..41ee0a1b 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -10,7 +10,7 @@ env: SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE: 200 LOG_LEVEL: debug # mandatory env vars... - EWMS_ADDRESS: fcb6c253 + EWMS_ADDRESS: http://localhost:8081 EWMS_TOKEN_URL: 65f3b929 EWMS_CLIENT_ID: b75a974d EWMS_CLIENT_SECRET: 411b16fe @@ -80,6 +80,7 @@ jobs: # & don't run non-branch triggers (like tags) # & we don't want to trigger an update on PR's merge to main/master/default (which is a branch) run: | + set -euo pipefail if [[ \ ${{github.actor}} != 'dependabot[bot]' && \ ${{github.ref_type}} == 'branch' && \ @@ -146,16 +147,19 @@ jobs: - name: pip install run: | + set -euo pipefail pip install --upgrade pip wheel setuptools pip install .[tests] - name: test run: | + set -euo pipefail pytest -vvv tests/unit --exitfirst - name: Dump logs if: always() run: | + set -euo pipefail cat pytest.logs || true integration-tests: @@ -184,6 +188,9 @@ jobs: - name: test run: | + set -euo pipefail + python tests/integration/dummy_ewms.py &> ./dummy_ewms.out & + export LATEST_TAG=$( \ curl -I https://github.com/icecube/skymap_scanner/releases/latest \ | awk -F '/' '/^location/ {print substr($NF, 1, length($NF)-1)}' \ @@ -214,15 +221,21 @@ jobs: --mount type=bind,source=$(realpath $DIR),target=/local/$DIR \ wipac/skydriver:local \ /local/$DIR/test-script.sh - + # - name: dump test logs if: always() run: | + set -euo pipefail docker logs test || true - + - name: dump dummy-ewms logs + if: always() + run: | + set -euo pipefail + cat ./dummy_ewms.out - name: dump mongo logs if: always() run: | + set -euo pipefail docker logs "${{ job.services.mongo.id }}" || true test-build-docker: diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 44a7e703..d47a5820 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -3,7 +3,7 @@ import asyncio import socket from typing import Any, AsyncIterator, Callable -from unittest.mock import MagicMock, Mock +from unittest.mock import Mock import kubernetes.client # type: ignore[import-untyped] import pytest @@ -12,6 +12,7 @@ import skydriver import skydriver.images # noqa: F401 # export +from skydriver.__main__ import setup_ewms_client from skydriver.database import create_mongodb_client from skydriver.database.utils import drop_database from skydriver.server import make @@ -131,7 +132,7 @@ async def server( mongo_client = await create_mongodb_client() k8s_batch_api = Mock() - ewms_rc = MagicMock() + ewms_rc = setup_ewms_client() backlog_task = asyncio.create_task( skydriver.k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc) ) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py new file mode 100644 index 00000000..ec224630 --- /dev/null +++ b/tests/integration/dummy_ewms.py @@ -0,0 +1,32 @@ +"""A dummy EWMS server for testing.""" + +import os +from typing import Any + +from flask import Flask, jsonify + +app = Flask(__name__) + +DONT_CALL_IT_A_DB: dict[str, Any] = {} + + +@app.route("/v0/mqs/workflows//mq-group/activation", methods=["POST"]) +def dummy_mq_group_activation_post(workflow_id: str): + # in the real mqs, there's a bunch of db logic, etc. + + stored = DONT_CALL_IT_A_DB[workflow_id] + for mqprofile in stored["mqprofiles"]: + mqprofile["is_activated"] = True + mqprofile["auth_token"] = "DUMMY_TOKEN" + mqprofile["broker_type"] = "DUMMY_BROKER_TYPE" + mqprofile["broker_address"] = "DUMMY_BROKER_ADDRESS" + + return jsonify(stored) + + +if __name__ == "__main__": + app.run( + debug=True, + host="0.0.0.0", + port=int(os.environ["EWMS_ADDRESS"].split(":")[-1]), + ) From b0da98b3dffa209146a11274555f8de60a311102 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:13:08 -0600 Subject: [PATCH 089/327] dummy ewms: `v0/workflows` --- skydriver/ewms.py | 5 ++++- tests/integration/dummy_ewms.py | 25 +++++++++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 0aa13534..55f63c4f 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -20,7 +20,10 @@ async def request_workflow_on_ewms( ) -> str: """Request a workflow in EWMS.""" if manifest.ewms_workflow_id != database.schema.PENDING_EWMS_WORKFLOW: - raise TypeError("Manifest is not designated for EWMS") + if manifest.ewms_workflow_id: + raise TypeError("Scan has already been sent to EWMS") + else: # None + raise TypeError("Scan is not designated for EWMS") s3_url_get = s3.generate_s3_get_url(manifest.scan_id) image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index ec224630..acbb6fd2 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -1,6 +1,7 @@ """A dummy EWMS server for testing.""" import os +import uuid from typing import Any from flask import Flask, jsonify @@ -10,18 +11,22 @@ DONT_CALL_IT_A_DB: dict[str, Any] = {} -@app.route("/v0/mqs/workflows//mq-group/activation", methods=["POST"]) -def dummy_mq_group_activation_post(workflow_id: str): - # in the real mqs, there's a bunch of db logic, etc. +@app.route("/v0/workflows", methods=["POST"]) +def dummy_workflows_post(): + # in the real ewms, there's a bunch of db logic, etc. - stored = DONT_CALL_IT_A_DB[workflow_id] - for mqprofile in stored["mqprofiles"]: - mqprofile["is_activated"] = True - mqprofile["auth_token"] = "DUMMY_TOKEN" - mqprofile["broker_type"] = "DUMMY_BROKER_TYPE" - mqprofile["broker_address"] = "DUMMY_BROKER_ADDRESS" + workflow_id = uuid.uuid4().hex + minimal_wf_doc = { + "workflow_id": workflow_id, + } - return jsonify(stored) + DONT_CALL_IT_A_DB[workflow_id] = minimal_wf_doc + + return jsonify( + { + "workflow": minimal_wf_doc, + } + ) if __name__ == "__main__": From 6b515f78a0dbd2fe7111006533704db0307fca96 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:15:56 -0600 Subject: [PATCH 090/327] tests: req flask --- setup.cfg | 80 +++++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2636d791..ed52d933 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.12 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,51 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.13 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples From 466dd957bcd4cb8b254c6e3ba9e4697744fb3840 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 15 Jan 2025 23:16:25 +0000 Subject: [PATCH 091/327] update setup.cfg --- setup.cfg | 81 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index ed52d933..d4e9c0d5 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.12 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,51 +26,52 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite + htcondor + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.13 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio - flask + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From f6245b40b14e4be9965400050f4f2105e9513ec1 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:23:40 -0600 Subject: [PATCH 092/327] dummy ewms - fix --- .github/workflows/wipac-cicd.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 41ee0a1b..aa95ddb3 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -189,6 +189,8 @@ jobs: - name: test run: | set -euo pipefail + + pip install .[tests] python tests/integration/dummy_ewms.py &> ./dummy_ewms.out & export LATEST_TAG=$( \ From 86bc6cd84fd0d714f83af4df86ffef83ada5b84b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:27:35 -0600 Subject: [PATCH 093/327] remove old mock --- tests/integration/test_backlog_runner.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index 8fa93861..e482c559 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -95,10 +95,6 @@ async def test_01( print_it(await rc.request("GET", "/scans/backlog")) -# mock skydriver.k8s.scanner_instance.SkymapScannerWorkerStopperK8sWrapper.go b/c it calls start_job -@mock.patch( - "skydriver.k8s.scanner_instance.SkymapScannerWorkerStopperK8sWrapper.go", new=Mock() -) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") @mock.patch("skydriver.s3.generate_s3_get_url") async def test_10( From e16ce681b3f422bfbcfb3eba6f6ad9a22cea8d1f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:37:07 -0600 Subject: [PATCH 094/327] dummy ewms - others --- skydriver/ewms.py | 15 ++++++--------- tests/integration/dummy_ewms.py | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 55f63c4f..2c8b416f 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -7,7 +7,7 @@ from rest_tools.client import RestClient from . import database, images, s3 -from .config import QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT +from .config import ENV, QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT from .database.schema import PENDING_EWMS_WORKFLOW LOGGER = logging.Logger(__name__) @@ -91,29 +91,26 @@ async def request_stop_on_ewms( ewms_rc: RestClient, workflow_id: str, abort: bool, -) -> int: +) -> None: """Signal that an EWMS workflow is finished, and stop whatever is needed. - Returns the number of stopped taskforces. - Suppresses any HTTP errors. """ try: if abort: - resp = await ewms_rc.request( + await ewms_rc.request( "POST", f"/v0/workflows/{workflow_id}/actions/abort", ) else: - resp = await ewms_rc.request( + await ewms_rc.request( "POST", f"/v0/workflows/{workflow_id}/actions/finished", ) except requests.exceptions.HTTPError as e: LOGGER.warning(repr(e)) - return 0 - else: - return resp["n_taskforces"] + if ENV.CI: + raise e @aiocache.cached(ttl=1 * 60) # don't cache too long, but avoid spamming ewms diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index acbb6fd2..a62a1091 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -18,6 +18,8 @@ def dummy_workflows_post(): workflow_id = uuid.uuid4().hex minimal_wf_doc = { "workflow_id": workflow_id, + "deactivated": None, + # add more fields only if needed in tests--keep things simple } DONT_CALL_IT_A_DB[workflow_id] = minimal_wf_doc @@ -29,6 +31,23 @@ def dummy_workflows_post(): ) +@app.route(f"/v0/workflows/", methods=["GET"]) +def dummy_workflows_get(workflow_id: str): + return jsonify(DONT_CALL_IT_A_DB[workflow_id]) + + +@app.route(f"/v0/workflows//actions/abort", methods=["POST"]) +def dummy_workflows_abort(workflow_id: str): + DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "abort"}) + return jsonify({}) + + +@app.route(f"/v0/workflows//actions/finished", methods=["POST"]) +def dummy_workflows_finished(workflow_id: str): + DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "finished"}) + return jsonify({}) + + if __name__ == "__main__": app.run( debug=True, From bd5185adfbf4e78fa7e0ccb2e3b56322880e9a0b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:43:30 -0600 Subject: [PATCH 095/327] update test schema - 1 --- tests/integration/test_rest_routes.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index b05ec7f1..84500319 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -16,6 +16,7 @@ from rest_tools.client import RestClient import skydriver.images # noqa: F401 # export +from skydriver.database.schema import DEPRECATED_EWMS_TASK LOGGER = logging.getLogger(__name__) @@ -99,12 +100,8 @@ async def _launch_scan( scan_metadata=None, progress=None, scanner_server_args=resp["scanner_server_args"], # see below - ewms_task=dict( - clusters=[], - tms_args=resp["ewms_task"]["tms_args"], # see below - env_vars=resp["ewms_task"]["env_vars"], # see below - complete=False, - ), + ewms_task=DEPRECATED_EWMS_TASK, + ewms_workflow_id=resp["ewms_workflow_id"], # see below classifiers=post_scan_body["classifiers"], last_updated=resp["last_updated"], # see below priority=0, @@ -112,6 +109,7 @@ async def _launch_scan( ) assert RE_UUID4HEX.fullmatch(resp["scan_id"]) assert RE_UUID4HEX.fullmatch(resp["i3_event_id"]) + assert RE_UUID4HEX.fullmatch(resp["ewms_workflow_id"]) # generated by dummy-ewms assert launch_time < resp["timestamp"] < resp["last_updated"] < time.time() # check args (avoid whitespace headaches...) From 427fcdb504a72861f6416bacd538eba9fe9de23a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:44:53 -0600 Subject: [PATCH 096/327] update test schema - 2 --- tests/integration/test_rest_routes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 84500319..503a5381 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -16,7 +16,6 @@ from rest_tools.client import RestClient import skydriver.images # noqa: F401 # export -from skydriver.database.schema import DEPRECATED_EWMS_TASK LOGGER = logging.getLogger(__name__) @@ -100,8 +99,8 @@ async def _launch_scan( scan_metadata=None, progress=None, scanner_server_args=resp["scanner_server_args"], # see below - ewms_task=DEPRECATED_EWMS_TASK, - ewms_workflow_id=resp["ewms_workflow_id"], # see below + ewms_task="use 'ewms_workflow_id'", + ewms_workflow_id="pending-ewms", classifiers=post_scan_body["classifiers"], last_updated=resp["last_updated"], # see below priority=0, @@ -109,7 +108,6 @@ async def _launch_scan( ) assert RE_UUID4HEX.fullmatch(resp["scan_id"]) assert RE_UUID4HEX.fullmatch(resp["i3_event_id"]) - assert RE_UUID4HEX.fullmatch(resp["ewms_workflow_id"]) # generated by dummy-ewms assert launch_time < resp["timestamp"] < resp["last_updated"] < time.time() # check args (avoid whitespace headaches...) From b3d16c655713f6ce19cf9132ae39573358b2573d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 17:48:23 -0600 Subject: [PATCH 097/327] flake8 --- tests/integration/dummy_ewms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index a62a1091..75b880aa 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -31,18 +31,18 @@ def dummy_workflows_post(): ) -@app.route(f"/v0/workflows/", methods=["GET"]) +@app.route("/v0/workflows/", methods=["GET"]) def dummy_workflows_get(workflow_id: str): return jsonify(DONT_CALL_IT_A_DB[workflow_id]) -@app.route(f"/v0/workflows//actions/abort", methods=["POST"]) +@app.route("/v0/workflows//actions/abort", methods=["POST"]) def dummy_workflows_abort(workflow_id: str): DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "abort"}) return jsonify({}) -@app.route(f"/v0/workflows//actions/finished", methods=["POST"]) +@app.route("/v0/workflows//actions/finished", methods=["POST"]) def dummy_workflows_finished(workflow_id: str): DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "finished"}) return jsonify({}) From 28ac8dab11056e29e5801004386d059866218c2d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 15 Jan 2025 18:06:12 -0600 Subject: [PATCH 098/327] todos --- skydriver/k8s/scan_backlog.py | 2 +- skydriver/rest_handlers.py | 4 ++-- tests/integration/dummy_ewms.py | 8 +++++++- tests/integration/test_rest_routes.py | 12 +++++++++++- 4 files changed, 21 insertions(+), 5 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 3395468d..7cdb5752 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -17,7 +17,7 @@ LOGGER = logging.getLogger(__name__) -async def designate_for_startup( +async def put_on_backlog( scan_id: str, scan_backlog: database.interface.ScanBacklogClient, priority: int, diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 74af4f8b..96e77874 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -39,7 +39,7 @@ ) from .database import schema from .ewms import request_stop_on_ewms -from .k8s.scan_backlog import designate_for_startup +from .k8s.scan_backlog import put_on_backlog from .k8s.scanner_instance import SkyScanK8sJobFactory from .utils import get_scan_state @@ -582,7 +582,7 @@ async def _start_scan( ) # place on backlog - await designate_for_startup( + await put_on_backlog( scan_id, scan_backlog, scan_request_obj["priority"], diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index 75b880aa..ff74b8b8 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -1,10 +1,11 @@ """A dummy EWMS server for testing.""" import os +import pprint import uuid from typing import Any -from flask import Flask, jsonify +from flask import Flask, jsonify, request app = Flask(__name__) @@ -15,6 +16,11 @@ def dummy_workflows_post(): # in the real ewms, there's a bunch of db logic, etc. + # IRL, we'd do something with this, but this isn't real life + req_json = request.get_json() + pprint.pprint(req_json) + + # "make" a workflow workflow_id = uuid.uuid4().hex minimal_wf_doc = { "workflow_id": workflow_id, diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 503a5381..e164ae21 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -110,6 +110,13 @@ async def _launch_scan( assert RE_UUID4HEX.fullmatch(resp["i3_event_id"]) assert launch_time < resp["timestamp"] < resp["last_updated"] < time.time() + # query the SkyScanK8sJobs coll + # -> since the scanner-server metadata is no longer stored in the manifest + # TODO + + # query the ScanRequests coll + # TODO + # check args (avoid whitespace headaches...) assert resp["scanner_server_args"].split() == scanner_server_args.split() for got_args, exp_args in zip(resp["ewms_task"]["tms_args"], tms_args): @@ -732,11 +739,14 @@ async def _after_scan_start_logic( assert resp["manifest"] == manifest assert resp["result"] == {} + # TODO: at what point do we expect the backlogger to request to ewms? + # TODO: we need to assert what ewms is sent (store in dummy ewms, and query here; or assert the call?) + # # INITIAL UPDATES # event_metadata = await _server_reply_with_event_metadata(rc, scan_id) - manifest = await _clientmanager_reply( + manifest = await _clientmanager_reply( # TODO: remove/replace (and anywhere else) rc, scan_id, clusters[0] if isinstance(clusters, list) else list(clusters.items())[0], From 88e7e0d92ee2a63b5131167c840300374c16d911 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:09:57 -0600 Subject: [PATCH 099/327] tests: query the SkyScanK8sJobs coll --- tests/integration/conftest.py | 9 +++- tests/integration/test_rest_routes.py | 71 ++++++++++++++++++++++----- 2 files changed, 67 insertions(+), 13 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d47a5820..8c874c4c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -8,6 +8,7 @@ import kubernetes.client # type: ignore[import-untyped] import pytest import pytest_asyncio +from motor.motor_asyncio import AsyncIOMotorClient from rest_tools.client import RestClient import skydriver @@ -115,10 +116,17 @@ def test_wait_before_teardown() -> float: return TEST_WAIT_BEFORE_TEARDOWN +@pytest_asyncio.fixture +async def mongo_client() -> AsyncIOMotorClient: + """A fixture to keep number of mongo connections to a minimum (aka 1).""" + return await create_mongodb_client() + + @pytest_asyncio.fixture async def server( monkeypatch: Any, port: int, + mongo_client: AsyncIOMotorClient, mongo_clear: Any, # pylint:disable=unused-argument ) -> AsyncIterator[Callable[[], RestClient]]: """Startup server in this process, yield RestClient func, then clean up.""" @@ -130,7 +138,6 @@ async def server( skydriver.rest_handlers, "WAIT_BEFORE_TEARDOWN", TEST_WAIT_BEFORE_TEARDOWN ) - mongo_client = await create_mongodb_client() k8s_batch_api = Mock() ewms_rc = setup_ewms_client() backlog_task = asyncio.create_task( diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index e164ae21..644262bb 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -13,6 +13,7 @@ import humanfriendly import pytest import requests +from motor.motor_asyncio import AsyncIOMotorClient from rest_tools.client import RestClient import skydriver.images # noqa: F401 # export @@ -68,11 +69,14 @@ async def _launch_scan( - rc: RestClient, post_scan_body: dict, tms_args: list[str] + rc: RestClient, + mongo_client: AsyncIOMotorClient, + post_scan_body: dict, + tms_args: list[str], ) -> dict: # launch scan launch_time = time.time() - resp = await rc.request( + post_resp = await rc.request( "POST", "/scan", {**post_scan_body, "manifest_projection": ["*"]}, @@ -88,35 +92,76 @@ async def _launch_scan( f"--predictive-scanning-threshold 1.0 " # the default ) - assert resp == dict( - scan_id=resp["scan_id"], + assert post_resp == dict( + scan_id=post_resp["scan_id"], # see below is_deleted=False, - timestamp=resp["timestamp"], # see below + timestamp=post_resp["timestamp"], # see below event_i3live_json_dict__hash=None, # field has been deprecated, always 'None' event_i3live_json_dict="use 'i3_event_id'", # field has been deprecated - i3_event_id=resp["i3_event_id"], # see below + i3_event_id=post_resp["i3_event_id"], # see below event_metadata=None, scan_metadata=None, progress=None, - scanner_server_args=resp["scanner_server_args"], # see below + scanner_server_args=post_resp["scanner_server_args"], # see below ewms_task="use 'ewms_workflow_id'", ewms_workflow_id="pending-ewms", classifiers=post_scan_body["classifiers"], - last_updated=resp["last_updated"], # see below + last_updated=post_resp["last_updated"], # see below priority=0, # TODO: check more fields in future (hint: ctrl+F this comment) ) - assert RE_UUID4HEX.fullmatch(resp["scan_id"]) - assert RE_UUID4HEX.fullmatch(resp["i3_event_id"]) - assert launch_time < resp["timestamp"] < resp["last_updated"] < time.time() + assert RE_UUID4HEX.fullmatch(post_resp["scan_id"]) + assert RE_UUID4HEX.fullmatch(post_resp["i3_event_id"]) + assert ( + launch_time < post_resp["timestamp"] < post_resp["last_updated"] < time.time() + ) # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest - # TODO + doc = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( + {"scan_id": post_resp["scan_id"]} + ) + assert doc == dict( + scan_id=post_resp["scan_id"], + rescan_ids=[], + # + docker_tag=post_scan_body["docker_tag"], + # + # skyscan server config + scanner_server_memory_bytes=post_scan_body["scanner_server_memory"], + reco_algo=post_scan_body["reco_algo"], + nsides=post_scan_body["nsides"], + real_or_simulated_event=post_scan_body["real_or_simulated_event"], + predictive_scanning_threshold=post_scan_body["predictive_scanning_threshold"], + # + classifiers=post_scan_body["classifiers"], + # + # cluster (condor) config + request_clusters=post_scan_body["cluster"], + worker_memory_bytes=post_scan_body["worker_memory"], + worker_disk_bytes=post_scan_body["worker_disk"], + max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], + max_worker_runtime=post_scan_body["max_worker_runtime"], + priority=post_scan_body["priority"], + debug_mode=[post_scan_body["debug_mode"]], + # + # misc + skyscan_mq_client_timeout_wait_for_first_message=( + post_scan_body["skyscan_mq_client_timeout_wait_for_first_message"] + if post_scan_body["skyscan_mq_client_timeout_wait_for_first_message"] != -1 + else None + ), + i3_event_id=post_resp["i3_event_id"], + rest_address="", + scanner_server_env_from_user=post_scan_body["scanner_server_env"], + ) # query the ScanRequests coll # TODO + assert 0 + +def foo(): # check args (avoid whitespace headaches...) assert resp["scanner_server_args"].split() == scanner_server_args.split() for got_args, exp_args in zip(resp["ewms_task"]["tms_args"], tms_args): @@ -701,12 +746,14 @@ async def test_000( server: Callable[[], RestClient], known_clusters: dict, test_wait_before_teardown: float, + mongo_client: AsyncIOMotorClient, ) -> None: """Test normal scan creation and retrieval.""" rc = server() manifest = await _launch_scan( rc, + mongo_client, { **POST_SCAN_BODY, "docker_tag": docker_tag_input, From 4838a6968248605b6e0a49f462af55a99c0558a7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:21:43 -0600 Subject: [PATCH 100/327] tests: query the SkyScanK8sJobs coll - 2 --- tests/integration/test_rest_routes.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 644262bb..8a77a4cc 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -4,6 +4,7 @@ import copy import logging import os +import pprint import random import re import time @@ -76,11 +77,13 @@ async def _launch_scan( ) -> dict: # launch scan launch_time = time.time() + print(f"now: {launch_time}") post_resp = await rc.request( "POST", "/scan", {**post_scan_body, "manifest_projection": ["*"]}, ) + pprint.pprint(post_resp) scanner_server_args = ( f"python -m skymap_scanner.server " @@ -112,8 +115,14 @@ async def _launch_scan( ) assert RE_UUID4HEX.fullmatch(post_resp["scan_id"]) assert RE_UUID4HEX.fullmatch(post_resp["i3_event_id"]) + # check timestamps + post_launch_ts = time.time() + print(f"now: {post_launch_ts}") assert ( - launch_time < post_resp["timestamp"] < post_resp["last_updated"] < time.time() + launch_time + < post_resp["timestamp"] + < post_resp["last_updated"] + < post_launch_ts ) # query the SkyScanK8sJobs coll @@ -121,6 +130,7 @@ async def _launch_scan( doc = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( {"scan_id": post_resp["scan_id"]} ) + pprint.pprint(doc) assert doc == dict( scan_id=post_resp["scan_id"], rescan_ids=[], From ca390a87e42f9ddc41e2185b6add94b1855f5f33 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:29:50 -0600 Subject: [PATCH 101/327] tests: query the SkyScanK8sJobs coll - 3 (ts) --- skydriver/database/mongodc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skydriver/database/mongodc.py b/skydriver/database/mongodc.py index 2b1a6cf5..437f9ffa 100644 --- a/skydriver/database/mongodc.py +++ b/skydriver/database/mongodc.py @@ -84,7 +84,9 @@ async def find_one_and_update( and dc.is_dataclass(return_dclass) and "last_updated" in [f.name for f in dc.fields(return_dclass)] ): - update["$set"].update({"last_updated": time.time()}) + now = time.time() + LOGGER.info(f"auto updating 'last_updated' field to {now}") + update["$set"].update({"last_updated": now}) doc = await super().find_one_and_update(filter, update, *args, **kwargs) if not doc: From 854a3d01116320628b4c89768383b6a636642e2d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:35:13 -0600 Subject: [PATCH 102/327] tests: query the SkyScanK8sJobs coll - 4 (ts) --- skydriver/database/mongodc.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skydriver/database/mongodc.py b/skydriver/database/mongodc.py index 437f9ffa..8c1e9664 100644 --- a/skydriver/database/mongodc.py +++ b/skydriver/database/mongodc.py @@ -85,7 +85,9 @@ async def find_one_and_update( and "last_updated" in [f.name for f in dc.fields(return_dclass)] ): now = time.time() - LOGGER.info(f"auto updating 'last_updated' field to {now}") + LOGGER.info( + f"auto-updating 'last_updated' field to {now} ({return_dclass.__name__})" + ) update["$set"].update({"last_updated": now}) doc = await super().find_one_and_update(filter, update, *args, **kwargs) From fc1677337a650d4570809db4748882c1c82aaa11 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:35:57 -0600 Subject: [PATCH 103/327] tests: query the SkyScanK8sJobs coll - 5 (ts) --- skydriver/rest_handlers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 96e77874..4fc00a6e 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -573,7 +573,7 @@ async def _start_scan( classifiers=scan_request_obj["classifiers"], priority=scan_request_obj["priority"], ) - await manifests.put(manifest) + manifest = await manifests.put(manifest) await skyscan_k8s_job_coll.insert_one( # type: ignore[attr-defined] { "scan_id": scan_id, From f64372a37f493cbf0b365b2f041616f38a7c069a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 11:46:27 -0600 Subject: [PATCH 104/327] tests: query the SkyScanK8sJobs coll - 6 (fields) --- tests/integration/test_rest_routes.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 8a77a4cc..45af4662 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -138,29 +138,25 @@ async def _launch_scan( docker_tag=post_scan_body["docker_tag"], # # skyscan server config - scanner_server_memory_bytes=post_scan_body["scanner_server_memory"], + scanner_server_memory_bytes=humanfriendly.parse_size("1024M"), reco_algo=post_scan_body["reco_algo"], nsides=post_scan_body["nsides"], real_or_simulated_event=post_scan_body["real_or_simulated_event"], - predictive_scanning_threshold=post_scan_body["predictive_scanning_threshold"], + predictive_scanning_threshold=1.0, # classifiers=post_scan_body["classifiers"], # # cluster (condor) config request_clusters=post_scan_body["cluster"], - worker_memory_bytes=post_scan_body["worker_memory"], - worker_disk_bytes=post_scan_body["worker_disk"], + worker_memory_bytes=humanfriendly.parse_size("8GB"), + worker_disk_bytes=humanfriendly.parse_size("1GB"), max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], max_worker_runtime=post_scan_body["max_worker_runtime"], - priority=post_scan_body["priority"], + priority=0, debug_mode=[post_scan_body["debug_mode"]], # # misc - skyscan_mq_client_timeout_wait_for_first_message=( - post_scan_body["skyscan_mq_client_timeout_wait_for_first_message"] - if post_scan_body["skyscan_mq_client_timeout_wait_for_first_message"] != -1 - else None - ), + skyscan_mq_client_timeout_wait_for_first_message=None, i3_event_id=post_resp["i3_event_id"], rest_address="", scanner_server_env_from_user=post_scan_body["scanner_server_env"], From 81a1099f710015a4705f1a4d46109ed1b7299b92 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 13:54:29 -0600 Subject: [PATCH 105/327] prod-tester: add `random_query.py` --- resources/prod_tester/random_query.py | 124 ++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 resources/prod_tester/random_query.py diff --git a/resources/prod_tester/random_query.py b/resources/prod_tester/random_query.py new file mode 100644 index 00000000..f24873b0 --- /dev/null +++ b/resources/prod_tester/random_query.py @@ -0,0 +1,124 @@ +"""A script to randomly query SkyDriver. + +This will makes sure everything that should be accessible is accessible. +""" + +import argparse +import asyncio +import pprint +import random + +import test_runner + + +# Function to split list into chunks +def chunk_list(data, size): + for i in range(0, len(data), size): + yield data[i : i + size] + + +async def main(): + parser = argparse.ArgumentParser( + description="Launch and monitor a scan for an event", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--skydriver-url", + required=True, + help="the url to connect to a SkyDriver server", + ) + args = parser.parse_args() + + rc = test_runner.get_rest_client(args.skydriver_url) + + # 1: get all the scan_ids (not too large) + print("POST @ /scans/find ...") + resp = await rc.request( + "POST", + "/scans/find", + { + "filter": {}, + "include_deleted": True, + "manifest_projection": ["scan_id"], + }, + ) + scan_ids = [m["scan_id"] for m in resp["manifests"]] + print(f"{len(scan_ids)} scans") + scan_ids = random.sample(scan_ids, max(1, len(scan_ids) // 20)) # 5% sample + print(f"will run queries for {len(scan_ids)} scans (5% sample)") + + print("\n---\n") + + # 2: re-find + total = 0 + versions = {"v1.0": [], "v1.1": []} + for chunk_scan_ids in chunk_list(scan_ids, 10): + print("POST @ /scans/find ...") + resp = await rc.request( + "POST", + "/scans/find", + { + "filter": {"scan_id": {"$in": chunk_scan_ids}}, + "include_deleted": True, + }, + ) + pprint.pprint(resp) + print(f"found {len(resp['manifests'])}/{len(chunk_scan_ids)} scans (subset)") + total += len(resp["manifests"]) + for m in resp["manifests"]: + if m["i3_event_id"]: + versions["v1.1"] += m["scan_id"] + elif isinstance(m["event_i3live_json_dict"], dict): + versions["v1.0"] += m["scan_id"] + else: + versions["other"] += m["scan_id"] + pprint.pprint(versions) + print(f"confirmed {total} scans") + assert total == len(scan_ids) + assert all(v for v in versions.values()) # check that all versions are represented + + print("\n---\n") + + # 3. quickly query the backlog + print("GET @ /scans/backlog ...") + resp = await rc.request("GET", "/scans/backlog") + pprint.pprint(resp) + + print("\n---\n") + + # 4. query each scan + for i, scan_id in enumerate(scan_ids): + print(f"various queries for {scan_id} ({i+1}/{len(scan_ids)}) ...") + # + print(f"GET @ /scan/{scan_id} ...") + resp = await rc.request("GET", f"/scan/{scan_id}", {"include_deleted": True}) + pprint.pprint(resp) + # + print(f"GET @ /scan/{scan_id}/manifest ...") + resp = await rc.request( + "GET", f"/scan/{scan_id}/manifest", {"include_deleted": True} + ) + pprint.pprint(resp) + # + print(f"GET @ /scan/{scan_id}/i3-event ...") + resp = await rc.request( + "GET", f"/scan/{scan_id}/i3-event", {"include_deleted": True} + ) + pprint.pprint(resp) + # + print(f"GET @ /scan/{scan_id}/result ...") + resp = await rc.request( + "GET", f"/scan/{scan_id}/result", {"include_deleted": True} + ) + pprint.pprint(resp) + # + print("\n---\n") + + print("\n---\n") + pprint.pprint(versions) + + +# Run the asyncio event loop +if __name__ == "__main__": + asyncio.run(main()) + print("Done.") From 4f286ace1fb3d67a71881a978535407b96f6203e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 14:01:14 -0600 Subject: [PATCH 106/327] prod-tester: add `random_query.py` - 2 --- resources/prod_tester/random_query.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/prod_tester/random_query.py b/resources/prod_tester/random_query.py index f24873b0..8d8b302f 100644 --- a/resources/prod_tester/random_query.py +++ b/resources/prod_tester/random_query.py @@ -67,11 +67,11 @@ async def main(): total += len(resp["manifests"]) for m in resp["manifests"]: if m["i3_event_id"]: - versions["v1.1"] += m["scan_id"] + versions["v1.1"].append(m["scan_id"]) elif isinstance(m["event_i3live_json_dict"], dict): - versions["v1.0"] += m["scan_id"] + versions["v1.0"].append(m["scan_id"]) else: - versions["other"] += m["scan_id"] + versions["other"].append(m["scan_id"]) pprint.pprint(versions) print(f"confirmed {total} scans") assert total == len(scan_ids) From 6e3bd81fce0bd6e2382a7afedf749195357237f9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 16 Jan 2025 14:16:57 -0600 Subject: [PATCH 107/327] prod-tester: add `random_query.py` - 3 --- resources/prod_tester/random_query.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/prod_tester/random_query.py b/resources/prod_tester/random_query.py index 8d8b302f..3cb30b35 100644 --- a/resources/prod_tester/random_query.py +++ b/resources/prod_tester/random_query.py @@ -67,9 +67,9 @@ async def main(): total += len(resp["manifests"]) for m in resp["manifests"]: if m["i3_event_id"]: - versions["v1.1"].append(m["scan_id"]) + versions["v1.2"].append(m["scan_id"]) elif isinstance(m["event_i3live_json_dict"], dict): - versions["v1.0"].append(m["scan_id"]) + versions["<=v1.1"].append(m["scan_id"]) else: versions["other"].append(m["scan_id"]) pprint.pprint(versions) From 27dfd1d8251d4b0c8d1450c2f5e1dcbb693d7032 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 10:44:36 -0600 Subject: [PATCH 108/327] tests: query the SkyScanK8sJobs coll - 7 (fields) --- skydriver/config.py | 1 + skydriver/rest_handlers.py | 2 +- tests/integration/test_rest_routes.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 0e3f74d1..1fe094b9 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -19,6 +19,7 @@ ) DEFAULT_WORKER_MEMORY_BYTES: int = humanfriendly.parse_size("8GB") DEFAULT_WORKER_DISK_BYTES: int = humanfriendly.parse_size("1GB") +DEFAULT_MAX_WORKER_RUNTIME = 4 * 60 * 60 K8S_CONTAINER_MEMORY_DEFAULT_BYTES: int = humanfriendly.parse_size("64M") K8S_CONTAINER_MEMORY_CLUSTER_STOPPER_BYTES: int = humanfriendly.parse_size("256M") diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 4fc00a6e..28b5135d 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -415,7 +415,7 @@ async def post(self) -> None: arghand.add_argument( "max_worker_runtime", type=int, - default=4 * 60 * 60, + default=DEFAULT_MAX_WORKER_RUNTIME, ) arghand.add_argument( # TODO - remove when TMS is handling workforce-scaling diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 45af4662..edd32029 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -151,7 +151,7 @@ async def _launch_scan( worker_memory_bytes=humanfriendly.parse_size("8GB"), worker_disk_bytes=humanfriendly.parse_size("1GB"), max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], - max_worker_runtime=post_scan_body["max_worker_runtime"], + max_worker_runtime=4 * 60 * 60, priority=0, debug_mode=[post_scan_body["debug_mode"]], # From 4d622f499ae91ce813787199e2371c6219069721 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 10:48:07 -0600 Subject: [PATCH 109/327] tests: query the ScanRequests coll --- tests/integration/test_rest_routes.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index edd32029..41671091 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -125,9 +125,8 @@ async def _launch_scan( < post_launch_ts ) - # query the SkyScanK8sJobs coll - # -> since the scanner-server metadata is no longer stored in the manifest - doc = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( + # query the ScanRequests coll + doc = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( {"scan_id": post_resp["scan_id"]} ) pprint.pprint(doc) @@ -162,8 +161,14 @@ async def _launch_scan( scanner_server_env_from_user=post_scan_body["scanner_server_env"], ) - # query the ScanRequests coll + # query the SkyScanK8sJobs coll + # -> since the scanner-server metadata is no longer stored in the manifest + doc = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( + {"scan_id": post_resp["scan_id"]} + ) + pprint.pprint(doc) # TODO + assert 0 From 4224df4b31857a8395f691ac371591fecd0cae6f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 10:48:42 -0600 Subject: [PATCH 110/327] fix import --- skydriver/rest_handlers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 28b5135d..8e33dc27 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -30,6 +30,7 @@ from . import database, ewms, images, k8s, utils from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, + DEFAULT_MAX_WORKER_RUNTIME, DEFAULT_WORKER_DISK_BYTES, DEFAULT_WORKER_MEMORY_BYTES, DebugMode, From a2eaf8bd74b56068850a1711a3155199be444be6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 16:52:41 +0000 Subject: [PATCH 111/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 9e865851..6a552e19 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.0 -botocore==1.36.0 +boto3==1.36.1 +botocore==1.36.1 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 @@ -37,7 +37,7 @@ requests==2.32.3 requests-futures==1.0.2 requests-oauthlib==2.0.0 rsa==4.9 -s3transfer==0.11.0 +s3transfer==0.11.1 six==1.17.0 tornado==6.4.2 typeguard==4.4.1 @@ -58,15 +58,15 @@ pipdeptree==2.24.0 setuptools==65.5.1 skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.0] -│ ├── botocore [required: >=1.36.0,<1.37.0, installed: 1.36.0] +├── boto3 [required: Any, installed: 1.36.1] +│ ├── botocore [required: >=1.36.1,<1.37.0, installed: 1.36.1] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] -│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.0] -│ └── botocore [required: >=1.33.2,<2.0a.0, installed: 1.36.0] +│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.1] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.1] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From f9a0b96a28701985e2cd5dc8b796ea7fc1817cea Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 10:58:48 -0600 Subject: [PATCH 112/327] tests: query the ScanRequests coll - 2 --- tests/integration/test_rest_routes.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 41671091..2750247f 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -127,26 +127,26 @@ async def _launch_scan( # query the ScanRequests coll doc = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( - {"scan_id": post_resp["scan_id"]} + {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) pprint.pprint(doc) assert doc == dict( scan_id=post_resp["scan_id"], rescan_ids=[], # - docker_tag=post_scan_body["docker_tag"], + docker_tag=os.environ["LATEST_TAG"], # # skyscan server config scanner_server_memory_bytes=humanfriendly.parse_size("1024M"), reco_algo=post_scan_body["reco_algo"], - nsides=post_scan_body["nsides"], + nsides={str(k): v for k, v in post_scan_body["nsides"].items()}, real_or_simulated_event=post_scan_body["real_or_simulated_event"], predictive_scanning_threshold=1.0, # classifiers=post_scan_body["classifiers"], # # cluster (condor) config - request_clusters=post_scan_body["cluster"], + request_clusters=list(post_scan_body["cluster"].items()), worker_memory_bytes=humanfriendly.parse_size("8GB"), worker_disk_bytes=humanfriendly.parse_size("1GB"), max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], @@ -157,7 +157,7 @@ async def _launch_scan( # misc skyscan_mq_client_timeout_wait_for_first_message=None, i3_event_id=post_resp["i3_event_id"], - rest_address="", + rest_address="http://localhost:41161", scanner_server_env_from_user=post_scan_body["scanner_server_env"], ) From fa6fddf5036842750d76143dd40f418cafd3c571 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:04:28 -0600 Subject: [PATCH 113/327] tests: query the ScanRequests coll - 3 --- tests/integration/test_rest_routes.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 2750247f..5714d559 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -157,9 +157,10 @@ async def _launch_scan( # misc skyscan_mq_client_timeout_wait_for_first_message=None, i3_event_id=post_resp["i3_event_id"], - rest_address="http://localhost:41161", + rest_address=doc["rest_address"], # see below scanner_server_env_from_user=post_scan_body["scanner_server_env"], ) + assert re.fullmatch(f"{re.escape('http://localhost:')}\d+", doc["rest_address"]) # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest From 0b9ccbad495b1570ba7241815fe205482a11d3b9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:04:48 -0600 Subject: [PATCH 114/327] tests: query the ScanRequests coll - 4 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 5714d559..12d10e3c 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -160,7 +160,7 @@ async def _launch_scan( rest_address=doc["rest_address"], # see below scanner_server_env_from_user=post_scan_body["scanner_server_env"], ) - assert re.fullmatch(f"{re.escape('http://localhost:')}\d+", doc["rest_address"]) + assert re.fullmatch(rf"{re.escape('http://localhost:')}\d+", doc["rest_address"]) # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest From 206b6dd84e969c88f407ca49faddaa5f98a41e53 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:09:51 -0600 Subject: [PATCH 115/327] tests: query the ScanRequests coll - 5 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 12d10e3c..60933bfd 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -146,7 +146,7 @@ async def _launch_scan( classifiers=post_scan_body["classifiers"], # # cluster (condor) config - request_clusters=list(post_scan_body["cluster"].items()), + request_clusters=list([k, v] for k, v in post_scan_body["cluster"].items()), worker_memory_bytes=humanfriendly.parse_size("8GB"), worker_disk_bytes=humanfriendly.parse_size("1GB"), max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], From 9453aca41c264a234dfce8e4cc1135910c8b908d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:29:04 -0600 Subject: [PATCH 116/327] tests: query the SkyScanK8sJobs coll (fr) - 1 --- .github/workflows/wipac-cicd.yml | 2 +- tests/integration/test_rest_routes.py | 190 ++++++++++++++++++++++++-- 2 files changed, 183 insertions(+), 9 deletions(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index aa95ddb3..8b26a6c1 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -3,7 +3,7 @@ name: wipac ci/cd on: [ push ] env: - THIS_IMAGE_WITH_TAG: 'ghcr.io/wipacrepo/skydriver:latest' + THIS_IMAGE_WITH_TAG: 'ghcr.io/wipacrepo/skydriver:vX.Y.Z' EWMS_PILOT_TASK_TIMEOUT: 999 SCAN_BACKLOG_RUNNER_SHORT_DELAY: 1 SCAN_BACKLOG_RUNNER_DELAY: 1 diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 60933bfd..969aa7f9 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -126,11 +126,11 @@ async def _launch_scan( ) # query the ScanRequests coll - doc = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( + doc_sr = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) - pprint.pprint(doc) - assert doc == dict( + pprint.pprint(doc_sr) + assert doc_sr == dict( scan_id=post_resp["scan_id"], rescan_ids=[], # @@ -157,17 +157,191 @@ async def _launch_scan( # misc skyscan_mq_client_timeout_wait_for_first_message=None, i3_event_id=post_resp["i3_event_id"], - rest_address=doc["rest_address"], # see below + rest_address=doc_sr["rest_address"], # see below scanner_server_env_from_user=post_scan_body["scanner_server_env"], ) - assert re.fullmatch(rf"{re.escape('http://localhost:')}\d+", doc["rest_address"]) + assert re.fullmatch(rf"{re.escape('http://localhost:')}\d+", doc_sr["rest_address"]) # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest - doc = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( - {"scan_id": post_resp["scan_id"]} + doc_k8s = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( + {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) - pprint.pprint(doc) + pprint.pprint(doc_k8s) + assert doc_k8s == { + "scan_id": post_resp["scan_id"], + "skyscan_k8s_job_dict": { + "apiVersion": "batch/v1", + "kind": "Job", + "metadata": { + "annotations": {"argocd.argoproj.io/sync-options": "Prune=false"}, + "labels": {"app.kubernetes.io/instance": None}, + "name": f"skyscan-{post_resp['scan_id']}", + "namespace": None, + }, + "spec": { + "activeDeadlineSeconds": 86400, + "backoffLimit": 0, + "template": { + "metadata": {"labels": {"app": "scanner-instance"}}, + "spec": { + "containers": [ + { + "args": scanner_server_args.split(), + "command": [], + "env": [ + { + "name": "SKYSCAN_EWMS_JSON", + "value": "/common-space/ewms.json", + }, + { + "name": "SKYSCAN_SKYDRIVER_ADDRESS", + "value": doc_sr["rest_address"], + }, + { + "name": "SKYSCAN_SKYDRIVER_SCAN_ID", + "value": post_resp["scan_id"], + }, + { + "name": "SKYSCAN_EWMS_PILOT_LOG", + "value": "WARNING", + }, + { + "name": "SKYSCAN_MQ_CLIENT_LOG", + "value": "WARNING", + }, + {"name": "SKYSCAN_BROKER_AUTH", "value": ""}, + {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, + ] + + [ # add those from 'post_scan_body' + {"name": k, "value": v} + for k, v in post_scan_body[ + "scanner_server_env" + ].items() + ], + "image": f"icecube/skymap_scanner:{os.environ['LATEST_TAG']}", + "name": f'skyscan-server-{post_resp["scan_id"]}', + "resources": { + "limits": {"cpu": "1", "memory": "1024000000"}, + "requests": { + "cpu": "1", + "ephemeral-storage": "1M", + "memory": "1024000000", + }, + }, + "volumeMounts": [ + { + "mountPath": "/common-space", + "name": "common-space-volume", + } + ], + }, + { + "args": [ + "/common-space/startup.json", + "--wait-indefinitely", + ], + "command": ["python", "-m", "s3_sidecar.post"], + "env": [ + {"name": "S3_URL", "value": os.environ["S3_URL"]}, + { + "name": "S3_ACCESS_KEY_ID", + "valueFrom": { + "secretKeyRef": { + "key": os.environ["S3_ACCESS_KEY_ID"], + "name": None, + } + }, + }, + { + "name": "S3_SECRET_KEY", + "valueFrom": { + "secretKeyRef": { + "key": os.environ["S3_SECRET_KEY"], + "name": None, + } + }, + }, + { + "name": "S3_BUCKET", + "value": os.environ["S3_BUCKET"], + }, + { + "name": "S3_OBJECT_KEY", + "value": f"{post_resp['scan_id']}-s3-object", + }, + ], + "image": os.environ["THIS_IMAGE_WITH_TAG"], + "name": f"sidecar-s3-{post_resp['scan_id']}", + "resources": { + "limits": {"cpu": "0.25", "memory": "256Mi"}, + "requests": { + "cpu": "0.25", + "ephemeral-storage": "1M", + "memory": "256Mi", + }, + }, + "restartPolicy": "OnFailure", + "volumeMounts": [ + { + "mountPath": "/common-space", + "name": "common-space-volume", + } + ], + }, + ], + "initContainers": [ + { + "args": [ + post_resp["scan_id"], + "--json-out", + "/common-space/ewms.json", + ], + "command": ["python", "-m", "ewms_init_container"], + "env": [ + { + "name": "SKYSCAN_SKYDRIVER_ADDRESS", + "value": doc_sr["rest_address"], + }, + {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, + { + "name": "EWMS_ADDRESS", + "value": os.environ["EWMS_ADDRESS"], + }, + { + "name": "EWMS_TOKEN_URL", + "value": os.environ["EWMS_TOKEN_URL"], + }, + { + "name": "EWMS_CLIENT_ID", + "value": os.environ["EWMS_CLIENT_ID"], + }, + { + "name": "EWMS_CLIENT_SECRET", + "value": os.environ["EWMS_CLIENT_SECRET"], + }, + { + "name": "QUEUE_ALIAS_TOCLIENT", + "value": "to-client-queue", + }, + { + "name": "QUEUE_ALIAS_FROMCLIENT", + "value": "from-client-queue", + }, + ], + "image": os.environ["THIS_IMAGE_WITH_TAG"], + "name": f"init-ewms-{post_resp['scan_id']}", + } + ], + "restartPolicy": "Never", + "serviceAccountName": None, + "volumes": [{"emptyDir": {}, "name": "common-space-volume"}], + }, + }, + "ttlSecondsAfterFinished": 600, + }, + }, + } # TODO assert 0 From 3a4578a44735399ee3fccafa28df1e481ef981df Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:34:30 -0600 Subject: [PATCH 117/327] tests: query the SkyScanK8sJobs coll (fr) - 2 --- .github/workflows/wipac-cicd.yml | 1 + tests/integration/test_rest_routes.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 8b26a6c1..52b883dc 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -20,6 +20,7 @@ env: S3_SECRET_KEY: 8dea68a1 S3_SECRET_KEY__K8S_SECRET_KEY: cdf7c60b S3_BUCKET: 72017610 + K8S_SECRET_NAME: super-secrets MIN_SKYMAP_SCANNER_TAG: "v3.21.2" # TODO: remove once skyscan v4 is out (that's the real min) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 969aa7f9..82612936 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -248,8 +248,10 @@ async def _launch_scan( "name": "S3_ACCESS_KEY_ID", "valueFrom": { "secretKeyRef": { - "key": os.environ["S3_ACCESS_KEY_ID"], - "name": None, + "key": os.environ[ + "S3_ACCESS_KEY_ID__K8S_SECRET_KEY" + ], + "name": os.environ["K8S_SECRET_NAME"], } }, }, @@ -257,8 +259,10 @@ async def _launch_scan( "name": "S3_SECRET_KEY", "valueFrom": { "secretKeyRef": { - "key": os.environ["S3_SECRET_KEY"], - "name": None, + "key": os.environ[ + "S3_SECRET_KEY__K8S_SECRET_KEY" + ], + "name": os.environ["K8S_SECRET_NAME"], } }, }, From 6049ef001998a59a871e4a48b1796d5377cbf10d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:41:27 -0600 Subject: [PATCH 118/327] tests: query the SkyScanK8sJobs coll (fr) - 3 --- .github/workflows/wipac-cicd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 52b883dc..e7125f0a 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -214,6 +214,7 @@ jobs: docker run --network="host" --rm -i --name test \ --env LATEST_TAG=$LATEST_TAG \ --env THIS_IMAGE_WITH_TAG=$THIS_IMAGE_WITH_TAG \ + --env K8S_SECRET_NAME=$K8S_SECRET_NAME \ $(env | grep '^SKYSCAN_' | awk '$0="--env "$0') \ $(env | grep '^EWMS_' | awk '$0="--env "$0') \ $(env | grep '^S3_' | awk '$0="--env "$0') \ From ad1cb8af74ccd2a85da500d668e45894c4c8a9f6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:47:11 -0600 Subject: [PATCH 119/327] carry on... --- tests/integration/test_rest_routes.py | 125 -------------------------- 1 file changed, 125 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 82612936..2285103a 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -346,131 +346,6 @@ async def _launch_scan( }, }, } - # TODO - - assert 0 - - -def foo(): - # check args (avoid whitespace headaches...) - assert resp["scanner_server_args"].split() == scanner_server_args.split() - for got_args, exp_args in zip(resp["ewms_task"]["tms_args"], tms_args): - print(got_args, exp_args) - for got, exp in zip(got_args.split(), exp_args.split()): - print(got, exp) - if exp == CLUSTER_ID_PLACEHOLDER: - assert RE_UUID4HEX.fullmatch(got) - else: - assert got == exp - assert len(got_args.split()) == len(exp_args.split()) - assert len(resp["ewms_task"]["tms_args"]) == len(tms_args) - - # check env vars - print(resp["ewms_task"]["env_vars"]) - assert set(resp["ewms_task"]["env_vars"].keys()) == { - "scanner_server", - "tms_starters", - } - - # check env vars, more closely - # "scanner_server" - assert set( # these have `value`s - e["name"] - for e in resp["ewms_task"]["env_vars"]["scanner_server"] - if e["value"] is not None and e["value_from"] is None - ) == { - "SKYSCAN_BROKER_ADDRESS", - "SKYSCAN_BROKER_AUTH", - "SKYSCAN_SKYDRIVER_ADDRESS", - "SKYSCAN_SKYDRIVER_AUTH", - "SKYSCAN_SKYDRIVER_SCAN_ID", - "SKYSCAN_EWMS_PILOT_LOG", - "SKYSCAN_MQ_CLIENT_LOG", - *post_scan_body["scanner_server_env"].keys(), # type: ignore[attr-defined] - } - assert ( - set( # these have `value_from`s - e - for e in resp["ewms_task"]["env_vars"]["scanner_server"] - if e["value_from"] is not None and e["value"] is None - ) - == set() - ) - # "tms_starters" - for env_dicts in resp["ewms_task"]["env_vars"]["tms_starters"]: - assert set( # these have `value`s - e["name"] - for e in env_dicts - if e["value"] is not None and e["value_from"] is None - ) == { - "EWMS_PILOT_TASK_TIMEOUT", # set by CI runner - "EWMS_TMS_S3_BUCKET", - "EWMS_TMS_S3_URL", - "SKYSCAN_BROKER_ADDRESS", - "SKYSCAN_BROKER_AUTH", - "SKYSCAN_SKYDRIVER_ADDRESS", - "SKYSCAN_SKYDRIVER_AUTH", - "SKYSCAN_SKYDRIVER_SCAN_ID", - "SKYSCAN_EWMS_PILOT_LOG", - "SKYSCAN_MQ_CLIENT_LOG", - "WORKER_K8S_LOCAL_APPLICATION_NAME", - "EWMS_PILOT_DUMP_TASK_OUTPUT", - } - assert ( - next( - x["value"] - for x in env_dicts - if x["name"] == "EWMS_PILOT_DUMP_TASK_OUTPUT" - ) - == "True" - ) - assert set( # these have `value_from`s - e["name"] - for e in env_dicts - if e["value_from"] is not None and e["value"] is None - ) == { - "CONDOR_TOKEN", - "EWMS_TMS_S3_ACCESS_KEY_ID", - "EWMS_TMS_S3_SECRET_KEY", - } or set( # these have `value_from`s - e["name"] - for e in env_dicts - if e["value_from"] is not None and e["value"] is None - ) == { - "WORKER_K8S_CONFIG_FILE_BASE64", - "EWMS_TMS_S3_ACCESS_KEY_ID", - "EWMS_TMS_S3_SECRET_KEY", - } - - # check env vars, even MORE closely - for env_dicts in [resp["ewms_task"]["env_vars"]["scanner_server"]] + resp[ - "ewms_task" - ]["env_vars"]["tms_starters"]: - assert ( - next(x["value"] for x in env_dicts if x["name"] == "SKYSCAN_BROKER_ADDRESS") - == "localhost" - ) - assert re.match( - r"http://localhost:[0-9]+", - next( - x["value"] - for x in env_dicts - if x["name"] == "SKYSCAN_SKYDRIVER_ADDRESS" - ), - ) - assert ( - len( - next( - x["value"] - for x in env_dicts - if x["name"] == "SKYSCAN_SKYDRIVER_SCAN_ID" - ) - ) - == 32 - ) - - # get scan_id - assert resp["scan_id"] return resp # type: ignore[no-any-return] From 77d13b8e77e5b991c8c63322ce93a140b7b1f2ee Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:48:39 -0600 Subject: [PATCH 120/327] carry on... again... --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 2285103a..08db80a6 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -347,7 +347,7 @@ async def _launch_scan( }, } - return resp # type: ignore[no-any-return] + return post_resp # type: ignore[no-any-return] async def _do_patch( From 98d676f4f1511985bb7f692b12609a0d9307f733 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:54:52 -0600 Subject: [PATCH 121/327] flake8 --- tests/integration/test_rest_routes.py | 37 +++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 08db80a6..6d250126 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -125,7 +125,26 @@ async def _launch_scan( < post_launch_ts ) - # query the ScanRequests coll + # check database + rest_address = await _assert_db_scanrequests_coll( + mongo_client, post_scan_body, post_resp + ) + await _assert_db_skyscank8sjobs_coll( + mongo_client, post_scan_body, post_resp, scanner_server_args, rest_address + ) + + return post_resp # type: ignore[no-any-return] + + +async def _assert_db_scanrequests_coll( + mongo_client: AsyncIOMotorClient, + post_scan_body: dict, + post_resp: dict, +) -> str: + """Query the ScanRequests coll. + + Return the REST address + """ doc_sr = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) @@ -162,6 +181,16 @@ async def _launch_scan( ) assert re.fullmatch(rf"{re.escape('http://localhost:')}\d+", doc_sr["rest_address"]) + return doc_sr["rest_address"] + + +async def _assert_db_skyscank8sjobs_coll( + mongo_client: AsyncIOMotorClient, + post_scan_body: dict, + post_resp: dict, + scanner_server_args: str, + rest_address: str, +): # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest doc_k8s = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( @@ -196,7 +225,7 @@ async def _launch_scan( }, { "name": "SKYSCAN_SKYDRIVER_ADDRESS", - "value": doc_sr["rest_address"], + "value": rest_address, }, { "name": "SKYSCAN_SKYDRIVER_SCAN_ID", @@ -305,7 +334,7 @@ async def _launch_scan( "env": [ { "name": "SKYSCAN_SKYDRIVER_ADDRESS", - "value": doc_sr["rest_address"], + "value": rest_address, }, {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, { @@ -347,8 +376,6 @@ async def _launch_scan( }, } - return post_resp # type: ignore[no-any-return] - async def _do_patch( rc: RestClient, From f9392a0fcc0deb64ae23619df1d7eef2a517a8ca Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 11:56:03 -0600 Subject: [PATCH 122/327] remove htcondor dep; bump to 3.13 --- setup.cfg | 82 +++++++++++++++++++++++++++---------------------------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/setup.cfg b/setup.cfg index d4e9c0d5..8f42d6fd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,17 +1,17 @@ [wipac:cicd_setup_builder] python_min = 3.10 -python_max = 3.12 +python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,52 +26,50 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - htcondor - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.13 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio - flask + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples From 168a9d7bf2494350b6642a5046cb12d4070bd66a Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 17:56:32 +0000 Subject: [PATCH 123/327] update setup.cfg --- setup.cfg | 81 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 8f42d6fd..6174fed2 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,50 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools -python_requires = >=3.10, <3.13 + aiocache + boto3 + dacite + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools +python_requires = >=3.10, <3.14 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio - flask + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From ca9f977b72b8fd3aa814e66f62a7e0f3cfb60eed Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 18:01:28 +0000 Subject: [PATCH 124/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 2 -- 1 file changed, 2 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 6a552e19..ab319afb 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -18,7 +18,6 @@ dacite==1.8.1 dnspython==2.7.0 durationpy==0.9 google-auth==2.37.0 -htcondor==24.3.0 humanfriendly==10.0 idna==3.10 jmespath==1.0.1 @@ -72,7 +71,6 @@ skydriver-s3-sidecar-ewms-init-container │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] ├── dacite [required: Any, installed: 1.8.1] -├── htcondor [required: Any, installed: 24.3.0] ├── humanfriendly [required: Any, installed: 10.0] ├── kubernetes [required: Any, installed: 31.0.0] │ ├── certifi [required: >=14.05.14, installed: 2024.12.14] From 452fd749d967a17476af44eebb0d458a08db95a5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:15:13 -0600 Subject: [PATCH 125/327] tests: mid-scan updates --- tests/integration/test_rest_routes.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 6d250126..8ce13903 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -427,16 +427,8 @@ async def _do_patch( else resp["progress"] # not checking ), scanner_server_args=resp["scanner_server_args"], # not checking - ewms_task=dict( - tms_args=resp["ewms_task"]["tms_args"], # not checking - env_vars=resp["ewms_task"]["env_vars"], # not checking - complete=False, - clusters=( - previous_clusters + [cluster] # type: ignore[operator] # see assert ^^^^ - if cluster - else resp["ewms_task"]["clusters"] # not checking - ), - ), + ewms_task="use 'ewms_workflow_id'", + ewms_workflow_id="pending-ewms", classifiers=resp["classifiers"], # not checking last_updated=resp["last_updated"], # see below priority=0, From 7dd1894b77186e0dbcabb5cf95a9d0288fadf340 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:24:43 -0600 Subject: [PATCH 126/327] tests: remove clientmanager --- tests/integration/test_rest_routes.py | 83 ++++----------------------- 1 file changed, 12 insertions(+), 71 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 8ce13903..a82915e6 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -5,10 +5,8 @@ import logging import os import pprint -import random import re import time -import uuid from typing import Any, Callable import humanfriendly @@ -21,16 +19,13 @@ LOGGER = logging.getLogger(__name__) - -# pylint: disable=redefined-outer-name - - skydriver.config.config_logging() StrDict = dict[str, Any] ######################################################################################## - +# CONSTANTS +######################################################################################## RE_UUID4HEX = re.compile(r"[0-9a-f]{12}4[0-9a-f]{3}[89ab][0-9a-f]{15}") @@ -66,6 +61,8 @@ ] +######################################################################################## +# UTILS ######################################################################################## @@ -377,14 +374,12 @@ async def _assert_db_skyscank8sjobs_coll( } -async def _do_patch( +async def _patch_manifest( rc: RestClient, scan_id: str, progress: StrDict | None = None, event_metadata: StrDict | None = None, scan_metadata: StrDict | None = None, - cluster: StrDict | None = None, - previous_clusters: list[StrDict] | None = None, ) -> StrDict: # do PATCH @ /scan/{scan_id}/manifest, assert response body = {} @@ -394,9 +389,6 @@ async def _do_patch( body["event_metadata"] = event_metadata if scan_metadata: body["scan_metadata"] = scan_metadata - if cluster: - body["cluster"] = cluster - assert isinstance(previous_clusters, list) # gotta include this one too assert body now = time.time() @@ -468,9 +460,9 @@ async def _patch_progress_and_scan_metadata( ) # update progress (update `scan_metadata` sometimes--not as important) if i % 2: # odd - manifest = await _do_patch(rc, scan_id, progress=progress) + manifest = await _patch_manifest(rc, scan_id, progress=progress) else: # even - manifest = await _do_patch( + manifest = await _patch_manifest( rc, scan_id, progress=progress, @@ -492,7 +484,7 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str is_real_event=IS_REAL_EVENT, ) - await _do_patch(rc, scan_id, event_metadata=event_metadata) + await _patch_manifest(rc, scan_id, event_metadata=event_metadata) # query by run+event id resp = await rc.request( @@ -537,34 +529,6 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str return event_metadata -async def _clientmanager_reply( - rc: RestClient, - scan_id: str, - cluster_name__n_workers: tuple[str, int], - previous_clusters: list[StrDict], - known_clusters: dict, -) -> StrDict: - # reply as the clientmanager with a new cluster - cluster = dict( - orchestrator=known_clusters[cluster_name__n_workers[0]]["orchestrator"], - location=known_clusters[cluster_name__n_workers[0]]["location"], - cluster_id=f"cluster-{random.randint(1, 10000)}", - n_workers=cluster_name__n_workers[1], - starter_info={}, - statuses={}, - top_task_errors={}, - uuid=str(uuid.uuid4().hex), - ) - - manifest = await _do_patch( - rc, - scan_id, - cluster=cluster, - previous_clusters=previous_clusters, - ) - return manifest - - async def _send_result( rc: RestClient, scan_id: str, @@ -797,6 +761,8 @@ def get_tms_args( ######################################################################################## +# TESTS +######################################################################################## @pytest.mark.parametrize( @@ -877,13 +843,6 @@ async def _after_scan_start_logic( # INITIAL UPDATES # event_metadata = await _server_reply_with_event_metadata(rc, scan_id) - manifest = await _clientmanager_reply( # TODO: remove/replace (and anywhere else) - rc, - scan_id, - clusters[0] if isinstance(clusters, list) else list(clusters.items())[0], - [], - known_clusters, - ) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -901,17 +860,6 @@ async def _after_scan_start_logic( # FIRST, clients send updates result = await _send_result(rc, scan_id, manifest, False) manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) - # NEXT, spin up more workers in clusters - for cluster_name__n_workers in ( - clusters[1:] if isinstance(clusters, list) else list(clusters.items())[1:] - ): - manifest = await _clientmanager_reply( - rc, - scan_id, - cluster_name__n_workers, - manifest["ewms_task"]["clusters"], - known_clusters, - ) # THEN, clients send updates result = await _send_result(rc, scan_id, manifest, False) manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) @@ -1165,13 +1113,6 @@ async def test_100__bad_data( # INITIAL UPDATES # event_metadata = await _server_reply_with_event_metadata(rc, scan_id) - manifest = await _clientmanager_reply( - rc, - scan_id, - ("foobar", random.randint(1, 10000)), - [], - known_clusters, - ) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -1185,7 +1126,7 @@ async def test_100__bad_data( f"400 Client Error: Cannot change an existing event_metadata for url: {rc.address}/scan/{scan_id}/manifest" ), ) as e: - await _do_patch( + await _patch_manifest( rc, scan_id, event_metadata=dict( @@ -1232,7 +1173,7 @@ async def test_100__bad_data( f"400 Client Error: Cannot change an existing scan_metadata for url: {rc.address}/scan/{scan_id}/manifest" ), ) as e: - await _do_patch(rc, scan_id, scan_metadata={"boo": "baz", "bot": "fox"}) + await _patch_manifest(rc, scan_id, scan_metadata={"boo": "baz", "bot": "fox"}) # # SEND RESULT From a89e2c9f90391713c0c8c36b35fced7d2cea3053 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:40:10 -0600 Subject: [PATCH 127/327] tests: misc logic updates --- tests/integration/test_rest_routes.py | 40 ++++++++++++++++++--------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index a82915e6..3089aac5 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -374,7 +374,7 @@ async def _assert_db_skyscank8sjobs_coll( } -async def _patch_manifest( +async def _do_patch( rc: RestClient, scan_id: str, progress: StrDict | None = None, @@ -460,9 +460,9 @@ async def _patch_progress_and_scan_metadata( ) # update progress (update `scan_metadata` sometimes--not as important) if i % 2: # odd - manifest = await _patch_manifest(rc, scan_id, progress=progress) + manifest = await _do_patch(rc, scan_id, progress=progress) else: # even - manifest = await _patch_manifest( + manifest = await _do_patch( rc, scan_id, progress=progress, @@ -484,7 +484,7 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str is_real_event=IS_REAL_EVENT, ) - await _patch_manifest(rc, scan_id, event_metadata=event_metadata) + manifest = await _do_patch(rc, scan_id, event_metadata=event_metadata) # query by run+event id resp = await rc.request( @@ -526,7 +526,7 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str ) assert [m["scan_id"] for m in resp["manifests"]] == [scan_id] - return event_metadata + return manifest async def _send_result( @@ -760,6 +760,12 @@ def get_tms_args( return tms_args +async def _is_scan_complete(rc: RestClient, scan_id: str) -> bool: + resp = await rc.request("GET", f"/scans/{scan_id}/status") + pprint.pprint(resp) + return resp["scan_complete"] + + ######################################################################################## # TESTS ######################################################################################## @@ -842,7 +848,7 @@ async def _after_scan_start_logic( # # INITIAL UPDATES # - event_metadata = await _server_reply_with_event_metadata(rc, scan_id) + manifest = await _server_reply_with_event_metadata(rc, scan_id) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -867,17 +873,25 @@ async def _after_scan_start_logic( # # SEND RESULT(s) # - assert not manifest["ewms_task"]["complete"] # workforce is not done + assert not await _is_scan_complete(rc, manifest["scan_id"]) # workforce is not done result = await _send_result(rc, scan_id, manifest, True) # wait as long as the server, so it'll mark as complete await asyncio.sleep(test_wait_before_teardown + 1) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert manifest["ewms_task"]["complete"] # workforce is done + assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done # # DELETE SCAN # - await _delete_scan(rc, event_metadata, scan_id, manifest, result, True, True) + await _delete_scan( + rc, + manifest["event_metadata"], + scan_id, + manifest, + result, + True, + True, + ) POST_SCAN_BODY_FOR_TEST_01 = dict(**POST_SCAN_BODY, cluster={"foobar": 1}) @@ -1112,7 +1126,7 @@ async def test_100__bad_data( # # INITIAL UPDATES # - event_metadata = await _server_reply_with_event_metadata(rc, scan_id) + manifest = await _server_reply_with_event_metadata(rc, scan_id) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -1126,7 +1140,7 @@ async def test_100__bad_data( f"400 Client Error: Cannot change an existing event_metadata for url: {rc.address}/scan/{scan_id}/manifest" ), ) as e: - await _patch_manifest( + await _do_patch( rc, scan_id, event_metadata=dict( @@ -1173,7 +1187,7 @@ async def test_100__bad_data( f"400 Client Error: Cannot change an existing scan_metadata for url: {rc.address}/scan/{scan_id}/manifest" ), ) as e: - await _patch_manifest(rc, scan_id, scan_metadata={"boo": "baz", "bot": "fox"}) + await _do_patch(rc, scan_id, scan_metadata={"boo": "baz", "bot": "fox"}) # # SEND RESULT @@ -1218,7 +1232,7 @@ async def test_100__bad_data( # wait as long as the server, so it'll mark as complete await asyncio.sleep(test_wait_before_teardown) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert manifest["ewms_task"]["complete"] # workforce is done + assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done # # DELETE SCAN From 3d95540d62a1efaa7cb14250251c0769c3c1fee2 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:42:13 -0600 Subject: [PATCH 128/327] tests: misc logic updates - 2 --- tests/integration/test_rest_routes.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 3089aac5..c03246bf 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -1144,8 +1144,8 @@ async def test_100__bad_data( rc, scan_id, event_metadata=dict( - run_id=event_metadata["run_id"], - event_id=event_metadata["event_id"], + run_id=manifest["event_metadata"]["run_id"], + event_id=manifest["event_metadata"]["event_id"], event_type="funky", mjd=23423432.3, is_real_event=IS_REAL_EVENT, @@ -1260,7 +1260,23 @@ async def test_100__bad_data( print(e.value) # OK - await _delete_scan(rc, event_metadata, scan_id, manifest, result, True, True) + await _delete_scan( + rc, + manifest["event_metadata"], + scan_id, + manifest, + result, + True, + True, + ) # also OK - await _delete_scan(rc, event_metadata, scan_id, manifest, result, True, True) + await _delete_scan( + rc, + manifest["event_metadata"], + scan_id, + manifest, + result, + True, + True, + ) From 965d630b9404a3969c11e6bee7955065194f70e8 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:42:58 -0600 Subject: [PATCH 129/327] Dockerfile: bump to `python:3.13` --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index a8e99a2c..ea15b6b4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM python:3.11 +FROM python:3.13 RUN useradd -m -U app From e9e10f377fa9f63db6a6c8a7fecd4f8c11ea63dc Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:43:46 -0600 Subject: [PATCH 130/327] tests: fix path --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index c03246bf..4749515a 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -761,7 +761,7 @@ def get_tms_args( async def _is_scan_complete(rc: RestClient, scan_id: str) -> bool: - resp = await rc.request("GET", f"/scans/{scan_id}/status") + resp = await rc.request("GET", f"/scan/{scan_id}/status") pprint.pprint(resp) return resp["scan_complete"] From 2e1f6937dbfdb0ceadd71863b8521ff02dd1d9a6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 19:48:14 +0000 Subject: [PATCH 131/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index ab319afb..8d40a71c 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -1,7 +1,7 @@ # # This file was autogenerated by WIPACrepo/wipac-dev-py-dependencies-action # within a container using the user-supplied image 'skydriver' -# using Python 3.11. +# using Python 3.13. # ######################################################################## # pip freeze @@ -54,7 +54,6 @@ cryptography==44.0.0 pipdeptree==2.24.0 ├── packaging [required: >=24.1, installed: 24.2] └── pip [required: >=24.2, installed: 24.3.1] -setuptools==65.5.1 skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] ├── boto3 [required: Any, installed: 1.36.1] @@ -145,4 +144,3 @@ skydriver-s3-sidecar-ewms-init-container │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] └── typing_extensions [required: Any, installed: 4.12.2] -wheel==0.45.1 From 8bcc15260190b54ebce3bf6c602f70c55eb9856e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:51:45 -0600 Subject: [PATCH 132/327] tests: backlogger --- tests/integration/test_rest_routes.py | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 4749515a..6c032df7 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -821,8 +821,6 @@ async def test_000( await _after_scan_start_logic( rc, manifest, - clusters, - known_clusters, test_wait_before_teardown, ) @@ -830,8 +828,6 @@ async def test_000( async def _after_scan_start_logic( rc: RestClient, manifest: dict, - clusters: list | dict, - known_clusters: dict, test_wait_before_teardown: float, ): scan_id = manifest["scan_id"] @@ -842,8 +838,15 @@ async def _after_scan_start_logic( assert resp["manifest"] == manifest assert resp["result"] == {} - # TODO: at what point do we expect the backlogger to request to ewms? - # TODO: we need to assert what ewms is sent (store in dummy ewms, and query here; or assert the call?) + # wait backlogger to request to ewms + await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_SHORT_DELAY"]) * 2) # extra + ewms_workflow_id = ( + await rc.request( + "GET", f"/scan/{scan_id}", {"projection": ["ewms_workflow_id"]} + ) + )["ewms_workflow_id"] + assert RE_UUID4HEX.fullmatch(ewms_workflow_id) + # TODO: assert the EWMS request is sent (store in dummy ewms, and query here; or assert the call?) # # INITIAL UPDATES @@ -983,8 +986,6 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_alpha, - clusters, - known_clusters, test_wait_before_teardown, ) @@ -1003,8 +1004,6 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_beta, - clusters, - known_clusters, test_wait_before_teardown, ) From bca60f1b147fb86eb8088b99d9f32cbf70a915e9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 13:58:28 -0600 Subject: [PATCH 133/327] tests: status --- skydriver/rest_handlers.py | 12 ++++++++++-- tests/integration/dummy_ewms.py | 26 +++++++++++++++++++++----- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 8e33dc27..5a0a61a8 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -39,6 +39,7 @@ is_testing, ) from .database import schema +from .database.schema import PENDING_EWMS_WORKFLOW from .ewms import request_stop_on_ewms from .k8s.scan_backlog import put_on_backlog from .k8s.scanner_instance import SkyScanK8sJobFactory @@ -1046,14 +1047,21 @@ async def get(self, scan_id: str) -> None: pods_411["pod_message"] = "pod(s) not found" LOGGER.exception(e) - # respond + # scan state scan_state = await get_scan_state(manifest, self.ewms_rc) - if manifest.ewms_workflow_id: + + # ewms + if ( + manifest.ewms_workflow_id + and manifest.ewms_workflow_id != PENDING_EWMS_WORKFLOW + ): clusters = await ewms.get_taskforce_phases( self.ewms_rc, manifest.ewms_workflow_id ) else: clusters = [] + + # respond resp = { "scan_state": scan_state, "is_deleted": manifest.is_deleted, diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index ff74b8b8..1924135e 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -9,7 +9,7 @@ app = Flask(__name__) -DONT_CALL_IT_A_DB: dict[str, Any] = {} +DONT_CALL_IT_A_DB__WORKFLOWS: dict[str, Any] = {} @app.route("/v0/workflows", methods=["POST"]) @@ -28,7 +28,7 @@ def dummy_workflows_post(): # add more fields only if needed in tests--keep things simple } - DONT_CALL_IT_A_DB[workflow_id] = minimal_wf_doc + DONT_CALL_IT_A_DB__WORKFLOWS[workflow_id] = minimal_wf_doc return jsonify( { @@ -39,21 +39,37 @@ def dummy_workflows_post(): @app.route("/v0/workflows/", methods=["GET"]) def dummy_workflows_get(workflow_id: str): - return jsonify(DONT_CALL_IT_A_DB[workflow_id]) + return jsonify(DONT_CALL_IT_A_DB__WORKFLOWS[workflow_id]) @app.route("/v0/workflows//actions/abort", methods=["POST"]) def dummy_workflows_abort(workflow_id: str): - DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "abort"}) + DONT_CALL_IT_A_DB__WORKFLOWS[workflow_id].update({"deactivated": "abort"}) return jsonify({}) @app.route("/v0/workflows//actions/finished", methods=["POST"]) def dummy_workflows_finished(workflow_id: str): - DONT_CALL_IT_A_DB[workflow_id].update({"deactivated": "finished"}) + DONT_CALL_IT_A_DB__WORKFLOWS[workflow_id].update({"deactivated": "finished"}) return jsonify({}) +@app.route("/v0/query/taskforces", methods=["POST"]) +def dummy_query_taskforces(): + workflow_id = request.get_json("workflow_id") + + # respond with correctly-syntaxed gibberish + resp = { + "taskforces": [ + { + "taskforce": f"TF-{workflow_id}", + "phase": "the-best-phase-ever", + } + ] + } + return jsonify(resp) + + if __name__ == "__main__": app.run( debug=True, From fe5e6221094eb020714d104dc07c39b3dfe8e96c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:03:50 -0600 Subject: [PATCH 134/327] tests: misc logic updates - 3 --- tests/integration/conftest.py | 2 +- tests/integration/test_rest_routes.py | 53 ++++----------------------- 2 files changed, 9 insertions(+), 46 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 8c874c4c..297188d6 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -126,7 +126,7 @@ async def mongo_client() -> AsyncIOMotorClient: async def server( monkeypatch: Any, port: int, - mongo_client: AsyncIOMotorClient, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] mongo_clear: Any, # pylint:disable=unused-argument ) -> AsyncIterator[Callable[[], RestClient]]: """Startup server in this process, yield RestClient func, then clean up.""" diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 6c032df7..9dbef040 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -68,9 +68,8 @@ async def _launch_scan( rc: RestClient, - mongo_client: AsyncIOMotorClient, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, - tms_args: list[str], ) -> dict: # launch scan launch_time = time.time() @@ -134,7 +133,7 @@ async def _launch_scan( async def _assert_db_scanrequests_coll( - mongo_client: AsyncIOMotorClient, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, post_resp: dict, ) -> str: @@ -182,7 +181,7 @@ async def _assert_db_scanrequests_coll( async def _assert_db_skyscank8sjobs_coll( - mongo_client: AsyncIOMotorClient, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, post_resp: dict, scanner_server_args: str, @@ -727,39 +726,6 @@ async def _delete_scan( # ^^^ not testing that this is unique b/c the event could've been re-ran (rescan) -def get_tms_args( - clusters: list | dict, - docker_tag_expected: str, - known_clusters: dict, -) -> list[str]: - tms_args = [] - for cluster in clusters if isinstance(clusters, list) else list(clusters.items()): - orchestrator = known_clusters[cluster[0]]["orchestrator"] - location = known_clusters[cluster[0]]["location"] - image = ( - f"/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:{docker_tag_expected}" - if orchestrator == "condor" - else f"icecube/skymap_scanner:{docker_tag_expected}" - ) - tms_args += [ - f"python -m clientmanager " - f" --uuid {CLUSTER_ID_PLACEHOLDER} " - f" {orchestrator} " - f" {' '.join(f'--{k} {v}' for k,v in location.items())} " - f" start " - f" --n-workers {cluster[1]} " - f" --worker-memory-bytes {humanfriendly.parse_size('8GB')} " - f" --worker-disk-bytes {humanfriendly.parse_size('1GB')} " - f" --image {image} " - f" --client-startup-json /common-space/startup.json " - f" --max-worker-runtime {4 * 60 * 60} " - f" --priority 0 " - f" --spool " - ] - - return tms_args - - async def _is_scan_complete(rc: RestClient, scan_id: str) -> bool: resp = await rc.request("GET", f"/scan/{scan_id}/status") pprint.pprint(resp) @@ -802,7 +768,7 @@ async def test_000( server: Callable[[], RestClient], known_clusters: dict, test_wait_before_teardown: float, - mongo_client: AsyncIOMotorClient, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Test normal scan creation and retrieval.""" rc = server() @@ -815,7 +781,6 @@ async def test_000( "docker_tag": docker_tag_input, "cluster": clusters, }, - get_tms_args(clusters, docker_tag_expected, known_clusters), ) await _after_scan_start_logic( @@ -968,6 +933,7 @@ async def test_010__rescan( server: Callable[[], RestClient], known_clusters: dict, test_wait_before_teardown: float, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: rc = server() @@ -976,12 +942,12 @@ async def test_010__rescan( # OG SCAN manifest_alpha = await _launch_scan( rc, + mongo_client, { **POST_SCAN_BODY, "docker_tag": "3.4.0", "cluster": clusters, }, - get_tms_args(clusters, "3.4.0", known_clusters), ) await _after_scan_start_logic( rc, @@ -1015,6 +981,7 @@ async def test_100__bad_data( server: Callable[[], RestClient], known_clusters: dict, test_wait_before_teardown: float, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Failure-test scan creation and retrieval.""" rc = server() @@ -1108,12 +1075,8 @@ async def test_100__bad_data( # OK manifest = await _launch_scan( rc, + mongo_client, POST_SCAN_BODY_FOR_TEST_01, - get_tms_args( - POST_SCAN_BODY_FOR_TEST_01["cluster"], # type: ignore[arg-type] - os.environ["LATEST_TAG"], - known_clusters, - ), ) scan_id = manifest["scan_id"] # follow-up query From bd73d77b7b51abde9160aa9d2276047f8d28a37b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:21:03 -0600 Subject: [PATCH 135/327] tests: misc logic updates - 4 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 9dbef040..0a5abb1d 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -807,7 +807,7 @@ async def _after_scan_start_logic( await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_SHORT_DELAY"]) * 2) # extra ewms_workflow_id = ( await rc.request( - "GET", f"/scan/{scan_id}", {"projection": ["ewms_workflow_id"]} + "GET", f"/scan/{scan_id}", {"manifest_projection": ["ewms_workflow_id"]} ) )["ewms_workflow_id"] assert RE_UUID4HEX.fullmatch(ewms_workflow_id) From f91418530b1580cb5256003dc911260b00e685e1 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:40:05 -0600 Subject: [PATCH 136/327] tests: misc logic updates - 5 --- tests/integration/test_rest_routes.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 0a5abb1d..817ce1ce 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -805,11 +805,7 @@ async def _after_scan_start_logic( # wait backlogger to request to ewms await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_SHORT_DELAY"]) * 2) # extra - ewms_workflow_id = ( - await rc.request( - "GET", f"/scan/{scan_id}", {"manifest_projection": ["ewms_workflow_id"]} - ) - )["ewms_workflow_id"] + ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}"))["ewms_workflow_id"] assert RE_UUID4HEX.fullmatch(ewms_workflow_id) # TODO: assert the EWMS request is sent (store in dummy ewms, and query here; or assert the call?) From d6746514da4a45f59deca5f8bdef637713e316f4 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 17 Jan 2025 20:45:32 +0000 Subject: [PATCH 137/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 8d40a71c..baf13f4c 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.1 -botocore==1.36.1 +boto3==1.36.2 +botocore==1.36.2 cachetools==5.5.0 certifi==2024.12.14 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.24.0 └── pip [required: >=24.2, installed: 24.3.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.1] -│ ├── botocore [required: >=1.36.1,<1.37.0, installed: 1.36.1] +├── boto3 [required: Any, installed: 1.36.2] +│ ├── botocore [required: >=1.36.2,<1.37.0, installed: 1.36.2] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.1] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.1] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.2] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From e1733892aaa63edc4f87ba666ea3de36075a77f8 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:46:45 -0600 Subject: [PATCH 138/327] tests: misc logic updates - 6 --- tests/integration/test_rest_routes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 817ce1ce..46eaefab 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -805,7 +805,9 @@ async def _after_scan_start_logic( # wait backlogger to request to ewms await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_SHORT_DELAY"]) * 2) # extra - ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}"))["ewms_workflow_id"] + ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ + "ewms_workflow_id" + ] assert RE_UUID4HEX.fullmatch(ewms_workflow_id) # TODO: assert the EWMS request is sent (store in dummy ewms, and query here; or assert the call?) From 7dc89dc2d98239750e17c91a04d87ffbf452f2a5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:48:17 -0600 Subject: [PATCH 139/327] mypy --- tests/integration/conftest.py | 2 +- tests/integration/test_rest_routes.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 297188d6..49ef6147 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -117,7 +117,7 @@ def test_wait_before_teardown() -> float: @pytest_asyncio.fixture -async def mongo_client() -> AsyncIOMotorClient: +async def mongo_client() -> AsyncIOMotorClient: # type: ignore[valid-type] """A fixture to keep number of mongo connections to a minimum (aka 1).""" return await create_mongodb_client() diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 46eaefab..bfa4eb32 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -141,7 +141,7 @@ async def _assert_db_scanrequests_coll( Return the REST address """ - doc_sr = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( + doc_sr = await mongo_client["SkyDriver_DB"]["ScanRequests"].find_one( # type: ignore[index] {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) pprint.pprint(doc_sr) @@ -189,7 +189,7 @@ async def _assert_db_skyscank8sjobs_coll( ): # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest - doc_k8s = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( + doc_k8s = await mongo_client["SkyDriver_DB"]["SkyScanK8sJobs"].find_one( # type: ignore[index] {"scan_id": post_resp["scan_id"]}, {"_id": 0} ) pprint.pprint(doc_k8s) From 8e19316898bf2e382ba8bdc8bbf0c62b846e6e95 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 17 Jan 2025 14:55:01 -0600 Subject: [PATCH 140/327] catchup with backlog --- skydriver/k8s/scan_backlog.py | 2 +- tests/integration/test_rest_routes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 7cdb5752..dd3bb29b 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -144,7 +144,7 @@ async def _run( ) except database.mongodc.DocumentNotFoundException: timer_main_loop.fastforward() - continue # empty queue- + continue # there's no scan to start # request a workflow on EWMS try: diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index bfa4eb32..4a350853 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -804,7 +804,7 @@ async def _after_scan_start_logic( assert resp["result"] == {} # wait backlogger to request to ewms - await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_SHORT_DELAY"]) * 2) # extra + await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) * 5) # extra ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ "ewms_workflow_id" ] From 18a95c0f2e2be6b8a67b3b033f053b89f68c34ac Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 13:55:51 -0600 Subject: [PATCH 141/327] misc --- .gitignore | 1 + skydriver/config.py | 1 - tests/integration/test_rest_routes.py | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2012806d..31ecf4a6 100644 --- a/.gitignore +++ b/.gitignore @@ -143,3 +143,4 @@ gke-cluster-config.yaml .idea/ test-suit-sandbox* +resources/[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]/ diff --git a/skydriver/config.py b/skydriver/config.py index 1fe094b9..aa5d52e6 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -97,7 +97,6 @@ class EnvConfig: KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST: str = "" # skyscan (forwarded) - # TODO: see https://github.com/WIPACrepo/wipac-dev-tools/pull/69 SKYSCAN_PROGRESS_INTERVAL_SEC: Optional[int] = None SKYSCAN_RESULT_INTERVAL_SEC: Optional[int] = None SKYSCAN_MQ_TIMEOUT_TO_CLIENTS: Optional[int] = None diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 4a350853..d2c75333 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -810,6 +810,7 @@ async def _after_scan_start_logic( ] assert RE_UUID4HEX.fullmatch(ewms_workflow_id) # TODO: assert the EWMS request is sent (store in dummy ewms, and query here; or assert the call?) + assert 0 # # INITIAL UPDATES From 0841a278e2554df16a7bae5aa098185de30bf0f6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 13:56:32 -0600 Subject: [PATCH 142/327] (test) --- tests/integration/test_rest_routes.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index d2c75333..689fa1d0 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -804,6 +804,7 @@ async def _after_scan_start_logic( assert resp["result"] == {} # wait backlogger to request to ewms + assert int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) * 5) # extra ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ "ewms_workflow_id" From f1253ec661c27a8af9ab19973713374d0da8466b Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 28 Jan 2025 20:00:18 +0000 Subject: [PATCH 143/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 38 +++++++++++++++---------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index baf13f4c..c1bdef0d 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,9 +7,9 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.2 -botocore==1.36.2 -cachetools==5.5.0 +boto3==1.36.7 +botocore==1.36.7 +cachetools==5.5.1 certifi==2024.12.14 cffi==1.17.1 charset-normalizer==3.4.1 @@ -17,11 +17,11 @@ cryptography==44.0.0 dacite==1.8.1 dnspython==2.7.0 durationpy==0.9 -google-auth==2.37.0 +google-auth==2.38.0 humanfriendly==10.0 idna==3.10 jmespath==1.0.1 -kubernetes==31.0.0 +kubernetes==32.0.0 motor==3.3.2 oauthlib==3.2.2 pyasn1==0.6.1 @@ -36,14 +36,14 @@ requests==2.32.3 requests-futures==1.0.2 requests-oauthlib==2.0.0 rsa==4.9 -s3transfer==0.11.1 +s3transfer==0.11.2 six==1.17.0 tornado==6.4.2 typeguard==4.4.1 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.15.0 +wipac-dev-tools==1.15.1 wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree @@ -51,31 +51,31 @@ wipac-rest-tools==1.8.5 cryptography==44.0.0 └── cffi [required: >=1.12, installed: 1.17.1] └── pycparser [required: Any, installed: 2.22] -pipdeptree==2.24.0 +pipdeptree==2.25.0 ├── packaging [required: >=24.1, installed: 24.2] -└── pip [required: >=24.2, installed: 24.3.1] +└── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.2] -│ ├── botocore [required: >=1.36.2,<1.37.0, installed: 1.36.2] +├── boto3 [required: Any, installed: 1.36.7] +│ ├── botocore [required: >=1.36.7,<1.37.0, installed: 1.36.7] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] -│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.1] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.2] +│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.7] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] ├── dacite [required: Any, installed: 1.8.1] ├── humanfriendly [required: Any, installed: 10.0] -├── kubernetes [required: Any, installed: 31.0.0] +├── kubernetes [required: Any, installed: 32.0.0] │ ├── certifi [required: >=14.05.14, installed: 2024.12.14] │ ├── durationpy [required: >=0.7, installed: 0.9] -│ ├── google-auth [required: >=1.0.1, installed: 2.37.0] -│ │ ├── cachetools [required: >=2.0.0,<6.0, installed: 5.5.0] +│ ├── google-auth [required: >=1.0.1, installed: 2.38.0] +│ │ ├── cachetools [required: >=2.0.0,<6.0, installed: 5.5.1] │ │ ├── pyasn1_modules [required: >=0.2.1, installed: 0.4.1] │ │ │ └── pyasn1 [required: >=0.4.6,<0.7.0, installed: 0.6.1] │ │ └── rsa [required: >=3.1.4,<5, installed: 4.9] @@ -113,7 +113,7 @@ skydriver-s3-sidecar-ewms-init-container ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.1] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.15.0] +├── wipac-dev-tools [required: Any, installed: 1.15.1] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -121,7 +121,7 @@ skydriver-s3-sidecar-ewms-init-container │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ └── typing_extensions [required: Any, installed: 4.12.2] └── wipac-rest-tools [required: Any, installed: 1.8.5] - ├── cachetools [required: Any, installed: 5.5.0] + ├── cachetools [required: Any, installed: 5.5.1] ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] ├── requests [required: Any, installed: 2.32.3] @@ -137,7 +137,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.15.0] + └── wipac-dev-tools [required: Any, installed: 1.15.1] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From 0d99d06fc54949b3f55cc99b365194fd2866962d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 14:13:51 -0600 Subject: [PATCH 144/327] mock out s3 connection --- skydriver/__main__.py | 20 +++++++++++++++++++- skydriver/ewms.py | 4 +++- skydriver/k8s/scan_backlog.py | 6 +++++- skydriver/s3.py | 16 ++-------------- tests/integration/conftest.py | 7 ++++++- tests/integration/test_rest_routes.py | 10 ++++++++-- 6 files changed, 43 insertions(+), 20 deletions(-) diff --git a/skydriver/__main__.py b/skydriver/__main__.py index 82b3856f..1bf9dadc 100644 --- a/skydriver/__main__.py +++ b/skydriver/__main__.py @@ -3,6 +3,8 @@ import asyncio import logging +import boto3 +import botocore.client from rest_tools.client import ClientCredentialsAuth, RestClient from . import database, k8s, server @@ -28,6 +30,17 @@ def setup_ewms_client() -> RestClient: ) +def setup_s3_client() -> botocore.client.BaseClient: + """Connect to S3 server.""" + return boto3.client( + "s3", + "us-east-1", + endpoint_url=ENV.S3_URL, + aws_access_key_id=ENV.S3_ACCESS_KEY_ID, + aws_secret_access_key=ENV.S3_SECRET_KEY, + ) + + async def main() -> None: """Establish connections and start components.""" @@ -48,10 +61,15 @@ async def main() -> None: ewms_rc = setup_ewms_client() LOGGER.info("EWMS client connected.") + # S3 client + LOGGER.info("Setting up s3 client...") + s3_client = setup_s3_client() + LOGGER.info("S3 client connected.") + # Scan Backlog Runner LOGGER.info("Starting scan backlog runner...") backlog_task = asyncio.create_task( - k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc) + k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc, s3_client) ) await asyncio.sleep(0) # start up previous task diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 2c8b416f..b5762f76 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -3,6 +3,7 @@ import logging import aiocache # type: ignore[import-untyped] +import botocore.client import requests from rest_tools.client import RestClient @@ -15,6 +16,7 @@ async def request_workflow_on_ewms( ewms_rc: RestClient, + s3_client: botocore.client.BaseClient, manifest: database.schema.Manifest, scan_request_obj: dict, ) -> str: @@ -25,7 +27,7 @@ async def request_workflow_on_ewms( else: # None raise TypeError("Scan is not designated for EWMS") - s3_url_get = s3.generate_s3_get_url(manifest.scan_id) + s3_url_get = s3.generate_s3_get_url(s3_client, manifest.scan_id) image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) body = { diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index dd3bb29b..6e13f8d9 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -4,6 +4,7 @@ import logging import time +import botocore.client import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient @@ -85,6 +86,7 @@ async def run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, ewms_rc: RestClient, + s3_client: botocore.client.BaseClient, ) -> None: """Error-handling around the scan backlog runner loop.""" LOGGER.info("Started scan backlog runner.") @@ -92,7 +94,7 @@ async def run( while True: # let's go! try: - await _run(mongo_client, k8s_batch_api, ewms_rc) + await _run(mongo_client, k8s_batch_api, ewms_rc, s3_client) except Exception as e: LOGGER.exception(e) @@ -106,6 +108,7 @@ async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, ewms_rc: RestClient, + s3_client: botocore.client.BaseClient, ) -> None: """The (actual) main loop.""" manifest_client = database.interface.ManifestClient(mongo_client) @@ -150,6 +153,7 @@ async def _run( try: workflow_id = await ewms.request_workflow_on_ewms( ewms_rc, + s3_client, manifest, scan_request_obj, ) diff --git a/skydriver/s3.py b/skydriver/s3.py index 0d011dd7..effb7f47 100644 --- a/skydriver/s3.py +++ b/skydriver/s3.py @@ -3,32 +3,20 @@ import logging import boto3 # type: ignore[import-untyped] +import botocore.client from .config import ENV LOGGER = logging.getLogger(__name__) -def _get_client(): - LOGGER.info("Connecting to S3...") - return boto3.client( - "s3", - "us-east-1", - endpoint_url=ENV.S3_URL, - aws_access_key_id=ENV.S3_ACCESS_KEY_ID, - aws_secret_access_key=ENV.S3_SECRET_KEY, - ) - - def make_object_key(scan_id: str) -> str: """Construct the object key from the scan_id (deterministic).""" return f"{scan_id}-s3-object" -def generate_s3_get_url(object_key: str) -> str: +def generate_s3_get_url(s3_client: botocore.client.BaseClient, object_key: str) -> str: """Generate a pre-signed S3 url for retrieving shared files.""" - s3_client = _get_client() - # get GET url get_url = s3_client.generate_presigned_url( "get_object", diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 49ef6147..be24927d 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -141,7 +141,12 @@ async def server( k8s_batch_api = Mock() ewms_rc = setup_ewms_client() backlog_task = asyncio.create_task( - skydriver.k8s.scan_backlog.run(mongo_client, k8s_batch_api, ewms_rc) + skydriver.k8s.scan_backlog.run( + mongo_client, + k8s_batch_api, + ewms_rc, + Mock(), # s3_client + ) ) await asyncio.sleep(0) # start up previous task rs = await make(mongo_client, k8s_batch_api, ewms_rc) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 689fa1d0..9fa086ac 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -123,10 +123,16 @@ async def _launch_scan( # check database rest_address = await _assert_db_scanrequests_coll( - mongo_client, post_scan_body, post_resp + mongo_client, + post_scan_body, + post_resp, ) await _assert_db_skyscank8sjobs_coll( - mongo_client, post_scan_body, post_resp, scanner_server_args, rest_address + mongo_client, + post_scan_body, + post_resp, + scanner_server_args, + rest_address, ) return post_resp # type: ignore[no-any-return] From c6120e9449279bff7149843ffc9ab54aa4d2b940 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 14:44:12 -0600 Subject: [PATCH 145/327] mock `KubeAPITools.start_job` --- tests/integration/conftest.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index be24927d..24c0931b 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -3,6 +3,7 @@ import asyncio import socket from typing import Any, AsyncIterator, Callable +from unittest import mock from unittest.mock import Mock import kubernetes.client # type: ignore[import-untyped] @@ -123,6 +124,7 @@ async def mongo_client() -> AsyncIOMotorClient: # type: ignore[valid-type] @pytest_asyncio.fixture +@mock.patch("skydriver.k8s.utils.KubeAPITools.start_job", return_value=None) async def server( monkeypatch: Any, port: int, From cc4d9e847df367abe1b037f1db43024a05e29970 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 28 Jan 2025 20:48:09 +0000 Subject: [PATCH 146/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index c1bdef0d..e68e5f75 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.7 -botocore==1.36.7 +boto3==1.36.8 +botocore==1.36.8 cachetools==5.5.1 certifi==2024.12.14 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.7] -│ ├── botocore [required: >=1.36.7,<1.37.0, installed: 1.36.7] +├── boto3 [required: Any, installed: 1.36.8] +│ ├── botocore [required: >=1.36.8,<1.37.0, installed: 1.36.8] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.7] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.8] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 29412b019f9fe4b5e1eeb32b574f41e8f219dcb3 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 14:55:04 -0600 Subject: [PATCH 147/327] mock `KubeAPITools.start_job` - 2 --- tests/integration/conftest.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 24c0931b..01fe905c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -124,7 +124,6 @@ async def mongo_client() -> AsyncIOMotorClient: # type: ignore[valid-type] @pytest_asyncio.fixture -@mock.patch("skydriver.k8s.utils.KubeAPITools.start_job", return_value=None) async def server( monkeypatch: Any, port: int, @@ -133,6 +132,19 @@ async def server( ) -> AsyncIterator[Callable[[], RestClient]]: """Startup server in this process, yield RestClient func, then clean up.""" + # NOTE: cannot use @mock.patch with @pytest_asyncio.fixture + # NOTE: cannot use `yield from` on async iterator + + with mock.patch("skydriver.k8s.utils.KubeAPITools.start_job", return_value=None): + async for y in _server(monkeypatch, port, mongo_client): + yield y + + +async def _server( + monkeypatch: Any, + port: int, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] +) -> AsyncIterator[Callable[[], RestClient]]: # patch at directly named import that happens before running the test monkeypatch.setattr(skydriver.rest_handlers, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) monkeypatch.setattr(skydriver.config, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) From acdeb9b8d8e0a5545ea88e7b8c922aa339532ba6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:04:24 -0600 Subject: [PATCH 148/327] (next) --- tests/integration/test_rest_routes.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 9fa086ac..3aa7737f 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -816,8 +816,6 @@ async def _after_scan_start_logic( "ewms_workflow_id" ] assert RE_UUID4HEX.fullmatch(ewms_workflow_id) - # TODO: assert the EWMS request is sent (store in dummy ewms, and query here; or assert the call?) - assert 0 # # INITIAL UPDATES From 9f589157a836f07d572fc8a3a965eafc574b7394 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:07:37 -0600 Subject: [PATCH 149/327] flake8 + mypy --- skydriver/__main__.py | 4 ++-- skydriver/ewms.py | 2 +- skydriver/k8s/scan_backlog.py | 2 +- skydriver/s3.py | 3 +-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/skydriver/__main__.py b/skydriver/__main__.py index 1bf9dadc..ef0abd0b 100644 --- a/skydriver/__main__.py +++ b/skydriver/__main__.py @@ -3,8 +3,8 @@ import asyncio import logging -import boto3 -import botocore.client +import boto3 # type: ignore[import-untyped] +import botocore.client # type: ignore[import-untyped] from rest_tools.client import ClientCredentialsAuth, RestClient from . import database, k8s, server diff --git a/skydriver/ewms.py b/skydriver/ewms.py index b5762f76..51b267c9 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -3,7 +3,7 @@ import logging import aiocache # type: ignore[import-untyped] -import botocore.client +import botocore.client # type: ignore[import-untyped] import requests from rest_tools.client import RestClient diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 6e13f8d9..bfdb5a75 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -4,7 +4,7 @@ import logging import time -import botocore.client +import botocore.client # type: ignore[import-untyped] import kubernetes.client # type: ignore[import-untyped] from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorCollection from rest_tools.client import RestClient diff --git a/skydriver/s3.py b/skydriver/s3.py index effb7f47..1f84a1c5 100644 --- a/skydriver/s3.py +++ b/skydriver/s3.py @@ -2,8 +2,7 @@ import logging -import boto3 # type: ignore[import-untyped] -import botocore.client +import botocore.client # type: ignore[import-untyped] from .config import ENV From 448dcb3393a4309f352a801f8c1db1f3f842f305 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:22:02 -0600 Subject: [PATCH 150/327] check manifest is not changing unexpectedly when patched --- tests/integration/test_rest_routes.py | 89 ++++++++++++++++----------- 1 file changed, 53 insertions(+), 36 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 3aa7737f..b2042499 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -21,7 +21,7 @@ skydriver.config.config_logging() -StrDict = dict[str, Any] +sdict = dict[str, Any] ######################################################################################## # CONSTANTS @@ -382,10 +382,11 @@ async def _assert_db_skyscank8sjobs_coll( async def _do_patch( rc: RestClient, scan_id: str, - progress: StrDict | None = None, - event_metadata: StrDict | None = None, - scan_metadata: StrDict | None = None, -) -> StrDict: + manifest: sdict, + progress: sdict | None = None, + event_metadata: sdict | None = None, + scan_metadata: sdict | None = None, +) -> sdict: # do PATCH @ /scan/{scan_id}/manifest, assert response body = {} if progress: @@ -403,13 +404,15 @@ async def _do_patch( scan_id=scan_id, is_deleted=False, timestamp=resp["timestamp"], # see below - i3_event_id=resp["i3_event_id"], # not checking - event_i3live_json_dict=resp["event_i3live_json_dict"], # not checking - event_i3live_json_dict__hash=resp[ + i3_event_id=manifest["i3_event_id"], # should not change + event_i3live_json_dict=manifest["event_i3live_json_dict"], # should not change + event_i3live_json_dict__hash=manifest[ "event_i3live_json_dict__hash" - ], # not checking - event_metadata=event_metadata if event_metadata else resp["event_metadata"], - scan_metadata=scan_metadata if scan_metadata else resp["scan_metadata"], + ], # should not change + event_metadata=( + event_metadata if event_metadata else manifest["event_metadata"] + ), + scan_metadata=(scan_metadata if scan_metadata else manifest["scan_metadata"]), progress=( { # inject the auto-filled args **progress, @@ -421,12 +424,12 @@ async def _do_patch( }, } if progress - else resp["progress"] # not checking + else manifest["progress"] # should not change ), - scanner_server_args=resp["scanner_server_args"], # not checking + scanner_server_args=manifest["scanner_server_args"], # should not change ewms_task="use 'ewms_workflow_id'", - ewms_workflow_id="pending-ewms", - classifiers=resp["classifiers"], # not checking + ewms_workflow_id=manifest["ewms_workflow_id"], # should not change + classifiers=manifest["classifiers"], # should not change last_updated=resp["last_updated"], # see below priority=0, # TODO: check more fields in future (hint: ctrl+F this comment) @@ -443,8 +446,9 @@ async def _do_patch( async def _patch_progress_and_scan_metadata( rc: RestClient, scan_id: str, + manifest: sdict, n: int, -) -> StrDict: +) -> sdict: # send progress updates for i in range(n): progress = dict( @@ -465,18 +469,28 @@ async def _patch_progress_and_scan_metadata( ) # update progress (update `scan_metadata` sometimes--not as important) if i % 2: # odd - manifest = await _do_patch(rc, scan_id, progress=progress) + manifest = await _do_patch( + rc, + scan_id, + manifest, + progress=progress, + ) else: # even manifest = await _do_patch( rc, scan_id, + manifest, progress=progress, scan_metadata={"scan_id": scan_id, "foo": "bar"}, ) return manifest -async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> StrDict: +async def _server_reply_with_event_metadata( + rc: RestClient, + scan_id: str, + manifest: sdict, +) -> sdict: # reply as the scanner server with the newly gathered run+event ids event_id = 123 run_id = 456 @@ -489,7 +503,7 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str is_real_event=IS_REAL_EVENT, ) - manifest = await _do_patch(rc, scan_id, event_metadata=event_metadata) + manifest = await _do_patch(rc, scan_id, manifest, event_metadata=event_metadata) # query by run+event id resp = await rc.request( @@ -537,9 +551,9 @@ async def _server_reply_with_event_metadata(rc: RestClient, scan_id: str) -> Str async def _send_result( rc: RestClient, scan_id: str, - last_known_manifest: StrDict, + manifest: sdict, is_final: bool, -) -> StrDict: +) -> sdict: # send finished result result = {"alpha": (11 + 1) ** 11, "beta": -11} if is_final: @@ -558,7 +572,7 @@ async def _send_result( # query progress resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert resp == last_known_manifest + assert resp == manifest # query result resp = await rc.request("GET", f"/scan/{scan_id}/result") @@ -566,7 +580,7 @@ async def _send_result( # query scan resp = await rc.request("GET", f"/scan/{scan_id}") - assert resp["manifest"] == last_known_manifest + assert resp["manifest"] == manifest assert resp["result"] == result return result @@ -574,10 +588,10 @@ async def _send_result( async def _delete_scan( rc: RestClient, - event_metadata: StrDict, + event_metadata: sdict, scan_id: str, - last_known_manifest: StrDict, - last_known_result: StrDict, + manifest: sdict, + last_known_result: sdict, is_final: bool, delete_completed_scan: bool | None, ) -> None: @@ -595,11 +609,11 @@ async def _delete_scan( # only checking these fields: "scan_id": scan_id, "is_deleted": True, - "progress": last_known_manifest["progress"], + "progress": manifest["progress"], "ewms_task": { **resp["manifest"]["ewms_task"], # whether workforce is done - "complete": last_known_manifest["ewms_task"]["complete"], + "complete": manifest["ewms_task"]["complete"], }, "last_updated": resp["manifest"]["last_updated"], # see below # TODO: check more fields in future (hint: ctrl+F this comment) @@ -798,7 +812,7 @@ async def test_000( async def _after_scan_start_logic( rc: RestClient, - manifest: dict, + manifest: sdict, test_wait_before_teardown: float, ): scan_id = manifest["scan_id"] @@ -820,7 +834,7 @@ async def _after_scan_start_logic( # # INITIAL UPDATES # - manifest = await _server_reply_with_event_metadata(rc, scan_id) + manifest = await _server_reply_with_event_metadata(rc, scan_id, manifest) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -830,17 +844,17 @@ async def _after_scan_start_logic( # # ADD PROGRESS # - manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) + manifest = await _patch_progress_and_scan_metadata(rc, scan_id, manifest, 10) # # SEND INTERMEDIATES (these can happen in any order, or even async) # # FIRST, clients send updates result = await _send_result(rc, scan_id, manifest, False) - manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) + manifest = await _patch_progress_and_scan_metadata(rc, scan_id, manifest, 10) # THEN, clients send updates result = await _send_result(rc, scan_id, manifest, False) - manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) + manifest = await _patch_progress_and_scan_metadata(rc, scan_id, manifest, 10) # # SEND RESULT(s) @@ -1092,7 +1106,7 @@ async def test_100__bad_data( # # INITIAL UPDATES # - manifest = await _server_reply_with_event_metadata(rc, scan_id) + manifest = await _server_reply_with_event_metadata(rc, scan_id, manifest) # follow-up query assert await rc.request("GET", f"/scan/{scan_id}/result") == {} resp = await rc.request("GET", f"/scan/{scan_id}") @@ -1109,6 +1123,7 @@ async def test_100__bad_data( await _do_patch( rc, scan_id, + manifest, event_metadata=dict( run_id=manifest["event_metadata"]["run_id"], event_id=manifest["event_metadata"]["event_id"], @@ -1144,7 +1159,7 @@ async def test_100__bad_data( print(e.value) # OK - manifest = await _patch_progress_and_scan_metadata(rc, scan_id, 10) + manifest = await _patch_progress_and_scan_metadata(rc, scan_id, manifest, 10) # ATTEMPT OVERWRITE with pytest.raises( @@ -1153,7 +1168,9 @@ async def test_100__bad_data( f"400 Client Error: Cannot change an existing scan_metadata for url: {rc.address}/scan/{scan_id}/manifest" ), ) as e: - await _do_patch(rc, scan_id, scan_metadata={"boo": "baz", "bot": "fox"}) + await _do_patch( + rc, scan_id, manifest, scan_metadata={"boo": "baz", "bot": "fox"} + ) # # SEND RESULT From 56f75a658c8dd97d070a8a9711b7f7b422bc6473 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:30:53 -0600 Subject: [PATCH 151/327] check manifest is not changing unexpectedly when patched - 2 --- tests/integration/test_rest_routes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index b2042499..f5c5bd30 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -826,10 +826,8 @@ async def _after_scan_start_logic( # wait backlogger to request to ewms assert int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) * 5) # extra - ewms_workflow_id = (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ - "ewms_workflow_id" - ] - assert RE_UUID4HEX.fullmatch(ewms_workflow_id) + manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") + assert RE_UUID4HEX.fullmatch(manifest["ewms_workflow_id"]) # # INITIAL UPDATES From 4771373aed43e56750494ab4ee33a1e8cc37a765 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:40:42 -0600 Subject: [PATCH 152/327] fix dummy ewms --- tests/integration/dummy_ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index 1924135e..89d6983d 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -62,7 +62,7 @@ def dummy_query_taskforces(): resp = { "taskforces": [ { - "taskforce": f"TF-{workflow_id}", + "taskforce_uuid": f"TF-{workflow_id}", "phase": "the-best-phase-ever", } ] From 3cd0a19068469e352ef3e424164a040011d20269 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:43:25 -0600 Subject: [PATCH 153/327] flake8 --- tests/integration/test_rest_routes.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index f5c5bd30..1ca00af8 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -382,10 +382,10 @@ async def _assert_db_skyscank8sjobs_coll( async def _do_patch( rc: RestClient, scan_id: str, - manifest: sdict, - progress: sdict | None = None, - event_metadata: sdict | None = None, - scan_metadata: sdict | None = None, + manifest: sdict, + progress: sdict | None = None, + event_metadata: sdict | None = None, + scan_metadata: sdict | None = None, ) -> sdict: # do PATCH @ /scan/{scan_id}/manifest, assert response body = {} @@ -446,7 +446,7 @@ async def _do_patch( async def _patch_progress_and_scan_metadata( rc: RestClient, scan_id: str, - manifest: sdict, + manifest: sdict, n: int, ) -> sdict: # send progress updates @@ -487,9 +487,9 @@ async def _patch_progress_and_scan_metadata( async def _server_reply_with_event_metadata( - rc: RestClient, - scan_id: str, - manifest: sdict, + rc: RestClient, + scan_id: str, + manifest: sdict, ) -> sdict: # reply as the scanner server with the newly gathered run+event ids event_id = 123 @@ -551,7 +551,7 @@ async def _server_reply_with_event_metadata( async def _send_result( rc: RestClient, scan_id: str, - manifest: sdict, + manifest: sdict, is_final: bool, ) -> sdict: # send finished result @@ -588,10 +588,10 @@ async def _send_result( async def _delete_scan( rc: RestClient, - event_metadata: sdict, + event_metadata: sdict, scan_id: str, - manifest: sdict, - last_known_result: sdict, + manifest: sdict, + last_known_result: sdict, is_final: bool, delete_completed_scan: bool | None, ) -> None: @@ -812,7 +812,7 @@ async def test_000( async def _after_scan_start_logic( rc: RestClient, - manifest: sdict, + manifest: sdict, test_wait_before_teardown: float, ): scan_id = manifest["scan_id"] From d8a61a945e2e37b9b2aff4e43e2337844cdf245f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:47:40 -0600 Subject: [PATCH 154/327] fix scan end game --- skydriver/rest_handlers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 5a0a61a8..18360535 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1003,7 +1003,10 @@ async def put(self, scan_id: str) -> None: WAIT_BEFORE_TEARDOWN ) # regular time.sleep() sleeps the entire server await stop_skyscan_workers( - self.manifests, scan_id, self.k8s_batch_api, abort=False + self.manifests, + scan_id, + self.ewms_rc, + abort=False, ) From 296ffe911ba8b4c27b73e975abd1baf7dbe15c41 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 15:48:57 -0600 Subject: [PATCH 155/327] flake8 --- skydriver/ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 51b267c9..b741087e 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -16,7 +16,7 @@ async def request_workflow_on_ewms( ewms_rc: RestClient, - s3_client: botocore.client.BaseClient, + s3_client: botocore.client.BaseClient, manifest: database.schema.Manifest, scan_request_obj: dict, ) -> str: From 2dcf30c0b886a4955f170a772c6d256d06185ac4 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 16:18:58 -0600 Subject: [PATCH 156/327] change `SCAN_FINISHED_SUCCESSFULLY` to be "got final result" --- skydriver/rest_handlers.py | 2 +- skydriver/utils.py | 14 +++++++++++--- tests/integration/dummy_ewms.py | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 18360535..a93b3fb3 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1051,7 +1051,7 @@ async def get(self, scan_id: str) -> None: LOGGER.exception(e) # scan state - scan_state = await get_scan_state(manifest, self.ewms_rc) + scan_state = await get_scan_state(manifest, self.ewms_rc, self.results) # ewms if ( diff --git a/skydriver/utils.py b/skydriver/utils.py index 31a74dee..7ba5a083 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -4,7 +4,7 @@ from rest_tools.client import RestClient -from . import ewms +from . import database, ewms from .database.schema import DEPRECATED_EWMS_TASK, Manifest, PENDING_EWMS_WORKFLOW @@ -12,6 +12,9 @@ class _ScanState(enum.Enum): """A non-persisted scan state.""" SCAN_FINISHED_SUCCESSFULLY = enum.auto() + # ^^^ indicates the scanner sent finished results. in reality, the scanner or ewms + # could've crashed immediately after BUT the user only cares about the RESULTS--so, + # this would still be considered a SUCCESS in *this* context IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() @@ -19,9 +22,14 @@ class _ScanState(enum.Enum): PENDING__PRESTARTUP = enum.auto() -async def get_scan_state(manifest: Manifest, ewms_rc: RestClient) -> str: +async def get_scan_state( + manifest: Manifest, + ewms_rc: RestClient, + results: database.interface.ResultClient, +) -> str: """Determine the state of the scan by parsing attributes and talking with EWMS.""" - if manifest.progress and manifest.progress.processing_stats.finished: + if (await results.get(manifest.scan_id)).is_final: + # NOTE: see note on 'SCAN_FINISHED_SUCCESSFULLY' above return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name def _has_cleared_backlog() -> bool: diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index 89d6983d..cf5a8a6b 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -62,7 +62,7 @@ def dummy_query_taskforces(): resp = { "taskforces": [ { - "taskforce_uuid": f"TF-{workflow_id}", + "taskforce_uuid": f"TF-{workflow_id['workflow_id']}", "phase": "the-best-phase-ever", } ] From 713a94709953b9102457df49973bdca480a00233 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 16:23:13 -0600 Subject: [PATCH 157/327] remove old test code --- tests/integration/test_rest_routes.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 1ca00af8..01baf1e5 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -610,11 +610,6 @@ async def _delete_scan( "scan_id": scan_id, "is_deleted": True, "progress": manifest["progress"], - "ewms_task": { - **resp["manifest"]["ewms_task"], - # whether workforce is done - "complete": manifest["ewms_task"]["complete"], - }, "last_updated": resp["manifest"]["last_updated"], # see below # TODO: check more fields in future (hint: ctrl+F this comment) }, From 709846d750063ae09c1571d537fb197b0d37b710 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 16:32:24 -0600 Subject: [PATCH 158/327] fix unit tests (see above change) --- tests/unit/test_scan_state.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index aafdfe19..2c5b05a3 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -1,6 +1,6 @@ """Test dynamically generating the scan state.""" -from unittest.mock import MagicMock, patch +from unittest.mock import AsyncMock, MagicMock, patch import pytest @@ -9,9 +9,19 @@ from skydriver.utils import get_scan_state -async def test_00__scan_finished_successfully() -> None: - """Test with SCAN_FINISHED_SUCCESSFULLY.""" +@pytest.mark.parametrize( + "processing_stats_is_finished", + [True, False], +) +async def test_00__scan_finished_successfully( + processing_stats_is_finished: bool, +) -> None: + """Test with SCAN_FINISHED_SUCCESSFULLY. + + `processing_stats.is_finished` does not affect "SCAN_FINISHED_SUCCESSFULLY" + """ ewms_rc = MagicMock() + results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=True))) manifest = schema.Manifest( scan_id=MagicMock(), @@ -26,11 +36,14 @@ async def test_00__scan_finished_successfully() -> None: spec_set=["processing_stats"], # no magic strict attrs -- kind of like dict processing_stats=MagicMock( spec_set=["finished"], # no magic strict attrs -- kind of like dict - finished=True, + finished=processing_stats_is_finished, ), ), ) - assert await get_scan_state(manifest, ewms_rc) == "SCAN_FINISHED_SUCCESSFULLY" + + assert ( + await get_scan_state(manifest, ewms_rc, results) == "SCAN_FINISHED_SUCCESSFULLY" + ) @pytest.mark.parametrize( @@ -44,6 +57,7 @@ async def test_00__scan_finished_successfully() -> None: async def test_10__partial_result_generated(ewms_dtype: str | None, state: str) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() + results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=False))) manifest = schema.Manifest( scan_id=MagicMock(), @@ -68,7 +82,7 @@ async def test_10__partial_result_generated(ewms_dtype: str | None, state: str) ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc) == state + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -84,6 +98,7 @@ async def test_20__waiting_on_first_pixel_reco( ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() + results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=False))) manifest = schema.Manifest( scan_id=MagicMock(), @@ -108,7 +123,7 @@ async def test_20__waiting_on_first_pixel_reco( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc) == state + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -124,6 +139,7 @@ async def test_40__waiting_on_scanner_server_startup( ) -> None: """Test normal and stopped variants.""" ewms_rc = MagicMock() + results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=False))) manifest = schema.Manifest( scan_id=MagicMock(), @@ -138,7 +154,7 @@ async def test_40__waiting_on_scanner_server_startup( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc) == state + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -152,6 +168,7 @@ async def test_40__waiting_on_scanner_server_startup( async def test_50__prestartup(ewms_dtype: str | None, state: str) -> None: """Test normal and stopped varriants.""" ewms_rc = MagicMock() + results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=False))) manifest = schema.Manifest( scan_id=MagicMock(), @@ -166,4 +183,4 @@ async def test_50__prestartup(ewms_dtype: str | None, state: str) -> None: ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc) == state + assert await get_scan_state(manifest, ewms_rc, results) == state From 7e95352726e70168da4bb1ee755085471aeb9b47 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 16:48:29 -0600 Subject: [PATCH 159/327] fix docker tag finagling test --- tests/integration/test_rest_routes.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 01baf1e5..b8a7595d 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -70,6 +70,7 @@ async def _launch_scan( rc: RestClient, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, + docker_tag_expected: str, ) -> dict: # launch scan launch_time = time.time() @@ -126,6 +127,7 @@ async def _launch_scan( mongo_client, post_scan_body, post_resp, + docker_tag_expected, ) await _assert_db_skyscank8sjobs_coll( mongo_client, @@ -142,6 +144,7 @@ async def _assert_db_scanrequests_coll( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, post_resp: dict, + docker_tag_expected: str, ) -> str: """Query the ScanRequests coll. @@ -155,7 +158,7 @@ async def _assert_db_scanrequests_coll( scan_id=post_resp["scan_id"], rescan_ids=[], # - docker_tag=os.environ["LATEST_TAG"], + docker_tag=docker_tag_expected, # # skyscan server config scanner_server_memory_bytes=humanfriendly.parse_size("1024M"), @@ -796,6 +799,7 @@ async def test_000( "docker_tag": docker_tag_input, "cluster": clusters, }, + docker_tag_expected, ) await _after_scan_start_logic( @@ -959,6 +963,7 @@ async def test_010__rescan( "docker_tag": "3.4.0", "cluster": clusters, }, + "3.4.0", ) await _after_scan_start_logic( rc, @@ -1088,6 +1093,7 @@ async def test_100__bad_data( rc, mongo_client, POST_SCAN_BODY_FOR_TEST_01, + os.environ["LATEST_TAG"], ) scan_id = manifest["scan_id"] # follow-up query From e537b70aa45baa67e2303132fb8a4b1b5d736a74 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 16:57:17 -0600 Subject: [PATCH 160/327] fix docker tag finagling test - 2 --- tests/integration/test_rest_routes.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index b8a7595d..51f88798 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -135,6 +135,7 @@ async def _launch_scan( post_resp, scanner_server_args, rest_address, + docker_tag_expected, ) return post_resp # type: ignore[no-any-return] @@ -195,6 +196,7 @@ async def _assert_db_skyscank8sjobs_coll( post_resp: dict, scanner_server_args: str, rest_address: str, + docker_tag_expected: str, ): # query the SkyScanK8sJobs coll # -> since the scanner-server metadata is no longer stored in the manifest @@ -253,7 +255,7 @@ async def _assert_db_skyscank8sjobs_coll( "scanner_server_env" ].items() ], - "image": f"icecube/skymap_scanner:{os.environ['LATEST_TAG']}", + "image": f"icecube/skymap_scanner:{docker_tag_expected}", "name": f'skyscan-server-{post_resp["scan_id"]}', "resources": { "limits": {"cpu": "1", "memory": "1024000000"}, From 4f95fa6bd9792f6978e03b7084cff0cfb30dabc3 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 17:15:09 -0600 Subject: [PATCH 161/327] fix docker tag finagling test - 3 --- .github/workflows/wipac-cicd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index e7125f0a..9b5192e0 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -21,7 +21,7 @@ env: S3_SECRET_KEY__K8S_SECRET_KEY: cdf7c60b S3_BUCKET: 72017610 K8S_SECRET_NAME: super-secrets - MIN_SKYMAP_SCANNER_TAG: "v3.21.2" # TODO: remove once skyscan v4 is out (that's the real min) + MIN_SKYMAP_SCANNER_TAG: "v3.1.2" # TODO: remove once skyscan v4 is out (that's the real min) jobs: From 7866d5bae76e54c75ba71e49327660fc5856c10b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 17:21:35 -0600 Subject: [PATCH 162/327] tests: `cluster` typing --- tests/integration/test_rest_routes.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 51f88798..13a651d5 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -171,7 +171,11 @@ async def _assert_db_scanrequests_coll( classifiers=post_scan_body["classifiers"], # # cluster (condor) config - request_clusters=list([k, v] for k, v in post_scan_body["cluster"].items()), + request_clusters=( + list([k, v] for k, v in post_scan_body["cluster"].items()) + if isinstance(post_scan_body["cluster"], dict) + else post_scan_body["cluster"] + ), worker_memory_bytes=humanfriendly.parse_size("8GB"), worker_disk_bytes=humanfriendly.parse_size("1GB"), max_pixel_reco_time=post_scan_body["max_pixel_reco_time"], From 823279c9256b277946fc8b9b084f1c37d6b79d72 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 17:34:36 -0600 Subject: [PATCH 163/327] fix rescan (merge fix) --- skydriver/rest_handlers.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index a93b3fb3..87c2093d 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -5,7 +5,6 @@ import dataclasses as dc import json import logging -import pickle import re import time import uuid @@ -616,25 +615,24 @@ async def post(self, scan_id: str) -> None: new_scan_id = uuid.uuid4().hex # grab the original requester's 'scan_request_obj' - doc = await self.scan_request_coll.find_one_and_update( + scan_request_obj = await self.scan_request_coll.find_one_and_update( {"scan_id": scan_id}, {"$push": {"rescan_ids": new_scan_id}}, return_document=ReturnDocument.AFTER, ) # -> backup plan: was this scan_id actually a rescan itself? - if not doc: - doc = await self.scan_request_coll.find_one_and_update( + if not scan_request_obj: + scan_request_obj = await self.scan_request_coll.find_one_and_update( {"rescan_ids": scan_id}, # one in a list {"$push": {"rescan_ids": new_scan_id}}, return_document=ReturnDocument.AFTER, ) # -> error: couldn't find it anywhere - if not doc: + if not scan_request_obj: raise web.HTTPError( 404, log_message="Could not find original scan-request information to start a rescan", ) - scan_request_obj = pickle.loads(doc["scan_request_obj_pkl"]) # add to 'classifiers' so the user has provenance info scan_request_obj["classifiers"].update( From 9ae3734407fc6d31b55676a73d1a71b6685f9e16 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 28 Jan 2025 17:42:15 -0600 Subject: [PATCH 164/327] test: rescan test fix (wip) --- tests/integration/test_rest_routes.py | 67 +-------------------------- 1 file changed, 1 insertion(+), 66 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 13a651d5..7cb0b717 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -1,7 +1,6 @@ """Integration tests for the REST server.""" import asyncio -import copy import logging import os import pprint @@ -886,70 +885,6 @@ async def _after_scan_start_logic( POST_SCAN_BODY_FOR_TEST_01 = dict(**POST_SCAN_BODY, cluster={"foobar": 1}) -def _assert_manifests_equal_with_normalization( - manifest_beta: dict, manifest_alpha: dict -): - """ - Asserts that specific keys in two manifests are equal after normalization. - Handles dynamically generated fields such as UUIDs and scan IDs. - - Args: - manifest_beta (dict): The first manifest to compare. - manifest_alpha (dict): The second manifest to compare. - - Raises: - AssertionError: If any of the specified keys are not equal after normalization. - """ - keys_to_compare = [ - "i3_event_id", - "ewms_task", - "priority", - "scanner_server_args", - ] - - def normalize_ewms_task(ewms_task: dict) -> dict: - """ - Normalizes the `ewms_task` dictionary by redacting specific dynamic sub-keys. - """ - normalized = copy.deepcopy(ewms_task) - - # Normalize `env_vars.scanner_server` - for dicto in normalized["env_vars"]["scanner_server"]: - if dicto["name"] == "SKYSCAN_SKYDRIVER_SCAN_ID": - dicto["value"] = "" - # Normalize `env_vars.scanner_server` - for listo in normalized["env_vars"]["tms_starters"]: - for dicto in listo: - if dicto["name"] == "SKYSCAN_SKYDRIVER_SCAN_ID": - dicto["value"] = "" - - # Normalize `tms_args` - normalized["tms_args"] = [ - re.sub(r"--uuid [a-f0-9\-]+", "--uuid ", arg) - for arg in normalized["tms_args"] - ] - - return normalized - - for key in keys_to_compare: - if key == "ewms_task": - normalized_beta = normalize_ewms_task(manifest_beta[key]) - normalized_alpha = normalize_ewms_task(manifest_alpha[key]) - assert normalized_beta == normalized_alpha, ( - f"Mismatch in key '{key}':\n" - f"Beta: {normalized_beta}\n" - f"Alpha: {normalized_alpha}" - ) - else: - assert manifest_beta[key] == manifest_alpha[key], ( - f"Mismatch in key '{key}':\n" - f"Beta: {manifest_beta.get(key)}\n" - f"Alpha: {manifest_alpha.get(key)}" - ) - - assert manifest_beta["timestamp"] > manifest_alpha["timestamp"] - - async def test_010__rescan( server: Callable[[], RestClient], known_clusters: dict, @@ -987,7 +922,7 @@ async def test_010__rescan( **manifest_alpha["classifiers"], **{"rescan": True, "origin_scan_id": manifest_alpha["scan_id"]}, } - _assert_manifests_equal_with_normalization(manifest_beta, manifest_alpha) + assert manifest_beta == manifest_alpha # continue on... await _after_scan_start_logic( rc, From 849156484d85115087b023c1d411a5586eeb4ab0 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 11:16:45 -0600 Subject: [PATCH 165/327] tests: update rescan --- tests/integration/test_rest_routes.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 7cb0b717..99767970 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -922,7 +922,12 @@ async def test_010__rescan( **manifest_alpha["classifiers"], **{"rescan": True, "origin_scan_id": manifest_alpha["scan_id"]}, } - assert manifest_beta == manifest_alpha + skip_keys = ["classifiers", "scan_id", "last_updated", "timestamp"] + assert {k: v for k, v in manifest_beta.items() if k not in skip_keys} == { + k: v for k, v in manifest_alpha.items() if k not in skip_keys + } + for sk in skip_keys: + assert manifest_beta[sk] != manifest_alpha[sk] # continue on... await _after_scan_start_logic( rc, From 827792ae2b96eb672f897bc80625e27b2bdfff9b Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 20:24:01 +0000 Subject: [PATCH 166/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index e68e5f75..4622bd94 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -14,7 +14,7 @@ certifi==2024.12.14 cffi==1.17.1 charset-normalizer==3.4.1 cryptography==44.0.0 -dacite==1.8.1 +dacite==1.9.1 dnspython==2.7.0 durationpy==0.9 google-auth==2.38.0 @@ -69,7 +69,7 @@ skydriver-s3-sidecar-ewms-init-container │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] -├── dacite [required: Any, installed: 1.8.1] +├── dacite [required: Any, installed: 1.9.1] ├── humanfriendly [required: Any, installed: 10.0] ├── kubernetes [required: Any, installed: 32.0.0] │ ├── certifi [required: >=14.05.14, installed: 2024.12.14] From 1d6c02e0a79aea7d13dfe7fdb2e580c3a5bc3bfd Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 14:33:59 -0600 Subject: [PATCH 167/327] fix rescan backlog fetching --- skydriver/k8s/scan_backlog.py | 11 ++++++++--- skydriver/rest_handlers.py | 18 +++++++++--------- 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index bfdb5a75..d346903f 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -69,7 +69,12 @@ async def get_next( # grab the scan request object--it has other info scan_request_obj = await scan_request_client.find_one( # type: ignore[attr-defined] - {"scan_id": manifest.scan_id} + { + "$or": [ + {"scan_id": manifest.scan_id}, + {"rescan_ids": manifest.scan_id}, # one in a list + ] + } ) # grab the k8s @@ -86,7 +91,7 @@ async def run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, ewms_rc: RestClient, - s3_client: botocore.client.BaseClient, + s3_client: botocore.client.BaseClient, ) -> None: """Error-handling around the scan backlog runner loop.""" LOGGER.info("Started scan backlog runner.") @@ -108,7 +113,7 @@ async def _run( mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, ewms_rc: RestClient, - s3_client: botocore.client.BaseClient, + s3_client: botocore.client.BaseClient, ) -> None: """The (actual) main loop.""" manifest_client = database.interface.ManifestClient(mongo_client) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 87c2093d..e45d5f12 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -614,19 +614,19 @@ async def post(self, scan_id: str) -> None: # generate unique scan_id new_scan_id = uuid.uuid4().hex - # grab the original requester's 'scan_request_obj' + # grab the 'scan_request_obj' scan_request_obj = await self.scan_request_coll.find_one_and_update( - {"scan_id": scan_id}, + { + "$or": [ + # grab the original requester's 'scan_request_obj' + {"scan_id": scan_id}, + # -> backup plan: was this scan_id actually a rescan itself? + {"rescan_ids": scan_id}, # one in a list + ] + }, {"$push": {"rescan_ids": new_scan_id}}, return_document=ReturnDocument.AFTER, ) - # -> backup plan: was this scan_id actually a rescan itself? - if not scan_request_obj: - scan_request_obj = await self.scan_request_coll.find_one_and_update( - {"rescan_ids": scan_id}, # one in a list - {"$push": {"rescan_ids": new_scan_id}}, - return_document=ReturnDocument.AFTER, - ) # -> error: couldn't find it anywhere if not scan_request_obj: raise web.HTTPError( From fc6816abc71b4133228b71018e242e6612606a7c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 14:35:27 -0600 Subject: [PATCH 168/327] pin `dacite<1.9` --- setup.cfg | 79 +++++++++++++++++++++++++++---------------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6174fed2..09ec6421 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.10 python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,51 +26,50 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools + aiocache + boto3 + dacite<1.9 + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools python_requires = >=3.10, <3.14 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio - flask + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples - + test + tests + doc + docs + resource + resources + example + examples From a4f7e2efb33a5346847c16b7928dd5dc55b6c05a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 14:36:15 -0600 Subject: [PATCH 169/327] py 3.13 only --- .github/workflows/wipac-cicd.yml | 4 ++++ setup.cfg | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 9b5192e0..5ee8671b 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -177,6 +177,10 @@ jobs: - 27017:27017 steps: - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.py3 }} + - uses: docker/setup-buildx-action@v2 - uses: docker/build-push-action@v3 with: diff --git a/setup.cfg b/setup.cfg index 09ec6421..2bb26371 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [wipac:cicd_setup_builder] -python_min = 3.10 +python_min = 3.13 python_max = 3.13 patch_without_tag = False package_dirs = From 9f3a852a802c790a1445e448f84c4320b24dc05b Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 20:36:46 +0000 Subject: [PATCH 170/327] update setup.cfg --- setup.cfg | 81 ++++++++++++++++++++++++++++--------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/setup.cfg b/setup.cfg index 2bb26371..d84b44ed 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,15 +3,15 @@ python_min = 3.13 python_max = 3.13 patch_without_tag = False package_dirs = - skydriver - s3_sidecar - ewms_init_container + skydriver + s3_sidecar + ewms_init_container [metadata] # generated by wipac:cicd_setup_builder: name, version, keywords version = attr: skydriver.__version__ keywords = - WIPAC - IceCube + WIPAC + IceCube name = skydriver-s3-sidecar-ewms-init-container [semantic_release] # fully-generated by wipac:cicd_setup_builder @@ -26,50 +26,51 @@ branch = main [options] # generated by wipac:cicd_setup_builder: python_requires, packages install_requires = - aiocache - boto3 - dacite<1.9 - humanfriendly - kubernetes - motor==3.3.2 - pymongo==4.6.1 - pyyaml - requests - tornado - typeguard - wipac-dev-tools - wipac-rest-tools -python_requires = >=3.10, <3.14 + aiocache + boto3 + dacite<1.9 + humanfriendly + kubernetes + motor==3.3.2 + pymongo==4.6.1 + pyyaml + requests + tornado + typeguard + wipac-dev-tools + wipac-rest-tools +python_requires = >=3.13, <3.14 packages = find: [options.extras_require] tests = - pytest - pytest-asyncio - pytest-mock - nest-asyncio - flask + pytest + pytest-asyncio + pytest-mock + nest-asyncio + flask mypy = - %(tests)s - texttable + %(tests)s + texttable [options.package_data] # generated by wipac:cicd_setup_builder: '*' * = py.typed [options.packages.find] # generated by wipac:cicd_setup_builder: include/exclude include = - skydriver - s3_sidecar - ewms_init_container - skydriver.* - s3_sidecar.* - ewms_init_container.* + skydriver + s3_sidecar + ewms_init_container + skydriver.* + s3_sidecar.* + ewms_init_container.* exclude = - test - tests - doc - docs - resource - resources - example - examples + test + tests + doc + docs + resource + resources + example + examples + From 09bd9f9433591fd155f0e647ea990542271362be Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 20:41:49 +0000 Subject: [PATCH 171/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 4622bd94..007a8fa9 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -14,7 +14,7 @@ certifi==2024.12.14 cffi==1.17.1 charset-normalizer==3.4.1 cryptography==44.0.0 -dacite==1.9.1 +dacite==1.8.1 dnspython==2.7.0 durationpy==0.9 google-auth==2.38.0 @@ -69,7 +69,7 @@ skydriver-s3-sidecar-ewms-init-container │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] -├── dacite [required: Any, installed: 1.9.1] +├── dacite [required: <1.9, installed: 1.8.1] ├── humanfriendly [required: Any, installed: 10.0] ├── kubernetes [required: Any, installed: 32.0.0] │ ├── certifi [required: >=14.05.14, installed: 2024.12.14] From 4d84ca3e158376714e1aba1665942c35cf184551 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 15:33:17 -0600 Subject: [PATCH 172/327] continue redefining what finished/completed scan is - wip --- skydriver/rest_handlers.py | 17 +++++++---------- skydriver/utils.py | 19 +++++++++++++------ tests/unit/test_scan_state.py | 13 +++++++------ 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index e45d5f12..190c85a1 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -26,7 +26,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, ewms, images, k8s, utils +from . import database, ewms, images, k8s from .config import ( DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_MAX_WORKER_RUNTIME, @@ -741,11 +741,8 @@ async def delete(self, scan_id: str) -> None: # check DB states manifest = await self.manifests.get(scan_id, True) - if ( - manifest.progress - and manifest.progress.processing_stats.finished - and not args.delete_completed_scan - ): + _, scan_complete = await get_scan_state(manifest, self.ewms_rc, self.results) + if scan_complete: msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( 400, @@ -1049,7 +1046,9 @@ async def get(self, scan_id: str) -> None: LOGGER.exception(e) # scan state - scan_state = await get_scan_state(manifest, self.ewms_rc, self.results) + scan_state, scan_complete = await get_scan_state( + manifest, self.ewms_rc, self.results + ) # ewms if ( @@ -1066,9 +1065,7 @@ async def get(self, scan_id: str) -> None: resp = { "scan_state": scan_state, "is_deleted": manifest.is_deleted, - "scan_complete": bool( - scan_state == utils._ScanState.SCAN_FINISHED_SUCCESSFULLY.name - ), + "scan_complete": scan_complete, "pods": pods_411, "clusters": clusters, } diff --git a/skydriver/utils.py b/skydriver/utils.py index 7ba5a083..ec3900e1 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -18,6 +18,7 @@ class _ScanState(enum.Enum): IN_PROGRESS__PARTIAL_RESULT_GENERATED = enum.auto() IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO = enum.auto() + PENDING__WAITING_ON_SCANNER_SERVER_STARTUP = enum.auto() PENDING__PRESTARTUP = enum.auto() @@ -26,11 +27,16 @@ async def get_scan_state( manifest: Manifest, ewms_rc: RestClient, results: database.interface.ResultClient, -) -> str: - """Determine the state of the scan by parsing attributes and talking with EWMS.""" +) -> tuple[str, bool]: + """Determine the state of the scan by parsing attributes and talking with EWMS. + + Returns tuple: + 1. the state as a human-readable string + 2. a bool for whether the scan was successful or not + """ if (await results.get(manifest.scan_id)).is_final: # NOTE: see note on 'SCAN_FINISHED_SUCCESSFULLY' above - return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name + return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name, True def _has_cleared_backlog() -> bool: return bool( @@ -73,11 +79,12 @@ def get_nonfinished_state() -> _ScanState: and isinstance(manifest.ewms_task, dict) and manifest.ewms_task.get("complete") ): - return f"STOPPED__{state.split('__')[1]}" # we didn't have info on what kind of stop + # we didn't have info on what kind of stop + return f"STOPPED__{state.split('__')[1]}", False # has EWMS ceased running the scan workers? elif dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id): # -> yes, the ewms workflow has been deactivated - return f"{dtype.upper()}__{state.split('__')[1]}" + return f"{dtype.upper()}__{state.split('__')[1]}", False else: # -> no, this is a non-finished scan - return state + return state, False diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index 2c5b05a3..d38b445e 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -41,8 +41,9 @@ async def test_00__scan_finished_successfully( ), ) - assert ( - await get_scan_state(manifest, ewms_rc, results) == "SCAN_FINISHED_SUCCESSFULLY" + assert await get_scan_state(manifest, ewms_rc, results) == ( + "SCAN_FINISHED_SUCCESSFULLY", + True, ) @@ -82,7 +83,7 @@ async def test_10__partial_result_generated(ewms_dtype: str | None, state: str) ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == state + assert await get_scan_state(manifest, ewms_rc, results) == (state, False) @pytest.mark.parametrize( @@ -123,7 +124,7 @@ async def test_20__waiting_on_first_pixel_reco( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == state + assert await get_scan_state(manifest, ewms_rc, results) == (state, False) @pytest.mark.parametrize( @@ -154,7 +155,7 @@ async def test_40__waiting_on_scanner_server_startup( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == state + assert await get_scan_state(manifest, ewms_rc, results) == (state, False) @pytest.mark.parametrize( @@ -183,4 +184,4 @@ async def test_50__prestartup(ewms_dtype: str | None, state: str) -> None: ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == state + assert await get_scan_state(manifest, ewms_rc, results) == (state, False) From bdee3d9ba40cacecf5bbe3b72b75a2b957f76998 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 15:43:12 -0600 Subject: [PATCH 173/327] continue redefining what finished/completed scan is - final --- README.md | 4 ++-- skydriver/rest_handlers.py | 13 ++++++------- skydriver/utils.py | 23 +++++++++++++---------- tests/unit/test_scan_state.py | 19 ++++++++----------- 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 088271b4..822b0a8d 100644 --- a/README.md +++ b/README.md @@ -325,8 +325,8 @@ None There are several codes for `scan_state`: -- Successful state - * `SCAN_FINISHED_SUCCESSFULLY` +- Successful state (completed scan) + * `SCAN_HAS_FINAL_RESULT` - Non-finished scan states (in reverse order of occurrence) * `IN_PROGRESS__PARTIAL_RESULT_GENERATED` * `IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO` diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 190c85a1..91014e01 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -42,7 +42,7 @@ from .ewms import request_stop_on_ewms from .k8s.scan_backlog import put_on_backlog from .k8s.scanner_instance import SkyScanK8sJobFactory -from .utils import get_scan_state +from .utils import does_scan_state_indicate_final_result_received, get_scan_state LOGGER = logging.getLogger(__name__) @@ -741,8 +741,9 @@ async def delete(self, scan_id: str) -> None: # check DB states manifest = await self.manifests.get(scan_id, True) - _, scan_complete = await get_scan_state(manifest, self.ewms_rc, self.results) - if scan_complete: + if does_scan_state_indicate_final_result_received( + await get_scan_state(manifest, self.ewms_rc, self.results) + ): msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( 400, @@ -1046,9 +1047,7 @@ async def get(self, scan_id: str) -> None: LOGGER.exception(e) # scan state - scan_state, scan_complete = await get_scan_state( - manifest, self.ewms_rc, self.results - ) + scan_state = await get_scan_state(manifest, self.ewms_rc, self.results) # ewms if ( @@ -1065,7 +1064,7 @@ async def get(self, scan_id: str) -> None: resp = { "scan_state": scan_state, "is_deleted": manifest.is_deleted, - "scan_complete": scan_complete, + "scan_complete": does_scan_state_indicate_final_result_received(scan_state), "pods": pods_411, "clusters": clusters, } diff --git a/skydriver/utils.py b/skydriver/utils.py index ec3900e1..6e43f095 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -11,7 +11,7 @@ class _ScanState(enum.Enum): """A non-persisted scan state.""" - SCAN_FINISHED_SUCCESSFULLY = enum.auto() + SCAN_HAS_FINAL_RESULT = enum.auto() # ^^^ indicates the scanner sent finished results. in reality, the scanner or ewms # could've crashed immediately after BUT the user only cares about the RESULTS--so, # this would still be considered a SUCCESS in *this* context @@ -23,20 +23,23 @@ class _ScanState(enum.Enum): PENDING__PRESTARTUP = enum.auto() +def does_scan_state_indicate_final_result_received(state: str) -> bool: + """Does the scan state indicate has result?""" + return state == _ScanState.SCAN_HAS_FINAL_RESULT.value + + async def get_scan_state( manifest: Manifest, ewms_rc: RestClient, results: database.interface.ResultClient, -) -> tuple[str, bool]: +) -> str: """Determine the state of the scan by parsing attributes and talking with EWMS. - Returns tuple: - 1. the state as a human-readable string - 2. a bool for whether the scan was successful or not + Returns the state as a human-readable string """ if (await results.get(manifest.scan_id)).is_final: - # NOTE: see note on 'SCAN_FINISHED_SUCCESSFULLY' above - return _ScanState.SCAN_FINISHED_SUCCESSFULLY.name, True + # NOTE: see note on 'SCAN_HAS_FINAL_RESULT' above + return _ScanState.SCAN_HAS_FINAL_RESULT.name def _has_cleared_backlog() -> bool: return bool( @@ -80,11 +83,11 @@ def get_nonfinished_state() -> _ScanState: and manifest.ewms_task.get("complete") ): # we didn't have info on what kind of stop - return f"STOPPED__{state.split('__')[1]}", False + return f"STOPPED__{state.split('__')[1]}" # has EWMS ceased running the scan workers? elif dtype := await ewms.get_deactivated_type(ewms_rc, manifest.ewms_workflow_id): # -> yes, the ewms workflow has been deactivated - return f"{dtype.upper()}__{state.split('__')[1]}", False + return f"{dtype.upper()}__{state.split('__')[1]}" else: # -> no, this is a non-finished scan - return state, False + return state diff --git a/tests/unit/test_scan_state.py b/tests/unit/test_scan_state.py index d38b445e..e154247a 100644 --- a/tests/unit/test_scan_state.py +++ b/tests/unit/test_scan_state.py @@ -13,12 +13,12 @@ "processing_stats_is_finished", [True, False], ) -async def test_00__scan_finished_successfully( +async def test_00__scan_has_final_result( processing_stats_is_finished: bool, ) -> None: - """Test with SCAN_FINISHED_SUCCESSFULLY. + """Test with SCAN_HAS_FINAL_RESULT. - `processing_stats.is_finished` does not affect "SCAN_FINISHED_SUCCESSFULLY" + `processing_stats.is_finished` does not affect "SCAN_HAS_FINAL_RESULT" """ ewms_rc = MagicMock() results = MagicMock(get=AsyncMock(return_value=MagicMock(is_final=True))) @@ -41,10 +41,7 @@ async def test_00__scan_finished_successfully( ), ) - assert await get_scan_state(manifest, ewms_rc, results) == ( - "SCAN_FINISHED_SUCCESSFULLY", - True, - ) + assert await get_scan_state(manifest, ewms_rc, results) == "SCAN_HAS_FINAL_RESULT" @pytest.mark.parametrize( @@ -83,7 +80,7 @@ async def test_10__partial_result_generated(ewms_dtype: str | None, state: str) ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == (state, False) + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -124,7 +121,7 @@ async def test_20__waiting_on_first_pixel_reco( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == (state, False) + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -155,7 +152,7 @@ async def test_40__waiting_on_scanner_server_startup( ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == (state, False) + assert await get_scan_state(manifest, ewms_rc, results) == state @pytest.mark.parametrize( @@ -184,4 +181,4 @@ async def test_50__prestartup(ewms_dtype: str | None, state: str) -> None: ) with patch("skydriver.ewms.get_deactivated_type", return_value=ewms_dtype): - assert await get_scan_state(manifest, ewms_rc, results) == (state, False) + assert await get_scan_state(manifest, ewms_rc, results) == state From 8a43972f984540a07cd51b5251bc8e0bce2fb5a2 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 29 Jan 2025 21:47:55 +0000 Subject: [PATCH 174/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 007a8fa9..e3117f5c 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.8 -botocore==1.36.8 +boto3==1.36.9 +botocore==1.36.9 cachetools==5.5.1 certifi==2024.12.14 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.8] -│ ├── botocore [required: >=1.36.8,<1.37.0, installed: 1.36.8] +├── boto3 [required: Any, installed: 1.36.9] +│ ├── botocore [required: >=1.36.9,<1.37.0, installed: 1.36.9] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.8] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.9] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 978b5e8c1ed0ca8cca95efe1036649baed0f7830 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 16:08:01 -0600 Subject: [PATCH 175/327] continue redefining what finished/completed scan is - 2 --- skydriver/utils.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index 6e43f095..14675a98 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -3,6 +3,7 @@ import enum from rest_tools.client import RestClient +from tornado import web from . import database, ewms from .database.schema import DEPRECATED_EWMS_TASK, Manifest, PENDING_EWMS_WORKFLOW @@ -37,9 +38,14 @@ async def get_scan_state( Returns the state as a human-readable string """ - if (await results.get(manifest.scan_id)).is_final: - # NOTE: see note on 'SCAN_HAS_FINAL_RESULT' above - return _ScanState.SCAN_HAS_FINAL_RESULT.name + try: + if (await results.get(manifest.scan_id)).is_final: + # NOTE: see note on 'SCAN_HAS_FINAL_RESULT' above + return _ScanState.SCAN_HAS_FINAL_RESULT.name + except web.HTTPError as e: + # get() raises 404 when no result found + if e.status_code != 404: + raise def _has_cleared_backlog() -> bool: return bool( From ec9344aa88d6d3fafa8ebe7b74de53a39528d704 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 16:10:24 -0600 Subject: [PATCH 176/327] flake8 --- skydriver/utils.py | 70 ++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 34 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index 14675a98..7cfa0f2b 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -29,6 +29,41 @@ def does_scan_state_indicate_final_result_received(state: str) -> bool: return state == _ScanState.SCAN_HAS_FINAL_RESULT.value +def _has_cleared_backlog(manifest: Manifest) -> bool: + return bool( + ( # has a real workflow id + manifest.ewms_workflow_id + and manifest.ewms_workflow_id != PENDING_EWMS_WORKFLOW + ) + or ( # backward compatibility... + manifest.ewms_task != DEPRECATED_EWMS_TASK + and isinstance(manifest.ewms_task, dict) + and manifest.ewms_task.get("clusters") + ) + ) + + +def _get_nonfinished_state(manifest: Manifest) -> _ScanState: + """Get the ScanState of the scan, only by parsing attributes.""" + # has scan cleared the backlog? (aka, has been *submitted* EWMS?) + if _has_cleared_backlog(manifest): + # has the scanner server started? + if manifest.progress: + # how far along is the scanner server? + # seen some pixels -> aka clients have processed pixels + if manifest.progress.processing_stats.rate: + return _ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED + # 0% -> aka clients haven't finished any pixels (yet) + else: + return _ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO + # no -> hasn't started yet + else: + return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP + # no -> still in backlog (or aborted while in backlog) + else: + return _ScanState.PENDING__PRESTARTUP + + async def get_scan_state( manifest: Manifest, ewms_rc: RestClient, @@ -47,40 +82,7 @@ async def get_scan_state( if e.status_code != 404: raise - def _has_cleared_backlog() -> bool: - return bool( - ( # has a real workflow id - manifest.ewms_workflow_id - and manifest.ewms_workflow_id != PENDING_EWMS_WORKFLOW - ) - or ( # backward compatibility... - manifest.ewms_task != DEPRECATED_EWMS_TASK - and isinstance(manifest.ewms_task, dict) - and manifest.ewms_task.get("clusters") - ) - ) - - def get_nonfinished_state() -> _ScanState: - """Get the ScanState of the scan, only by parsing attributes.""" - # has scan cleared the backlog? (aka, has been *submitted* EWMS?) - if _has_cleared_backlog(): - # has the scanner server started? - if manifest.progress: - # how far along is the scanner server? - # seen some pixels -> aka clients have processed pixels - if manifest.progress.processing_stats.rate: - return _ScanState.IN_PROGRESS__PARTIAL_RESULT_GENERATED - # 0% -> aka clients haven't finished any pixels (yet) - else: - return _ScanState.IN_PROGRESS__WAITING_ON_FIRST_PIXEL_RECO - # no -> hasn't started yet - else: - return _ScanState.PENDING__WAITING_ON_SCANNER_SERVER_STARTUP - # no -> still in backlog (or aborted while in backlog) - else: - return _ScanState.PENDING__PRESTARTUP - - state = get_nonfinished_state().name # start here, augment if needed + state = _get_nonfinished_state(manifest).name # start here, augment if needed # AUGMENT STATUS... if ( # Backward Compatibility: is this an old/pre-ewms scan? From 6bcf730a17e76fb85991939a72149c0d37115408 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 16:15:27 -0600 Subject: [PATCH 177/327] continue redefining what finished/completed scan is - 3 --- skydriver/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index 7cfa0f2b..623cf9b0 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -25,8 +25,8 @@ class _ScanState(enum.Enum): def does_scan_state_indicate_final_result_received(state: str) -> bool: - """Does the scan state indicate has result?""" - return state == _ScanState.SCAN_HAS_FINAL_RESULT.value + """Has the scan ended with a final result?""" + return state == _ScanState.SCAN_HAS_FINAL_RESULT.name def _has_cleared_backlog(manifest: Manifest) -> bool: From b91fa1a85da815d71747db45383cc0a5869833dc Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 16:21:22 -0600 Subject: [PATCH 178/327] continue redefining what finished/completed scan is - 4 --- skydriver/rest_handlers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 91014e01..0454456b 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -741,8 +741,11 @@ async def delete(self, scan_id: str) -> None: # check DB states manifest = await self.manifests.get(scan_id, True) - if does_scan_state_indicate_final_result_received( - await get_scan_state(manifest, self.ewms_rc, self.results) + if ( + not args.delete_completed_scan + and does_scan_state_indicate_final_result_received( + await get_scan_state(manifest, self.ewms_rc, self.results) + ) ): msg = "Attempted to delete a completed scan (must use `delete_completed_scan=True`)" raise web.HTTPError( From fa7de703b89475f5e9d9d4de524ca73521b27238 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 17:22:39 -0600 Subject: [PATCH 179/327] misc --- resources/rewrite-db.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/rewrite-db.sh b/resources/rewrite-db.sh index 17b33d24..4cc5dfa4 100755 --- a/resources/rewrite-db.sh +++ b/resources/rewrite-db.sh @@ -1,4 +1,5 @@ #!/bin/bash +set -euo pipefail set -ex ######################################################################################## From c7ed96635ac11dbbbe6fe4cd221207f745a01244 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 29 Jan 2025 17:27:46 -0600 Subject: [PATCH 180/327] remove pylint comments --- skydriver/rest_handlers.py | 28 ++++++++++++------------ tests/integration/test_backlog_runner.py | 2 -- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 0454456b..94f915fa 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -117,10 +117,10 @@ def _arg_dict_strict(val: Any) -> dict: # handlers -class BaseSkyDriverHandler(RestHandler): # pylint: disable=W0223 +class BaseSkyDriverHandler(RestHandler): """BaseSkyDriverHandler is a RestHandler for all SkyDriver routes.""" - def initialize( # type: ignore # pylint: disable=W0221 + def initialize( # type: ignore self, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] k8s_batch_api: kubernetes.client.BatchV1Api, @@ -130,7 +130,7 @@ def initialize( # type: ignore # pylint: disable=W0221 ) -> None: """Initialize a BaseSkyDriverHandler object.""" super().initialize(*args, **kwargs) # type: ignore[no-untyped-call] - # pylint: disable=W0201 + self.manifests = database.interface.ManifestClient(mongo_client) self.results = database.interface.ResultClient(mongo_client) self.scan_backlog = database.interface.ScanBacklogClient(mongo_client) @@ -159,7 +159,7 @@ def initialize( # type: ignore # pylint: disable=W0221 # ---------------------------------------------------------------------------- -class MainHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class MainHandler(BaseSkyDriverHandler): """MainHandler is a BaseSkyDriverHandler that handles the root route.""" ROUTE = r"/$" @@ -173,7 +173,7 @@ async def get(self) -> None: # ----------------------------------------------------------------------------- -class ScansFindHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScansFindHandler(BaseSkyDriverHandler): """Handles finding scans by attributes.""" ROUTE = r"/scans/find$" @@ -217,7 +217,7 @@ async def post(self) -> None: # ----------------------------------------------------------------------------- -class ScanBacklogHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanBacklogHandler(BaseSkyDriverHandler): """Handles looking at backlog.""" ROUTE = r"/scans/backlog$" @@ -335,7 +335,7 @@ def _data_size_parse(val: Any) -> int: raise argparse.ArgumentTypeError("invalid data size") -class ScanLauncherHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanLauncherHandler(BaseSkyDriverHandler): """Handles starting new scans.""" ROUTE = r"/scan$" @@ -595,7 +595,7 @@ async def _start_scan( # ----------------------------------------------------------------------------- -class ScanRescanHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanRescanHandler(BaseSkyDriverHandler): """Handles actions on copying a scan's manifest and starting that.""" ROUTE = r"/scan/(?P\w+)/actions/rescan$" @@ -717,7 +717,7 @@ async def get_result_safely( # ----------------------------------------------------------------------------- -class ScanHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanHandler(BaseSkyDriverHandler): """Handles actions on scan's manifest.""" ROUTE = r"/scan/(?P\w+)$" @@ -804,7 +804,7 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanManifestHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanManifestHandler(BaseSkyDriverHandler): """Handles actions on scan's manifest.""" ROUTE = r"/scan/(?P\w+)/manifest$" @@ -900,7 +900,7 @@ def from_dict_wrapper_or_none(data_class: Type[T], val: Any) -> T | None: # ----------------------------------------------------------------------------- -class ScanI3EventHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanI3EventHandler(BaseSkyDriverHandler): """Handles grabbing i3 events using scan ids.""" ROUTE = r"/scan/(?P\w+)/i3-event$" @@ -942,7 +942,7 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanResultHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanResultHandler(BaseSkyDriverHandler): """Handles actions on persisted scan results.""" ROUTE = r"/scan/(?P\w+)/result$" @@ -1012,7 +1012,7 @@ async def put(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanStatusHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanStatusHandler(BaseSkyDriverHandler): """Handles relying statuses for scans.""" ROUTE = r"/scan/(?P\w+)/status$" @@ -1083,7 +1083,7 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanLogsHandler(BaseSkyDriverHandler): # pylint: disable=W0223 +class ScanLogsHandler(BaseSkyDriverHandler): """Handles relaying logs for scans.""" ROUTE = r"/scan/(?P\w+)/logs$" diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index e482c559..f29f3ee1 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -1,7 +1,5 @@ """Integration tests for backlog runner.""" -# pylint: disable=redefined-outer-name - import asyncio import json from typing import Any, Callable From 539e13a627f3204a673a70ccd64ab592e0d07d07 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 14:44:37 -0600 Subject: [PATCH 181/327] add fix from #127 --- skydriver/k8s/scan_backlog.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index d346903f..6d3b724e 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -190,3 +190,8 @@ async def _run( # remove from backlog now that startup succeeded await backlog_client.remove(entry) # TODO: remove k8s job doc? + + # wait so to not overwhelm resources (also, see `sleep()` at top) + await asyncio.sleep( + ENV.SCAN_BACKLOG_RUNNER_DELAY - ENV.SCAN_BACKLOG_RUNNER_SHORT_DELAY + ) From 67d8718fc26fecee1a2d1039f03dbdb4627f3d74 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 14:56:13 -0600 Subject: [PATCH 182/327] remove unused env vars --- skydriver/config.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index aa5d52e6..189276d2 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -104,11 +104,6 @@ class EnvConfig: SKYSCAN_LOG: Optional[str] = None SKYSCAN_LOG_THIRD_PARTY: Optional[str] = None - # EWMS (forwarded) - EWMS_PILOT_QUARANTINE_TIME: Optional[int] = None - EWMS_TMS_S3_BUCKET: str = "" - EWMS_TMS_S3_URL: str = "" - def __post_init__(self) -> None: object.__setattr__(self, "LOG_LEVEL", self.LOG_LEVEL.upper()) # b/c frozen From fbb5880ebe2c474762237319cff50c011731fd63 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 15:45:38 -0600 Subject: [PATCH 183/327] remove unused env vars - 2 --- skydriver/config.py | 2 -- skydriver/k8s/scanner_instance.py | 5 ----- tests/integration/test_rest_routes.py | 1 - 3 files changed, 8 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 189276d2..360c440e 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -91,8 +91,6 @@ class EnvConfig: # keycloak KEYCLOAK_OIDC_URL: str = "" - KEYCLOAK_CLIENT_ID_BROKER: str = "" - KEYCLOAK_CLIENT_SECRET_BROKER: str = "" KEYCLOAK_CLIENT_ID_SKYDRIVER_REST: str = "" KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST: str = "" diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 941f7f00..28ae5fdd 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -291,11 +291,6 @@ def make_skyscan_server_envvars( # 3. generate & add auth tokens tokens = { - "SKYSCAN_BROKER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( - ENV.KEYCLOAK_OIDC_URL, - ENV.KEYCLOAK_CLIENT_ID_BROKER, - ENV.KEYCLOAK_CLIENT_SECRET_BROKER, - ), "SKYSCAN_SKYDRIVER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( ENV.KEYCLOAK_OIDC_URL, ENV.KEYCLOAK_CLIENT_ID_SKYDRIVER_REST, diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 99767970..0a47a2eb 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -249,7 +249,6 @@ async def _assert_db_skyscank8sjobs_coll( "name": "SKYSCAN_MQ_CLIENT_LOG", "value": "WARNING", }, - {"name": "SKYSCAN_BROKER_AUTH", "value": ""}, {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, ] + [ # add those from 'post_scan_body' From 7db95fa5070061af5de022302d47185c5f6bece9 Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 30 Jan 2025 21:49:08 +0000 Subject: [PATCH 184/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index e3117f5c..022de6d1 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.9 -botocore==1.36.9 +boto3==1.36.10 +botocore==1.36.10 cachetools==5.5.1 certifi==2024.12.14 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.9] -│ ├── botocore [required: >=1.36.9,<1.37.0, installed: 1.36.9] +├── boto3 [required: Any, installed: 1.36.10] +│ ├── botocore [required: >=1.36.10,<1.37.0, installed: 1.36.10] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.9] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.10] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From c69c4eb77c46b88f5c21b6a0a35e4076010c4e4d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 18:24:26 -0600 Subject: [PATCH 185/327] add env vars for k8s resource requests --- skydriver/config.py | 17 ++++++++++++++--- skydriver/k8s/scanner_instance.py | 20 ++++++++++++++------ skydriver/rest_handlers.py | 3 +-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 360c440e..b3d61f8f 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -14,9 +14,6 @@ # Constants -DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES: int = humanfriendly.parse_size( - "1024M" -) DEFAULT_WORKER_MEMORY_BYTES: int = humanfriendly.parse_size("8GB") DEFAULT_WORKER_DISK_BYTES: int = humanfriendly.parse_size("1GB") DEFAULT_MAX_WORKER_RUNTIME = 4 * 60 * 60 @@ -88,6 +85,20 @@ class EnvConfig: K8S_APPLICATION_NAME: str = "" K8S_TTL_SECONDS_AFTER_FINISHED: int = 10 * 60 K8S_ACTIVE_DEADLINE_SECONDS: int = 24 * 60 * 60 + # + K8S_SCANNER_MEM_REQUEST: str = "1024M" # note: this is also used as the limit + K8S_SCANNER_CPU_LIMIT: float = 1.0 + K8S_SCANNER_CPU_REQUEST: float = 0.10 + # + K8S_SCANNER_INIT_MEM_LIMIT: str = "1Mi" + K8S_SCANNER_INIT_CPU_LIMIT: float = 0.05 + K8S_SCANNER_INIT_MEM_REQUEST: str = "1Mi" + K8S_SCANNER_INIT_CPU_REQUEST: float = 0.10 + # + K8S_SCANNER_SIDECAR_S3_MEM_LIMIT: str = "8Mi" + K8S_SCANNER_SIDECAR_S3_CPU_LIMIT: float = 0.05 + K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = "1Mi" + K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.10 # keycloak KEYCLOAK_OIDC_URL: str = "" diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 28ae5fdd..3f89d074 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -148,6 +148,14 @@ def _make_job( command: ["python", "-m", "ewms_init_container"] args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory._EWMS_JSON_FPATH}"] env: {_to_inline_yaml(init_ewms_envvars)} + resources: + limits: + memory: "{ENV.K8S_SCANNER_INIT_MEM_LIMIT}" + cpu: "{ENV.K8S_SCANNER_INIT_CPU_LIMIT}" + requests: + memory: "{ENV.K8S_SCANNER_INIT_MEM_REQUEST}" + cpu: "{ENV.K8S_SCANNER_INIT_CPU_REQUEST}" + ephemeral-storage: "1M" containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} @@ -157,10 +165,10 @@ def _make_job( resources: limits: memory: "{scanner_server_memory_bytes}" - cpu: "1" + cpu: "{ENV.K8S_SCANNER_CPU_LIMIT}" requests: memory: "{scanner_server_memory_bytes}" - cpu: "1" + cpu: "{ENV.K8S_SCANNER_CPU_REQUEST}" ephemeral-storage: "1M" volumeMounts: - name: common-space-volume @@ -189,11 +197,11 @@ def _make_job( value: "{s3.make_object_key(scan_id)}" resources: limits: - memory: "256Mi" - cpu: "0.25" + memory: "{ENV.K8S_SCANNER_SIDECAR_S3_MEM_LIMIT}" + cpu: "{ENV.K8S_SCANNER_SIDECAR_S3_CPU_LIMIT}" requests: - memory: "256Mi" - cpu: "0.25" + memory: "{ENV.K8S_SCANNER_SIDECAR_S3_MEM_REQUEST}" + cpu: "{ENV.K8S_SCANNER_SIDECAR_S3_CPU_REQUEST}" ephemeral-storage: "1M" volumeMounts: - name: common-space-volume diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 94f915fa..9a95448b 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -28,7 +28,6 @@ from . import database, ewms, images, k8s from .config import ( - DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, DEFAULT_MAX_WORKER_RUNTIME, DEFAULT_WORKER_DISK_BYTES, DEFAULT_WORKER_MEMORY_BYTES, @@ -354,7 +353,7 @@ async def post(self) -> None: arghand.add_argument( "scanner_server_memory", type=_data_size_parse, - default=DEFAULT_K8S_CONTAINER_MEMORY_SKYSCAN_SERVER_BYTES, + default=humanfriendly.parse_size(ENV.K8S_SCANNER_MEM_REQUEST), ) # client worker args arghand.add_argument( From da40324ebc834b8e5ccc9e867951b79bf20565c6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 18:26:09 -0600 Subject: [PATCH 186/327] remove unused vars --- skydriver/config.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index b3d61f8f..41d2f19e 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -22,12 +22,10 @@ K8S_CONTAINER_MEMORY_CLUSTER_STOPPER_BYTES: int = humanfriendly.parse_size("256M") K8S_CONTAINER_MEMORY_CLUSTER_STARTER_BYTES: int = humanfriendly.parse_size("256M") -CLUSTER_STOPPER_K8S_TTL_SECONDS_AFTER_FINISHED = 1 * 60 * 60 -CLUSTER_STOPPER_K8S_JOB_N_RETRIES = 6 - SCAN_MIN_PRIORITY_TO_START_ASAP = 100 -QUEUE_ALIAS_TOCLIENT = "to-client-queue" # this *needs* to stay constant, stored in db +# WARNING: these values must remain constant, they are cross-referenced in the db +QUEUE_ALIAS_TOCLIENT = "to-client-queue" # '' QUEUE_ALIAS_FROMCLIENT = "from-client-queue" # '' From 81396286e801eacfdfda2db88d005dcb2a5a634b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 18:26:48 -0600 Subject: [PATCH 187/327] remove unused vars - 2 --- skydriver/config.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 41d2f19e..231cf101 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -18,10 +18,6 @@ DEFAULT_WORKER_DISK_BYTES: int = humanfriendly.parse_size("1GB") DEFAULT_MAX_WORKER_RUNTIME = 4 * 60 * 60 -K8S_CONTAINER_MEMORY_DEFAULT_BYTES: int = humanfriendly.parse_size("64M") -K8S_CONTAINER_MEMORY_CLUSTER_STOPPER_BYTES: int = humanfriendly.parse_size("256M") -K8S_CONTAINER_MEMORY_CLUSTER_STARTER_BYTES: int = humanfriendly.parse_size("256M") - SCAN_MIN_PRIORITY_TO_START_ASAP = 100 # WARNING: these values must remain constant, they are cross-referenced in the db From 2dcea7f0a396128933c9f277b1cd007d2d6de93b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 30 Jan 2025 18:38:02 -0600 Subject: [PATCH 188/327] add env vars for ewms worker resources --- skydriver/config.py | 12 ++++++------ skydriver/ewms.py | 2 +- skydriver/rest_handlers.py | 11 ++++------- 3 files changed, 11 insertions(+), 14 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 231cf101..a1df6c8e 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -5,7 +5,6 @@ import logging from typing import Any, Optional -import humanfriendly from wipac_dev_tools import from_environment_as_dataclass, logging_tools sdict = dict[str, Any] @@ -14,10 +13,6 @@ # Constants -DEFAULT_WORKER_MEMORY_BYTES: int = humanfriendly.parse_size("8GB") -DEFAULT_WORKER_DISK_BYTES: int = humanfriendly.parse_size("1GB") -DEFAULT_MAX_WORKER_RUNTIME = 4 * 60 * 60 - SCAN_MIN_PRIORITY_TO_START_ASAP = 100 # WARNING: these values must remain constant, they are cross-referenced in the db @@ -80,7 +75,7 @@ class EnvConfig: K8S_TTL_SECONDS_AFTER_FINISHED: int = 10 * 60 K8S_ACTIVE_DEADLINE_SECONDS: int = 24 * 60 * 60 # - K8S_SCANNER_MEM_REQUEST: str = "1024M" # note: this is also used as the limit + K8S_SCANNER_MEM_REQUEST__DEFAULT: str = "1024M" # note: also used as the limit def. K8S_SCANNER_CPU_LIMIT: float = 1.0 K8S_SCANNER_CPU_REQUEST: float = 0.10 # @@ -94,6 +89,11 @@ class EnvConfig: K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = "1Mi" K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.10 + # EWMS optional config + EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" + EWMS_WORKER_DISK__DEFAULT: int = "1GB" + EWMS_MAX_WORKER_RUNTIME__DEFAULT: int = 4 * 60 * 60 # 4 hours + # keycloak KEYCLOAK_OIDC_URL: str = "" KEYCLOAK_CLIENT_ID_SKYDRIVER_REST: str = "" diff --git a/skydriver/ewms.py b/skydriver/ewms.py index b741087e..b005531b 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -74,7 +74,7 @@ async def request_workflow_on_ewms( }, "worker_config": { "do_transfer_worker_stdouterr": True, # toggle? - "max_worker_runtime": 6 * 60 * 60, # 6 hours + "max_worker_runtime": scan_request_obj["max_worker_runtime"], "n_cores": 1, "priority": scan_request_obj["priority"], "worker_disk": scan_request_obj["worker_disk_bytes"], diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 9a95448b..eb1e3efa 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -28,9 +28,6 @@ from . import database, ewms, images, k8s from .config import ( - DEFAULT_MAX_WORKER_RUNTIME, - DEFAULT_WORKER_DISK_BYTES, - DEFAULT_WORKER_MEMORY_BYTES, DebugMode, ENV, KNOWN_CLUSTERS, @@ -353,13 +350,13 @@ async def post(self) -> None: arghand.add_argument( "scanner_server_memory", type=_data_size_parse, - default=humanfriendly.parse_size(ENV.K8S_SCANNER_MEM_REQUEST), + default=humanfriendly.parse_size(ENV.K8S_SCANNER_MEM_REQUEST__DEFAULT), ) # client worker args arghand.add_argument( "worker_memory", type=_data_size_parse, - default=DEFAULT_WORKER_MEMORY_BYTES, + default=humanfriendly.parse_size(ENV.EWMS_WORKER_MEMORY__DEFAULT), ) arghand.add_argument( # NOTE - DEPRECATED "memory", @@ -375,7 +372,7 @@ async def post(self) -> None: arghand.add_argument( "worker_disk", type=_data_size_parse, - default=DEFAULT_WORKER_DISK_BYTES, + default=humanfriendly.parse_size(ENV.EWMS_WORKER_DISK__DEFAULT), ) arghand.add_argument( "cluster", @@ -415,7 +412,7 @@ async def post(self) -> None: arghand.add_argument( "max_worker_runtime", type=int, - default=DEFAULT_MAX_WORKER_RUNTIME, + default=ENV.EWMS_MAX_WORKER_RUNTIME__DEFAULT, ) arghand.add_argument( # TODO - remove when TMS is handling workforce-scaling From 8cf7a1f82e210101593a15382459328eb77a813b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 31 Jan 2025 14:32:10 -0600 Subject: [PATCH 189/327] set min k8s mem resources --- skydriver/config.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index a1df6c8e..04bee464 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -13,6 +13,9 @@ # Constants +K8S_MIN_MEM_LIMIT = "100M" +K8S_MIN_MEM_REQUEST = "10M" + SCAN_MIN_PRIORITY_TO_START_ASAP = 100 # WARNING: these values must remain constant, they are cross-referenced in the db @@ -79,14 +82,14 @@ class EnvConfig: K8S_SCANNER_CPU_LIMIT: float = 1.0 K8S_SCANNER_CPU_REQUEST: float = 0.10 # - K8S_SCANNER_INIT_MEM_LIMIT: str = "1Mi" + K8S_SCANNER_INIT_MEM_LIMIT: str = K8S_MIN_MEM_LIMIT K8S_SCANNER_INIT_CPU_LIMIT: float = 0.05 - K8S_SCANNER_INIT_MEM_REQUEST: str = "1Mi" + K8S_SCANNER_INIT_MEM_REQUEST: str = K8S_MIN_MEM_REQUEST K8S_SCANNER_INIT_CPU_REQUEST: float = 0.10 # - K8S_SCANNER_SIDECAR_S3_MEM_LIMIT: str = "8Mi" + K8S_SCANNER_SIDECAR_S3_MEM_LIMIT: str = K8S_MIN_MEM_LIMIT K8S_SCANNER_SIDECAR_S3_CPU_LIMIT: float = 0.05 - K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = "1Mi" + K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = K8S_MIN_MEM_REQUEST K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.10 # EWMS optional config From 5abe2635bad18d8336c219e05801e8ca0314d113 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 31 Jan 2025 20:35:50 +0000 Subject: [PATCH 190/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 022de6d1..f61d47ee 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -10,7 +10,7 @@ aiocache==0.12.3 boto3==1.36.10 botocore==1.36.10 cachetools==5.5.1 -certifi==2024.12.14 +certifi==2025.1.31 cffi==1.17.1 charset-normalizer==3.4.1 cryptography==44.0.0 @@ -72,7 +72,7 @@ skydriver-s3-sidecar-ewms-init-container ├── dacite [required: <1.9, installed: 1.8.1] ├── humanfriendly [required: Any, installed: 10.0] ├── kubernetes [required: Any, installed: 32.0.0] -│ ├── certifi [required: >=14.05.14, installed: 2024.12.14] +│ ├── certifi [required: >=14.05.14, installed: 2025.1.31] │ ├── durationpy [required: >=0.7, installed: 0.9] │ ├── google-auth [required: >=1.0.1, installed: 2.38.0] │ │ ├── cachetools [required: >=2.0.0,<6.0, installed: 5.5.1] @@ -85,14 +85,14 @@ skydriver-s3-sidecar-ewms-init-container │ │ └── six [required: >=1.5, installed: 1.17.0] │ ├── PyYAML [required: >=5.4.1, installed: 6.0.2] │ ├── requests [required: Any, installed: 2.32.3] -│ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] +│ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ ├── requests-oauthlib [required: Any, installed: 2.0.0] │ │ ├── oauthlib [required: >=3.0.0, installed: 3.2.2] │ │ └── requests [required: >=2.0.0, installed: 2.32.3] -│ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] +│ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] @@ -106,7 +106,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── dnspython [required: >=1.16.0,<3.0.0, installed: 2.7.0] ├── PyYAML [required: Any, installed: 6.0.2] ├── requests [required: Any, installed: 2.32.3] -│ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] +│ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] @@ -115,7 +115,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] ├── wipac-dev-tools [required: Any, installed: 1.15.1] │ ├── requests [required: Any, installed: 2.32.3] -│ │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] +│ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] @@ -125,13 +125,13 @@ skydriver-s3-sidecar-ewms-init-container ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] ├── requests [required: Any, installed: 2.32.3] - │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] + │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── requests-futures [required: Any, installed: 1.0.2] │ └── requests [required: >=1.2.0, installed: 2.32.3] - │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] + │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] @@ -139,7 +139,7 @@ skydriver-s3-sidecar-ewms-init-container ├── urllib3 [required: >=2.0.4, installed: 2.3.0] └── wipac-dev-tools [required: Any, installed: 1.15.1] ├── requests [required: Any, installed: 2.32.3] - │ ├── certifi [required: >=2017.4.17, installed: 2024.12.14] + │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] From 5dafee801c7af3fac7f5841e2ef86f7839c37227 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 31 Jan 2025 14:47:11 -0600 Subject: [PATCH 191/327] add env vars for ewms worker resources - 2 --- skydriver/config.py | 10 +++++----- tests/integration/test_rest_routes.py | 27 ++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/skydriver/config.py b/skydriver/config.py index 04bee464..9c7e2559 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -83,18 +83,18 @@ class EnvConfig: K8S_SCANNER_CPU_REQUEST: float = 0.10 # K8S_SCANNER_INIT_MEM_LIMIT: str = K8S_MIN_MEM_LIMIT - K8S_SCANNER_INIT_CPU_LIMIT: float = 0.05 + K8S_SCANNER_INIT_CPU_LIMIT: float = 0.10 K8S_SCANNER_INIT_MEM_REQUEST: str = K8S_MIN_MEM_REQUEST - K8S_SCANNER_INIT_CPU_REQUEST: float = 0.10 + K8S_SCANNER_INIT_CPU_REQUEST: float = 0.05 # K8S_SCANNER_SIDECAR_S3_MEM_LIMIT: str = K8S_MIN_MEM_LIMIT - K8S_SCANNER_SIDECAR_S3_CPU_LIMIT: float = 0.05 + K8S_SCANNER_SIDECAR_S3_CPU_LIMIT: float = 0.10 K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = K8S_MIN_MEM_REQUEST - K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.10 + K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.05 # EWMS optional config EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" - EWMS_WORKER_DISK__DEFAULT: int = "1GB" + EWMS_WORKER_DISK__DEFAULT: str = "1GB" EWMS_MAX_WORKER_RUNTIME__DEFAULT: int = 4 * 60 * 60 # 4 hours # keycloak diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 0a47a2eb..22631184 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -260,9 +260,12 @@ async def _assert_db_skyscank8sjobs_coll( "image": f"icecube/skymap_scanner:{docker_tag_expected}", "name": f'skyscan-server-{post_resp["scan_id"]}', "resources": { - "limits": {"cpu": "1", "memory": "1024000000"}, + "limits": { + "cpu": "1.0", + "memory": "1024000000", + }, "requests": { - "cpu": "1", + "cpu": "0.1", "ephemeral-storage": "1M", "memory": "1024000000", }, @@ -316,11 +319,14 @@ async def _assert_db_skyscank8sjobs_coll( "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"sidecar-s3-{post_resp['scan_id']}", "resources": { - "limits": {"cpu": "0.25", "memory": "256Mi"}, + "limits": { + "cpu": "0.1", + "memory": "100M", + }, "requests": { - "cpu": "0.25", + "cpu": "0.05", "ephemeral-storage": "1M", - "memory": "256Mi", + "memory": "10M", }, }, "restartPolicy": "OnFailure", @@ -373,6 +379,17 @@ async def _assert_db_skyscank8sjobs_coll( ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"init-ewms-{post_resp['scan_id']}", + "resources": { + "limits": { + "cpu": "0.1", + "memory": "100M", + }, + "requests": { + "cpu": "0.05", + "ephemeral-storage": "1M", + "memory": "10M", + }, + }, } ], "restartPolicy": "Never", From ece2a1b89a4ad3c86826cafb5c1b0b2582aeef88 Mon Sep 17 00:00:00 2001 From: github-actions Date: Fri, 31 Jan 2025 20:51:29 +0000 Subject: [PATCH 192/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index f61d47ee..63fa3789 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.10 -botocore==1.36.10 +boto3==1.36.11 +botocore==1.36.11 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.10] -│ ├── botocore [required: >=1.36.10,<1.37.0, installed: 1.36.10] +├── boto3 [required: Any, installed: 1.36.11] +│ ├── botocore [required: >=1.36.11,<1.37.0, installed: 1.36.11] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.10] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.11] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 0c8b7b592b27ac85ceef6b7d570202d17495ae6d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 31 Jan 2025 17:01:46 -0600 Subject: [PATCH 193/327] comment --- skydriver/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skydriver/config.py b/skydriver/config.py index 9c7e2559..e0607a22 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -96,6 +96,7 @@ class EnvConfig: EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" EWMS_WORKER_DISK__DEFAULT: str = "1GB" EWMS_MAX_WORKER_RUNTIME__DEFAULT: int = 4 * 60 * 60 # 4 hours + # note: other EWMS vars at top of class # keycloak KEYCLOAK_OIDC_URL: str = "" From 011a021c729bc8eef9c7c209acc738fc58bc9520 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 4 Feb 2025 19:42:41 +0000 Subject: [PATCH 194/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 63fa3789..5553d8ee 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.11 -botocore==1.36.11 +boto3==1.36.12 +botocore==1.36.12 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.11] -│ ├── botocore [required: >=1.36.11,<1.37.0, installed: 1.36.11] +├── boto3 [required: Any, installed: 1.36.12] +│ ├── botocore [required: >=1.36.12,<1.37.0, installed: 1.36.12] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.11] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.12] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 2df3bfcf87ca92a0669df9ffb4a1291c8af97183 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 14:06:16 -0600 Subject: [PATCH 195/327] fix docker tag verification --- skydriver/images.py | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/skydriver/images.py b/skydriver/images.py index 96f61b6e..ee98a2f2 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -85,7 +85,7 @@ def _parse_image_ts(info: dict) -> float: @cachetools.func.lru_cache() # cache it forever def min_skymap_scanner_tag_ts() -> float: """Get the timestamp for when the `MIN_SKYMAP_SCANNER_TAG` image was created.""" - info = get_info_from_docker_hub(ENV.MIN_SKYMAP_SCANNER_TAG.lstrip("v")) + info, _ = get_info_from_docker_hub(ENV.MIN_SKYMAP_SCANNER_TAG) return _parse_image_ts(info) @@ -107,11 +107,7 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: ValueError -- if `docker_tag` doesn't exist on Docker Hub ValueError -- if there's an issue communicating w/ Docker Hub API """ - info = get_info_from_docker_hub(docker_tag) - - if VERSION_REGEX_MAJMINPATCH.fullmatch(docker_tag): - return docker_tag - + info, docker_tag = get_info_from_docker_hub(docker_tag) # check that the image is not too old if _parse_image_ts(info) < min_skymap_scanner_tag_ts(): raise ValueError( @@ -119,6 +115,9 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: f"'{ENV.MIN_SKYMAP_SCANNER_TAG}'. Contact admins for more info" ) + # make sure tag is fully qualified + if VERSION_REGEX_MAJMINPATCH.fullmatch(docker_tag): + return docker_tag # already full version # match sha to vX.Y.Z try: if majminpatch := _match_sha_to_majminpatch(info["digest"]): @@ -130,8 +129,15 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: raise ValueError("Image tag could not resolve to a full version") -def get_info_from_docker_hub(docker_tag: str) -> dict: - """Get the json dict from GET @ Docker Hub.""" +def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: + """Get the json dict from GET @ Docker Hub, and the non v-prefixed tag (see below). + + Accepts v-prefixed tags, like 'v2.3.4', 'v4', etc. + """ + if VERSION_REGEX_PREFIX_V.fullmatch(docker_tag): + # v4 -> 4; v5.1 -> 5.1; v3.6.9 -> 3.6.9 + docker_tag = docker_tag.lstrip("v") + _error = ValueError(f"Image tag not on Docker Hub: {docker_tag}") if not docker_tag or not docker_tag.strip(): @@ -146,7 +152,7 @@ def get_info_from_docker_hub(docker_tag: str) -> dict: if not resp.ok: raise _error - return resp.json() + return resp.json(), docker_tag def resolve_docker_tag(docker_tag: str) -> str: @@ -156,18 +162,8 @@ def resolve_docker_tag(docker_tag: str) -> str: off & retry until the image exists """ LOGGER.info(f"checking docker tag: {docker_tag}") - try: - - if docker_tag == "latest": # 'latest' doesn't exist in CVMFS - return _try_resolve_to_majminpatch_docker_hub("latest") - - if VERSION_REGEX_PREFIX_V.fullmatch(docker_tag): - # v4 -> 4; v5.1 -> 5.1; v3.6.9 -> 3.6.9 - docker_tag = docker_tag.lstrip("v") - return _try_resolve_to_majminpatch_docker_hub(docker_tag) - except Exception as e: LOGGER.exception(e) raise e From 6a205cfa9e8877382f08e231724e20dde93c3dea Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 14:32:15 -0600 Subject: [PATCH 196/327] prod_tester: add `--skyscan-docker-tag` --- resources/prod_tester/test_runner.py | 3 ++- resources/prod_tester/test_suit_prod.py | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 8a9800ff..7d17a139 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -54,6 +54,7 @@ async def launch_a_scan( cluster: str, n_workers: int, reco_algo: str, + skyscan_docker_tag: str, ) -> dict: """Request to SkyDriver to scan an event.""" body = { @@ -63,7 +64,7 @@ async def launch_a_scan( "real_or_simulated_event": "real", "predictive_scanning_threshold": 1, # 0.3, "cluster": {cluster: n_workers}, - "docker_tag": "latest", + "docker_tag": skyscan_docker_tag, "max_pixel_reco_time": 30 * 60, # seconds "scanner_server_memory": "1G", "priority": 100, diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 1c35a7b6..660c802b 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -112,6 +112,7 @@ async def launch_scans( rc: RestClient, cluster: str, n_workers: int, + skyscan_docker_tag: str, ) -> list[test_getter.TestParamSet]: for i, test in enumerate(tests): logging.info( @@ -135,6 +136,7 @@ async def launch_scans( cluster, n_workers, test.reco_algo, + skyscan_docker_tag, ) test.scan_id = manifest["scan_id"] except Exception as e: @@ -187,6 +189,7 @@ async def test_all( cluster: str, n_workers: int, rescans: list[test_getter.TestParamSet] | None, + skyscan_docker_tag: str, ) -> None: """Do all the tests.""" # setup @@ -200,6 +203,7 @@ async def test_all( rc, cluster, n_workers, + skyscan_docker_tag, ) with open(config.SANDBOX_MAP_FPATH, "w") as f: # dump to file json.dump([t.to_json() for t in tests], f, indent=4) @@ -255,6 +259,11 @@ async def main(): required=True, help="the cluster to use for running workers. Ex: sub-2", ) + parser.add_argument( + "--skyscan-docker-tag", + default="latest", + help="the skymap scanner docker tag to use", + ) parser.add_argument( "--n-workers", required=True, @@ -332,6 +341,7 @@ async def main(): args.cluster, args.n_workers, rescans, + args.skyscan_docker_tag, ) From ccdb2c1e42c0932db5dfb4878fd58ab9ed8ca6b0 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 4 Feb 2025 20:35:42 +0000 Subject: [PATCH 197/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 5553d8ee..b46a3dad 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.12 -botocore==1.36.12 +boto3==1.36.13 +botocore==1.36.13 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.12] -│ ├── botocore [required: >=1.36.12,<1.37.0, installed: 1.36.12] +├── boto3 [required: Any, installed: 1.36.13] +│ ├── botocore [required: >=1.36.13,<1.37.0, installed: 1.36.13] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.12] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.13] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 96d30da2cf15d913c3f7b94ae99a36b27284924d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 15:49:28 -0600 Subject: [PATCH 198/327] logging on ewms fail --- skydriver/ewms.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index b005531b..45b7ed05 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -1,5 +1,6 @@ """Tools for interfacing with EMWS.""" +import json import logging import aiocache # type: ignore[import-untyped] @@ -65,7 +66,7 @@ async def request_workflow_on_ewms( "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": scan_request_obj[ "skyscan_mq_client_timeout_wait_for_first_message" ], - "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": 5 * 60, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? "EWMS_PILOT_INFILE_EXT": ".json", "EWMS_PILOT_OUTFILE_EXT": ".json", @@ -85,8 +86,14 @@ async def request_workflow_on_ewms( ], } - resp = await ewms_rc.request("POST", "/v0/workflows", body) - return resp["workflow"]["workflow_id"] + try: + resp = await ewms_rc.request("POST", "/v0/workflows", body) + except requests.exceptions.HTTPError: + LOGGER.error(f"request to ewms failed with:") + LOGGER.error(json.dumps(body, indent=4)) + raise + else: + return resp["workflow"]["workflow_id"] async def request_stop_on_ewms( From b3ca48728e66784fb6dce385a47ee7284804f769 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:04:08 -0600 Subject: [PATCH 199/327] don't give ewms falsy env vars --- skydriver/ewms.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 45b7ed05..0a686784 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -59,17 +59,21 @@ async def request_workflow_on_ewms( "pilot_config": { "tag": "latest", "environment": { - "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout - "EWMS_PILOT_TASK_TIMEOUT": scan_request_obj[ - "max_pixel_reco_time" - ], - "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": scan_request_obj[ - "skyscan_mq_client_timeout_wait_for_first_message" - ], - "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, - "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? - "EWMS_PILOT_INFILE_EXT": ".json", - "EWMS_PILOT_OUTFILE_EXT": ".json", + k: v + for k, v in { + "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout + "EWMS_PILOT_TASK_TIMEOUT": scan_request_obj[ + "max_pixel_reco_time" + ], + "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": scan_request_obj[ + "skyscan_mq_client_timeout_wait_for_first_message" + ], + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, + "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? + "EWMS_PILOT_INFILE_EXT": ".json", + "EWMS_PILOT_OUTFILE_EXT": ".json", + }.items() + if v # filter out any falsy values }, "input_files": [], }, From a65a4fd98d702b8b1dbf1429a68f707415ff702d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:06:31 -0600 Subject: [PATCH 200/327] flake8 --- skydriver/ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 0a686784..71a1f040 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -93,7 +93,7 @@ async def request_workflow_on_ewms( try: resp = await ewms_rc.request("POST", "/v0/workflows", body) except requests.exceptions.HTTPError: - LOGGER.error(f"request to ewms failed with:") + LOGGER.error("request to ewms failed with:") LOGGER.error(json.dumps(body, indent=4)) raise else: From ae727dcc4f20f9327736de93e11d1e9c2f6ea6da Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:25:11 -0600 Subject: [PATCH 201/327] start k8s before ewms --- skydriver/k8s/scan_backlog.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 6d3b724e..04b60648 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -154,7 +154,24 @@ async def _run( timer_main_loop.fastforward() continue # there's no scan to start - # request a workflow on EWMS + LOGGER.info( + f"Starting Scanner Instance: ({entry.scan_id=}) ({entry.timestamp})" + ) + # NOTE: the job_obj is enormous, so don't log it + + # 1st: start k8s job -- this could be any k8s job (pre- or post-ewms switchover) + # ^^^ b/c this uses local resources, if something goes wrong, this limits exposure + try: + resp = KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) + LOGGER.info(resp) + except kubernetes.client.exceptions.ApiException as e: + # k8s job (backlog entry) will be revived & restarted in future iteration + LOGGER.exception(e) + timer_main_loop.fastforward() # nothing was started, so don't wait long + continue + + # 2nd: request a workflow on EWMS + # ^^^ do after k8s b/c now we know that that was successful try: workflow_id = await ewms.request_workflow_on_ewms( ewms_rc, @@ -172,21 +189,6 @@ async def _run( return_dclass=dict, ) - LOGGER.info( - f"Starting Scanner Instance: ({entry.scan_id=}) ({entry.timestamp})" - ) - # NOTE: the job_obj is enormous, so don't log it - - # start k8s job -- this could be any k8s job (pre- or post-ewms switchover) - try: - resp = KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) - LOGGER.info(resp) - except kubernetes.client.exceptions.ApiException as e: - # k8s job (backlog entry) will be revived & restarted in future iteration - LOGGER.exception(e) - timer_main_loop.fastforward() # nothing was started, so don't wait long - continue - # remove from backlog now that startup succeeded await backlog_client.remove(entry) # TODO: remove k8s job doc? From dccce27e6ce89aa62f0a869d6a795927d87cc51d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:26:31 -0600 Subject: [PATCH 202/327] start k8s before ewms - 2 --- skydriver/k8s/scan_backlog.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 04b60648..55c57435 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -183,11 +183,12 @@ async def _run( LOGGER.exception(e) timer_main_loop.fastforward() # nothing was started, so don't wait long continue - await manifest_client.collection.find_one_and_update( - {"scan_id": manifest.scan_id}, - {"$set": {"ewms_workflow_id": workflow_id}}, - return_dclass=dict, - ) + else: + await manifest_client.collection.find_one_and_update( + {"scan_id": manifest.scan_id}, + {"$set": {"ewms_workflow_id": workflow_id}}, + return_dclass=dict, + ) # remove from backlog now that startup succeeded await backlog_client.remove(entry) From 94fe2a6e1a8dfc7904f7f701abc18691fc86ea8c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:29:53 -0600 Subject: [PATCH 203/327] fix k8s api call --- skydriver/k8s/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 854ad3a6..23402a4b 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -28,7 +28,7 @@ def start_job( raise ValueError("Job object not created") try: api_response = kubernetes.utils.create_from_dict( - k8s_batch_api, + k8s_batch_api.api_client, job_dict, namespace=ENV.K8S_NAMESPACE, ) From 42e5e6a8296f2dfce535ebf6bfe10e91844e5e95 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:44:00 -0600 Subject: [PATCH 204/327] logging --- skydriver/ewms.py | 2 +- skydriver/k8s/utils.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 71a1f040..1d7a3cc8 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -93,7 +93,7 @@ async def request_workflow_on_ewms( try: resp = await ewms_rc.request("POST", "/v0/workflows", body) except requests.exceptions.HTTPError: - LOGGER.error("request to ewms failed with:") + LOGGER.error("request to ewms failed using:") LOGGER.error(json.dumps(body, indent=4)) raise else: diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 23402a4b..fd18a61f 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -33,10 +33,12 @@ def start_job( namespace=ENV.K8S_NAMESPACE, ) LOGGER.info(api_response) - except ApiException as e: - LOGGER.exception(e) + except ApiException: + LOGGER.error("request to make k8s job failed using:") + LOGGER.error(json.dumps(job_dict, indent=4)) raise - return api_response + else: + return api_response @staticmethod def get_pods( From 1dd5096375f5744fd40074a5979163baf75a04af Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 16:59:14 -0600 Subject: [PATCH 205/327] fix k8s env vars: must all be strs --- skydriver/k8s/scanner_instance.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 3f89d074..7c3c546a 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -20,18 +20,17 @@ LOGGER = logging.getLogger(__name__) -def _to_inline_yaml(obj: list[str] | sdict) -> str: - """Convert obj-based attrs to yaml-syntax""" - # -> inline, compact formatting, no indenting needed +def _to_inline_yaml_str(obj: list[str] | sdict) -> str: + """Convert obj-based attrs to yaml-syntax where each value is a string.""" if isinstance(obj, dict): return yaml.safe_dump( - [{"name": k, "value": v} for k, v in obj.items()], - default_flow_style=True, + [{"name": str(k), "value": str(v)} for k, v in obj.items()], + default_flow_style=True, # inline, compact formatting, no indenting needed ) elif isinstance(obj, list): return yaml.safe_dump( - obj, - default_flow_style=True, + [str(o) for o in obj], + default_flow_style=True, # inline, compact formatting, no indenting needed ) else: raise TypeError(f"unsupported type {type(obj)}") @@ -105,6 +104,8 @@ def _make_job( NOTE: Let's keep definitions as straightforward as possible. """ + scanner_server_envvars = {k: str(v) for k, v in scanner_server_envvars.items()} + init_ewms_envvars = {} for k in ["SKYSCAN_SKYDRIVER_ADDRESS", "SKYSCAN_SKYDRIVER_AUTH"]: init_ewms_envvars[k] = scanner_server_envvars[k] @@ -147,7 +148,7 @@ def _make_job( image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "ewms_init_container"] args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory._EWMS_JSON_FPATH}"] - env: {_to_inline_yaml(init_ewms_envvars)} + env: {_to_inline_yaml_str(init_ewms_envvars)} resources: limits: memory: "{ENV.K8S_SCANNER_INIT_MEM_LIMIT}" @@ -160,8 +161,8 @@ def _make_job( - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} command: [] - args: {_to_inline_yaml(scanner_server_args.split())} - env: {_to_inline_yaml(scanner_server_envvars)} + args: {_to_inline_yaml_str(scanner_server_args.split())} + env: {_to_inline_yaml_str(scanner_server_envvars)} resources: limits: memory: "{scanner_server_memory_bytes}" From 944cc1777778292339bf4b304161965d8ce32c96 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 17:08:51 -0600 Subject: [PATCH 206/327] fix k8s env vars: must all be strs - 2 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 22631184..5111008b 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -252,7 +252,7 @@ async def _assert_db_skyscank8sjobs_coll( {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, ] + [ # add those from 'post_scan_body' - {"name": k, "value": v} + {"name": k, "value": str(v)} for k, v in post_scan_body[ "scanner_server_env" ].items() From 47ddb5aac472aee1006e6af3ca01b5ccb94db520 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 17:15:07 -0600 Subject: [PATCH 207/327] logging - 2 --- skydriver/k8s/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index fd18a61f..4ee0fb00 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -33,7 +33,7 @@ def start_job( namespace=ENV.K8S_NAMESPACE, ) LOGGER.info(api_response) - except ApiException: + except Exception: # broad b/c re-raising LOGGER.error("request to make k8s job failed using:") LOGGER.error(json.dumps(job_dict, indent=4)) raise From 7ddb14a6e22c2b5563f2ea8139921da46fc15e37 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 17:18:42 -0600 Subject: [PATCH 208/327] logging - 3 --- skydriver/k8s/scan_backlog.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 55c57435..876c0b6b 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -102,6 +102,10 @@ async def run( await _run(mongo_client, k8s_batch_api, ewms_rc, s3_client) except Exception as e: LOGGER.exception(e) + LOGGER.error( + f"above error stopped the backlogger, " + f"resuming in {ENV.SCAN_BACKLOG_RUNNER_DELAY} seconds..." + ) # wait hopefully log enough that any transient errors are resolved, # like a mongo pod failure and restart From 9f200138c1ac348ca85c48972f57e160dba21670 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 17:26:58 -0600 Subject: [PATCH 209/327] fix k8s error catching --- skydriver/k8s/scan_backlog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 876c0b6b..11bd9d9e 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -168,7 +168,7 @@ async def _run( try: resp = KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) LOGGER.info(resp) - except kubernetes.client.exceptions.ApiException as e: + except kubernetes.utils.FailToCreateError as e: # k8s job (backlog entry) will be revived & restarted in future iteration LOGGER.exception(e) timer_main_loop.fastforward() # nothing was started, so don't wait long From 9c3010fd851b20f22dc1efdfe50fb32129d4762e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 4 Feb 2025 17:27:55 -0600 Subject: [PATCH 210/327] flake8 --- skydriver/k8s/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 4ee0fb00..221ae6fc 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -5,7 +5,6 @@ from typing import Any, Iterator import kubernetes.client # type: ignore[import-untyped] -from kubernetes.client.rest import ApiException # type: ignore[import-untyped] from ..config import ENV, sdict From c6a02daa536e9b0ff4579cb9b7e61c64100a3186 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 12:31:56 -0600 Subject: [PATCH 211/327] prod_tester: `--one` --- resources/prod_tester/test_suit_prod.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 660c802b..7439a568 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -190,10 +190,13 @@ async def test_all( n_workers: int, rescans: list[test_getter.TestParamSet] | None, skyscan_docker_tag: str, + run_one: bool, ) -> None: """Do all the tests.""" # setup tests = list(test_getter.setup_tests()) + if run_one: + tests = [tests[0]] if rescans: _match_rescans_to_tests(rescans, tests) @@ -282,7 +285,15 @@ async def main(): default=config.SANDBOX_DIR, help="the existing (previously ran) sandbox to submit rescans for", ) + parser.add_argument( + "--one", + default=False, + action="store_true", + help="just requests a single scan instead of the whole suite", + ) args = parser.parse_args() + if args.one and args.rescan: + raise RuntimeError("cannot give --one and --rescan together") if args.rescan: # grab json map @@ -342,6 +353,7 @@ async def main(): args.n_workers, rescans, args.skyscan_docker_tag, + args.one, ) From 1e4820b5c8b7a7cb0d1483b7d0fc683d5b6b4ad7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 12:36:40 -0600 Subject: [PATCH 212/327] init container: logging --- ewms_init_container/__main__.py | 43 +++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 414b9050..3cfb44cb 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -2,24 +2,44 @@ import argparse import asyncio +import dataclasses as dc import json import logging -import os import time from pathlib import Path from rest_tools.client import ClientCredentialsAuth, RestClient +from wipac_dev_tools import from_environment_as_dataclass, logging_tools LOGGER = logging.getLogger(__package__) +@dc.dataclass(frozen=True) +class EnvConfig: + """Environment variables.""" + + SKYSCAN_SKYDRIVER_ADDRESS: str + SKYSCAN_SKYDRIVER_AUTH: str + + EWMS_ADDRESS: str + EWMS_TOKEN_URL: str + EWMS_CLIENT_ID: str + EWMS_CLIENT_SECRET: str + + QUEUE_ALIAS_TOCLIENT: str + QUEUE_ALIAS_FROMCLIENT: str + + +ENV = from_environment_as_dataclass(EnvConfig) + + async def get_workflow_id(scan_id: str) -> str: """Retrieve the workflow id for the scan (w/ `scan_id`).""" LOGGER.info(f"getting workflow id for scan {scan_id}...") skyd_rc = RestClient( - os.environ["SKYSCAN_SKYDRIVER_ADDRESS"], - os.environ["SKYSCAN_SKYDRIVER_AUTH"], + ENV.SKYSCAN_SKYDRIVER_ADDRESS, + ENV.SKYSCAN_SKYDRIVER_AUTH, logger=LOGGER, ) resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") @@ -34,10 +54,10 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") ewms_rc = ClientCredentialsAuth( - os.environ["EWMS_ADDRESS"], - os.environ["EWMS_TOKEN_URL"], - os.environ["EWMS_CLIENT_ID"], - os.environ["EWMS_CLIENT_SECRET"], + ENV.EWMS_ADDRESS, + ENV.EWMS_TOKEN_URL, + ENV.EWMS_CLIENT_ID, + ENV.EWMS_CLIENT_SECRET, logger=LOGGER, ) @@ -59,12 +79,8 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: LOGGER.info(f"mqprofiles: {mqprofiles}") # convert mqprofiles to dicts based on the queue aliases - toclient = next( - p for p in mqprofiles if p["mqid"] == os.environ["QUEUE_ALIAS_TOCLIENT"] - ) - fromclient = next( - p for p in mqprofiles if p["mqid"] == os.environ["QUEUE_ALIAS_FROMCLIENT"] - ) + toclient = next(p for p in mqprofiles if p["mqid"] == ENV.QUEUE_ALIAS_TOCLIENT) + fromclient = next(p for p in mqprofiles if p["mqid"] == ENV.QUEUE_ALIAS_FROMCLIENT) return { "toclient": { @@ -106,6 +122,7 @@ async def main() -> None: help="the json file to write the map of EWMS attributes to", ) args = parser.parse_args() + logging_tools.log_argparse_args(args) workflow_id = await get_workflow_id(args.scan_id) ewms_dict = await get_ewms_attrs(workflow_id) From 78380f38bbe2ca35e7d1a7619008df14f9073535 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 12:38:50 -0600 Subject: [PATCH 213/327] init container: logging - 2 --- ewms_init_container/__main__.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 3cfb44cb..072c5d24 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -64,12 +64,12 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: # loop until mqprofiles is not empty and all "is_activated" fields are true while True: LOGGER.info("requesting EWMS mqprofiles...") - mqprofiles = ( - await ewms_rc.request( - "GET", - f"/v0/mqs/workflows/{workflow_id}/mq-profiles/public", - ) - )["mqprofiles"] + resp = await ewms_rc.request( + "GET", + f"/v0/mqs/workflows/{workflow_id}/mq-profiles/public", + ) + LOGGER.info(json.dumps(resp, indent=4)) + mqprofiles = resp["mqprofiles"] if mqprofiles and all(m["is_activated"] for m in mqprofiles): break else: From cf2b49779ca426d8a4d50259b0ba09f2d76dbb83 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 12:45:42 -0600 Subject: [PATCH 214/327] init container: logging - 3 (also sidecar) --- ewms_init_container/__main__.py | 7 +++++++ s3_sidecar/post.py | 37 +++++++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 9 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 072c5d24..8ed2c919 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -133,5 +133,12 @@ async def main() -> None: if __name__ == "__main__": + logging_tools.set_level( + "INFO", + first_party_loggers=LOGGER, + third_party_level="INFO", + future_third_parties=[], + specialty_loggers={"rest_tools": "INFO"}, + ) asyncio.run(main()) LOGGER.info("Done.") diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index 7979781c..ab09acdc 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -1,18 +1,33 @@ """Utilities for posting to an S3 bucket.""" import argparse +import dataclasses as dc import logging -import os import time from pathlib import Path import boto3 # type: ignore[import-untyped] import requests +from wipac_dev_tools import from_environment_as_dataclass, logging_tools from wipac_dev_tools.timing_tools import IntervalTimer LOGGER = logging.getLogger(__package__) +@dc.dataclass(frozen=True) +class EnvConfig: + """Environment variables.""" + + S3_URL: str + S3_ACCESS_KEY_ID: str + S3_SECRET_KEY: str + S3_BUCKET: str + S3_OBJECT_KEY: str + + +ENV = from_environment_as_dataclass(EnvConfig) + + def post(fpath: Path) -> None: """Post the file to the S3 bucket.""" if not fpath.exists(): @@ -25,17 +40,14 @@ def post(fpath: Path) -> None: s3_client = boto3.client( "s3", "us-east-1", - endpoint_url=os.environ["S3_URL"], - aws_access_key_id=os.environ["S3_ACCESS_KEY_ID"], - aws_secret_access_key=os.environ["S3_SECRET_KEY"], + endpoint_url=ENV.S3_URL, + aws_access_key_id=ENV.S3_ACCESS_KEY_ID, + aws_secret_access_key=ENV.S3_SECRET_KEY, ) # POST LOGGER.info("generating presigned post-url...") - upload_details = s3_client.generate_presigned_post( - os.environ["S3_BUCKET"], - os.environ["S3_OBJECT_KEY"], - ) + upload_details = s3_client.generate_presigned_post(ENV.S3_BUCKET, ENV.S3_OBJECT_KEY) LOGGER.info("posting file to s3...") with open(fpath, "rb") as f: response = requests.post( @@ -66,8 +78,8 @@ def main() -> None: default=False, help="whether to wait indefinitely for the file to exist", ) - args = parser.parse_args() + logging_tools.log_argparse_args(args) logger_timer = IntervalTimer(5, LOGGER) @@ -82,5 +94,12 @@ def main() -> None: if __name__ == "__main__": + logging_tools.set_level( + "INFO", + first_party_loggers=LOGGER, + third_party_level="INFO", + future_third_parties=[], + specialty_loggers={"rest_tools": "INFO"}, + ) main() LOGGER.info("Done.") From fc9a96e386082f340c3277a978ee181168cc8327 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 12:53:00 -0600 Subject: [PATCH 215/327] logging - 4 --- ewms_init_container/__main__.py | 8 ++++++++ s3_sidecar/post.py | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 8ed2c919..fd6dc0d9 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -133,6 +133,14 @@ async def main() -> None: if __name__ == "__main__": + hand = logging.StreamHandler() + hand.setFormatter( + logging.Formatter( + "%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s[%(process)d] %(message)s <%(filename)s:%(lineno)s/%(funcName)s()>", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + logging.getLogger().addHandler(hand) logging_tools.set_level( "INFO", first_party_loggers=LOGGER, diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index ab09acdc..794a262d 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -94,6 +94,14 @@ def main() -> None: if __name__ == "__main__": + hand = logging.StreamHandler() + hand.setFormatter( + logging.Formatter( + "%(asctime)s.%(msecs)03d [%(levelname)8s] %(name)s[%(process)d] %(message)s <%(filename)s:%(lineno)s/%(funcName)s()>", + datefmt="%Y-%m-%d %H:%M:%S", + ) + ) + logging.getLogger().addHandler(hand) logging_tools.set_level( "INFO", first_party_loggers=LOGGER, From c6d9b67a6dfe419c89a2b4e18f923a1671741041 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 14:42:04 -0600 Subject: [PATCH 216/327] fix race condition --- ewms_init_container/__main__.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index fd6dc0d9..397d9294 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -42,8 +42,23 @@ async def get_workflow_id(scan_id: str) -> str: ENV.SKYSCAN_SKYDRIVER_AUTH, logger=LOGGER, ) - resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") - workflow_id = resp["ewms_workflow_id"] + + # get the id, with retries + while True: + resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") + match workflow_id := resp["ewms_workflow_id"]: + case None: + raise ValueError( + "workflow id is 'None', this indicates scan predates ewms-integration." + ) + case "pending-ewms": + LOGGER.warning( + "a workflow id has not yet been assigned for this scan. " + "Waiting, then trying again..." + ) + await asyncio.sleep(10) + case _: + break # got an actual id! LOGGER.info(f"workflow id: {workflow_id}") return workflow_id From 6255def4ede68997736b1e2b00ad98eb685237a0 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 14:43:58 -0600 Subject: [PATCH 217/327] fix mq-profile retrieval --- ewms_init_container/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 397d9294..d5bca9d1 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -94,8 +94,8 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: LOGGER.info(f"mqprofiles: {mqprofiles}") # convert mqprofiles to dicts based on the queue aliases - toclient = next(p for p in mqprofiles if p["mqid"] == ENV.QUEUE_ALIAS_TOCLIENT) - fromclient = next(p for p in mqprofiles if p["mqid"] == ENV.QUEUE_ALIAS_FROMCLIENT) + toclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_TOCLIENT) + fromclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_FROMCLIENT) return { "toclient": { From 0def2da5bb86cf15a4e3143386800f6d65109d3e Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 14:45:35 -0600 Subject: [PATCH 218/327] note --- ewms_init_container/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index d5bca9d1..b5163b89 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -97,7 +97,7 @@ async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: toclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_TOCLIENT) fromclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_FROMCLIENT) - return { + return { # NOTE: these fields are accessed by name in the skymap scanner "toclient": { "name": toclient["mqid"], "auth_token": toclient["auth_token"], From 558d5c6211c96f742a0ed00c69f2fdba2b488cda Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 5 Feb 2025 20:49:46 +0000 Subject: [PATCH 219/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index b46a3dad..888f2c0a 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.13 -botocore==1.36.13 +boto3==1.36.14 +botocore==1.36.14 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.13] -│ ├── botocore [required: >=1.36.13,<1.37.0, installed: 1.36.13] +├── boto3 [required: Any, installed: 1.36.14] +│ ├── botocore [required: >=1.36.14,<1.37.0, installed: 1.36.14] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.13] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.14] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From c3580602558e35b4952dd1bf7e705d64393e05c8 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 14:54:59 -0600 Subject: [PATCH 220/327] fix: bind common-space volume --- skydriver/k8s/scanner_instance.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 7c3c546a..dc30d80f 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -157,6 +157,9 @@ def _make_job( memory: "{ENV.K8S_SCANNER_INIT_MEM_REQUEST}" cpu: "{ENV.K8S_SCANNER_INIT_CPU_REQUEST}" ephemeral-storage: "1M" + volumeMounts: + - name: common-space-volume + mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" containers: - name: skyscan-server-{scan_id} image: {images.get_skyscan_docker_image(docker_tag)} From 3893e1e90fab7d52690d8f5e380f261d839c2a40 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 14:59:04 -0600 Subject: [PATCH 221/327] fix: bind common-space volume - tests --- tests/integration/test_rest_routes.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 5111008b..f4fea534 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -390,6 +390,12 @@ async def _assert_db_skyscank8sjobs_coll( "memory": "10M", }, }, + "volumeMounts": [ + { + "mountPath": "/common-space", + "name": "common-space-volume", + } + ], } ], "restartPolicy": "Never", From 61a09e50c69434025ed641da4879b65760d11540 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 15:28:52 -0600 Subject: [PATCH 222/327] fix: ewms request syntax --- skydriver/ewms.py | 6 +++++- tests/integration/dummy_ewms.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 1d7a3cc8..e34ed545 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -151,7 +151,11 @@ async def get_taskforce_phases( resp = await ewms_rc.request( "POST", "/v0/query/taskforces", - {"workflow_id": workflow_id}, + { + "query": { + "workflow_id": workflow_id, + } + }, ) return [ {"taskforce": tf["taskforce_uuid"], "phase": tf["phase"]} diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index cf5a8a6b..856a3ddd 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -56,7 +56,7 @@ def dummy_workflows_finished(workflow_id: str): @app.route("/v0/query/taskforces", methods=["POST"]) def dummy_query_taskforces(): - workflow_id = request.get_json("workflow_id") + workflow_id = request.get_json("query")["workflow_id"] # respond with correctly-syntaxed gibberish resp = { From 1ba2adfe0d543cb0ccf35ee4d5c6d182a5fc2352 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 15:32:21 -0600 Subject: [PATCH 223/327] flake8 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index f4fea534..a6f62824 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -193,7 +193,7 @@ async def _assert_db_scanrequests_coll( return doc_sr["rest_address"] -async def _assert_db_skyscank8sjobs_coll( +async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] post_scan_body: dict, post_resp: dict, From 5e10bfce88ccc455fcd6daa098febcaef27e4c32 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 15:42:51 -0600 Subject: [PATCH 224/327] logging --- ewms_init_container/__main__.py | 2 +- s3_sidecar/post.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index b5163b89..1ac58c15 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -157,7 +157,7 @@ async def main() -> None: ) logging.getLogger().addHandler(hand) logging_tools.set_level( - "INFO", + "DEBUG", first_party_loggers=LOGGER, third_party_level="INFO", future_third_parties=[], diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index 794a262d..735cf577 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -2,6 +2,7 @@ import argparse import dataclasses as dc +import json import logging import time from pathlib import Path @@ -32,6 +33,8 @@ def post(fpath: Path) -> None: """Post the file to the S3 bucket.""" if not fpath.exists(): raise FileNotFoundError(str(fpath)) + with open(fpath, "r") as f: + LOGGER.debug(json.load(f, indent=4)) LOGGER.info("file exists, waiting a bit longer just in case") time.sleep(5) # in case the file is currently being written (good enough logic?) @@ -103,7 +106,7 @@ def main() -> None: ) logging.getLogger().addHandler(hand) logging_tools.set_level( - "INFO", + "DEBUG", first_party_loggers=LOGGER, third_party_level="INFO", future_third_parties=[], From 5b7ff2da9eeda41b68a795783f2c089d03fcc93b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:00:38 -0600 Subject: [PATCH 225/327] prod_tester: get scan status --- resources/prod_tester/test_runner.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 7d17a139..7986e419 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -101,9 +101,8 @@ async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> print( pformat(result_resp), file=out, flush=True ) # pprint doesn't have flush - done = result_resp["is_final"] except Exception as e: # 404 (scanner not yet online) - print(f"ok: {repr(e)}", file=out, flush=True) + print(f"suppressed error: {repr(e)}", file=out, flush=True) # get progress try: @@ -117,11 +116,22 @@ async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> print(json.dumps(progress, indent=4), file=out, flush=True) except Exception as e: # 404 (scanner not yet online) or KeyError (no progress yet) - print(f"ok: {repr(e)}", file=out, flush=True) + print(f"suppressed error: {repr(e)}", file=out, flush=True) + + # get status + try: + result_resp = await rc.request("GET", f"/scan/{scan_id}/status") + print( + pformat(result_resp), file=out, flush=True + ) # pprint doesn't have flush + done = result_resp["scan_complete"] + except Exception as e: + print(f"suppressed error: {repr(e)}", file=out, flush=True) # done? else, wait print(scan_id, file=out, flush=True) if done: print("scan is done!", file=out, flush=True) return result_resp["skyscan_result"] - await asyncio.sleep(60) + else: + await asyncio.sleep(60) From e0c718b6fc97e51ad974a7acc46a255087db2e16 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:07:57 -0600 Subject: [PATCH 226/327] fix: ewms request syntax - 2 (tests) --- tests/integration/dummy_ewms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index 856a3ddd..a61c49c7 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -56,13 +56,14 @@ def dummy_workflows_finished(workflow_id: str): @app.route("/v0/query/taskforces", methods=["POST"]) def dummy_query_taskforces(): - workflow_id = request.get_json("query")["workflow_id"] + query = request.get_json("query") + print(query) # respond with correctly-syntaxed gibberish resp = { "taskforces": [ { - "taskforce_uuid": f"TF-{workflow_id['workflow_id']}", + "taskforce_uuid": f"TF-{query['workflow_id']}", "phase": "the-best-phase-ever", } ] From 13a278acb5e811be138050530514d1a6538ccf31 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:22:12 -0600 Subject: [PATCH 227/327] fix: ewms request syntax - 3 (tests) --- tests/integration/dummy_ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index a61c49c7..c549aaf3 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -57,7 +57,7 @@ def dummy_workflows_finished(workflow_id: str): @app.route("/v0/query/taskforces", methods=["POST"]) def dummy_query_taskforces(): query = request.get_json("query") - print(query) + print(f"the query: {query}") # respond with correctly-syntaxed gibberish resp = { From bd22f7e1d15de49bd9e8400ddf5a12552e493a1f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:23:14 -0600 Subject: [PATCH 228/327] fix: ewms request syntax - 4 (tests) --- tests/integration/dummy_ewms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index c549aaf3..95c9a295 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -56,14 +56,14 @@ def dummy_workflows_finished(workflow_id: str): @app.route("/v0/query/taskforces", methods=["POST"]) def dummy_query_taskforces(): - query = request.get_json("query") - print(f"the query: {query}") + req_json = request.get_json() + pprint.pprint(req_json) # respond with correctly-syntaxed gibberish resp = { "taskforces": [ { - "taskforce_uuid": f"TF-{query['workflow_id']}", + "taskforce_uuid": f"TF-{req_json['query']['workflow_id']}", "phase": "the-best-phase-ever", } ] From 08552949b449a799e43939ca7c2cf623238d645f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:36:02 -0600 Subject: [PATCH 229/327] give ewms the cvmfs image path --- skydriver/ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index e34ed545..6e367754 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -29,7 +29,7 @@ async def request_workflow_on_ewms( raise TypeError("Scan is not designated for EWMS") s3_url_get = s3.generate_s3_get_url(s3_client, manifest.scan_id) - image = images.get_skyscan_docker_image(scan_request_obj["docker_tag"]) + image = images.get_skyscan_cvmfs_singularity_image(scan_request_obj["docker_tag"]) body = { "public_queue_aliases": [QUEUE_ALIAS_TOCLIENT, QUEUE_ALIAS_FROMCLIENT], From 001c767de2aeeb709488e60679b18f8b5b2ac0f6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 16:50:15 -0600 Subject: [PATCH 230/327] logging - fix --- s3_sidecar/post.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3_sidecar/post.py b/s3_sidecar/post.py index 735cf577..24bbf539 100644 --- a/s3_sidecar/post.py +++ b/s3_sidecar/post.py @@ -34,7 +34,7 @@ def post(fpath: Path) -> None: if not fpath.exists(): raise FileNotFoundError(str(fpath)) with open(fpath, "r") as f: - LOGGER.debug(json.load(f, indent=4)) + LOGGER.debug(json.dumps(json.load(f), indent=4)) LOGGER.info("file exists, waiting a bit longer just in case") time.sleep(5) # in case the file is currently being written (good enough logic?) From 629648bbe57bfed7e1b7cef43bab5e717d735bdd Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 17:17:50 -0600 Subject: [PATCH 231/327] add more fields to scan/.../status --- README.md | 6 +++--- skydriver/ewms.py | 11 ++++++++++- skydriver/rest_handlers.py | 14 +++++++++----- tests/integration/dummy_ewms.py | 1 + 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 822b0a8d..e4454a07 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) +[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) # SkyDriver v1 @@ -313,11 +313,11 @@ None "scan_state": str, # a short human-readable code "is_deleted": bool, "scan_complete": bool, # workforce is done - "pods": { # field is included only if `include_pod_statuses == True` + "k8s_pods": { # field is included only if `include_pod_statuses == True` "pod_status": dict, # a large k8s status object "pod_status_message": str, # a human-readable message explaining the pod status retrieval } - "clusters": list, # same as Manifest's clusters field + "ewms_workforce": list, # statuses on ewms' components that run scanner clients } ``` diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 6e367754..683f5aaa 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -158,6 +158,15 @@ async def get_taskforce_phases( }, ) return [ - {"taskforce": tf["taskforce_uuid"], "phase": tf["phase"]} + { + k: tf[k] + for k in [ + "taskforce_uuid", + "phase", + "phase_change_log", + "compound_statuses", + "top_task_errors", + ] + } for tf in resp["taskforces"] ] diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 06f4de1c..fe27aeea 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -294,8 +294,12 @@ def _classifiers_validator(val: Any) -> dict[str, str | bool | float | int | Non # type checks if not isinstance(val, dict): raise argparse.ArgumentTypeError("must be a dict") - if any(v for v in val.values() if not isinstance(v, str | bool | float | int | None)): - raise argparse.ArgumentTypeError("entry must be 'str | bool | float | int | None'") + if any( + v for v in val.values() if not isinstance(v, str | bool | float | int | None) + ): + raise argparse.ArgumentTypeError( + "entry must be 'str | bool | float | int | None'" + ) # size check if len(val) > MAX_CLASSIFIERS_LEN: @@ -1064,11 +1068,11 @@ async def get(self, scan_id: str) -> None: "scan_state": scan_state, "is_deleted": manifest.is_deleted, "scan_complete": does_scan_state_indicate_final_result_received(scan_state), - "pods": pods_411, - "clusters": clusters, + "k8s_pods": pods_411, + "ewms_workforce": clusters, } if not args.include_pod_statuses: - resp.pop("pods") + resp.pop("k8s_pods") self.write(resp) # diff --git a/tests/integration/dummy_ewms.py b/tests/integration/dummy_ewms.py index 95c9a295..bdf22965 100644 --- a/tests/integration/dummy_ewms.py +++ b/tests/integration/dummy_ewms.py @@ -65,6 +65,7 @@ def dummy_query_taskforces(): { "taskforce_uuid": f"TF-{req_json['query']['workflow_id']}", "phase": "the-best-phase-ever", + # IRL, there are other attrs here } ] } From 3f846d40dbec485270ad2d3e2871846b097e2c3d Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 5 Feb 2025 23:18:23 +0000 Subject: [PATCH 232/327] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e4454a07..300b3348 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) +[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) # SkyDriver v1 From cbeed8a63befd13fc154b8b05ad1614f3c7a6077 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 5 Feb 2025 17:44:49 -0600 Subject: [PATCH 233/327] add more fields to scan/.../status - 2 --- skydriver/ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 683f5aaa..e57b69ef 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -159,7 +159,7 @@ async def get_taskforce_phases( ) return [ { - k: tf[k] + k: tf.get(k) for k in [ "taskforce_uuid", "phase", From af17b26284e9192e32a44ec9872997bca216912a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 11:54:28 -0600 Subject: [PATCH 234/327] s3-sidecar: use __main__ --- s3_sidecar/{post.py => __main__.py} | 0 skydriver/k8s/scanner_instance.py | 2 +- tests/integration/test_rest_routes.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename s3_sidecar/{post.py => __main__.py} (100%) diff --git a/s3_sidecar/post.py b/s3_sidecar/__main__.py similarity index 100% rename from s3_sidecar/post.py rename to s3_sidecar/__main__.py diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index dc30d80f..fbc3a1cf 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -180,7 +180,7 @@ def _make_job( - name: sidecar-s3-{scan_id} restartPolicy: OnFailure image: {ENV.THIS_IMAGE_WITH_TAG} - command: ["python", "-m", "s3_sidecar.post"] + command: ["python", "-m", "s3_sidecar"] args: ["{SkyScanK8sJobFactory._STARTUP_JSON_FPATH}", "--wait-indefinitely"] env: - name: S3_URL diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index a6f62824..c0cc8a4a 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -282,7 +282,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "/common-space/startup.json", "--wait-indefinitely", ], - "command": ["python", "-m", "s3_sidecar.post"], + "command": ["python", "-m", "s3_sidecar"], "env": [ {"name": "S3_URL", "value": os.environ["S3_URL"]}, { From b8ba6fa580e047507479aae022a54b5231986465 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 12:09:53 -0600 Subject: [PATCH 235/327] s3-sidecar: add lifetime timer --- s3_sidecar/__main__.py | 18 ++++++++++++++++-- skydriver/config.py | 1 + skydriver/k8s/scanner_instance.py | 2 ++ tests/integration/test_rest_routes.py | 4 ++++ 4 files changed, 23 insertions(+), 2 deletions(-) diff --git a/s3_sidecar/__main__.py b/s3_sidecar/__main__.py index 24bbf539..f3cc82b1 100644 --- a/s3_sidecar/__main__.py +++ b/s3_sidecar/__main__.py @@ -24,6 +24,7 @@ class EnvConfig: S3_SECRET_KEY: str S3_BUCKET: str S3_OBJECT_KEY: str + K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS: int ENV = from_environment_as_dataclass(EnvConfig) @@ -84,13 +85,26 @@ def main() -> None: args = parser.parse_args() logging_tools.log_argparse_args(args) - logger_timer = IntervalTimer(5, LOGGER) + housekeeping_timer = IntervalTimer( + 5, + logging.getLogger(f"{LOGGER.name}.housekeeping"), + ) + lifetime_timer = IntervalTimer( + ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, + logging.getLogger(f"{LOGGER.name}.lifetime_timer"), + ) if args.wait_indefinitely: LOGGER.info("Waiting for file to exist...") while not args.fpath.exists(): - if logger_timer.has_interval_elapsed(): + if housekeeping_timer.has_interval_elapsed(): + # log LOGGER.info("still waiting...") + # has it been too long? + if lifetime_timer.has_interval_elapsed(): + raise RuntimeError( + f"lifetime timer has expired: {lifetime_timer} seconds" + ) time.sleep(1) post(args.fpath) diff --git a/skydriver/config.py b/skydriver/config.py index e0607a22..43d568a8 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -91,6 +91,7 @@ class EnvConfig: K8S_SCANNER_SIDECAR_S3_CPU_LIMIT: float = 0.10 K8S_SCANNER_SIDECAR_S3_MEM_REQUEST: str = K8S_MIN_MEM_REQUEST K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.05 + K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS: int = 15 * 60 # 15 mins # EWMS optional config EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index fbc3a1cf..8f4851c2 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -199,6 +199,8 @@ def _make_job( value: "{ENV.S3_BUCKET}" - name: S3_OBJECT_KEY value: "{s3.make_object_key(scan_id)}" + - name: K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS + value: {ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS} resources: limits: memory: "{ENV.K8S_SCANNER_SIDECAR_S3_MEM_LIMIT}" diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index c0cc8a4a..dc70c65b 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -315,6 +315,10 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "S3_OBJECT_KEY", "value": f"{post_resp['scan_id']}-s3-object", }, + { + "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": 15 * 60, + }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"sidecar-s3-{post_resp['scan_id']}", From f0981f987e21ac869d41fb8960c228b2a59fdd08 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 12:10:28 -0600 Subject: [PATCH 236/327] (test: faulty scanner) --- skydriver/k8s/scanner_instance.py | 1 - 1 file changed, 1 deletion(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 8f4851c2..0109d429 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -165,7 +165,6 @@ def _make_job( image: {images.get_skyscan_docker_image(docker_tag)} command: [] args: {_to_inline_yaml_str(scanner_server_args.split())} - env: {_to_inline_yaml_str(scanner_server_envvars)} resources: limits: memory: "{scanner_server_memory_bytes}" From 53ae4d3cec89a1ab00ce9882d3af3a198a004c53 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 12:19:09 -0600 Subject: [PATCH 237/327] s3-sidecar: add lifetime timer - 2 --- skydriver/k8s/scanner_instance.py | 2 +- tests/integration/test_rest_routes.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 0109d429..2dfb9ab8 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -199,7 +199,7 @@ def _make_job( - name: S3_OBJECT_KEY value: "{s3.make_object_key(scan_id)}" - name: K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS - value: {ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS} + value: "{ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS}" resources: limits: memory: "{ENV.K8S_SCANNER_SIDECAR_S3_MEM_LIMIT}" diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index dc70c65b..07219e5b 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -317,7 +317,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 }, { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": 15 * 60, + "value": str(15 * 60), }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], From a5737788420973d65464de5553ca68003b545f32 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 12:42:19 -0600 Subject: [PATCH 238/327] Revert "(test: faulty scanner)" This reverts commit f0981f987e21ac869d41fb8960c228b2a59fdd08. --- skydriver/k8s/scanner_instance.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 2dfb9ab8..28e69fb3 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -165,6 +165,7 @@ def _make_job( image: {images.get_skyscan_docker_image(docker_tag)} command: [] args: {_to_inline_yaml_str(scanner_server_args.split())} + env: {_to_inline_yaml_str(scanner_server_envvars)} resources: limits: memory: "{scanner_server_memory_bytes}" From a48b673a2d54f3f4998ba17317b3efe1033157b9 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 6 Feb 2025 12:43:03 -0600 Subject: [PATCH 239/327] s3-sidecar: add lifetime timer - 3 --- s3_sidecar/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/s3_sidecar/__main__.py b/s3_sidecar/__main__.py index f3cc82b1..14a88df3 100644 --- a/s3_sidecar/__main__.py +++ b/s3_sidecar/__main__.py @@ -103,7 +103,7 @@ def main() -> None: # has it been too long? if lifetime_timer.has_interval_elapsed(): raise RuntimeError( - f"lifetime timer has expired: {lifetime_timer} seconds" + f"lifetime timer has expired: {lifetime_timer.seconds} seconds" ) time.sleep(1) From 7922853616e84b29e9b4944b84a7f07ac456f14a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 14:19:09 -0600 Subject: [PATCH 240/327] fix s3-url generation --- skydriver/ewms.py | 25 +++++++++++++++++++-- skydriver/k8s/scanner_instance.py | 4 ++-- skydriver/s3.py | 28 ------------------------ tests/integration/test_backlog_runner.py | 26 +++++++++++----------- 4 files changed, 38 insertions(+), 45 deletions(-) delete mode 100644 skydriver/s3.py diff --git a/skydriver/ewms.py b/skydriver/ewms.py index e57b69ef..89c2bf79 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -8,7 +8,7 @@ import requests from rest_tools.client import RestClient -from . import database, images, s3 +from . import database, images from .config import ENV, QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT from .database.schema import PENDING_EWMS_WORKFLOW @@ -28,7 +28,7 @@ async def request_workflow_on_ewms( else: # None raise TypeError("Scan is not designated for EWMS") - s3_url_get = s3.generate_s3_get_url(s3_client, manifest.scan_id) + s3_url_get = generate_presigned_s3_get_url(s3_client, manifest.scan_id) image = images.get_skyscan_cvmfs_singularity_image(scan_request_obj["docker_tag"]) body = { @@ -170,3 +170,24 @@ async def get_taskforce_phases( } for tf in resp["taskforces"] ] + + +def make_s3_object_key(scan_id: str) -> str: + """Construct the object key from the scan_id (deterministic).""" + return f"{scan_id}-s3-object" + + +def generate_presigned_s3_get_url( + s3_client: botocore.client.BaseClient, scan_id: str +) -> str: + """Generate a pre-signed S3 url for retrieving shared files.""" + # get GET url + get_url = s3_client.generate_presigned_url( + "get_object", + Params={ + "Bucket": ENV.S3_BUCKET, + "Key": make_s3_object_key(scan_id), + }, + ExpiresIn=24 * 60 * 60, # seconds + ) + return get_url diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 28e69fb3..2e9e5d1a 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -8,7 +8,7 @@ import yaml from rest_tools.client import ClientCredentialsAuth -from .. import images, s3 +from .. import ewms, images from ..config import ( DebugMode, ENV, @@ -198,7 +198,7 @@ def _make_job( - name: S3_BUCKET value: "{ENV.S3_BUCKET}" - name: S3_OBJECT_KEY - value: "{s3.make_object_key(scan_id)}" + value: "{ewms.make_s3_object_key(scan_id)}" - name: K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS value: "{ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS}" resources: diff --git a/skydriver/s3.py b/skydriver/s3.py deleted file mode 100644 index 1f84a1c5..00000000 --- a/skydriver/s3.py +++ /dev/null @@ -1,28 +0,0 @@ -"""Utilities for interacting with S3 buckets.""" - -import logging - -import botocore.client # type: ignore[import-untyped] - -from .config import ENV - -LOGGER = logging.getLogger(__name__) - - -def make_object_key(scan_id: str) -> str: - """Construct the object key from the scan_id (deterministic).""" - return f"{scan_id}-s3-object" - - -def generate_s3_get_url(s3_client: botocore.client.BaseClient, object_key: str) -> str: - """Generate a pre-signed S3 url for retrieving shared files.""" - # get GET url - get_url = s3_client.generate_presigned_url( - "get_object", - Params={ - "Bucket": ENV.S3_BUCKET, - "Key": object_key, - }, - ExpiresIn=24 * 60 * 60, # seconds - ) - return get_url diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index f29f3ee1..8d43883c 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -36,10 +36,10 @@ def print_it(obj: Any) -> None: @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.s3.generate_s3_get_url") +@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_00( kapitsj_mock: Mock, - s3gs3gurl_mock: Mock, + gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting.""" @@ -51,17 +51,17 @@ async def test_00( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) # call counts - s3gs3gurl_mock.assert_called_once() + gps3geturl_mock.assert_called_once() kapitsj_mock.assert_called_once() print_it(await rc.request("GET", "/scans/backlog")) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.s3.generate_s3_get_url") +@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_01( kapitsj_mock: Mock, - s3gs3gurl_mock: Mock, + gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting with multiple.""" @@ -78,26 +78,26 @@ async def test_01( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + assert gps3geturl_mock.call_count >= i + 1 # in case runner is faster assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert s3gs3gurl_mock.call_count == N_JOBS + assert gps3geturl_mock.call_count == N_JOBS assert kapitsj_mock.call_count == N_JOBS await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert s3gs3gurl_mock.call_count == N_JOBS + assert gps3geturl_mock.call_count == N_JOBS assert kapitsj_mock.call_count == N_JOBS print_it(await rc.request("GET", "/scans/backlog")) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.s3.generate_s3_get_url") +@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_10( kapitsj_mock: Mock, - s3gs3gurl_mock: Mock, + gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting with multiple cancels.""" @@ -123,16 +123,16 @@ async def test_10( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert s3gs3gurl_mock.call_count >= i + 1 # in case runner is faster + assert gps3geturl_mock.call_count >= i + 1 # in case runner is faster assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert s3gs3gurl_mock.call_count == N_JOBS - 2 + assert gps3geturl_mock.call_count == N_JOBS - 2 assert kapitsj_mock.call_count == N_JOBS - 2 await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert s3gs3gurl_mock.call_count == N_JOBS - 2 + assert gps3geturl_mock.call_count == N_JOBS - 2 assert kapitsj_mock.call_count == N_JOBS - 2 print_it(await rc.request("GET", "/scans/backlog")) From 27ee350d4592a79ae96b58eaee1ecb159bb4b544 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 10 Feb 2025 20:23:25 +0000 Subject: [PATCH 241/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 888f2c0a..766f9acf 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.14 -botocore==1.36.14 +boto3==1.36.16 +botocore==1.36.16 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -53,18 +53,18 @@ cryptography==44.0.0 └── pycparser [required: Any, installed: 2.22] pipdeptree==2.25.0 ├── packaging [required: >=24.1, installed: 24.2] -└── pip [required: >=24.2, installed: 25.0] +└── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.14] -│ ├── botocore [required: >=1.36.14,<1.37.0, installed: 1.36.14] +├── boto3 [required: Any, installed: 1.36.16] +│ ├── botocore [required: >=1.36.16,<1.37.0, installed: 1.36.16] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.14] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.16] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 331b1d9608dde239c9ce1dc2daaf03b1109616a1 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 14:45:35 -0600 Subject: [PATCH 242/327] fix s3-url generation - 2 --- s3_sidecar/__main__.py | 1 + skydriver/config.py | 1 + skydriver/ewms.py | 14 ++++++++------ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/s3_sidecar/__main__.py b/s3_sidecar/__main__.py index 14a88df3..c4b57011 100644 --- a/s3_sidecar/__main__.py +++ b/s3_sidecar/__main__.py @@ -52,6 +52,7 @@ def post(fpath: Path) -> None: # POST LOGGER.info("generating presigned post-url...") upload_details = s3_client.generate_presigned_post(ENV.S3_BUCKET, ENV.S3_OBJECT_KEY) + LOGGER.info(json.dumps(upload_details, indent=4)) LOGGER.info("posting file to s3...") with open(fpath, "rb") as f: response = requests.post( diff --git a/skydriver/config.py b/skydriver/config.py index 43d568a8..9a8d4702 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -47,6 +47,7 @@ class EnvConfig: S3_SECRET_KEY: str # the actual value S3_SECRET_KEY__K8S_SECRET_KEY: str # the key used in the k8s secrets.yml S3_BUCKET: str + S3_EXPIRES_IN: int = 7 * 24 * 60 * 60 # 7 days # misc AUTH_AUDIENCE: str = "skydriver" diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 89c2bf79..a871ceba 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -181,13 +181,15 @@ def generate_presigned_s3_get_url( s3_client: botocore.client.BaseClient, scan_id: str ) -> str: """Generate a pre-signed S3 url for retrieving shared files.""" - # get GET url + params = { + "Bucket": ENV.S3_BUCKET, + "Key": make_s3_object_key(scan_id), + } + LOGGER.info(f"generating presigned s3-url for scan {scan_id} ({params})...") get_url = s3_client.generate_presigned_url( "get_object", - Params={ - "Bucket": ENV.S3_BUCKET, - "Key": make_s3_object_key(scan_id), - }, - ExpiresIn=24 * 60 * 60, # seconds + Params=params, + ExpiresIn=ENV.S3_EXPIRES_IN, # seconds ) + LOGGER.info(get_url) return get_url From dcbeedd4094a5d1dfa34e0c45513ca49a699ee77 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 14:58:17 -0600 Subject: [PATCH 243/327] logging --- skydriver/k8s/scan_backlog.py | 3 +-- skydriver/k8s/utils.py | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 11bd9d9e..88fd8977 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -166,8 +166,7 @@ async def _run( # 1st: start k8s job -- this could be any k8s job (pre- or post-ewms switchover) # ^^^ b/c this uses local resources, if something goes wrong, this limits exposure try: - resp = KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) - LOGGER.info(resp) + KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) except kubernetes.utils.FailToCreateError as e: # k8s job (backlog entry) will be revived & restarted in future iteration LOGGER.exception(e) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 221ae6fc..620cc4d0 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -26,18 +26,18 @@ def start_job( if not job_dict: raise ValueError("Job object not created") try: - api_response = kubernetes.utils.create_from_dict( + resp = kubernetes.utils.create_from_dict( k8s_batch_api.api_client, job_dict, namespace=ENV.K8S_NAMESPACE, ) - LOGGER.info(api_response) + LOGGER.info(json.dumps(resp, indent=0)) # otherwise huge except Exception: # broad b/c re-raising LOGGER.error("request to make k8s job failed using:") LOGGER.error(json.dumps(job_dict, indent=4)) raise else: - return api_response + return resp @staticmethod def get_pods( From a3654fa7a82e3e59caca991d5110e38be6482cae Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 10 Feb 2025 22:20:47 +0000 Subject: [PATCH 244/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 766f9acf..15c756c1 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.16 -botocore==1.36.16 +boto3==1.36.17 +botocore==1.36.17 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.16] -│ ├── botocore [required: >=1.36.16,<1.37.0, installed: 1.36.16] +├── boto3 [required: Any, installed: 1.36.17] +│ ├── botocore [required: >=1.36.17,<1.37.0, installed: 1.36.17] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.16] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.17] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From e6dcfcda86ae28ee7c49816783a7b8cf427c953f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 17:23:57 -0600 Subject: [PATCH 245/327] supply url to grafana dashboard for scanner server logs --- README.md | 18 ++++----- skydriver/config.py | 2 + skydriver/k8s/scanner_instance.py | 32 +++++++++++++++- skydriver/k8s/utils.py | 55 +-------------------------- skydriver/rest_handlers.py | 62 ++++++------------------------- 5 files changed, 55 insertions(+), 114 deletions(-) diff --git a/README.md b/README.md index 300b3348..9e3ccfca 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) +[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) # SkyDriver v1 @@ -298,9 +298,7 @@ _Retrieve the status of a scan_ #### Arguments -| Argument | Type | Required/Default | Description | -|--------------------------|------|------------------|----------------------------------------------------------------------------------------------------------------| -| `"include_pod_statuses"` | bool | `False` | whether to include the k8s pod statuses for the clientmanager & central server -- expends additional resources +None #### SkyDriver Effects @@ -313,10 +311,9 @@ None "scan_state": str, # a short human-readable code "is_deleted": bool, "scan_complete": bool, # workforce is done - "k8s_pods": { # field is included only if `include_pod_statuses == True` - "pod_status": dict, # a large k8s status object - "pod_status_message": str, # a human-readable message explaining the pod status retrieval - } + "scanner_server_logs": { + "url": str, # a url to a web dashboard for viewing dashboards + }, "ewms_workforce": list, # statuses on ewms' components that run scanner clients } ``` @@ -356,8 +353,9 @@ None ``` { - "pod_container_logs": str | list[ dict[str,str] ], # list - "pod_container_logs_message": str, # a human-readable message explaining the log retrieval + "scanner_server": { + "url": str, # a url to a web dashboard for viewing dashboards + } } ``` diff --git a/skydriver/config.py b/skydriver/config.py index 9a8d4702..186ca4da 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -94,6 +94,8 @@ class EnvConfig: K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.05 K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS: int = 15 * 60 # 15 mins + GRAFANA_DASHBOARD_BASEURL: str = "" + # EWMS optional config EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" EWMS_WORKER_DISK__DEFAULT: str = "1GB" diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 2e9e5d1a..08702607 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -5,9 +5,11 @@ import textwrap from pathlib import Path +import kubernetes import yaml from rest_tools.client import ClientCredentialsAuth +from .utils import KubeAPITools from .. import ewms, images from ..config import ( DebugMode, @@ -20,6 +22,11 @@ LOGGER = logging.getLogger(__name__) +def get_skyscan_server_container_name(scan_id: str) -> str: + """Get the k8s container name for the scanner server from the scan_id (deterministic).""" + return f"skyscan-server-{scan_id}" + + def _to_inline_yaml_str(obj: list[str] | sdict) -> str: """Convert obj-based attrs to yaml-syntax where each value is a string.""" if isinstance(obj, dict): @@ -161,7 +168,7 @@ def _make_job( - name: common-space-volume mountPath: "{SkyScanK8sJobFactory.COMMON_SPACE_VOLUME_PATH}" containers: - - name: skyscan-server-{scan_id} + - name: {get_skyscan_server_container_name(scan_id)} image: {images.get_skyscan_docker_image(docker_tag)} command: [] args: {_to_inline_yaml_str(scanner_server_args.split())} @@ -317,3 +324,26 @@ def make_skyscan_server_envvars( env.update(scanner_server_env_from_user) return env + + +def assemble_scanner_server_logs_url( + k8s_batch_api: kubernetes.client.BatchV1Api, + scan_id: str, +) -> str: + """Get the URL pointing to a web dashboard for viewing the scanner server's logs.""" + job_name = SkyScanK8sJobFactory.get_job_name(scan_id) + k8s_core_api = kubernetes.client.CoreV1Api(api_client=k8s_batch_api.api_client) + + try: + for podname in KubeAPITools.get_pods(k8s_core_api, job_name, ENV.K8S_NAMESPACE): + # this is an iterator, but in reality, the job should only map to 1 pod + return ( + f"{ENV.GRAFANA_DASHBOARD_BASEURL}" + f"&var-namespace={ENV.K8S_NAMESPACE}" + f"&var-pod={podname}" + f"&var-container={get_skyscan_server_container_name(scan_id)}" + ) + except Exception as e: + LOGGER.error(f"there was an issue retrieving k8s pod(s) for {scan_id=}") + LOGGER.exception(e) + return "404" # don't return exception info for security reasons diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 620cc4d0..4bdb2708 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -44,7 +44,7 @@ def get_pods( k8s_core_api: kubernetes.client.CoreV1Api, job_name: str, namespace: str, - ) -> Iterator[kubernetes.client.V1Pod]: + ) -> Iterator[str]: """Get each pod corresponding to the job. Raises `ValueError` if there are no pods for the job. @@ -55,55 +55,4 @@ def get_pods( if not pods.items: raise ValueError(f"Job {job_name} has no pods") for pod in pods.items: - yield pod - - @staticmethod - def get_pod_status( - k8s_batch_api: kubernetes.client.BatchV1Api, - job_name: str, - namespace: str, - ) -> dict[str, dict[str, Any]]: - """Get the status of the k8s pod(s) and their containers. - - Raises `ValueError` if there are no pods for the job. - """ - LOGGER.info(f"getting pod status for {job_name=} {namespace=}") - status = {} - - k8s_core_api = kubernetes.client.CoreV1Api(api_client=k8s_batch_api.api_client) - - for pod in KubeAPITools.get_pods(k8s_core_api, job_name, namespace): - pod_status: kubernetes.client.V1PodStatus = pod.status - # pod status has non-serializable things like datetime objects - serializable = json.loads(json.dumps(pod_status.to_dict(), default=str)) - status[pod.metadata.name] = serializable - - return status - - @staticmethod - def get_container_logs( - k8s_batch_api: kubernetes.client.BatchV1Api, - job_name: str, - namespace: str, - ) -> dict[str, dict[str, str]]: - """Grab the logs for all containers. - - Raises `ValueError` if there are no pods for the job. - """ - LOGGER.info(f"getting logs for {job_name=} {namespace=}") - logs = {} - - k8s_core_api = kubernetes.client.CoreV1Api(api_client=k8s_batch_api.api_client) - - for pod in KubeAPITools.get_pods(k8s_core_api, job_name, namespace): - these_logs = {} - for container in pod.spec.containers: - these_logs[container.name] = k8s_core_api.read_namespaced_pod_log( - pod.metadata.name, - namespace, - container=container.name, - timestamps=True, - ) - logs[pod.metadata.name] = these_logs - - return logs + yield pod.metadata.name diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index fe27aeea..281192c8 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -26,7 +26,7 @@ from tornado import web from wipac_dev_tools import argparse_tools -from . import database, ewms, images, k8s +from . import database, ewms, images from .config import ( DebugMode, ENV, @@ -37,7 +37,7 @@ from .database.schema import PENDING_EWMS_WORKFLOW from .ewms import request_stop_on_ewms from .k8s.scan_backlog import put_on_backlog -from .k8s.scanner_instance import SkyScanK8sJobFactory +from .k8s.scanner_instance import SkyScanK8sJobFactory, assemble_scanner_server_logs_url from .utils import does_scan_state_indicate_final_result_received, get_scan_state LOGGER = logging.getLogger(__name__) @@ -1020,35 +1020,8 @@ class ScanStatusHandler(BaseSkyDriverHandler): @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get a scan's status.""" - arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) - arghand.add_argument( - "include_pod_statuses", - type=bool, - default=False, - ) - args = arghand.parse_args() - manifest = await self.manifests.get(scan_id, incl_del=True) - # get pod status - pods_411: dict[str, Any] = {} - if args.include_pod_statuses: - try: - pods_411["pod_status"] = k8s.utils.KubeAPITools.get_pod_status( - self.k8s_batch_api, - SkyScanK8sJobFactory.get_job_name(scan_id), - ENV.K8S_NAMESPACE, - ) - pods_411["pod_message"] = "retrieved" - except (kubernetes.client.rest.ApiException, ValueError) as e: - if await self.scan_backlog.is_in_backlog(scan_id): - pods_411["pod_status"] = {} - pods_411["pod_message"] = "in backlog" - else: - pods_411["pod_status"] = {} - pods_411["pod_message"] = "pod(s) not found" - LOGGER.exception(e) - # scan state scan_state = await get_scan_state(manifest, self.ewms_rc, self.results) @@ -1068,11 +1041,13 @@ async def get(self, scan_id: str) -> None: "scan_state": scan_state, "is_deleted": manifest.is_deleted, "scan_complete": does_scan_state_indicate_final_result_received(scan_state), - "k8s_pods": pods_411, + "scanner_server_logs": { + "url": assemble_scanner_server_logs_url( + self.k8s_batch_api, manifest.scan_id + ), + }, "ewms_workforce": clusters, } - if not args.include_pod_statuses: - resp.pop("k8s_pods") self.write(resp) # @@ -1091,26 +1066,13 @@ class ScanLogsHandler(BaseSkyDriverHandler): @service_account_auth(roles=[USER_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get a scan's logs.""" - try: - pod_container_logs = k8s.utils.KubeAPITools.get_container_logs( - self.k8s_batch_api, - SkyScanK8sJobFactory.get_job_name(scan_id), - ENV.K8S_NAMESPACE, - ) - pod_container_logs_message = "retrieved" - except (kubernetes.client.rest.ApiException, ValueError) as e: - if await self.scan_backlog.is_in_backlog(scan_id): - pod_container_logs = {} - pod_container_logs_message = "in backlog" - else: - pod_container_logs = {} - pod_container_logs_message = "pod(s) not found" - LOGGER.exception(e) - self.write( { - "pod_container_logs": pod_container_logs, - "pod_container_logs_message": pod_container_logs_message, + "scanner_server": { + "url": assemble_scanner_server_logs_url( + self.k8s_batch_api, scan_id + ), + } } ) From c4e75a3e8230777fbacf8da9772ae08caabff9be Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 10 Feb 2025 23:24:28 +0000 Subject: [PATCH 246/327] update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9e3ccfca..0ce19c41 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) +[![GitHub release (latest by date including pre-releases)](https://img.shields.io/github/v/release/WIPACrepo/SkyDriver?include_prereleases)](https://github.com/WIPACrepo/SkyDriver/) [![Lines of code](https://img.shields.io/tokei/lines/github/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/) [![GitHub issues](https://img.shields.io/github/issues/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/issues?q=is%3Aissue+sort%3Aupdated-desc+is%3Aopen) [![GitHub pull requests](https://img.shields.io/github/issues-pr/WIPACrepo/SkyDriver)](https://github.com/WIPACrepo/SkyDriver/pulls?q=is%3Apr+sort%3Aupdated-desc+is%3Aopen) # SkyDriver v1 From b3d1d0b37f29c44e2fac7890a21c8584e596e065 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 17:42:37 -0600 Subject: [PATCH 247/327] add to `get_scan.py` script --- resources/get_scan.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/resources/get_scan.py b/resources/get_scan.py index e04e963d..c6bc8bc6 100644 --- a/resources/get_scan.py +++ b/resources/get_scan.py @@ -54,8 +54,12 @@ async def main(): rc = get_rest_client(args.skydriver_url) logging.info(f"getting manifest for scan {args.scan_id}") - manifest = await rc.request("GET", f"/scan/{args.scan_id}/manifest") - print(json.dumps(manifest, indent=4), flush=True) + resp = await rc.request("GET", f"/scan/{args.scan_id}/manifest") + print(json.dumps(resp, indent=4), flush=True) + + logging.info(f"getting statuses for scan {args.scan_id}") + resp = await rc.request("GET", f"/scan/{args.scan_id}/status") + print(json.dumps(resp, indent=4), flush=True) if __name__ == "__main__": From 2f430e204a43ce3532eb5b05d200d95d551fad36 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 17:43:56 -0600 Subject: [PATCH 248/327] mypy --- skydriver/k8s/scanner_instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 08702607..b9e02852 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -5,7 +5,7 @@ import textwrap from pathlib import Path -import kubernetes +import kubernetes.client # type: ignore[import-untyped] import yaml from rest_tools.client import ClientCredentialsAuth From 3c75f13f09e732db6845d4f0c981ad8ca895dca7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 10 Feb 2025 17:48:22 -0600 Subject: [PATCH 249/327] mypy - 2 --- skydriver/k8s/scanner_instance.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index b9e02852..c02693fe 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -346,4 +346,6 @@ def assemble_scanner_server_logs_url( except Exception as e: LOGGER.error(f"there was an issue retrieving k8s pod(s) for {scan_id=}") LOGGER.exception(e) - return "404" # don't return exception info for security reasons + + # fall-through + return "404" # don't return exception info for security reasons From 068a02fb5619c325b1a449bb32c8d1fb0bab66b6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 11:54:03 -0600 Subject: [PATCH 250/327] logging --- skydriver/images.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/skydriver/images.py b/skydriver/images.py index ee98a2f2..ca891b8f 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -1,5 +1,6 @@ """Utilities for dealing with docker/cvmfs/singularity images.""" +import json import logging import re from pathlib import Path @@ -144,7 +145,9 @@ def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: raise _error try: - resp = requests.get(f"{DOCKERHUB_API_URL}/{docker_tag}") + url = f"{DOCKERHUB_API_URL}/{docker_tag}" + LOGGER.info(f"looking at {url}...") + resp = requests.get(url) except Exception as e: LOGGER.exception(e) raise ValueError("Image tag verification failed") @@ -152,6 +155,7 @@ def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: if not resp.ok: raise _error + LOGGER.debug(json.dumps(resp.json(), indent=0)) return resp.json(), docker_tag From 16712df97f682f90a7a7246028909354c6bb70d5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 11:57:46 -0600 Subject: [PATCH 251/327] request timeout --- skydriver/images.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/images.py b/skydriver/images.py index ca891b8f..70a8f3c5 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -147,7 +147,7 @@ def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: try: url = f"{DOCKERHUB_API_URL}/{docker_tag}" LOGGER.info(f"looking at {url}...") - resp = requests.get(url) + resp = requests.get(url, timeout=10) except Exception as e: LOGGER.exception(e) raise ValueError("Image tag verification failed") From a635a29928ca8aeb6fb84ff9c38d2ab4988cdeb5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 11 Feb 2025 18:01:44 +0000 Subject: [PATCH 252/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 15c756c1..e9e27b42 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -13,7 +13,7 @@ cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 charset-normalizer==3.4.1 -cryptography==44.0.0 +cryptography==44.0.1 dacite==1.8.1 dnspython==2.7.0 durationpy==0.9 @@ -48,7 +48,7 @@ wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree ######################################################################## -cryptography==44.0.0 +cryptography==44.0.1 └── cffi [required: >=1.12, installed: 1.17.1] └── pycparser [required: Any, installed: 2.22] pipdeptree==2.25.0 From 48d2b87c6beb85c94333577a1b31b4bf5cc70670 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 12:20:54 -0600 Subject: [PATCH 253/327] logging - 2 --- skydriver/images.py | 53 ++++++++++++++++++++++++++++++--------------- 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/skydriver/images.py b/skydriver/images.py index 70a8f3c5..b018b8af 100644 --- a/skydriver/images.py +++ b/skydriver/images.py @@ -1,6 +1,5 @@ """Utilities for dealing with docker/cvmfs/singularity images.""" -import json import logging import re from pathlib import Path @@ -21,7 +20,7 @@ _IMAGE = "skymap_scanner" _SKYSCAN_DOCKER_IMAGE_NO_TAG = f"icecube/{_IMAGE}" -DOCKERHUB_API_URL = ( +SKYSCAN_DOCKERHUB_API_URL = ( f"https://hub.docker.com/v2/repositories/{_SKYSCAN_DOCKER_IMAGE_NO_TAG}/tags" ) @@ -54,24 +53,42 @@ def get_skyscan_docker_image(tag: str) -> str: # utils -def _match_sha_to_majminpatch(sha: str) -> str | None: +def _match_sha_to_majminpatch(target_sha: str) -> str | None: """Finds the image w/ same SHA and has a version tag like '#.#.#'. No error handling """ - url = DOCKERHUB_API_URL - while True: - resp = requests.get(url).json() + LOGGER.debug( + f"finding an image that has a version tag like '#.#.#' for sha={target_sha}..." + ) + + url = SKYSCAN_DOCKERHUB_API_URL + while True: # loop for pagination + LOGGER.info(f"looking at {url}...") + resp = requests.get(url, timeout=10).json() + + # look at each result on this page for result in resp["results"]: - if sha != result.get("digest", result["images"][0]["digest"]): - # some old ones have their 'digest' in their 'images' list entry + result_sha = result.get("digest", result["images"][0]["digest"]) + # ^^^ some old ones have their 'digest' in their 'images' list entry + LOGGER.debug(f"an api image: sha={result_sha} ({result})") + if target_sha != result_sha: + LOGGER.debug("-> no match, looking at next...") continue - if VERSION_REGEX_MAJMINPATCH.fullmatch(result["name"]): + elif VERSION_REGEX_MAJMINPATCH.fullmatch(result["name"]): + LOGGER.debug("-> success! matches AND has a full version tag") return result["name"] # type: ignore[no-any-return] - if not resp["next"]: - break - url = resp["next"] - return None + else: + LOGGER.debug("-> matches, but not a full version tag") + + # what now? get url for the next page + if not resp["next"]: # no more -> no match! + LOGGER.debug( + f"-> could not find a full version tag matching sha={target_sha}" + ) + return None + else: + url = resp["next"] def _parse_image_ts(info: dict) -> float: @@ -127,7 +144,7 @@ def _try_resolve_to_majminpatch_docker_hub(docker_tag: str) -> str: return docker_tag except Exception as e: LOGGER.exception(e) - raise ValueError("Image tag could not resolve to a full version") + raise ValueError("Error validating image on Docker Hub") def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: @@ -145,7 +162,7 @@ def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: raise _error try: - url = f"{DOCKERHUB_API_URL}/{docker_tag}" + url = f"{SKYSCAN_DOCKERHUB_API_URL}/{docker_tag}" LOGGER.info(f"looking at {url}...") resp = requests.get(url, timeout=10) except Exception as e: @@ -155,7 +172,7 @@ def get_info_from_docker_hub(docker_tag: str) -> tuple[dict, str]: if not resp.ok: raise _error - LOGGER.debug(json.dumps(resp.json(), indent=0)) + LOGGER.debug(resp) return resp.json(), docker_tag @@ -167,7 +184,9 @@ def resolve_docker_tag(docker_tag: str) -> str: """ LOGGER.info(f"checking docker tag: {docker_tag}") try: - return _try_resolve_to_majminpatch_docker_hub(docker_tag) + out_image = _try_resolve_to_majminpatch_docker_hub(docker_tag) + LOGGER.info(f"resolved tag: {docker_tag} -> {out_image}") + return out_image except Exception as e: LOGGER.exception(e) raise e From 7331c341fc8bc07e442c01e07685b73f3a5b4d0a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 12:44:25 -0600 Subject: [PATCH 254/327] logging - 3 --- skydriver/ewms.py | 1 + skydriver/k8s/scan_backlog.py | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index a871ceba..430f6bb3 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -91,6 +91,7 @@ async def request_workflow_on_ewms( } try: + LOGGER.info("requesting to ewms...") resp = await ewms_rc.request("POST", "/v0/workflows", body) except requests.exceptions.HTTPError: LOGGER.error("request to ewms failed using:") diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 88fd8977..e4f081c7 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -166,6 +166,7 @@ async def _run( # 1st: start k8s job -- this could be any k8s job (pre- or post-ewms switchover) # ^^^ b/c this uses local resources, if something goes wrong, this limits exposure try: + LOGGER.info(f"Starting K8s job: scan_id={manifest.scan_id}") KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) except kubernetes.utils.FailToCreateError as e: # k8s job (backlog entry) will be revived & restarted in future iteration @@ -176,6 +177,7 @@ async def _run( # 2nd: request a workflow on EWMS # ^^^ do after k8s b/c now we know that that was successful try: + LOGGER.info(f"Requesting EWMS Workflow: scan_id={manifest.scan_id}") workflow_id = await ewms.request_workflow_on_ewms( ewms_rc, s3_client, @@ -187,6 +189,7 @@ async def _run( timer_main_loop.fastforward() # nothing was started, so don't wait long continue else: + LOGGER.info(f"-> {workflow_id=}: scan_id={manifest.scan_id}") await manifest_client.collection.find_one_and_update( {"scan_id": manifest.scan_id}, {"$set": {"ewms_workflow_id": workflow_id}}, @@ -194,6 +197,7 @@ async def _run( ) # remove from backlog now that startup succeeded + LOGGER.info(f"Scan successfully started: scan_id={manifest.scan_id}") await backlog_client.remove(entry) # TODO: remove k8s job doc? From 84227b5c84904190e4455147935b2d378ab73a9d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 12:51:16 -0600 Subject: [PATCH 255/327] add timeout to ewms-init container --- ewms_init_container/__main__.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 1ac58c15..c6464e4d 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -9,10 +9,12 @@ from pathlib import Path from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment_as_dataclass, logging_tools +from wipac_dev_tools import from_environment_as_dataclass, logging_tools, timing_tools LOGGER = logging.getLogger(__package__) +TIMEOUT = 5 * 60 + @dc.dataclass(frozen=True) class EnvConfig: @@ -43,9 +45,19 @@ async def get_workflow_id(scan_id: str) -> str: logger=LOGGER, ) - # get the id, with retries + timeout_timer = timing_tools.IntervalTimer( + TIMEOUT, logging.getLogger(f"{LOGGER.name}.timeout_timer") + ) + + # get the id, with retries -- this id should be available pretty quickly while True: + if timeout_timer.has_interval_elapsed(): + raise TimeoutError( + f"EWMS workflow_id could not be retrieved within {timeout_timer.seconds} seconds" + ) + # get id resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") + # parse match workflow_id := resp["ewms_workflow_id"]: case None: raise ValueError( From b22d92da41b1a718c451ed6d73d8d485aa652d50 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 15:41:48 -0600 Subject: [PATCH 256/327] move ewms request to init-container - 1 (wip) --- ewms_init_container/__main__.py | 185 +++++++++++++++++++++--------- skydriver/config.py | 4 - skydriver/ewms.py | 108 +---------------- skydriver/k8s/scan_backlog.py | 3 + skydriver/k8s/scanner_instance.py | 146 ++++++++++++++--------- 5 files changed, 230 insertions(+), 216 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index c6464e4d..45832b07 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -8,12 +8,17 @@ import time from pathlib import Path +import botocore.client # type: ignore[import-untyped] +import requests from rest_tools.client import ClientCredentialsAuth, RestClient -from wipac_dev_tools import from_environment_as_dataclass, logging_tools, timing_tools +from wipac_dev_tools import from_environment_as_dataclass, logging_tools LOGGER = logging.getLogger(__package__) -TIMEOUT = 5 * 60 + +# WARNING: these values must remain constant, they are cross-referenced in the db +QUEUE_ALIAS_TOCLIENT = "to-client-queue" # '' +QUEUE_ALIAS_FROMCLIENT = "from-client-queue" # '' @dc.dataclass(frozen=True) @@ -28,66 +33,128 @@ class EnvConfig: EWMS_CLIENT_ID: str EWMS_CLIENT_SECRET: str + EWMS_TASK_IMAGE: str + QUEUE_ALIAS_TOCLIENT: str QUEUE_ALIAS_FROMCLIENT: str + S3_URL: str + S3_ACCESS_KEY_ID: str + S3_SECRET_KEY: str + S3_BUCKET: str + S3_OBJECT_KEY: str + S3_EXPIRES_IN: int -ENV = from_environment_as_dataclass(EnvConfig) +ENV = from_environment_as_dataclass(EnvConfig) -async def get_workflow_id(scan_id: str) -> str: - """Retrieve the workflow id for the scan (w/ `scan_id`).""" - LOGGER.info(f"getting workflow id for scan {scan_id}...") - skyd_rc = RestClient( - ENV.SKYSCAN_SKYDRIVER_ADDRESS, - ENV.SKYSCAN_SKYDRIVER_AUTH, - logger=LOGGER, - ) - - timeout_timer = timing_tools.IntervalTimer( - TIMEOUT, logging.getLogger(f"{LOGGER.name}.timeout_timer") +def generate_presigned_s3_get_url( + s3_client: botocore.client.BaseClient, scan_id: str +) -> str: + """Generate a pre-signed S3 url for retrieving shared files.""" + params = { + "Bucket": ENV.S3_BUCKET, + "Key": ENV.S3_OBJECT_KEY, + } + LOGGER.info(f"generating presigned s3-url for scan {scan_id} ({params})...") + get_url = s3_client.generate_presigned_url( + "get_object", + Params=params, + ExpiresIn=ENV.S3_EXPIRES_IN, # seconds ) + LOGGER.info(get_url) + return get_url + + +async def request_workflow_on_ewms( + ewms_rc: RestClient, + s3_client: botocore.client.BaseClient, + manifest: dict, + scan_request_obj: dict, +) -> str: + """Request a workflow in EWMS.""" + if manifest["ewms_workflow_id"] != database.schema.PENDING_EWMS_WORKFLOW: + if manifest["ewms_workflow_id"]: + raise TypeError("Scan has already been sent to EWMS") + else: # None + raise TypeError("Scan is not designated for EWMS") + + s3_url_get = generate_presigned_s3_get_url(s3_client, manifest.scan_id) + + body = { + "public_queue_aliases": [QUEUE_ALIAS_TOCLIENT, QUEUE_ALIAS_FROMCLIENT], + "tasks": [ + { + "cluster_locations": [ + cname for cname, _ in scan_request_obj["request_clusters"] + ], + "input_queue_aliases": [QUEUE_ALIAS_TOCLIENT], + "output_queue_aliases": [QUEUE_ALIAS_FROMCLIENT], + "task_image": ENV.EWMS_TASK_IMAGE, + "task_args": ( + "python -m skymap_scanner.client " + "--infile {{INFILE}} --outfile {{OUTFILE}} " + "--client-startup-json {{DATA_HUB}}/startup.json" + ), + "init_image": ENV.EWMS_TASK_IMAGE, # piggyback this image since it's already present + "init_args": ( + "bash -c " + '"' # quote for bash -c "..." + "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " + f"'{s3_url_get}'" # single-quote the url + '"' # unquote for bash -c "..." + ), + "n_workers": scan_request_obj["request_clusters"][0][1], + # TODO: ^^^ pass on varying # of workers per cluster + "pilot_config": { + "tag": "latest", + "environment": { + k: v + for k, v in { + "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout + "EWMS_PILOT_TASK_TIMEOUT": ENV.EWMS_PILOT_TASK_TIMEOUT, + "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": ENV.EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, + "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? + "EWMS_PILOT_INFILE_EXT": ".json", + "EWMS_PILOT_OUTFILE_EXT": ".json", + }.items() + if v # filter out any falsy values + }, + "input_files": [], + }, + "worker_config": { + "do_transfer_worker_stdouterr": True, # toggle? + "max_worker_runtime": scan_request_obj["max_worker_runtime"], + "n_cores": 1, + "priority": scan_request_obj["priority"], + "worker_disk": scan_request_obj["worker_disk_bytes"], + "worker_memory": scan_request_obj["worker_memory_bytes"], + "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", + }, + } + ], + } - # get the id, with retries -- this id should be available pretty quickly - while True: - if timeout_timer.has_interval_elapsed(): - raise TimeoutError( - f"EWMS workflow_id could not be retrieved within {timeout_timer.seconds} seconds" - ) - # get id - resp = await skyd_rc.request("GET", f"/scan/{scan_id}/manifest") - # parse - match workflow_id := resp["ewms_workflow_id"]: - case None: - raise ValueError( - "workflow id is 'None', this indicates scan predates ewms-integration." - ) - case "pending-ewms": - LOGGER.warning( - "a workflow id has not yet been assigned for this scan. " - "Waiting, then trying again..." - ) - await asyncio.sleep(10) - case _: - break # got an actual id! - - LOGGER.info(f"workflow id: {workflow_id}") - return workflow_id - - -async def get_ewms_attrs(workflow_id: str) -> dict[str, dict[str, str]]: + try: + LOGGER.info("requesting to ewms...") + resp = await ewms_rc.request("POST", "/v0/workflows", body) + except requests.exceptions.HTTPError: + LOGGER.error("request to ewms failed using:") + LOGGER.error(json.dumps(body, indent=4)) + raise + else: + return resp["workflow"]["workflow_id"] + + +async def get_ewms_attrs( + ewms_rc: RestClient, + workflow_id: str, +) -> dict[str, dict[str, str]]: """Retrieve the EWMS attributes for the workflow.""" LOGGER.info(f"getting EWMS attributes for workflow {workflow_id}...") - ewms_rc = ClientCredentialsAuth( - ENV.EWMS_ADDRESS, - ENV.EWMS_TOKEN_URL, - ENV.EWMS_CLIENT_ID, - ENV.EWMS_CLIENT_SECRET, - logger=LOGGER, - ) - # loop until mqprofiles is not empty and all "is_activated" fields are true while True: LOGGER.info("requesting EWMS mqprofiles...") @@ -151,8 +218,22 @@ async def main() -> None: args = parser.parse_args() logging_tools.log_argparse_args(args) - workflow_id = await get_workflow_id(args.scan_id) - ewms_dict = await get_ewms_attrs(workflow_id) + ewms_rc = ClientCredentialsAuth( + ENV.EWMS_ADDRESS, + ENV.EWMS_TOKEN_URL, + ENV.EWMS_CLIENT_ID, + ENV.EWMS_CLIENT_SECRET, + logger=LOGGER, + ) + skyd_rc = RestClient( + ENV.SKYSCAN_SKYDRIVER_ADDRESS, + ENV.SKYSCAN_SKYDRIVER_AUTH, + logger=LOGGER, + ) + + workflow_id = await request_workflow_on_ewms(ewms_rc, args.scan_id) + await send_workflow_id_to_skydriver(skyd_rc, workflow_id) + ewms_dict = await get_ewms_attrs(ewms_rc, workflow_id) LOGGER.info(f"dumping EWMS attributes to '{args.json_out}'...") with open(args.json_out, "w") as f: diff --git a/skydriver/config.py b/skydriver/config.py index 186ca4da..95481a64 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -18,10 +18,6 @@ SCAN_MIN_PRIORITY_TO_START_ASAP = 100 -# WARNING: these values must remain constant, they are cross-referenced in the db -QUEUE_ALIAS_TOCLIENT = "to-client-queue" # '' -QUEUE_ALIAS_FROMCLIENT = "from-client-queue" # '' - @enum.unique class DebugMode(enum.Enum): diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 430f6bb3..aebe0cab 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -1,6 +1,5 @@ """Tools for interfacing with EMWS.""" -import json import logging import aiocache # type: ignore[import-untyped] @@ -8,99 +7,12 @@ import requests from rest_tools.client import RestClient -from . import database, images -from .config import ENV, QUEUE_ALIAS_FROMCLIENT, QUEUE_ALIAS_TOCLIENT +from .config import ENV from .database.schema import PENDING_EWMS_WORKFLOW LOGGER = logging.Logger(__name__) -async def request_workflow_on_ewms( - ewms_rc: RestClient, - s3_client: botocore.client.BaseClient, - manifest: database.schema.Manifest, - scan_request_obj: dict, -) -> str: - """Request a workflow in EWMS.""" - if manifest.ewms_workflow_id != database.schema.PENDING_EWMS_WORKFLOW: - if manifest.ewms_workflow_id: - raise TypeError("Scan has already been sent to EWMS") - else: # None - raise TypeError("Scan is not designated for EWMS") - - s3_url_get = generate_presigned_s3_get_url(s3_client, manifest.scan_id) - image = images.get_skyscan_cvmfs_singularity_image(scan_request_obj["docker_tag"]) - - body = { - "public_queue_aliases": [QUEUE_ALIAS_TOCLIENT, QUEUE_ALIAS_FROMCLIENT], - "tasks": [ - { - "cluster_locations": [ - cname for cname, _ in scan_request_obj["request_clusters"] - ], - "input_queue_aliases": [QUEUE_ALIAS_TOCLIENT], - "output_queue_aliases": [QUEUE_ALIAS_FROMCLIENT], - "task_image": image, - "task_args": ( - "python -m skymap_scanner.client " - "--infile {{INFILE}} --outfile {{OUTFILE}} " - "--client-startup-json {{DATA_HUB}}/startup.json" - ), - "init_image": image, # piggyback this image since it's already present - "init_args": ( - "bash -c " - '"' # quote for bash -c "..." - "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " - f"'{s3_url_get}'" # single-quote the url - '"' # unquote for bash -c "..." - ), - "n_workers": scan_request_obj["request_clusters"][0][1], - # TODO: ^^^ pass on varying # of workers per cluster - "pilot_config": { - "tag": "latest", - "environment": { - k: v - for k, v in { - "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout - "EWMS_PILOT_TASK_TIMEOUT": scan_request_obj[ - "max_pixel_reco_time" - ], - "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": scan_request_obj[ - "skyscan_mq_client_timeout_wait_for_first_message" - ], - "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, - "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? - "EWMS_PILOT_INFILE_EXT": ".json", - "EWMS_PILOT_OUTFILE_EXT": ".json", - }.items() - if v # filter out any falsy values - }, - "input_files": [], - }, - "worker_config": { - "do_transfer_worker_stdouterr": True, # toggle? - "max_worker_runtime": scan_request_obj["max_worker_runtime"], - "n_cores": 1, - "priority": scan_request_obj["priority"], - "worker_disk": scan_request_obj["worker_disk_bytes"], - "worker_memory": scan_request_obj["worker_memory_bytes"], - "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", - }, - } - ], - } - - try: - LOGGER.info("requesting to ewms...") - resp = await ewms_rc.request("POST", "/v0/workflows", body) - except requests.exceptions.HTTPError: - LOGGER.error("request to ewms failed using:") - LOGGER.error(json.dumps(body, indent=4)) - raise - else: - return resp["workflow"]["workflow_id"] - - async def request_stop_on_ewms( ewms_rc: RestClient, workflow_id: str, @@ -176,21 +88,3 @@ async def get_taskforce_phases( def make_s3_object_key(scan_id: str) -> str: """Construct the object key from the scan_id (deterministic).""" return f"{scan_id}-s3-object" - - -def generate_presigned_s3_get_url( - s3_client: botocore.client.BaseClient, scan_id: str -) -> str: - """Generate a pre-signed S3 url for retrieving shared files.""" - params = { - "Bucket": ENV.S3_BUCKET, - "Key": make_s3_object_key(scan_id), - } - LOGGER.info(f"generating presigned s3-url for scan {scan_id} ({params})...") - get_url = s3_client.generate_presigned_url( - "get_object", - Params=params, - ExpiresIn=ENV.S3_EXPIRES_IN, # seconds - ) - LOGGER.info(get_url) - return get_url diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index e4f081c7..75731da1 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -185,6 +185,9 @@ async def _run( scan_request_obj, ) except Exception as e: + # TODO: if this fails, then the k8s have already started. so, next loop, either kill the og k8s or somehow re-use -- no timeout on ewms-init? + # option 1: move this request thing to the ewms-init + # option 2: add a second backlogger that only does ewms -- may have timing issues LOGGER.exception(e) timer_main_loop.fastforward() # nothing was started, so don't wait long continue diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index c02693fe..cf16a3ed 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -4,6 +4,7 @@ import logging import textwrap from pathlib import Path +from typing import Any import kubernetes.client # type: ignore[import-untyped] import yaml @@ -14,10 +15,9 @@ from ..config import ( DebugMode, ENV, - QUEUE_ALIAS_FROMCLIENT, - QUEUE_ALIAS_TOCLIENT, sdict, ) +from ..images import get_skyscan_cvmfs_singularity_image LOGGER = logging.getLogger(__name__) @@ -27,20 +27,12 @@ def get_skyscan_server_container_name(scan_id: str) -> str: return f"skyscan-server-{scan_id}" -def _to_inline_yaml_str(obj: list[str] | sdict) -> str: - """Convert obj-based attrs to yaml-syntax where each value is a string.""" - if isinstance(obj, dict): - return yaml.safe_dump( - [{"name": str(k), "value": str(v)} for k, v in obj.items()], - default_flow_style=True, # inline, compact formatting, no indenting needed - ) - elif isinstance(obj, list): - return yaml.safe_dump( - [str(o) for o in obj], - default_flow_style=True, # inline, compact formatting, no indenting needed - ) - else: - raise TypeError(f"unsupported type {type(obj)}") +def _to_inline_yaml_str(obj: Any) -> str: + """Convert obj to one-line yaml-syntax.""" + return yaml.safe_dump( + obj, + default_flow_style=True, # inline, compact formatting, no indenting needed + ) class SkyScanK8sJobFactory: @@ -81,13 +73,19 @@ def make( is_real_event=is_real_event, predictive_scanning_threshold=predictive_scanning_threshold, ) - scanner_server_envvars = SkyScanK8sJobFactory.make_skyscan_server_envvars( + scanner_server_envvars = EnvVarFactory.make_skyscan_server_envvars( rest_address=rest_address, scan_id=scan_id, skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, scanner_server_env_from_user=scanner_server_env_from_user, ) + ewms_envvars = EnvVarFactory.make_ewms_envvars( + docker_tag, + skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, + max_pixel_reco_time=max_pixel_reco_time, + ) + # assemble the job job_dict = SkyScanK8sJobFactory._make_job( scan_id, @@ -95,6 +93,7 @@ def make( scanner_server_memory_bytes, scanner_server_args, scanner_server_envvars, + ewms_envvars, ) return job_dict, scanner_server_args @@ -105,27 +104,29 @@ def _make_job( docker_tag: str, scanner_server_memory_bytes: int, scanner_server_args: str, - scanner_server_envvars: sdict, + scanner_server_envvars: list[sdict], + ewms_envvars: list[sdict], ) -> sdict: """Create the K8s job manifest. NOTE: Let's keep definitions as straightforward as possible. """ - scanner_server_envvars = {k: str(v) for k, v in scanner_server_envvars.items()} - - init_ewms_envvars = {} - for k in ["SKYSCAN_SKYDRIVER_ADDRESS", "SKYSCAN_SKYDRIVER_AUTH"]: - init_ewms_envvars[k] = scanner_server_envvars[k] - init_ewms_envvars.update( + ewms_init_envvars = ( + [ + envvar + for envvar in scanner_server_envvars + if envvar["name"] + in ["SKYSCAN_SKYDRIVER_ADDRESS", "SKYSCAN_SKYDRIVER_AUTH"] + ] + + ewms_envvars + + EnvVarFactory.make_s3_envvars(scan_id) + ) + s3_sidecar_envvars = [ { - "EWMS_ADDRESS": ENV.EWMS_ADDRESS, - "EWMS_TOKEN_URL": ENV.EWMS_TOKEN_URL, - "EWMS_CLIENT_ID": ENV.EWMS_CLIENT_ID, - "EWMS_CLIENT_SECRET": ENV.EWMS_CLIENT_SECRET, - "QUEUE_ALIAS_TOCLIENT": QUEUE_ALIAS_TOCLIENT, - "QUEUE_ALIAS_FROMCLIENT": QUEUE_ALIAS_FROMCLIENT, + "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": "ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", } - ) + ] + EnvVarFactory.make_s3_envvars(scan_id) # now, assemble job_yaml = textwrap.dedent( # fixes """-indentation @@ -155,7 +156,7 @@ def _make_job( image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "ewms_init_container"] args: ["{scan_id}", "--json-out", "{SkyScanK8sJobFactory._EWMS_JSON_FPATH}"] - env: {_to_inline_yaml_str(init_ewms_envvars)} + env: {_to_inline_yaml_str(ewms_init_envvars)} resources: limits: memory: "{ENV.K8S_SCANNER_INIT_MEM_LIMIT}" @@ -189,25 +190,7 @@ def _make_job( image: {ENV.THIS_IMAGE_WITH_TAG} command: ["python", "-m", "s3_sidecar"] args: ["{SkyScanK8sJobFactory._STARTUP_JSON_FPATH}", "--wait-indefinitely"] - env: - - name: S3_URL - value: "{ENV.S3_URL}" - - name: S3_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: {ENV.K8S_SECRET_NAME} - key: {ENV.S3_ACCESS_KEY_ID__K8S_SECRET_KEY} - - name: S3_SECRET_KEY - valueFrom: - secretKeyRef: - name: {ENV.K8S_SECRET_NAME} - key: {ENV.S3_SECRET_KEY__K8S_SECRET_KEY} - - name: S3_BUCKET - value: "{ENV.S3_BUCKET}" - - name: S3_OBJECT_KEY - value: "{ewms.make_s3_object_key(scan_id)}" - - name: K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS - value: "{ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS}" + env: {_to_inline_yaml_str(s3_sidecar_envvars)} resources: limits: memory: "{ENV.K8S_SCANNER_SIDECAR_S3_MEM_LIMIT}" @@ -254,6 +237,63 @@ def get_scanner_server_args( ) return args + +class EnvVarFactory: + """Factory class for assembling k8s environment-variable objects.""" + + @staticmethod + def make_ewms_envvars( + docker_tag: str, + skyscan_mq_client_timeout_wait_for_first_message: int, + max_pixel_reco_time: int, + ) -> list[sdict]: + return [ + {"name": str(k), "value": str(v)} + for k, v in { + "EWMS_ADDRESS": ENV.EWMS_ADDRESS, + "EWMS_TOKEN_URL": ENV.EWMS_TOKEN_URL, + "EWMS_CLIENT_ID": ENV.EWMS_CLIENT_ID, + "EWMS_CLIENT_SECRET": ENV.EWMS_CLIENT_SECRET, + # + "EWMS_TASK_IMAGE": get_skyscan_cvmfs_singularity_image(docker_tag), + # + "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, + "EWMS_PILOT_TASK_TIMEOUT": max_pixel_reco_time, + }.items() + if v is not None + ] + + @staticmethod + def make_s3_envvars(scan_id: str) -> list[sdict]: + return [ + {"name": "S3_URL", "value": ENV.S3_URL}, + { + "name": "S3_ACCESS_KEY_ID", + "valueFrom": { + "secretKeyRef": { + "name": ENV.K8S_SECRET_NAME, + "key": ENV.S3_ACCESS_KEY_ID__K8S_SECRET_KEY, + } + }, + }, + { + "name": "S3_SECRET_KEY", + "valueFrom": { + "secretKeyRef": { + "name": ENV.K8S_SECRET_NAME, + "key": ENV.S3_SECRET_KEY__K8S_SECRET_KEY, + } + }, + }, + {"name": "S3_EXPIRES_IN", "value": ENV.S3_EXPIRES_IN}, + {"name": "S3_BUCKET", "value": ENV.S3_BUCKET}, + {"name": "S3_OBJECT_KEY", "value": ewms.make_s3_object_key(scan_id)}, + { + "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, + }, + ] + @staticmethod def _get_token_from_keycloak( token_url: str, @@ -277,7 +317,7 @@ def make_skyscan_server_envvars( scan_id: str, skyscan_mq_client_timeout_wait_for_first_message: int | None, scanner_server_env_from_user: dict, - ) -> sdict: + ) -> list[sdict]: """Get the environment variables provided to the skyscan server.""" LOGGER.debug(f"making scanner server env vars for {scan_id=}") env = {} @@ -323,7 +363,7 @@ def make_skyscan_server_envvars( # 4. Add user's env env.update(scanner_server_env_from_user) - return env + return [{"name": str(k), "value": str(v)} for k, v in env.items()] def assemble_scanner_server_logs_url( From f9dbd7bdd9cadd1a8f241c6d60ee0a6f5dfe83e6 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 11 Feb 2025 21:45:48 +0000 Subject: [PATCH 257/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index e9e27b42..949584e9 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.17 -botocore==1.36.17 +boto3==1.36.18 +botocore==1.36.18 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.17] -│ ├── botocore [required: >=1.36.17,<1.37.0, installed: 1.36.17] +├── boto3 [required: Any, installed: 1.36.18] +│ ├── botocore [required: >=1.36.18,<1.37.0, installed: 1.36.18] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.17] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.18] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 2f2a02f12e276a7d6a22319624a41f27b46044d7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 15:58:41 -0600 Subject: [PATCH 258/327] move ewms request to init-container - 2 (wip) --- ewms_init_container/__main__.py | 51 +++++++++++++++---------------- skydriver/k8s/scanner_instance.py | 36 ++++++++++++++++++++-- skydriver/rest_handlers.py | 6 ++++ 3 files changed, 63 insertions(+), 30 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 45832b07..70f039fd 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -8,6 +8,7 @@ import time from pathlib import Path +import boto3 # type: ignore[import-untyped] import botocore.client # type: ignore[import-untyped] import requests from rest_tools.client import ClientCredentialsAuth, RestClient @@ -49,10 +50,17 @@ class EnvConfig: ENV = from_environment_as_dataclass(EnvConfig) -def generate_presigned_s3_get_url( - s3_client: botocore.client.BaseClient, scan_id: str -) -> str: +def generate_presigned_s3_get_url(scan_id: str) -> str: """Generate a pre-signed S3 url for retrieving shared files.""" + LOGGER.info("connecting to s3...") + s3_client = boto3.client( + "s3", + "us-east-1", + endpoint_url=ENV.S3_URL, + aws_access_key_id=ENV.S3_ACCESS_KEY_ID, + aws_secret_access_key=ENV.S3_SECRET_KEY, + ) + params = { "Bucket": ENV.S3_BUCKET, "Key": ENV.S3_OBJECT_KEY, @@ -67,28 +75,13 @@ def generate_presigned_s3_get_url( return get_url -async def request_workflow_on_ewms( - ewms_rc: RestClient, - s3_client: botocore.client.BaseClient, - manifest: dict, - scan_request_obj: dict, -) -> str: +async def request_workflow_on_ewms(ewms_rc: RestClient, s3_url_get: str) -> str: """Request a workflow in EWMS.""" - if manifest["ewms_workflow_id"] != database.schema.PENDING_EWMS_WORKFLOW: - if manifest["ewms_workflow_id"]: - raise TypeError("Scan has already been sent to EWMS") - else: # None - raise TypeError("Scan is not designated for EWMS") - - s3_url_get = generate_presigned_s3_get_url(s3_client, manifest.scan_id) - body = { "public_queue_aliases": [QUEUE_ALIAS_TOCLIENT, QUEUE_ALIAS_FROMCLIENT], "tasks": [ { - "cluster_locations": [ - cname for cname, _ in scan_request_obj["request_clusters"] - ], + "cluster_locations": ENV.EWMS_CLUSTERS, "input_queue_aliases": [QUEUE_ALIAS_TOCLIENT], "output_queue_aliases": [QUEUE_ALIAS_FROMCLIENT], "task_image": ENV.EWMS_TASK_IMAGE, @@ -105,8 +98,7 @@ async def request_workflow_on_ewms( f"'{s3_url_get}'" # single-quote the url '"' # unquote for bash -c "..." ), - "n_workers": scan_request_obj["request_clusters"][0][1], - # TODO: ^^^ pass on varying # of workers per cluster + "n_workers": ENV.EWMS_N_WORKERS, "pilot_config": { "tag": "latest", "environment": { @@ -126,11 +118,11 @@ async def request_workflow_on_ewms( }, "worker_config": { "do_transfer_worker_stdouterr": True, # toggle? - "max_worker_runtime": scan_request_obj["max_worker_runtime"], + "max_worker_runtime": ENV.EWMS_WORKER_MAX_WORKER_RUNTIME, "n_cores": 1, - "priority": scan_request_obj["priority"], - "worker_disk": scan_request_obj["worker_disk_bytes"], - "worker_memory": scan_request_obj["worker_memory_bytes"], + "priority": ENV.EWMS_WORKER_PRIORITY, + "worker_disk": ENV.EWMS_WORKER_DISK_BYTES, + "worker_memory": ENV.EWMS_WORKER_MEMORY_BYTES, "condor_requirements": "HAS_CVMFS_icecube_opensciencegrid_org && has_avx && has_avx2", }, } @@ -231,8 +223,13 @@ async def main() -> None: logger=LOGGER, ) - workflow_id = await request_workflow_on_ewms(ewms_rc, args.scan_id) + # 1. talk to ewms + workflow_id = await request_workflow_on_ewms( + ewms_rc, generate_presigned_s3_get_url(args.scan_id) + ) + # 2. update skydriver await send_workflow_id_to_skydriver(skyd_rc, workflow_id) + # 3. talk to ewms (again) ewms_dict = await get_ewms_attrs(ewms_rc, workflow_id) LOGGER.info(f"dumping EWMS attributes to '{args.json_out}'...") diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index cf16a3ed..ff04c26c 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -59,6 +59,12 @@ def make( rest_address: str, skyscan_mq_client_timeout_wait_for_first_message: int | None, scanner_server_env_from_user: dict, + request_clusters: list, + max_pixel_reco_time: int, + max_worker_runtime: int, + priority: int, + worker_disk_bytes: int, + worker_memory_bytes: int, ) -> tuple[sdict, str]: """Make the K8s job dict. @@ -82,8 +88,16 @@ def make( ewms_envvars = EnvVarFactory.make_ewms_envvars( docker_tag, - skyscan_mq_client_timeout_wait_for_first_message=skyscan_mq_client_timeout_wait_for_first_message, - max_pixel_reco_time=max_pixel_reco_time, + # + request_clusters, + # + skyscan_mq_client_timeout_wait_for_first_message, + max_pixel_reco_time, + # + max_worker_runtime, + priority, + worker_disk_bytes, + worker_memory_bytes, ) # assemble the job @@ -244,8 +258,16 @@ class EnvVarFactory: @staticmethod def make_ewms_envvars( docker_tag: str, - skyscan_mq_client_timeout_wait_for_first_message: int, + # + request_clusters: list, + # + skyscan_mq_client_timeout_wait_for_first_message: int | None, max_pixel_reco_time: int, + # + max_worker_runtime: int, + priority: int, + worker_disk_bytes: int, + worker_memory_bytes: int, ) -> list[sdict]: return [ {"name": str(k), "value": str(v)} @@ -255,10 +277,18 @@ def make_ewms_envvars( "EWMS_CLIENT_ID": ENV.EWMS_CLIENT_ID, "EWMS_CLIENT_SECRET": ENV.EWMS_CLIENT_SECRET, # + "EWMS_CLUSTERS": [cname for cname, _ in request_clusters], + "EWMS_N_WORKERS": request_clusters[0][1], + # "EWMS_TASK_IMAGE": get_skyscan_cvmfs_singularity_image(docker_tag), # "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, "EWMS_PILOT_TASK_TIMEOUT": max_pixel_reco_time, + # + "EWMS_WORKER_MAX_WORKER_RUNTIME": max_worker_runtime, + "EWMS_WORKER_PRIORITY": priority, + "EWMS_WORKER_DISK_BYTES": worker_disk_bytes, + "EWMS_WORKER_MEMORY_BYTES": worker_memory_bytes, }.items() if v is not None ] diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 281192c8..67b364a8 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -559,6 +559,12 @@ async def _start_scan( "skyscan_mq_client_timeout_wait_for_first_message" ], scanner_server_env_from_user=scan_request_obj["scanner_server_env_from_user"], + request_clusters=scan_request_obj["request_clusters"], + max_pixel_reco_time=scan_request_obj["max_pixel_reco_time"], + max_worker_runtime=scan_request_obj["max_worker_runtime"], + priority=scan_request_obj["priority"], + worker_disk_bytes=scan_request_obj["worker_disk_bytes"], + worker_memory_bytes=scan_request_obj["worker_memory_bytes"], ) # put in db (do before k8s start so if k8s fail, we can debug using db's info) From 8b85e0047ec863ece77efd434a65106895316513 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 16:06:32 -0600 Subject: [PATCH 259/327] move ewms request to init-container - 3 (wip) --- ewms_init_container/__main__.py | 24 ++++++++++++++++++++---- skydriver/k8s/scanner_instance.py | 3 ++- 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 70f039fd..a78faf9f 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -21,6 +21,8 @@ QUEUE_ALIAS_TOCLIENT = "to-client-queue" # '' QUEUE_ALIAS_FROMCLIENT = "from-client-queue" # '' +CURL_TIMEOUT = 60 + @dc.dataclass(frozen=True) class EnvConfig: @@ -35,6 +37,16 @@ class EnvConfig: EWMS_CLIENT_SECRET: str EWMS_TASK_IMAGE: str + EWMS_CLUSTERS: list[str] # auto-parsed from space-delimited string + EWMS_N_WORKERS: int + + EWMS_PILOT_TASK_TIMEOUT: int + EWMS_PILOT_TIMEOUT_QUEUE_INCOMING: int + + EWMS_WORKER_MAX_WORKER_RUNTIME: int + EWMS_WORKER_PRIORITY: int + EWMS_WORKER_DISK_BYTES: int + EWMS_WORKER_MEMORY_BYTES: int QUEUE_ALIAS_TOCLIENT: str QUEUE_ALIAS_FROMCLIENT: str @@ -46,6 +58,8 @@ class EnvConfig: S3_OBJECT_KEY: str S3_EXPIRES_IN: int + EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE: int | None = None + ENV = from_environment_as_dataclass(EnvConfig) @@ -91,10 +105,12 @@ async def request_workflow_on_ewms(ewms_rc: RestClient, s3_url_get: str) -> str: "--client-startup-json {{DATA_HUB}}/startup.json" ), "init_image": ENV.EWMS_TASK_IMAGE, # piggyback this image since it's already present - "init_args": ( + "init_args": ( # to get the s3 object/file "bash -c " '"' # quote for bash -c "..." - "curl --fail-with-body --max-time 60 -o {{DATA_HUB}}/startup.json " + "curl --fail-with-body " + f"--max-time {CURL_TIMEOUT} " + "-o {{DATA_HUB}}/startup.json " f"'{s3_url_get}'" # single-quote the url '"' # unquote for bash -c "..." ), @@ -104,10 +120,10 @@ async def request_workflow_on_ewms(ewms_rc: RestClient, s3_url_get: str) -> str: "environment": { k: v for k, v in { - "EWMS_PILOT_INIT_TIMEOUT": 61, # 1 sec more than 'curl' timeout + "EWMS_PILOT_INIT_TIMEOUT": CURL_TIMEOUT + 1, "EWMS_PILOT_TASK_TIMEOUT": ENV.EWMS_PILOT_TASK_TIMEOUT, "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": ENV.EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE, - "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.EWMS_PILOT_TIMEOUT_QUEUE_INCOMING, "EWMS_PILOT_CONTAINER_DEBUG": "True", # toggle? "EWMS_PILOT_INFILE_EXT": ".json", "EWMS_PILOT_OUTFILE_EXT": ".json", diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index ff04c26c..3a768919 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -277,12 +277,13 @@ def make_ewms_envvars( "EWMS_CLIENT_ID": ENV.EWMS_CLIENT_ID, "EWMS_CLIENT_SECRET": ENV.EWMS_CLIENT_SECRET, # - "EWMS_CLUSTERS": [cname for cname, _ in request_clusters], + "EWMS_CLUSTERS": " ".join(cname for cname, _ in request_clusters), "EWMS_N_WORKERS": request_clusters[0][1], # "EWMS_TASK_IMAGE": get_skyscan_cvmfs_singularity_image(docker_tag), # "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, "EWMS_PILOT_TASK_TIMEOUT": max_pixel_reco_time, # "EWMS_WORKER_MAX_WORKER_RUNTIME": max_worker_runtime, From c10b2e4e8cb050ed660b3374892b18da2f013ef6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 16:33:28 -0600 Subject: [PATCH 260/327] move ewms request to init-container - 4 (ready) --- ewms_init_container/__main__.py | 16 +++++-- skydriver/k8s/scan_backlog.py | 30 ++----------- skydriver/rest_handlers.py | 74 +++++++++++++++++++++++++++++---- skydriver/server.py | 1 + 4 files changed, 82 insertions(+), 39 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index a78faf9f..d0363308 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -239,15 +239,25 @@ async def main() -> None: logger=LOGGER, ) + # 0. check that a workflow has not already been requested for this scan + resp = await skyd_rc.request("GET", f"/scan/{args.scan_id}/ewms/workflow-id") + if not resp["is_pending_ewms_workflow"]: + raise ValueError("this scan is not pending an EWMS workflow") + # # 1. talk to ewms workflow_id = await request_workflow_on_ewms( ewms_rc, generate_presigned_s3_get_url(args.scan_id) ) + # # 2. update skydriver - await send_workflow_id_to_skydriver(skyd_rc, workflow_id) - # 3. talk to ewms (again) + await skyd_rc.request( + "POST", + f"/scan/{args.scan_id}/ewms/workflow-id", + {"workflow_id": workflow_id}, + ) + # + # 3. talk to ewms (again) & write to file ewms_dict = await get_ewms_attrs(ewms_rc, workflow_id) - LOGGER.info(f"dumping EWMS attributes to '{args.json_out}'...") with open(args.json_out, "w") as f: json.dump(ewms_dict, f) diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 75731da1..8bbafc56 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -12,7 +12,7 @@ from wipac_dev_tools.timing_tools import IntervalTimer from .utils import KubeAPITools -from .. import database, ewms +from .. import database from ..config import ENV LOGGER = logging.getLogger(__name__) @@ -163,8 +163,7 @@ async def _run( ) # NOTE: the job_obj is enormous, so don't log it - # 1st: start k8s job -- this could be any k8s job (pre- or post-ewms switchover) - # ^^^ b/c this uses local resources, if something goes wrong, this limits exposure + # start k8s job -- this could be any k8s job (pre- or post-ewms switchover) try: LOGGER.info(f"Starting K8s job: scan_id={manifest.scan_id}") KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) @@ -174,30 +173,7 @@ async def _run( timer_main_loop.fastforward() # nothing was started, so don't wait long continue - # 2nd: request a workflow on EWMS - # ^^^ do after k8s b/c now we know that that was successful - try: - LOGGER.info(f"Requesting EWMS Workflow: scan_id={manifest.scan_id}") - workflow_id = await ewms.request_workflow_on_ewms( - ewms_rc, - s3_client, - manifest, - scan_request_obj, - ) - except Exception as e: - # TODO: if this fails, then the k8s have already started. so, next loop, either kill the og k8s or somehow re-use -- no timeout on ewms-init? - # option 1: move this request thing to the ewms-init - # option 2: add a second backlogger that only does ewms -- may have timing issues - LOGGER.exception(e) - timer_main_loop.fastforward() # nothing was started, so don't wait long - continue - else: - LOGGER.info(f"-> {workflow_id=}: scan_id={manifest.scan_id}") - await manifest_client.collection.find_one_and_update( - {"scan_id": manifest.scan_id}, - {"$set": {"ewms_workflow_id": workflow_id}}, - return_dclass=dict, - ) + # NOTE: DO NOT ADD ANYMORE ACTIONS THAT CAN POSSIBLY FAIL -- THINK STATELESSNESS # remove from backlog now that startup succeeded LOGGER.info(f"Scan successfully started: scan_id={manifest.scan_id}") diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 67b364a8..3fe03e9a 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -34,6 +34,7 @@ is_testing, ) from .database import schema +from .database.mongodc import DocumentNotFoundException from .database.schema import PENDING_EWMS_WORKFLOW from .ewms import request_stop_on_ewms from .k8s.scan_backlog import put_on_backlog @@ -58,7 +59,7 @@ USER_ACCT = "user" -SKYMAP_SCANNER_ACCT = "system" +INTERNAL_ACCT = "system" if is_testing(): @@ -77,7 +78,7 @@ async def wrapper(self, *args, **kwargs): # type: ignore[no-untyped-def] service_account_auth = token_attribute_role_mapping_auth( # type: ignore[no-untyped-call] role_attrs={ USER_ACCT: ["groups=/institutions/IceCube.*"], - SKYMAP_SCANNER_ACCT: ["skydriver_role=system"], + INTERNAL_ACCT: ["skydriver_role=system"], } ) @@ -781,7 +782,7 @@ async def delete(self, scan_id: str) -> None: } ) - @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get manifest & result.""" arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) @@ -815,7 +816,7 @@ class ScanManifestHandler(BaseSkyDriverHandler): ROUTE = r"/scan/(?P\w+)/manifest$" - @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get scan progress.""" arghand = ArgumentHandler(ArgumentSource.QUERY_ARGUMENTS, self) @@ -839,7 +840,7 @@ async def get(self, scan_id: str) -> None: # Include the whole event dict in the response like the 'old' manifest. # This overrides the manifest's field which should be an id. if ( - self.auth_roles[0] == SKYMAP_SCANNER_ACCT # type: ignore + self.auth_roles[0] == INTERNAL_ACCT # type: ignore and "event_i3live_json_dict" in args.projection and manifest.i3_event_id # if no id, then event already in manifest ): @@ -861,7 +862,7 @@ async def get(self, scan_id: str) -> None: resp = dict_projection(dc.asdict(manifest), args.projection) self.write(resp) - @service_account_auth(roles=[SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[INTERNAL_ACCT]) # type: ignore async def patch(self, scan_id: str) -> None: """Update scan progress.""" arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) @@ -911,7 +912,7 @@ class ScanI3EventHandler(BaseSkyDriverHandler): ROUTE = r"/scan/(?P\w+)/i3-event$" - @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get scan's i3 event.""" manifest = await self.manifests.get(scan_id, True) @@ -973,7 +974,7 @@ async def get(self, scan_id: str) -> None: self.write(dc.asdict(result) if result else {}) - @service_account_auth(roles=[SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[INTERNAL_ACCT]) # type: ignore async def put(self, scan_id: str) -> None: """Put (persist) a scan's result.""" arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) @@ -1023,7 +1024,7 @@ class ScanStatusHandler(BaseSkyDriverHandler): ROUTE = r"/scan/(?P\w+)/status$" - @service_account_auth(roles=[USER_ACCT, SKYMAP_SCANNER_ACCT]) # type: ignore + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore async def get(self, scan_id: str) -> None: """Get a scan's status.""" manifest = await self.manifests.get(scan_id, incl_del=True) @@ -1088,3 +1089,58 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- + + +class ScanActionEWMSWorkflowIDHandler(BaseSkyDriverHandler): + """Handles actions on scan's ewms workflow id.""" + + ROUTE = r"/scan/(?P\w+)/ewms/workflow-id$" + + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore + async def get(self, scan_id: str) -> None: + """Get the ewms workflow_id.""" + manifest = await self.manifests.get(scan_id, incl_del=True) + self.write( + { + "workflow_id": manifest.ewms_workflow_id, + "is_pending_ewms_workflow": ( + manifest.ewms_workflow_id == PENDING_EWMS_WORKFLOW + ), + } + ) + + @service_account_auth(roles=[INTERNAL_ACCT]) # type: ignore + async def post(self, scan_id: str) -> None: + """Update the ewms workflow_id.""" + arghand = ArgumentHandler(ArgumentSource.JSON_BODY_ARGUMENTS, self) + arghand.add_argument( + "workflow_id", + required=True, + type=str, + ) + args = arghand.parse_args() + + try: + manifest = await self.manifests.collection.find_one_and_update( + { + "scan_id": scan_id, + "ewms_workflow_id": PENDING_EWMS_WORKFLOW, + "is_deleted": False, + }, + {"$set": {"ewms_workflow_id": args.workflow_id}}, + return_document=ReturnDocument.AFTER, + return_dclass=dict, + ) + except DocumentNotFoundException: + raise web.HTTPError( + 404, + log_message=( + "Could not find a scan manifest to update " + "(either the scan_id does not exist or its EWMS workflow_id cannot be updated)" + ), + ) + else: + self.write(manifest) + + +# ----------------------------------------------------------------------------- diff --git a/skydriver/server.py b/skydriver/server.py index df3b8223..6e2657f2 100644 --- a/skydriver/server.py +++ b/skydriver/server.py @@ -51,6 +51,7 @@ async def make( rest_handlers.ScanRescanHandler, rest_handlers.ScanStatusHandler, rest_handlers.ScanLogsHandler, + rest_handlers.ScanActionEWMSWorkflowIDHandler, ]: try: rs.add_route(getattr(klass, "ROUTE"), klass, args) From 62999b9bdd0bb29c7600527d65eb0833c32e1c20 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 16:35:12 -0600 Subject: [PATCH 261/327] flake8 --- ewms_init_container/__main__.py | 1 - skydriver/ewms.py | 1 - 2 files changed, 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index d0363308..4cdf6167 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -9,7 +9,6 @@ from pathlib import Path import boto3 # type: ignore[import-untyped] -import botocore.client # type: ignore[import-untyped] import requests from rest_tools.client import ClientCredentialsAuth, RestClient from wipac_dev_tools import from_environment_as_dataclass, logging_tools diff --git a/skydriver/ewms.py b/skydriver/ewms.py index aebe0cab..2a654b54 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -3,7 +3,6 @@ import logging import aiocache # type: ignore[import-untyped] -import botocore.client # type: ignore[import-untyped] import requests from rest_tools.client import RestClient From 3e1cffce8f3d80de8f3c9cf42371c5f9c7ee605d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 16:35:53 -0600 Subject: [PATCH 262/327] fix call --- skydriver/k8s/scanner_instance.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index 3a768919..e28da2da 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -383,7 +383,7 @@ def make_skyscan_server_envvars( # 3. generate & add auth tokens tokens = { - "SKYSCAN_SKYDRIVER_AUTH": SkyScanK8sJobFactory._get_token_from_keycloak( + "SKYSCAN_SKYDRIVER_AUTH": EnvVarFactory._get_token_from_keycloak( ENV.KEYCLOAK_OIDC_URL, ENV.KEYCLOAK_CLIENT_ID_SKYDRIVER_REST, ENV.KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST, From 93da37b88dfc04c4e3b3c31765913b47da540a25 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 16:40:46 -0600 Subject: [PATCH 263/327] fix unit tests --- tests/integration/test_backlog_runner.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/tests/integration/test_backlog_runner.py b/tests/integration/test_backlog_runner.py index 8d43883c..c7c5ddbb 100644 --- a/tests/integration/test_backlog_runner.py +++ b/tests/integration/test_backlog_runner.py @@ -36,10 +36,8 @@ def print_it(obj: Any) -> None: @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_00( kapitsj_mock: Mock, - gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting.""" @@ -51,17 +49,14 @@ async def test_00( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) # call counts - gps3geturl_mock.assert_called_once() kapitsj_mock.assert_called_once() print_it(await rc.request("GET", "/scans/backlog")) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_01( kapitsj_mock: Mock, - gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting with multiple.""" @@ -78,26 +73,21 @@ async def test_01( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert gps3geturl_mock.call_count >= i + 1 # in case runner is faster assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert gps3geturl_mock.call_count == N_JOBS assert kapitsj_mock.call_count == N_JOBS await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert gps3geturl_mock.call_count == N_JOBS assert kapitsj_mock.call_count == N_JOBS print_it(await rc.request("GET", "/scans/backlog")) @mock.patch("skydriver.k8s.utils.KubeAPITools.start_job") -@mock.patch("skydriver.ewms.generate_presigned_s3_get_url") async def test_10( kapitsj_mock: Mock, - gps3geturl_mock: Mock, server: Callable[[], RestClient], ) -> None: """Test backlog job starting with multiple cancels.""" @@ -123,16 +113,13 @@ async def test_10( await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 1.01) print_it(await rc.request("GET", "/scans/backlog")) # call counts - assert gps3geturl_mock.call_count >= i + 1 # in case runner is faster assert kapitsj_mock.call_count >= i + 1 # in case runner is faster # call counts - assert gps3geturl_mock.call_count == N_JOBS - 2 assert kapitsj_mock.call_count == N_JOBS - 2 await asyncio.sleep(skydriver.config.ENV.SCAN_BACKLOG_RUNNER_DELAY * 2) # any extra calls? - assert gps3geturl_mock.call_count == N_JOBS - 2 assert kapitsj_mock.call_count == N_JOBS - 2 print_it(await rc.request("GET", "/scans/backlog")) From 28983363cd0bef3ac628c023fe9b44d23630a065 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:04:43 -0600 Subject: [PATCH 264/327] update integration tests - 2 --- skydriver/k8s/scanner_instance.py | 2 +- tests/integration/test_rest_routes.py | 53 ++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 7 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index e28da2da..a05cd32d 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -138,7 +138,7 @@ def _make_job( s3_sidecar_envvars = [ { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": "ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, } ] + EnvVarFactory.make_s3_envvars(scan_id) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 07219e5b..777660e4 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -307,6 +307,10 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 } }, }, + { + "name": "S3_EXPIRES_IN", + "value": str(7 * 24 * 60 * 60), + }, { "name": "S3_BUCKET", "value": os.environ["S3_BUCKET"], @@ -317,7 +321,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 }, { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": str(15 * 60), + "value": "900", }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], @@ -355,7 +359,10 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "SKYSCAN_SKYDRIVER_ADDRESS", "value": rest_address, }, - {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, + { + "name": "SKYSCAN_SKYDRIVER_AUTH", + "value": "", + }, { "name": "EWMS_ADDRESS", "value": os.environ["EWMS_ADDRESS"], @@ -373,12 +380,46 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "value": os.environ["EWMS_CLIENT_SECRET"], }, { - "name": "QUEUE_ALIAS_TOCLIENT", - "value": "to-client-queue", + "name": "EWMS_CLUSTERS", + "value": " ".join( + list(post_scan_body["cluster"].keys()) + if isinstance( + post_scan_body["cluster"], dict + ) + else [ + c[0] for c in post_scan_body["cluster"] + ] + ), + }, + { + "name": "EWMS_N_WORKERS", + "value": "1", + }, + { + "name": "EWMS_TASK_IMAGE", + "value": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:3.21.2", + }, + { + "name": "EWMS_PILOT_TASK_TIMEOUT", + "value": str( + post_scan_body["max_pixel_reco_time"] + ), + }, + { + "name": "EWMS_WORKER_MAX_WORKER_RUNTIME", + "value": "14400", + }, + { + "name": "EWMS_WORKER_PRIORITY", + "value": "0", + }, + { + "name": "EWMS_WORKER_DISK_BYTES", + "value": "1000000000", }, { - "name": "QUEUE_ALIAS_FROMCLIENT", - "value": "from-client-queue", + "name": "EWMS_WORKER_MEMORY_BYTES", + "value": "8000000000", }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], From ba9f8902dbbb023a58cadd9f91048b2641e4ebdb Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:17:06 -0600 Subject: [PATCH 265/327] update integration tests - 3 --- skydriver/k8s/scanner_instance.py | 4 +- tests/integration/test_rest_routes.py | 75 ++++++++++++++++----------- 2 files changed, 47 insertions(+), 32 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index a05cd32d..d1c75276 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -316,12 +316,12 @@ def make_s3_envvars(scan_id: str) -> list[sdict]: } }, }, - {"name": "S3_EXPIRES_IN", "value": ENV.S3_EXPIRES_IN}, + {"name": "S3_EXPIRES_IN", "value": str(ENV.S3_EXPIRES_IN)}, {"name": "S3_BUCKET", "value": ENV.S3_BUCKET}, {"name": "S3_OBJECT_KEY", "value": ewms.make_s3_object_key(scan_id)}, { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, + "value": str(ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS), }, ] diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 777660e4..58fb971d 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -251,7 +251,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 }, {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, ] - + [ # add those from 'post_scan_body' + + [ {"name": k, "value": str(v)} for k, v in post_scan_body[ "scanner_server_env" @@ -260,10 +260,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "image": f"icecube/skymap_scanner:{docker_tag_expected}", "name": f'skyscan-server-{post_resp["scan_id"]}', "resources": { - "limits": { - "cpu": "1.0", - "memory": "1024000000", - }, + "limits": {"cpu": "1.0", "memory": "1024000000"}, "requests": { "cpu": "0.1", "ephemeral-storage": "1M", @@ -307,10 +304,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 } }, }, - { - "name": "S3_EXPIRES_IN", - "value": str(7 * 24 * 60 * 60), - }, + {"name": "S3_EXPIRES_IN", "value": str(604800)}, { "name": "S3_BUCKET", "value": os.environ["S3_BUCKET"], @@ -321,16 +315,13 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 }, { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": "900", + "value": str(900), }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"sidecar-s3-{post_resp['scan_id']}", "resources": { - "limits": { - "cpu": "0.1", - "memory": "100M", - }, + "limits": {"cpu": "0.1", "memory": "100M"}, "requests": { "cpu": "0.05", "ephemeral-storage": "1M", @@ -359,10 +350,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "SKYSCAN_SKYDRIVER_ADDRESS", "value": rest_address, }, - { - "name": "SKYSCAN_SKYDRIVER_AUTH", - "value": "", - }, + {"name": "SKYSCAN_SKYDRIVER_AUTH", "value": ""}, { "name": "EWMS_ADDRESS", "value": os.environ["EWMS_ADDRESS"], @@ -391,10 +379,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 ] ), }, - { - "name": "EWMS_N_WORKERS", - "value": "1", - }, + {"name": "EWMS_N_WORKERS", "value": "1"}, { "name": "EWMS_TASK_IMAGE", "value": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:3.21.2", @@ -409,10 +394,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "EWMS_WORKER_MAX_WORKER_RUNTIME", "value": "14400", }, - { - "name": "EWMS_WORKER_PRIORITY", - "value": "0", - }, + {"name": "EWMS_WORKER_PRIORITY", "value": "0"}, { "name": "EWMS_WORKER_DISK_BYTES", "value": "1000000000", @@ -421,14 +403,47 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "EWMS_WORKER_MEMORY_BYTES", "value": "8000000000", }, + {"name": "S3_URL", "value": os.environ["S3_URL"]}, + { + "name": "S3_ACCESS_KEY_ID", + "valueFrom": { + "secretKeyRef": { + "key": os.environ[ + "S3_ACCESS_KEY_ID__K8S_SECRET_KEY" + ], + "name": os.environ["K8S_SECRET_NAME"], + } + }, + }, + { + "name": "S3_SECRET_KEY", + "valueFrom": { + "secretKeyRef": { + "key": os.environ[ + "S3_SECRET_KEY__K8S_SECRET_KEY" + ], + "name": os.environ["K8S_SECRET_NAME"], + } + }, + }, + {"name": "S3_EXPIRES_IN", "value": str(604800)}, + { + "name": "S3_BUCKET", + "value": os.environ["S3_BUCKET"], + }, + { + "name": "S3_OBJECT_KEY", + "value": f"{post_resp['scan_id']}-s3-object", + }, + { + "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": str(900), + }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"init-ewms-{post_resp['scan_id']}", "resources": { - "limits": { - "cpu": "0.1", - "memory": "100M", - }, + "limits": {"cpu": "0.1", "memory": "100M"}, "requests": { "cpu": "0.05", "ephemeral-storage": "1M", From bec27184c84bee489e7416dea69c5a89caf6777a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:21:42 -0600 Subject: [PATCH 266/327] update integration tests - 4 --- skydriver/k8s/scanner_instance.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index d1c75276..ce429e81 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -138,7 +138,7 @@ def _make_job( s3_sidecar_envvars = [ { "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, + "value": str(ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS), } ] + EnvVarFactory.make_s3_envvars(scan_id) @@ -319,10 +319,6 @@ def make_s3_envvars(scan_id: str) -> list[sdict]: {"name": "S3_EXPIRES_IN", "value": str(ENV.S3_EXPIRES_IN)}, {"name": "S3_BUCKET", "value": ENV.S3_BUCKET}, {"name": "S3_OBJECT_KEY", "value": ewms.make_s3_object_key(scan_id)}, - { - "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": str(ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS), - }, ] @staticmethod From 6d8128ce40026d8c5dacdba4ff6fecb29a5a7353 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:25:11 -0600 Subject: [PATCH 267/327] update integration tests - 5 --- tests/integration/test_rest_routes.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 58fb971d..1a5b1fb0 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -435,10 +435,6 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "S3_OBJECT_KEY", "value": f"{post_resp['scan_id']}-s3-object", }, - { - "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": str(900), - }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"init-ewms-{post_resp['scan_id']}", From 3b018c20a2610652874a9e67ad66399543e022ec Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:28:57 -0600 Subject: [PATCH 268/327] update integration tests - 6 --- tests/integration/test_rest_routes.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 1a5b1fb0..f9476d8c 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -281,6 +281,10 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 ], "command": ["python", "-m", "s3_sidecar"], "env": [ + { + "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", + "value": str(900), + }, {"name": "S3_URL", "value": os.environ["S3_URL"]}, { "name": "S3_ACCESS_KEY_ID", @@ -313,10 +317,6 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 "name": "S3_OBJECT_KEY", "value": f"{post_resp['scan_id']}-s3-object", }, - { - "name": "K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS", - "value": str(900), - }, ], "image": os.environ["THIS_IMAGE_WITH_TAG"], "name": f"sidecar-s3-{post_resp['scan_id']}", From 943e6bd301d8dd43218a7c907fe014085803ad14 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:48:46 -0600 Subject: [PATCH 269/327] update integration tests - 7 --- tests/integration/test_rest_routes.py | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index f9476d8c..9228daa9 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -6,6 +6,7 @@ import pprint import re import time +import uuid from typing import Any, Callable import humanfriendly @@ -908,8 +909,29 @@ async def _after_scan_start_logic( # wait backlogger to request to ewms assert int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) await asyncio.sleep(int(os.environ["SCAN_BACKLOG_RUNNER_DELAY"]) * 5) # extra - manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert RE_UUID4HEX.fullmatch(manifest["ewms_workflow_id"]) + + # mimic the ewms-init container... + # -> before + assert (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ + "ewms_workflow_id" + ] == skydriver.database.schema.PENDING_EWMS_WORKFLOW + assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { + "workflow_id": skydriver.database.schema.PENDING_EWMS_WORKFLOW, + "is_pending_ewms_workflow": True, + } + # -> update workflow_id + impromptu_uuid = uuid.uuid4().hex + await rc.request( + "POST", f"/scan/{scan_id}/ewms/workflow-id", {"workflow_id": impromptu_uuid} + ) + # -> after + assert (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ + "ewms_workflow_id" + ] == impromptu_uuid + assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { + "workflow_id": impromptu_uuid, + "is_pending_ewms_workflow": True, + } # # INITIAL UPDATES From a1517e334f8b36b0bf6e215f84202da9eea3572b Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:52:16 -0600 Subject: [PATCH 270/327] mypy --- tests/integration/test_rest_routes.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 9228daa9..6f8e33d9 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -17,6 +17,8 @@ import skydriver.images # noqa: F401 # export +PENDING_EWMS_WORKFLOW = "pending-ewms" + LOGGER = logging.getLogger(__name__) skydriver.config.config_logging() @@ -104,7 +106,7 @@ async def _launch_scan( progress=None, scanner_server_args=post_resp["scanner_server_args"], # see below ewms_task="use 'ewms_workflow_id'", - ewms_workflow_id="pending-ewms", + ewms_workflow_id=PENDING_EWMS_WORKFLOW, classifiers=post_scan_body["classifiers"], last_updated=post_resp["last_updated"], # see below priority=0, @@ -914,9 +916,9 @@ async def _after_scan_start_logic( # -> before assert (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ "ewms_workflow_id" - ] == skydriver.database.schema.PENDING_EWMS_WORKFLOW + ] == PENDING_EWMS_WORKFLOW assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { - "workflow_id": skydriver.database.schema.PENDING_EWMS_WORKFLOW, + "workflow_id": PENDING_EWMS_WORKFLOW, "is_pending_ewms_workflow": True, } # -> update workflow_id From 77043373792f76cb82c9d760596c719a99b1c97f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:53:56 -0600 Subject: [PATCH 271/327] pop "_id" --- skydriver/rest_handlers.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 3fe03e9a..44d80b59 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1131,6 +1131,7 @@ async def post(self, scan_id: str) -> None: return_document=ReturnDocument.AFTER, return_dclass=dict, ) + manifest.pop("_id") except DocumentNotFoundException: raise web.HTTPError( 404, From 6e7d5c2cf86b97468a011c0d17d1b570862ee0f3 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 17:58:02 -0600 Subject: [PATCH 272/327] update integration tests - 8 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 6f8e33d9..565e84e6 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -932,7 +932,7 @@ async def _after_scan_start_logic( ] == impromptu_uuid assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { "workflow_id": impromptu_uuid, - "is_pending_ewms_workflow": True, + "is_pending_ewms_workflow": False, } # From 0aade5e89f5af5230ba6cc84821afac5e00def7d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:01:49 -0600 Subject: [PATCH 273/327] update integration tests - 9 --- tests/integration/test_rest_routes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 565e84e6..913e5491 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -927,9 +927,8 @@ async def _after_scan_start_logic( "POST", f"/scan/{scan_id}/ewms/workflow-id", {"workflow_id": impromptu_uuid} ) # -> after - assert (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ - "ewms_workflow_id" - ] == impromptu_uuid + manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") + assert manifest["ewms_workflow_id"] == impromptu_uuid assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { "workflow_id": impromptu_uuid, "is_pending_ewms_workflow": False, From 7c490b37147720d1eda22a970e11f90b93bc845c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:02:17 -0600 Subject: [PATCH 274/327] update integration tests - 10 --- tests/integration/test_rest_routes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 913e5491..e5b89872 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -914,9 +914,8 @@ async def _after_scan_start_logic( # mimic the ewms-init container... # -> before - assert (await rc.request("GET", f"/scan/{scan_id}/manifest"))[ - "ewms_workflow_id" - ] == PENDING_EWMS_WORKFLOW + manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") + assert manifest["ewms_workflow_id"] == PENDING_EWMS_WORKFLOW assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { "workflow_id": PENDING_EWMS_WORKFLOW, "is_pending_ewms_workflow": True, From 1d6d1c55e3aa10437fb6e6ffbd428fc6578cdd91 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:16:25 -0600 Subject: [PATCH 275/327] update integration tests - 11 (ewms connect) --- tests/integration/test_rest_routes.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index e5b89872..38e33e43 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -6,7 +6,6 @@ import pprint import re import time -import uuid from typing import Any, Callable import humanfriendly @@ -16,6 +15,7 @@ from rest_tools.client import RestClient import skydriver.images # noqa: F401 # export +from skydriver.__main__ import setup_ewms_client PENDING_EWMS_WORKFLOW = "pending-ewms" @@ -921,15 +921,16 @@ async def _after_scan_start_logic( "is_pending_ewms_workflow": True, } # -> update workflow_id - impromptu_uuid = uuid.uuid4().hex + resp = await setup_ewms_client().request("POST", "/v0/workflows", {"foo": "bar"}) + workflow_id = resp["workflow"]["workflow_id"] await rc.request( - "POST", f"/scan/{scan_id}/ewms/workflow-id", {"workflow_id": impromptu_uuid} + "POST", f"/scan/{scan_id}/ewms/workflow-id", {"workflow_id": workflow_id} ) # -> after manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") - assert manifest["ewms_workflow_id"] == impromptu_uuid + assert manifest["ewms_workflow_id"] == workflow_id assert (await rc.request("GET", f"/scan/{scan_id}/ewms/workflow-id")) == { - "workflow_id": impromptu_uuid, + "workflow_id": workflow_id, "is_pending_ewms_workflow": False, } From 0100dac4f9e9f72c9b4c1ba02b0fa19339c8e90c Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:20:20 -0600 Subject: [PATCH 276/327] update integration tests - 12 --- tests/integration/test_rest_routes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 38e33e43..447d6a75 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -385,7 +385,7 @@ async def _assert_db_skyscank8sjobs_coll( # noqa: MFL000 {"name": "EWMS_N_WORKERS", "value": "1"}, { "name": "EWMS_TASK_IMAGE", - "value": "/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:3.21.2", + "value": f"/cvmfs/icecube.opensciencegrid.org/containers/realtime/skymap_scanner:{docker_tag_expected}", }, { "name": "EWMS_PILOT_TASK_TIMEOUT", From d3520cfbca819e8c81203bf68e02542179591bc1 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:32:31 -0600 Subject: [PATCH 277/327] remove extra env vars --- ewms_init_container/__main__.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 4cdf6167..8d9cc289 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -47,9 +47,6 @@ class EnvConfig: EWMS_WORKER_DISK_BYTES: int EWMS_WORKER_MEMORY_BYTES: int - QUEUE_ALIAS_TOCLIENT: str - QUEUE_ALIAS_FROMCLIENT: str - S3_URL: str S3_ACCESS_KEY_ID: str S3_SECRET_KEY: str From 8811df02123a44a7f116872a76341539889fd4c0 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 11 Feb 2025 18:38:54 -0600 Subject: [PATCH 278/327] remove extra env vars - 2 --- ewms_init_container/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ewms_init_container/__main__.py b/ewms_init_container/__main__.py index 8d9cc289..4638d18a 100644 --- a/ewms_init_container/__main__.py +++ b/ewms_init_container/__main__.py @@ -177,8 +177,8 @@ async def get_ewms_attrs( LOGGER.info(f"mqprofiles: {mqprofiles}") # convert mqprofiles to dicts based on the queue aliases - toclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_TOCLIENT) - fromclient = next(p for p in mqprofiles if p["alias"] == ENV.QUEUE_ALIAS_FROMCLIENT) + toclient = next(p for p in mqprofiles if p["alias"] == QUEUE_ALIAS_TOCLIENT) + fromclient = next(p for p in mqprofiles if p["alias"] == QUEUE_ALIAS_FROMCLIENT) return { # NOTE: these fields are accessed by name in the skymap scanner "toclient": { From 6fc6a82c7adad5fa8b5684afca1aac4c9f9e1318 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 11:56:32 -0600 Subject: [PATCH 279/327] prod-tester: fix result getter --- resources/prod_tester/test_runner.py | 35 +++++++++------------------- 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 7986e419..db3fd2d2 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -87,51 +87,38 @@ async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> Return the result. """ out = open(log_file, "w") if log_file else sys.stdout - result_resp = {} - resp = await rc.request("GET", f"/scan/{scan_id}/manifest") print(json.dumps(resp, indent=4), file=out, flush=True) # loop w/ sleep - done = False while True: # get result try: - result_resp = await rc.request("GET", f"/scan/{scan_id}/result") - print( - pformat(result_resp), file=out, flush=True - ) # pprint doesn't have flush + resp = await rc.request("GET", f"/scan/{scan_id}/status") + print(pformat(resp), file=out, flush=True) # pprint doesn't have flush except Exception as e: # 404 (scanner not yet online) print(f"suppressed error: {repr(e)}", file=out, flush=True) # get progress try: resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - progress = resp["progress"] - print( - json.dumps(progress["processing_stats"].pop("rate"), indent=4), - file=out, - flush=True, - ) - print(json.dumps(progress, indent=4), file=out, flush=True) + print(json.dumps(resp["progress"], indent=4), file=out, flush=True) except Exception as e: # 404 (scanner not yet online) or KeyError (no progress yet) print(f"suppressed error: {repr(e)}", file=out, flush=True) # get status try: - result_resp = await rc.request("GET", f"/scan/{scan_id}/status") - print( - pformat(result_resp), file=out, flush=True - ) # pprint doesn't have flush - done = result_resp["scan_complete"] + resp = await rc.request("GET", f"/scan/{scan_id}/result") + print(pformat(resp), file=out, flush=True) # pprint doesn't have flush except Exception as e: print(f"suppressed error: {repr(e)}", file=out, flush=True) + else: + if resp["scan_complete"]: + print("scan is done!", file=out, flush=True) + print(scan_id, file=out, flush=True) + return resp["skyscan_result"] # done? else, wait print(scan_id, file=out, flush=True) - if done: - print("scan is done!", file=out, flush=True) - return result_resp["skyscan_result"] - else: - await asyncio.sleep(60) + await asyncio.sleep(60) From f8cca1cea68f703eabfe2ff20a5fdde3dda36de4 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:04:01 -0600 Subject: [PATCH 280/327] end-game/stopper: fix & logging --- skydriver/ewms.py | 2 ++ skydriver/rest_handlers.py | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 2a654b54..b481afc4 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -23,11 +23,13 @@ async def request_stop_on_ewms( """ try: if abort: + LOGGER.info(f"sending 'abort' signal to ewms for {workflow_id=}...") await ewms_rc.request( "POST", f"/v0/workflows/{workflow_id}/actions/abort", ) else: + LOGGER.info(f"sending 'finished' signal to ewms for {workflow_id=}...") await ewms_rc.request( "POST", f"/v0/workflows/{workflow_id}/actions/finished", diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 44d80b59..0f144ddf 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -670,6 +670,7 @@ async def stop_skyscan_workers( ) -> database.schema.Manifest: """Stop all parts of the Scanner instance (if running) and mark in DB.""" manifest = await manifests.get(scan_id, True) + LOGGER.info(f"stopping (ewms) workers for {scan_id=}...") # request to ewms if manifest.ewms_workflow_id: @@ -1000,7 +1001,7 @@ async def put(self, scan_id: str) -> None: self.write(dc.asdict(result_dc)) # END # - self.finish() + await self.finish() # AFTER RESPONSE # # when we get the final result, it's time to tear down From 2a953e3abed5b334ffebe0b286f406061aa319ca Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:04:54 -0600 Subject: [PATCH 281/327] formatting --- skydriver/rest_handlers.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 0f144ddf..cf4be4a2 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -765,7 +765,12 @@ async def delete(self, scan_id: str) -> None: # mark as deleted -> also stops backlog from starting manifest = await self.manifests.mark_as_deleted(scan_id) # abort - await stop_skyscan_workers(self.manifests, scan_id, self.ewms_rc, abort=True) + await stop_skyscan_workers( + self.manifests, + scan_id, + self.ewms_rc, + abort=True, + ) try: result_dict = dc.asdict(await self.results.get(scan_id)) From abfac154fa9d972d76d2a105382372d70f7022d4 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:08:37 -0600 Subject: [PATCH 282/327] formatting - 2 --- resources/prod_tester/test_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index db3fd2d2..34d85e56 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -92,6 +92,7 @@ async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> # loop w/ sleep while True: + print("-" * 60, file=out, flush=True) # get result try: resp = await rc.request("GET", f"/scan/{scan_id}/status") From 3217824764e79da3456f7ff1442692ac020df8f2 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:14:17 -0600 Subject: [PATCH 283/327] prod-tester: prints --- resources/prod_tester/test_runner.py | 32 ++++++++++++++++++---------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 34d85e56..15fba689 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -87,39 +87,49 @@ async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> Return the result. """ out = open(log_file, "w") if log_file else sys.stdout + + def print_now(string: str): + print(string, file=out, flush=True) # fyi: pprint doesn't have flush + resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - print(json.dumps(resp, indent=4), file=out, flush=True) + print_now(json.dumps(resp, indent=4)) + + prev_result = {} # loop w/ sleep while True: - print("-" * 60, file=out, flush=True) + print_now("-" * 60) # get result try: resp = await rc.request("GET", f"/scan/{scan_id}/status") - print(pformat(resp), file=out, flush=True) # pprint doesn't have flush + print_now(pformat(resp)) # pprint doesn't have flush except Exception as e: # 404 (scanner not yet online) - print(f"suppressed error: {repr(e)}", file=out, flush=True) + print_now(f"suppressed error: {repr(e)}") # get progress try: resp = await rc.request("GET", f"/scan/{scan_id}/manifest") - print(json.dumps(resp["progress"], indent=4), file=out, flush=True) + print_now(json.dumps(resp["progress"], indent=4)) except Exception as e: # 404 (scanner not yet online) or KeyError (no progress yet) - print(f"suppressed error: {repr(e)}", file=out, flush=True) + print_now(f"suppressed error: {repr(e)}") # get status try: resp = await rc.request("GET", f"/scan/{scan_id}/result") - print(pformat(resp), file=out, flush=True) # pprint doesn't have flush + if prev_result != resp: + print_now(pformat(resp)) + prev_result = resp + else: + print_now("") except Exception as e: - print(f"suppressed error: {repr(e)}", file=out, flush=True) + print_now(f"suppressed error: {repr(e)}") else: if resp["scan_complete"]: - print("scan is done!", file=out, flush=True) - print(scan_id, file=out, flush=True) + print_now("scan is done!") + print_now(scan_id) return resp["skyscan_result"] # done? else, wait - print(scan_id, file=out, flush=True) + print_now(scan_id) await asyncio.sleep(60) From 9ae73c37d2bc6174dc85991c7642e40d87652e55 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:16:34 -0600 Subject: [PATCH 284/327] prod-tester: fix result getter - 2 --- resources/prod_tester/test_runner.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 15fba689..db8026e8 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -122,13 +122,12 @@ def print_now(string: str): prev_result = resp else: print_now("") - except Exception as e: - print_now(f"suppressed error: {repr(e)}") - else: if resp["scan_complete"]: print_now("scan is done!") print_now(scan_id) return resp["skyscan_result"] + except Exception as e: + print_now(f"suppressed error: {repr(e)}") # done? else, wait print_now(scan_id) From 86a74486b7740999d3b6ef64822574a3865411ef Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:36:38 -0600 Subject: [PATCH 285/327] end-game/stopper: no need to wait - 2 --- skydriver/rest_handlers.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index cf4be4a2..9fbd69ca 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1,7 +1,6 @@ """Handlers for the SkyDriver REST API server interface.""" import argparse -import asyncio import dataclasses as dc import json import logging @@ -52,7 +51,6 @@ MAX_CLASSIFIERS_LEN = 25 -WAIT_BEFORE_TEARDOWN = 60 # ----------------------------------------------------------------------------- # REST requestor auth @@ -1003,17 +1001,9 @@ async def put(self, scan_id: str) -> None: args.skyscan_result, args.is_final, ) - self.write(dc.asdict(result_dc)) - - # END # - await self.finish() - # AFTER RESPONSE # # when we get the final result, it's time to tear down if args.is_final: - await asyncio.sleep( - WAIT_BEFORE_TEARDOWN - ) # regular time.sleep() sleeps the entire server await stop_skyscan_workers( self.manifests, scan_id, @@ -1021,6 +1011,8 @@ async def put(self, scan_id: str) -> None: abort=False, ) + self.write(dc.asdict(result_dc)) + # ----------------------------------------------------------------------------- From 3b12c565b03f86dec1fa8a9eaf7164dff7365eae Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:40:33 -0600 Subject: [PATCH 286/327] fix logging --- skydriver/k8s/utils.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 4bdb2708..2b32952d 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -25,16 +25,17 @@ def start_job( """ if not job_dict: raise ValueError("Job object not created") + + LOGGER.info(json.dumps(job_dict, indent=0)) # otherwise huge + try: resp = kubernetes.utils.create_from_dict( k8s_batch_api.api_client, job_dict, namespace=ENV.K8S_NAMESPACE, ) - LOGGER.info(json.dumps(resp, indent=0)) # otherwise huge except Exception: # broad b/c re-raising - LOGGER.error("request to make k8s job failed using:") - LOGGER.error(json.dumps(job_dict, indent=4)) + LOGGER.error("request to make k8s job failed above job_dict") raise else: return resp From dea4609e820644f9ca57db4a2a6f3fdf2c961f94 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:41:06 -0600 Subject: [PATCH 287/327] fix logging - 2 --- skydriver/k8s/utils.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 2b32952d..49337fc2 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -1,6 +1,5 @@ """An interface to the Kubernetes cluster.""" -import json import logging from typing import Any, Iterator @@ -26,7 +25,7 @@ def start_job( if not job_dict: raise ValueError("Job object not created") - LOGGER.info(json.dumps(job_dict, indent=0)) # otherwise huge + LOGGER.info(job_dict) try: resp = kubernetes.utils.create_from_dict( From 8a7bd7b02ca9a9d5071297124fee879fdac66f9f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:41:27 -0600 Subject: [PATCH 288/327] fix logging - 3 --- skydriver/k8s/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 49337fc2..363976de 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -25,6 +25,7 @@ def start_job( if not job_dict: raise ValueError("Job object not created") + LOGGER.info("K8s Job:") LOGGER.info(job_dict) try: From 3b89e0133d37dc56ad6c4f1ca3b25db650b808b5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:43:22 -0600 Subject: [PATCH 289/327] flake8 --- resources/prod_tester/test_runner.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index db8026e8..c679cdcf 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -81,7 +81,11 @@ async def launch_a_scan( return manifest # type: ignore[no-any-return] -async def monitor(rc: RestClient, scan_id: str, log_file: Path | None = None) -> dict: +async def monitor( # noqa: MFL000 + rc: RestClient, + scan_id: str, + log_file: Path | None = None, +) -> dict: """Monitor the event scan until its done. Return the result. From 38fbf886fe16df3af897e41a446fc118229c91aa Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:43:54 -0600 Subject: [PATCH 290/327] mypy --- resources/prod_tester/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index c679cdcf..8d09f7cd 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -98,7 +98,7 @@ def print_now(string: str): resp = await rc.request("GET", f"/scan/{scan_id}/manifest") print_now(json.dumps(resp, indent=4)) - prev_result = {} + prev_result: dict = {} # loop w/ sleep while True: From 4a10dd074c44dab02a64c0120dc247e670125a6a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:45:55 -0600 Subject: [PATCH 291/327] end-game/stopper: no need to wait - 3 --- resources/prod_tester/test_runner.py | 2 +- tests/integration/conftest.py | 17 +++-------------- tests/integration/test_rest_routes.py | 11 ++--------- 3 files changed, 6 insertions(+), 24 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 8d09f7cd..67b09945 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -100,7 +100,7 @@ def print_now(string: str): prev_result: dict = {} - # loop w/ sleep + # loop w/ sleeps while True: print_now("-" * 60) # get result diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 01fe905c..5df85406 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -109,14 +109,6 @@ def known_clusters() -> dict: return KNOWN_CLUSTERS -TEST_WAIT_BEFORE_TEARDOWN = 2.0 - - -@pytest.fixture(scope="session") -def test_wait_before_teardown() -> float: - return TEST_WAIT_BEFORE_TEARDOWN - - @pytest_asyncio.fixture async def mongo_client() -> AsyncIOMotorClient: # type: ignore[valid-type] """A fixture to keep number of mongo connections to a minimum (aka 1).""" @@ -141,16 +133,13 @@ async def server( async def _server( - monkeypatch: Any, - port: int, - mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] + monkeypatch: Any, + port: int, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> AsyncIterator[Callable[[], RestClient]]: # patch at directly named import that happens before running the test monkeypatch.setattr(skydriver.rest_handlers, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) monkeypatch.setattr(skydriver.config, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) - monkeypatch.setattr( - skydriver.rest_handlers, "WAIT_BEFORE_TEARDOWN", TEST_WAIT_BEFORE_TEARDOWN - ) k8s_batch_api = Mock() ewms_rc = setup_ewms_client() diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 447d6a75..60181bfc 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -871,7 +871,6 @@ async def test_000( docker_tag_expected: str, server: Callable[[], RestClient], known_clusters: dict, - test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Test normal scan creation and retrieval.""" @@ -891,14 +890,12 @@ async def test_000( await _after_scan_start_logic( rc, manifest, - test_wait_before_teardown, ) async def _after_scan_start_logic( rc: RestClient, manifest: sdict, - test_wait_before_teardown: float, ): scan_id = manifest["scan_id"] @@ -965,7 +962,7 @@ async def _after_scan_start_logic( assert not await _is_scan_complete(rc, manifest["scan_id"]) # workforce is not done result = await _send_result(rc, scan_id, manifest, True) # wait as long as the server, so it'll mark as complete - await asyncio.sleep(test_wait_before_teardown + 1) + await asyncio.sleep(1) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done @@ -989,7 +986,6 @@ async def _after_scan_start_logic( async def test_010__rescan( server: Callable[[], RestClient], known_clusters: dict, - test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: rc = server() @@ -1010,7 +1006,6 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_alpha, - test_wait_before_teardown, ) # RESCAN @@ -1033,7 +1028,6 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_beta, - test_wait_before_teardown, ) @@ -1043,7 +1037,6 @@ async def test_010__rescan( async def test_100__bad_data( server: Callable[[], RestClient], known_clusters: dict, - test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Failure-test scan creation and retrieval.""" @@ -1259,7 +1252,7 @@ async def test_100__bad_data( # OK result = await _send_result(rc, scan_id, manifest, True) # wait as long as the server, so it'll mark as complete - await asyncio.sleep(test_wait_before_teardown) + await asyncio.sleep(1) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done From b86844a38dcf2ebac7851d87f622e45810e4f39d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:47:01 -0600 Subject: [PATCH 292/327] flake8 --- resources/prod_tester/test_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 67b09945..4f68a684 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -81,7 +81,7 @@ async def launch_a_scan( return manifest # type: ignore[no-any-return] -async def monitor( # noqa: MFL000 +async def monitor( # noqa: C901 rc: RestClient, scan_id: str, log_file: Path | None = None, From 5c346d40495e8e41864376700fec04f7dbdc2c96 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:49:25 -0600 Subject: [PATCH 293/327] Revert "end-game/stopper: no need to wait - 3" This reverts commit 4a10dd074c44dab02a64c0120dc247e670125a6a. --- resources/prod_tester/test_runner.py | 2 +- tests/integration/conftest.py | 17 ++++++++++++++--- tests/integration/test_rest_routes.py | 11 +++++++++-- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 4f68a684..26af5d5b 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -100,7 +100,7 @@ def print_now(string: str): prev_result: dict = {} - # loop w/ sleeps + # loop w/ sleep while True: print_now("-" * 60) # get result diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 5df85406..01fe905c 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -109,6 +109,14 @@ def known_clusters() -> dict: return KNOWN_CLUSTERS +TEST_WAIT_BEFORE_TEARDOWN = 2.0 + + +@pytest.fixture(scope="session") +def test_wait_before_teardown() -> float: + return TEST_WAIT_BEFORE_TEARDOWN + + @pytest_asyncio.fixture async def mongo_client() -> AsyncIOMotorClient: # type: ignore[valid-type] """A fixture to keep number of mongo connections to a minimum (aka 1).""" @@ -133,13 +141,16 @@ async def server( async def _server( - monkeypatch: Any, - port: int, - mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] + monkeypatch: Any, + port: int, + mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> AsyncIterator[Callable[[], RestClient]]: # patch at directly named import that happens before running the test monkeypatch.setattr(skydriver.rest_handlers, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) monkeypatch.setattr(skydriver.config, "KNOWN_CLUSTERS", KNOWN_CLUSTERS) + monkeypatch.setattr( + skydriver.rest_handlers, "WAIT_BEFORE_TEARDOWN", TEST_WAIT_BEFORE_TEARDOWN + ) k8s_batch_api = Mock() ewms_rc = setup_ewms_client() diff --git a/tests/integration/test_rest_routes.py b/tests/integration/test_rest_routes.py index 60181bfc..447d6a75 100644 --- a/tests/integration/test_rest_routes.py +++ b/tests/integration/test_rest_routes.py @@ -871,6 +871,7 @@ async def test_000( docker_tag_expected: str, server: Callable[[], RestClient], known_clusters: dict, + test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Test normal scan creation and retrieval.""" @@ -890,12 +891,14 @@ async def test_000( await _after_scan_start_logic( rc, manifest, + test_wait_before_teardown, ) async def _after_scan_start_logic( rc: RestClient, manifest: sdict, + test_wait_before_teardown: float, ): scan_id = manifest["scan_id"] @@ -962,7 +965,7 @@ async def _after_scan_start_logic( assert not await _is_scan_complete(rc, manifest["scan_id"]) # workforce is not done result = await _send_result(rc, scan_id, manifest, True) # wait as long as the server, so it'll mark as complete - await asyncio.sleep(1) + await asyncio.sleep(test_wait_before_teardown + 1) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done @@ -986,6 +989,7 @@ async def _after_scan_start_logic( async def test_010__rescan( server: Callable[[], RestClient], known_clusters: dict, + test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: rc = server() @@ -1006,6 +1010,7 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_alpha, + test_wait_before_teardown, ) # RESCAN @@ -1028,6 +1033,7 @@ async def test_010__rescan( await _after_scan_start_logic( rc, manifest_beta, + test_wait_before_teardown, ) @@ -1037,6 +1043,7 @@ async def test_010__rescan( async def test_100__bad_data( server: Callable[[], RestClient], known_clusters: dict, + test_wait_before_teardown: float, mongo_client: AsyncIOMotorClient, # type: ignore[valid-type] ) -> None: """Failure-test scan creation and retrieval.""" @@ -1252,7 +1259,7 @@ async def test_100__bad_data( # OK result = await _send_result(rc, scan_id, manifest, True) # wait as long as the server, so it'll mark as complete - await asyncio.sleep(1) + await asyncio.sleep(test_wait_before_teardown) manifest = await rc.request("GET", f"/scan/{scan_id}/manifest") assert await _is_scan_complete(rc, manifest["scan_id"]) # workforce is done From 887bc9056245fb2785ced6276937c66fb4b04da6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 12:49:29 -0600 Subject: [PATCH 294/327] Revert "end-game/stopper: no need to wait - 2" This reverts commit 86a74486b7740999d3b6ef64822574a3865411ef. --- skydriver/rest_handlers.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index 9fbd69ca..cf4be4a2 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1,6 +1,7 @@ """Handlers for the SkyDriver REST API server interface.""" import argparse +import asyncio import dataclasses as dc import json import logging @@ -51,6 +52,7 @@ MAX_CLASSIFIERS_LEN = 25 +WAIT_BEFORE_TEARDOWN = 60 # ----------------------------------------------------------------------------- # REST requestor auth @@ -1001,9 +1003,17 @@ async def put(self, scan_id: str) -> None: args.skyscan_result, args.is_final, ) + self.write(dc.asdict(result_dc)) + + # END # + await self.finish() + # AFTER RESPONSE # # when we get the final result, it's time to tear down if args.is_final: + await asyncio.sleep( + WAIT_BEFORE_TEARDOWN + ) # regular time.sleep() sleeps the entire server await stop_skyscan_workers( self.manifests, scan_id, @@ -1011,8 +1021,6 @@ async def put(self, scan_id: str) -> None: abort=False, ) - self.write(dc.asdict(result_dc)) - # ----------------------------------------------------------------------------- From e5fefa42d74aa91527e167f430cf9d12de390fc6 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 12 Feb 2025 16:33:49 -0600 Subject: [PATCH 295/327] prod-tester: fix result getter - 3 --- resources/prod_tester/test_runner.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 26af5d5b..0c773ea9 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -101,12 +101,15 @@ def print_now(string: str): prev_result: dict = {} # loop w/ sleep - while True: + done = False + while not done: print_now("-" * 60) - # get result + + # get status try: resp = await rc.request("GET", f"/scan/{scan_id}/status") print_now(pformat(resp)) # pprint doesn't have flush + done = resp["scan_complete"] # loop control except Exception as e: # 404 (scanner not yet online) print_now(f"suppressed error: {repr(e)}") @@ -118,7 +121,7 @@ def print_now(string: str): # 404 (scanner not yet online) or KeyError (no progress yet) print_now(f"suppressed error: {repr(e)}") - # get status + # get result try: resp = await rc.request("GET", f"/scan/{scan_id}/result") if prev_result != resp: @@ -126,13 +129,14 @@ def print_now(string: str): prev_result = resp else: print_now("") - if resp["scan_complete"]: - print_now("scan is done!") - print_now(scan_id) - return resp["skyscan_result"] except Exception as e: print_now(f"suppressed error: {repr(e)}") # done? else, wait - print_now(scan_id) - await asyncio.sleep(60) + if not done: + print_now(scan_id) + await asyncio.sleep(60) + + print_now("scan is done!") + print_now(scan_id) + return (await rc.request("GET", f"/scan/{scan_id}/result"))["skyscan_result"] From dda2dd355fa691904f5a6222ca6734e12daaf0b2 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 12 Feb 2025 22:37:44 +0000 Subject: [PATCH 296/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 949584e9..ec8b1427 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.18 -botocore==1.36.18 +boto3==1.36.19 +botocore==1.36.19 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.18] -│ ├── botocore [required: >=1.36.18,<1.37.0, installed: 1.36.18] +├── boto3 [required: Any, installed: 1.36.19] +│ ├── botocore [required: >=1.36.19,<1.37.0, installed: 1.36.19] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.18] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.19] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 749254bc853196fc8b751379e0ff542db8e96db8 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 14:56:21 -0600 Subject: [PATCH 297/327] use `EWMS_PILOT_TIMEOUT_QUEUE_INCOMING` --- resources/prod_tester/test_suit_prod.py | 2 ++ skydriver/config.py | 14 +++++++------- skydriver/k8s/scanner_instance.py | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 7439a568..4776631c 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -5,6 +5,7 @@ import os import shutil import subprocess +import sys import tarfile from datetime import datetime from pathlib import Path @@ -359,4 +360,5 @@ async def main(): # Run the asyncio event loop if __name__ == "__main__": + subprocess.check_call([sys.executable, "-m", "pip", "install", "icecube-skyreader"]) asyncio.run(main()) diff --git a/skydriver/config.py b/skydriver/config.py index 95481a64..76704bef 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -3,7 +3,7 @@ import dataclasses as dc import enum import logging -from typing import Any, Optional +from typing import Any from wipac_dev_tools import from_environment_as_dataclass, logging_tools @@ -96,6 +96,7 @@ class EnvConfig: EWMS_WORKER_MEMORY__DEFAULT: str = "8GB" EWMS_WORKER_DISK__DEFAULT: str = "1GB" EWMS_MAX_WORKER_RUNTIME__DEFAULT: int = 4 * 60 * 60 # 4 hours + EWMS_PILOT_TIMEOUT_QUEUE_INCOMING: int | None = None # note: other EWMS vars at top of class # keycloak @@ -104,12 +105,11 @@ class EnvConfig: KEYCLOAK_CLIENT_SECRET_SKYDRIVER_REST: str = "" # skyscan (forwarded) - SKYSCAN_PROGRESS_INTERVAL_SEC: Optional[int] = None - SKYSCAN_RESULT_INTERVAL_SEC: Optional[int] = None - SKYSCAN_MQ_TIMEOUT_TO_CLIENTS: Optional[int] = None - SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS: Optional[int] = None - SKYSCAN_LOG: Optional[str] = None - SKYSCAN_LOG_THIRD_PARTY: Optional[str] = None + SKYSCAN_PROGRESS_INTERVAL_SEC: int | None = None + SKYSCAN_RESULT_INTERVAL_SEC: int | None = None + SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS: int | None = None + SKYSCAN_LOG: str | None = None + SKYSCAN_LOG_THIRD_PARTY: str | None = None def __post_init__(self) -> None: object.__setattr__(self, "LOG_LEVEL", self.LOG_LEVEL.upper()) # b/c frozen diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index ce429e81..ee3c14ff 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -283,7 +283,7 @@ def make_ewms_envvars( "EWMS_TASK_IMAGE": get_skyscan_cvmfs_singularity_image(docker_tag), # "EWMS_PILOT_TIMEOUT_QUEUE_WAIT_FOR_FIRST_MESSAGE": skyscan_mq_client_timeout_wait_for_first_message, - "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, + "EWMS_PILOT_TIMEOUT_QUEUE_INCOMING": ENV.EWMS_PILOT_TIMEOUT_QUEUE_INCOMING, "EWMS_PILOT_TASK_TIMEOUT": max_pixel_reco_time, # "EWMS_WORKER_MAX_WORKER_RUNTIME": max_worker_runtime, @@ -364,7 +364,6 @@ def make_skyscan_server_envvars( "SKYSCAN_PROGRESS_INTERVAL_SEC": ENV.SKYSCAN_PROGRESS_INTERVAL_SEC, "SKYSCAN_RESULT_INTERVAL_SEC": ENV.SKYSCAN_RESULT_INTERVAL_SEC, # - "SKYSCAN_MQ_TIMEOUT_TO_CLIENTS": ENV.SKYSCAN_MQ_TIMEOUT_TO_CLIENTS, "SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS": ENV.SKYSCAN_MQ_TIMEOUT_FROM_CLIENTS, # "SKYSCAN_LOG": ENV.SKYSCAN_LOG, From 476ded15600f1a037a46ab5dae144ed16a0e240c Mon Sep 17 00:00:00 2001 From: github-actions Date: Thu, 13 Feb 2025 21:00:26 +0000 Subject: [PATCH 298/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index ec8b1427..0f66410b 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.19 -botocore==1.36.19 +boto3==1.36.20 +botocore==1.36.20 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.19] -│ ├── botocore [required: >=1.36.19,<1.37.0, installed: 1.36.19] +├── boto3 [required: Any, installed: 1.36.20] +│ ├── botocore [required: >=1.36.20,<1.37.0, installed: 1.36.20] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.19] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.20] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From 3ba4dab3eeb1361adf99cacb4580069f6bd8b2a2 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 15:50:51 -0600 Subject: [PATCH 299/327] add endpoints for querying info on workers --- skydriver/ewms.py | 59 ++++++++++++++++++++++++---------- skydriver/rest_handlers.py | 65 ++++++++++++++++++++++++++++++-------- skydriver/server.py | 4 ++- 3 files changed, 97 insertions(+), 31 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index b481afc4..ae834f96 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -1,12 +1,13 @@ """Tools for interfacing with EMWS.""" import logging +from collections import defaultdict import aiocache # type: ignore[import-untyped] import requests from rest_tools.client import RestClient -from .config import ENV +from .config import ENV, sdict from .database.schema import PENDING_EWMS_WORKFLOW LOGGER = logging.Logger(__name__) @@ -57,11 +58,14 @@ async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | N @aiocache.cached(ttl=1 * 60) # don't cache too long, but avoid spamming ewms -async def get_taskforce_phases( +async def get_taskforce_infos( ewms_rc: RestClient, workflow_id: str, -) -> list[dict[str, str]]: - """Get all the states of all the taskforces associated with the workflow.""" +) -> list[sdict]: + """Get all info of all the taskforces associated with the workflow.""" + if workflow_id == PENDING_EWMS_WORKFLOW or (not workflow_id): + return [] + resp = await ewms_rc.request( "POST", "/v0/query/taskforces", @@ -71,19 +75,40 @@ async def get_taskforce_phases( } }, ) - return [ - { - k: tf.get(k) - for k in [ - "taskforce_uuid", - "phase", - "phase_change_log", - "compound_statuses", - "top_task_errors", - ] - } - for tf in resp["taskforces"] - ] + return resp["taskforces"] + + +async def get_workforce_statuses( + ewms_rc: RestClient, + workflow_id: str, +) -> dict[str, dict[str, dict[str, int]] | int]: + """Get the compound statuses for the entire workflow's workforce (aka its taskforces), + along with the number of currently running workers. + + Example: + from ewms: + >>> {'IDLE': {'null': 1}, 'RUNNING': {'Tasking': 24}} + >>> {'IDLE': {'foo': 99}, 'RUNNING': {'Tasking': 20}} + >>> {'RUNNING': {'Processing': 7}, 'REMOVED': {'Error': 1}} + out: + >>> {'IDLE': {'null': 1, 'foo': 99}, 'RUNNING': {'Tasking': 44, 'Processing': 7}, 'REMOVED': {'Error': 1}} + """ + tf_state_dicts = await get_taskforce_infos(ewms_rc, workflow_id) + + # merge & sum the compound statuses + merged = defaultdict(lambda: defaultdict(int)) + for state in tf_state_dicts: + d = state["compound_statuses"] + for outer_key, inner_dict in d.items(): + for inner_key, value in inner_dict.items(): + merged[outer_key][inner_key] += value + + return { + "statuses": {k: dict(v) for k, v in merged.items()}, # convert to dict + "n_running": sum(merged.get("RUNNING", {}).values()), + # NOTE: it's tempting to sum other statuses' counts, but not all + # statuses are mutually exclusive -- iow, ewms may double count for some jobs + } def make_s3_object_key(scan_id: str) -> str: diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index cf4be4a2..e1d82830 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -1038,17 +1038,6 @@ async def get(self, scan_id: str) -> None: # scan state scan_state = await get_scan_state(manifest, self.ewms_rc, self.results) - # ewms - if ( - manifest.ewms_workflow_id - and manifest.ewms_workflow_id != PENDING_EWMS_WORKFLOW - ): - clusters = await ewms.get_taskforce_phases( - self.ewms_rc, manifest.ewms_workflow_id - ) - else: - clusters = [] - # respond resp = { "scan_state": scan_state, @@ -1059,7 +1048,10 @@ async def get(self, scan_id: str) -> None: self.k8s_batch_api, manifest.scan_id ), }, - "ewms_workforce": clusters, + "ewms_workforce": await ewms.get_workforce_statuses( + self.ewms_rc, manifest.ewms_workflow_id + ), + # ^^^ same as '/scan//ewms/workforce' } self.write(resp) @@ -1097,7 +1089,7 @@ async def get(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- -class ScanActionEWMSWorkflowIDHandler(BaseSkyDriverHandler): +class ScanEWMSWorkflowIDHandler(BaseSkyDriverHandler): """Handles actions on scan's ewms workflow id.""" ROUTE = r"/scan/(?P\w+)/ewms/workflow-id$" @@ -1151,3 +1143,50 @@ async def post(self, scan_id: str) -> None: # ----------------------------------------------------------------------------- + + +class ScanEWMSWorkforceHandler(BaseSkyDriverHandler): + """Handles actions for a scan's ewms workforce (condor workers).""" + + ROUTE = r"/scan/(?P\w+)/ewms/workforce$" + + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore + async def get(self, scan_id: str) -> None: + """GET. + + This is a high-level utility, which removes unnecessary EWMS semantics. + """ + manifest = await self.manifests.get(scan_id, incl_del=True) + + self.write( + await ewms.get_workforce_statuses( + self.ewms_rc, + manifest.ewms_workflow_id, + ) + ) + + +# ----------------------------------------------------------------------------- + + +class ScanEWMSTaskforcesHandler(BaseSkyDriverHandler): + """Handles actions for a scan's ewms taskforces (condor job submissions/clusters).""" + + ROUTE = r"/scan/(?P\w+)/ewms/taskforces$" + + @service_account_auth(roles=[USER_ACCT, INTERNAL_ACCT]) # type: ignore + async def get(self, scan_id: str) -> None: + """GET. + + This is useful for debugging by seeing what was sent to condor. + """ + manifest = await self.manifests.get(scan_id, incl_del=True) + + self.write( + { + "taskforces": await ewms.get_taskforce_infos( + self.ewms_rc, + manifest.ewms_workflow_id, + ) + } + ) diff --git a/skydriver/server.py b/skydriver/server.py index 6e2657f2..38acbe80 100644 --- a/skydriver/server.py +++ b/skydriver/server.py @@ -51,7 +51,9 @@ async def make( rest_handlers.ScanRescanHandler, rest_handlers.ScanStatusHandler, rest_handlers.ScanLogsHandler, - rest_handlers.ScanActionEWMSWorkflowIDHandler, + rest_handlers.ScanEWMSWorkflowIDHandler, + rest_handlers.ScanEWMSWorkforceHandler, + rest_handlers.ScanEWMSTaskforcesHandler, ]: try: rs.add_route(getattr(klass, "ROUTE"), klass, args) From eb2c1ffd1a8ea638d63e4a53a721baf0a65ca903 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 16:00:52 -0600 Subject: [PATCH 300/327] mypy --- skydriver/ewms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index ae834f96..4f37e543 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -60,7 +60,7 @@ async def get_deactivated_type(ewms_rc: RestClient, workflow_id: str) -> str | N @aiocache.cached(ttl=1 * 60) # don't cache too long, but avoid spamming ewms async def get_taskforce_infos( ewms_rc: RestClient, - workflow_id: str, + workflow_id: str | None, ) -> list[sdict]: """Get all info of all the taskforces associated with the workflow.""" if workflow_id == PENDING_EWMS_WORKFLOW or (not workflow_id): @@ -80,7 +80,7 @@ async def get_taskforce_infos( async def get_workforce_statuses( ewms_rc: RestClient, - workflow_id: str, + workflow_id: str | None, ) -> dict[str, dict[str, dict[str, int]] | int]: """Get the compound statuses for the entire workflow's workforce (aka its taskforces), along with the number of currently running workers. @@ -96,7 +96,7 @@ async def get_workforce_statuses( tf_state_dicts = await get_taskforce_infos(ewms_rc, workflow_id) # merge & sum the compound statuses - merged = defaultdict(lambda: defaultdict(int)) + merged: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) for state in tf_state_dicts: d = state["compound_statuses"] for outer_key, inner_dict in d.items(): From 73593694585c6b683655f4196af32473ce6df565 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 16:02:06 -0600 Subject: [PATCH 301/327] null condition --- skydriver/ewms.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 4f37e543..5f7dca54 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -98,7 +98,8 @@ async def get_workforce_statuses( # merge & sum the compound statuses merged: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) for state in tf_state_dicts: - d = state["compound_statuses"] + if not (d := state.get("compound_statuses")): + continue for outer_key, inner_dict in d.items(): for inner_key, value in inner_dict.items(): merged[outer_key][inner_key] += value From 4cd73c11a8a58ecf3e1a2d4e4b4f3e0fc15e8740 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 17:30:37 -0600 Subject: [PATCH 302/327] use epoch timestamp as prefix of scanids --- skydriver/rest_handlers.py | 10 +++++++--- skydriver/utils.py | 9 +++++++++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/skydriver/rest_handlers.py b/skydriver/rest_handlers.py index e1d82830..3ff74abd 100644 --- a/skydriver/rest_handlers.py +++ b/skydriver/rest_handlers.py @@ -39,7 +39,11 @@ from .ewms import request_stop_on_ewms from .k8s.scan_backlog import put_on_backlog from .k8s.scanner_instance import SkyScanK8sJobFactory, assemble_scanner_server_logs_url -from .utils import does_scan_state_indicate_final_result_received, get_scan_state +from .utils import ( + does_scan_state_indicate_final_result_received, + get_scan_state, + make_scan_id, +) LOGGER = logging.getLogger(__name__) @@ -473,7 +477,7 @@ async def post(self) -> None: ) # generate unique scan_id - scan_id = uuid.uuid4().hex + scan_id = make_scan_id() # Before doing anything else, persist in DB # -> store the event in its own collection to reduce redundancy @@ -619,7 +623,7 @@ async def post(self, scan_id: str) -> None: args = arghand.parse_args() # generate unique scan_id - new_scan_id = uuid.uuid4().hex + new_scan_id = make_scan_id() # grab the 'scan_request_obj' scan_request_obj = await self.scan_request_coll.find_one_and_update( diff --git a/skydriver/utils.py b/skydriver/utils.py index 623cf9b0..54abdd04 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -1,6 +1,8 @@ """Utility functions that don't fit anywhere else.""" import enum +import time +import uuid from rest_tools.client import RestClient from tornado import web @@ -9,6 +11,13 @@ from .database.schema import DEPRECATED_EWMS_TASK, Manifest, PENDING_EWMS_WORKFLOW +def make_scan_id() -> str: + """Make a new scan id.""" + hex_time = str(hex(int(time.time()))).removeprefix("0x") + hex_uuid_less = uuid.uuid4().hex[len(hex_time) :] + return f"{hex_time}{hex_uuid_less}" + + class _ScanState(enum.Enum): """A non-persisted scan state.""" From 8e87e67a02cd2f1c15d4a44c705ddd30135f1b28 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 17:42:14 -0600 Subject: [PATCH 303/327] don't include 'FatalError' counts in 'n_running' --- skydriver/ewms.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 5f7dca54..bbb47de6 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -104,9 +104,16 @@ async def get_workforce_statuses( for inner_key, value in inner_dict.items(): merged[outer_key][inner_key] += value + # compute `n_running`, excluding 'FatalError' + n_running = sum( + count + for substatus, count in merged.get("RUNNING", {}).items() + if substatus != "FatalError" + ) + return { "statuses": {k: dict(v) for k, v in merged.items()}, # convert to dict - "n_running": sum(merged.get("RUNNING", {}).values()), + "n_running": n_running, # NOTE: it's tempting to sum other statuses' counts, but not all # statuses are mutually exclusive -- iow, ewms may double count for some jobs } From d295e9c912f613d485106e7ac2eb5fc92fb23152 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Thu, 13 Feb 2025 18:00:50 -0600 Subject: [PATCH 304/327] use epoch timestamp as prefix of scanids - 2 --- skydriver/utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index 54abdd04..cceb81be 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -12,10 +12,11 @@ def make_scan_id() -> str: - """Make a new scan id.""" - hex_time = str(hex(int(time.time()))).removeprefix("0x") - hex_uuid_less = uuid.uuid4().hex[len(hex_time) :] - return f"{hex_time}{hex_uuid_less}" + """Make a new scan id, chronological when sorted.""" + big_time = int(time.time() * 100) + hex_big_time = str(hex(big_time)).removeprefix("0x") + hex_uuid_short = uuid.uuid4().hex[len(hex_big_time) :] + return f"{hex_big_time}{hex_uuid_short}" class _ScanState(enum.Enum): From b0699649132419ef2a8d83086bdd27a9982e8f3d Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 11:00:19 -0600 Subject: [PATCH 305/327] remove unused func --- skydriver/database/interface.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/skydriver/database/interface.py b/skydriver/database/interface.py index bda75082..8ad249d0 100644 --- a/skydriver/database/interface.py +++ b/skydriver/database/interface.py @@ -358,13 +358,3 @@ async def get_all(self) -> AsyncIterator[dict]: return_dclass=dict, ): yield entry - - async def is_in_backlog(self, scan_id: str) -> bool: - """Return whether the scan id is in the backlog.""" - LOGGER.debug(f"looking for {scan_id} in backlog") - async for _ in self.collection.find( - {"scan_id": scan_id}, - return_dclass=dict, - ): - return True - return False From 0322df4bb898aed650e847270c8e3b07a22c3324 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 13:00:20 -0600 Subject: [PATCH 306/327] add retry+delay when k8s job quota is surpassed --- resources/prod_tester/test_suit_prod.py | 8 ++-- skydriver/config.py | 2 + skydriver/k8s/scan_backlog.py | 4 +- skydriver/k8s/scanner_instance.py | 5 +- skydriver/k8s/utils.py | 62 ++++++++++++++++++------- 5 files changed, 56 insertions(+), 25 deletions(-) diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 4776631c..c92ded07 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -154,12 +154,10 @@ def display_test_status(tests: list[test_getter.TestParamSet]): ) table = texttable.Texttable() - # Define column alignment and widths - table.set_cols_align(["r", "l", "l", "r", "l"]) - table.set_cols_width([2, 25, 20, 8, 10]) - - # Add the header row + # columns table.add_row(["#", "Event File", "Reco Algo", "Scan ID", "Status"]) + table.set_cols_align(["r", "l", "l", "r", "l"]) + table.set_cols_width([2, 25, 18, 10, 10]) # Add rows for each test for i, test in sorted_tests: diff --git a/skydriver/config.py b/skydriver/config.py index 76704bef..eb1a017f 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -90,6 +90,8 @@ class EnvConfig: K8S_SCANNER_SIDECAR_S3_CPU_REQUEST: float = 0.05 K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS: int = 15 * 60 # 15 mins + K8S_START_JOB_TRANSIENT_ERROR_RETRY_DELAY: int = 5 * 60 + GRAFANA_DASHBOARD_BASEURL: str = "" # EWMS optional config diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 8bbafc56..0d56fbe6 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -166,7 +166,9 @@ async def _run( # start k8s job -- this could be any k8s job (pre- or post-ewms switchover) try: LOGGER.info(f"Starting K8s job: scan_id={manifest.scan_id}") - KubeAPITools.start_job(k8s_batch_api, skyscan_k8s_job) + await KubeAPITools.start_job( + k8s_batch_api, skyscan_k8s_job, inf_retry_on_transient_errors=True + ) except kubernetes.utils.FailToCreateError as e: # k8s job (backlog entry) will be revived & restarted in future iteration LOGGER.exception(e) diff --git a/skydriver/k8s/scanner_instance.py b/skydriver/k8s/scanner_instance.py index ee3c14ff..131f4faa 100644 --- a/skydriver/k8s/scanner_instance.py +++ b/skydriver/k8s/scanner_instance.py @@ -410,8 +410,9 @@ def assemble_scanner_server_logs_url( f"&var-container={get_skyscan_server_container_name(scan_id)}" ) except Exception as e: - LOGGER.error(f"there was an issue retrieving k8s pod(s) for {scan_id=}") - LOGGER.exception(e) + LOGGER.error( + f"there was an issue retrieving k8s pod(s) for {scan_id=}: {repr(e)}" + ) # fall-through return "404" # don't return exception info for security reasons diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 363976de..0521407c 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -1,5 +1,7 @@ """An interface to the Kubernetes cluster.""" +import asyncio +import itertools import logging from typing import Any, Iterator @@ -10,35 +12,61 @@ LOGGER = logging.getLogger(__name__) +def is_known_k8s_transient_error(e: Exception) -> bool: + """Is this exception a known transient error in the k8s namespace. + + IOW, will this error go away if we try again in a bit? + """ + + # did the job exceed the job quota? if so, there will be fewer jobs in the future + # ex: kubernetes.utils.create_from_yaml.FailToCreateError: Error from server (Forbidden): {"kind":"Status","apiVersion":"v1","metadata":{},"status":"Failure","message":"jobs.batch \"skyscan-67af75da614147fe8a740bb96f4be08e\" is forbidden: exceeded quota: skydriver-dev-job-quota, requested: count/jobs.batch=1, used: count/jobs.batch=100, limited: count/jobs.batch=100","reason":"Forbidden","details":{"name":"skyscan-67af75da614147fe8a740bb96f4be08e","group":"batch","kind":"jobs"},"code":403} + if isinstance(e, kubernetes.utils.FailToCreateError) and bool( + f"exceeded quota: {ENV.K8S_APPLICATION_NAME}-job-quota" in str(e) + ): + return True + + # fall-through + return False + + class KubeAPITools: """A convenience wrapper around `kubernetes.client`.""" @staticmethod - def start_job( + async def start_job( k8s_batch_api: kubernetes.client.BatchV1Api, job_dict: sdict, + inf_retry_on_transient_errors: bool = False, ) -> Any: """Start the k8s job. Returns REST response. """ if not job_dict: - raise ValueError("Job object not created") - - LOGGER.info("K8s Job:") - LOGGER.info(job_dict) - - try: - resp = kubernetes.utils.create_from_dict( - k8s_batch_api.api_client, - job_dict, - namespace=ENV.K8S_NAMESPACE, - ) - except Exception: # broad b/c re-raising - LOGGER.error("request to make k8s job failed above job_dict") - raise - else: - return resp + raise ValueError("No job object to create") + + for i in itertools.count(): + LOGGER.info(f"K8s Job (attempt #{i+1}):") + LOGGER.info(job_dict) + try: + return kubernetes.utils.create_from_dict( + k8s_batch_api.api_client, + job_dict, + namespace=ENV.K8S_NAMESPACE, + ) + except Exception as e: # broad b/c re-raising + if inf_retry_on_transient_errors and is_known_k8s_transient_error(e): + LOGGER.warning( + f"encountered a transient error in the k8s namespace, " + f"trying again in {ENV.K8S_START_JOB_TRANSIENT_ERROR_RETRY_DELAY}s" + ": {repr(e)}" + ) + # maybe next time, it'll be ok + await asyncio.sleep(ENV.K8S_START_JOB_TRANSIENT_ERROR_RETRY_DELAY) + continue + else: + LOGGER.error("request to make k8s job failed above job_dict") + raise @staticmethod def get_pods( From d8bc7fee76c7676669065fe2255c8faea1020627 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 14:08:00 -0600 Subject: [PATCH 307/327] prod-tester: format --- resources/prod_tester/test_suit_prod.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index c92ded07..779c313c 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -154,14 +154,16 @@ def display_test_status(tests: list[test_getter.TestParamSet]): ) table = texttable.Texttable() + scan_id_len = 10 + # columns table.add_row(["#", "Event File", "Reco Algo", "Scan ID", "Status"]) table.set_cols_align(["r", "l", "l", "r", "l"]) - table.set_cols_width([2, 25, 18, 10, 10]) + table.set_cols_width([2, 25, 18, scan_id_len, 10]) # Add rows for each test for i, test in sorted_tests: - scan_id = test.scan_id[:8] if test.scan_id else "N/A" + scan_id = test.scan_id[:scan_id_len] if test.scan_id else "N/A" status = test.test_status.name table.add_row([i, test.event_file.name, test.reco_algo, scan_id, status]) From 0d66b0b70db8b8b2c35cf3e52fabb429e5614be2 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 14:16:17 -0600 Subject: [PATCH 308/327] comment --- skydriver/utils.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skydriver/utils.py b/skydriver/utils.py index cceb81be..4ca04ac6 100644 --- a/skydriver/utils.py +++ b/skydriver/utils.py @@ -13,10 +13,10 @@ def make_scan_id() -> str: """Make a new scan id, chronological when sorted.""" - big_time = int(time.time() * 100) - hex_big_time = str(hex(big_time)).removeprefix("0x") - hex_uuid_short = uuid.uuid4().hex[len(hex_big_time) :] - return f"{hex_big_time}{hex_uuid_short}" + big_time = int(time.time() * 100) # ex: 173956400135 + hex_big_time = str(hex(big_time)).removeprefix("0x") # ex: 28809c0407 + hex_uuid_short = uuid.uuid4().hex[len(hex_big_time) :] # ex: 4348a28a8554441b96bcf4 + return f"{hex_big_time}{hex_uuid_short}" # ex: 28809c04074348a28a8554441b96bcf4 class _ScanState(enum.Enum): From ddf2d013c476dda2299b78ca788c1518b5b8057f Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 14:17:48 -0600 Subject: [PATCH 309/327] typo --- skydriver/k8s/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 0521407c..94c01b43 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -59,7 +59,7 @@ async def start_job( LOGGER.warning( f"encountered a transient error in the k8s namespace, " f"trying again in {ENV.K8S_START_JOB_TRANSIENT_ERROR_RETRY_DELAY}s" - ": {repr(e)}" + f": {repr(e)}" ) # maybe next time, it'll be ok await asyncio.sleep(ENV.K8S_START_JOB_TRANSIENT_ERROR_RETRY_DELAY) From ae9614dbab7effc4d99a8e05298af200e5620767 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Fri, 14 Feb 2025 14:22:15 -0600 Subject: [PATCH 310/327] logging - 2 --- skydriver/k8s/utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/skydriver/k8s/utils.py b/skydriver/k8s/utils.py index 94c01b43..02d54e9b 100644 --- a/skydriver/k8s/utils.py +++ b/skydriver/k8s/utils.py @@ -49,11 +49,13 @@ async def start_job( LOGGER.info(f"K8s Job (attempt #{i+1}):") LOGGER.info(job_dict) try: - return kubernetes.utils.create_from_dict( + resp = kubernetes.utils.create_from_dict( k8s_batch_api.api_client, job_dict, namespace=ENV.K8S_NAMESPACE, ) + LOGGER.info("k8s job successfully created!") + return resp except Exception as e: # broad b/c re-raising if inf_retry_on_transient_errors and is_known_k8s_transient_error(e): LOGGER.warning( From d41ba33b041376c8922d04886b43f2efe793c7d5 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Mon, 17 Feb 2025 15:23:47 -0600 Subject: [PATCH 311/327] update `IntervalTimer` use --- s3_sidecar/__main__.py | 8 ++------ skydriver/k8s/scan_backlog.py | 8 ++++++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/s3_sidecar/__main__.py b/s3_sidecar/__main__.py index c4b57011..de895598 100644 --- a/s3_sidecar/__main__.py +++ b/s3_sidecar/__main__.py @@ -86,13 +86,9 @@ def main() -> None: args = parser.parse_args() logging_tools.log_argparse_args(args) - housekeeping_timer = IntervalTimer( - 5, - logging.getLogger(f"{LOGGER.name}.housekeeping"), - ) + housekeeping_timer = IntervalTimer(5, f"{LOGGER.name}.housekeeping") lifetime_timer = IntervalTimer( - ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, - logging.getLogger(f"{LOGGER.name}.lifetime_timer"), + ENV.K8S_SCANNER_SIDECAR_S3_LIFETIME_SECONDS, f"{LOGGER.name}.lifetime_timer" ) if args.wait_indefinitely: diff --git a/skydriver/k8s/scan_backlog.py b/skydriver/k8s/scan_backlog.py index 0d56fbe6..01cf0506 100644 --- a/skydriver/k8s/scan_backlog.py +++ b/skydriver/k8s/scan_backlog.py @@ -135,8 +135,12 @@ async def _run( ) ) - timer_main_loop = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) - timer_logging = IntervalTimer(ENV.SCAN_BACKLOG_RUNNER_DELAY, LOGGER) + timer_main_loop = IntervalTimer( + ENV.SCAN_BACKLOG_RUNNER_DELAY, f"{LOGGER.name}.timer" + ) + timer_logging = IntervalTimer( + ENV.SCAN_BACKLOG_RUNNER_DELAY, f"{LOGGER.name}.heartbeat_timer" + ) # main loop while True: From 9e77f3eccccb05f2b16ef1621bb999084e3d45e5 Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 17 Feb 2025 22:07:40 +0000 Subject: [PATCH 312/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 0f66410b..d6b94322 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.20 -botocore==1.36.20 +boto3==1.36.22 +botocore==1.36.22 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -39,11 +39,11 @@ rsa==4.9 s3transfer==0.11.2 six==1.17.0 tornado==6.4.2 -typeguard==4.4.1 +typeguard==4.4.2 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.15.1 +wipac-dev-tools==1.15.2 wipac-rest-tools==1.8.5 ######################################################################## # pipdeptree @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.20] -│ ├── botocore [required: >=1.36.20,<1.37.0, installed: 1.36.20] +├── boto3 [required: Any, installed: 1.36.22] +│ ├── botocore [required: >=1.36.22,<1.37.0, installed: 1.36.22] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.20] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.22] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -111,9 +111,9 @@ skydriver-s3-sidecar-ewms-init-container │ ├── idna [required: >=2.5,<4, installed: 3.10] │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] -├── typeguard [required: Any, installed: 4.4.1] +├── typeguard [required: Any, installed: 4.4.2] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.15.1] +├── wipac-dev-tools [required: Any, installed: 1.15.2] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -137,7 +137,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.15.1] + └── wipac-dev-tools [required: Any, installed: 1.15.2] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From d7b7f039bf860caf5d921e143231bc54c1483c73 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 19 Feb 2025 14:03:51 -0600 Subject: [PATCH 313/327] misc cicd --- .github/workflows/wipac-cicd.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 5ee8671b..6962dc7e 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -40,7 +40,6 @@ jobs: # LINTERS ############################################################################# - flake8: runs-on: ubuntu-latest steps: @@ -69,7 +68,6 @@ jobs: # PACKAGING ############################################################################# - writable-branch-detect: runs-on: ubuntu-latest outputs: @@ -132,7 +130,6 @@ jobs: # TESTS ############################################################################# - unit-tests: needs: [ py-versions ] runs-on: ubuntu-latest @@ -257,6 +254,11 @@ jobs: file: Dockerfile tags: wipac/skydriver:local + + ############################################################################# + # RELEASE + ############################################################################# + release: # only run on main/master/default if: format('refs/heads/{0}', github.event.repository.default_branch) == github.ref From 12c06bc4e927a8b60fb3d839cb92c47f4334a740 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 19 Feb 2025 20:08:30 +0000 Subject: [PATCH 314/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index d6b94322..80775300 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.22 -botocore==1.36.22 +boto3==1.36.23 +botocore==1.36.23 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -21,7 +21,7 @@ google-auth==2.38.0 humanfriendly==10.0 idna==3.10 jmespath==1.0.1 -kubernetes==32.0.0 +kubernetes==32.0.1 motor==3.3.2 oauthlib==3.2.2 pyasn1==0.6.1 @@ -56,22 +56,22 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.22] -│ ├── botocore [required: >=1.36.22,<1.37.0, installed: 1.36.22] +├── boto3 [required: Any, installed: 1.36.23] +│ ├── botocore [required: >=1.36.23,<1.37.0, installed: 1.36.23] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.22] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.23] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] ├── dacite [required: <1.9, installed: 1.8.1] ├── humanfriendly [required: Any, installed: 10.0] -├── kubernetes [required: Any, installed: 32.0.0] +├── kubernetes [required: Any, installed: 32.0.1] │ ├── certifi [required: >=14.05.14, installed: 2025.1.31] │ ├── durationpy [required: >=0.7, installed: 0.9] │ ├── google-auth [required: >=1.0.1, installed: 2.38.0] From 7f94f734c4f9080e5c6c908c87677d07cc3b5c71 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 19 Feb 2025 14:36:30 -0600 Subject: [PATCH 315/327] matrix flake8 --- .github/workflows/wipac-cicd.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 6962dc7e..234e4e25 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -42,9 +42,15 @@ jobs: flake8: runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + py3: ${{ fromJSON(needs.py-versions.outputs.matrix) }} steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.py3 }} - uses: WIPACrepo/wipac-dev-flake8-action@v1.2 with: max-complexity: 10 From 131c8838a28b71b760d07d3deb392cea64ee24f7 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 19 Feb 2025 14:39:07 -0600 Subject: [PATCH 316/327] matrix flake8 - 2 --- .github/workflows/wipac-cicd.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/wipac-cicd.yml b/.github/workflows/wipac-cicd.yml index 234e4e25..4cb04564 100644 --- a/.github/workflows/wipac-cicd.yml +++ b/.github/workflows/wipac-cicd.yml @@ -41,6 +41,7 @@ jobs: ############################################################################# flake8: + needs: [ py-versions ] runs-on: ubuntu-latest strategy: fail-fast: false From 8b7acd82f3032ec096e704d6c31fbdf9366a3415 Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 19 Feb 2025 20:43:01 +0000 Subject: [PATCH 317/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 80775300..d89c46b0 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.23 -botocore==1.36.23 +boto3==1.36.24 +botocore==1.36.24 cachetools==5.5.1 certifi==2025.1.31 cffi==1.17.1 @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.23] -│ ├── botocore [required: >=1.36.23,<1.37.0, installed: 1.36.23] +├── boto3 [required: Any, installed: 1.36.24] +│ ├── botocore [required: >=1.36.24,<1.37.0, installed: 1.36.24] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.23] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.24] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] From c061d4ae02fc3c0e619ff26228251cc2e79fb15a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 25 Feb 2025 15:26:02 -0600 Subject: [PATCH 318/327] prod-tester: fix column typing --- resources/prod_tester/test_suit_prod.py | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/prod_tester/test_suit_prod.py b/resources/prod_tester/test_suit_prod.py index 779c313c..784b4242 100644 --- a/resources/prod_tester/test_suit_prod.py +++ b/resources/prod_tester/test_suit_prod.py @@ -160,6 +160,7 @@ def display_test_status(tests: list[test_getter.TestParamSet]): table.add_row(["#", "Event File", "Reco Algo", "Scan ID", "Status"]) table.set_cols_align(["r", "l", "l", "r", "l"]) table.set_cols_width([2, 25, 18, scan_id_len, 10]) + table.set_cols_dtype(["i", "t", "t", "t", "t"]) # Add rows for each test for i, test in sorted_tests: From 9a58dde709a4644fe3dbb82ce2e11ac9294cc3a8 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 25 Feb 2025 21:29:08 +0000 Subject: [PATCH 319/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index d89c46b0..46f5cb1e 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,9 +7,9 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.36.24 -botocore==1.36.24 -cachetools==5.5.1 +boto3==1.37.1 +botocore==1.37.1 +cachetools==5.5.2 certifi==2025.1.31 cffi==1.17.1 charset-normalizer==3.4.1 @@ -43,8 +43,8 @@ typeguard==4.4.2 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.15.2 -wipac-rest-tools==1.8.5 +wipac-dev-tools==1.15.3 +wipac-rest-tools==1.8.6 ######################################################################## # pipdeptree ######################################################################## @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.36.24] -│ ├── botocore [required: >=1.36.24,<1.37.0, installed: 1.36.24] +├── boto3 [required: Any, installed: 1.37.1] +│ ├── botocore [required: >=1.37.1,<1.38.0, installed: 1.37.1] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.36.24] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.37.1] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -75,7 +75,7 @@ skydriver-s3-sidecar-ewms-init-container │ ├── certifi [required: >=14.05.14, installed: 2025.1.31] │ ├── durationpy [required: >=0.7, installed: 0.9] │ ├── google-auth [required: >=1.0.1, installed: 2.38.0] -│ │ ├── cachetools [required: >=2.0.0,<6.0, installed: 5.5.1] +│ │ ├── cachetools [required: >=2.0.0,<6.0, installed: 5.5.2] │ │ ├── pyasn1_modules [required: >=0.2.1, installed: 0.4.1] │ │ │ └── pyasn1 [required: >=0.4.6,<0.7.0, installed: 0.6.1] │ │ └── rsa [required: >=3.1.4,<5, installed: 4.9] @@ -113,15 +113,15 @@ skydriver-s3-sidecar-ewms-init-container ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.2] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.15.2] +├── wipac-dev-tools [required: Any, installed: 1.15.3] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] │ │ ├── idna [required: >=2.5,<4, installed: 3.10] │ │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] │ └── typing_extensions [required: Any, installed: 4.12.2] -└── wipac-rest-tools [required: Any, installed: 1.8.5] - ├── cachetools [required: Any, installed: 5.5.1] +└── wipac-rest-tools [required: Any, installed: 1.8.6] + ├── cachetools [required: Any, installed: 5.5.2] ├── PyJWT [required: !=2.6.0, installed: 2.10.1] ├── qrcode [required: Any, installed: 8.0] ├── requests [required: Any, installed: 2.32.3] @@ -137,7 +137,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.15.2] + └── wipac-dev-tools [required: Any, installed: 1.15.3] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From 5b5e6e8ecabd1e622160e6a5d19322abae717e62 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Tue, 25 Feb 2025 16:46:58 -0600 Subject: [PATCH 320/327] prod-tester: fix env var --- resources/prod_tester/test_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/prod_tester/test_runner.py b/resources/prod_tester/test_runner.py index 0c773ea9..9d6783f5 100644 --- a/resources/prod_tester/test_runner.py +++ b/resources/prod_tester/test_runner.py @@ -70,6 +70,7 @@ async def launch_a_scan( "priority": 100, "scanner_server_env": { "SKYSCAN_MINI_TEST": True, + "_SKYSCAN_CI_MINI_TEST": True, # env var changed to this in the "skydriver 2"-ready scanner }, "classifiers": { "_TEST": True, From 17e2e47289c2ddc2e32e7cc9f7fb317a0fb07166 Mon Sep 17 00:00:00 2001 From: github-actions Date: Tue, 25 Feb 2025 22:53:27 +0000 Subject: [PATCH 321/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index 46f5cb1e..ec1ea658 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -43,7 +43,7 @@ typeguard==4.4.2 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.15.3 +wipac-dev-tools==1.15.4 wipac-rest-tools==1.8.6 ######################################################################## # pipdeptree @@ -113,7 +113,7 @@ skydriver-s3-sidecar-ewms-init-container ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.2] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.15.3] +├── wipac-dev-tools [required: Any, installed: 1.15.4] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -137,7 +137,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.15.3] + └── wipac-dev-tools [required: Any, installed: 1.15.4] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From ae62d17250d948c7ba473a3c6ca545955e838510 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 26 Feb 2025 15:52:17 -0600 Subject: [PATCH 322/327] comment --- skydriver/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/config.py b/skydriver/config.py index eb1a017f..31a956ef 100644 --- a/skydriver/config.py +++ b/skydriver/config.py @@ -65,7 +65,7 @@ class EnvConfig: SCAN_BACKLOG_PENDING_ENTRY_TTL_REVIVE: int = 5 * 60 # entry is revived after N secs THIS_IMAGE_WITH_TAG: str = "" - MIN_SKYMAP_SCANNER_TAG: str = "v4.0.0" + MIN_SKYMAP_SCANNER_TAG: str = "v4.0.0" # TODO: update this either in k8s or here # k8s K8S_NAMESPACE: str = "" From 9a662552ea22087bb0a33a4594c52724370f788f Mon Sep 17 00:00:00 2001 From: github-actions Date: Wed, 26 Feb 2025 21:56:04 +0000 Subject: [PATCH 323/327] update dependencies*.log files(s) --- dependencies-docker-skydriver.log | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/dependencies-docker-skydriver.log b/dependencies-docker-skydriver.log index ec1ea658..6e25e78d 100644 --- a/dependencies-docker-skydriver.log +++ b/dependencies-docker-skydriver.log @@ -7,8 +7,8 @@ # pip freeze ######################################################################## aiocache==0.12.3 -boto3==1.37.1 -botocore==1.37.1 +boto3==1.37.2 +botocore==1.37.2 cachetools==5.5.2 certifi==2025.1.31 cffi==1.17.1 @@ -36,14 +36,14 @@ requests==2.32.3 requests-futures==1.0.2 requests-oauthlib==2.0.0 rsa==4.9 -s3transfer==0.11.2 +s3transfer==0.11.3 six==1.17.0 tornado==6.4.2 typeguard==4.4.2 typing_extensions==4.12.2 urllib3==2.3.0 websocket-client==1.8.0 -wipac-dev-tools==1.15.4 +wipac-dev-tools==1.15.6 wipac-rest-tools==1.8.6 ######################################################################## # pipdeptree @@ -56,15 +56,15 @@ pipdeptree==2.25.0 └── pip [required: >=24.2, installed: 25.0.1] skydriver-s3-sidecar-ewms-init-container ├── aiocache [required: Any, installed: 0.12.3] -├── boto3 [required: Any, installed: 1.37.1] -│ ├── botocore [required: >=1.37.1,<1.38.0, installed: 1.37.1] +├── boto3 [required: Any, installed: 1.37.2] +│ ├── botocore [required: >=1.37.2,<1.38.0, installed: 1.37.2] │ │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ │ └── six [required: >=1.5, installed: 1.17.0] │ │ └── urllib3 [required: >=1.25.4,<3,!=2.2.0, installed: 2.3.0] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] -│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.2] -│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.37.1] +│ └── s3transfer [required: >=0.11.0,<0.12.0, installed: 0.11.3] +│ └── botocore [required: >=1.36.0,<2.0a.0, installed: 1.37.2] │ ├── jmespath [required: >=0.7.1,<2.0.0, installed: 1.0.1] │ ├── python-dateutil [required: >=2.1,<3.0.0, installed: 2.9.0.post0] │ │ └── six [required: >=1.5, installed: 1.17.0] @@ -113,7 +113,7 @@ skydriver-s3-sidecar-ewms-init-container ├── tornado [required: Any, installed: 6.4.2] ├── typeguard [required: Any, installed: 4.4.2] │ └── typing_extensions [required: >=4.10.0, installed: 4.12.2] -├── wipac-dev-tools [required: Any, installed: 1.15.4] +├── wipac-dev-tools [required: Any, installed: 1.15.6] │ ├── requests [required: Any, installed: 2.32.3] │ │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] @@ -137,7 +137,7 @@ skydriver-s3-sidecar-ewms-init-container │ └── urllib3 [required: >=1.21.1,<3, installed: 2.3.0] ├── tornado [required: Any, installed: 6.4.2] ├── urllib3 [required: >=2.0.4, installed: 2.3.0] - └── wipac-dev-tools [required: Any, installed: 1.15.4] + └── wipac-dev-tools [required: Any, installed: 1.15.6] ├── requests [required: Any, installed: 2.32.3] │ ├── certifi [required: >=2017.4.17, installed: 2025.1.31] │ ├── charset-normalizer [required: >=2,<4, installed: 3.4.1] From 8038d663d024a6f937d03af294ea0706e3cafc51 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 26 Feb 2025 16:15:45 -0600 Subject: [PATCH 324/327] small refactor --- skydriver/ewms.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index bbb47de6..0026c603 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -78,6 +78,19 @@ async def get_taskforce_infos( return resp["taskforces"] +def _increment_counts(target: defaultdict[str, int], source: dict[str, int]): + """Increment the counts in `target` by the corresponding values in `source`. + + Example: + target = {"Tasking": 24} + source = {"Tasking": 20, "Processing": 7} + _increment_counts(target, source) + # target becomes {"Tasking": 44, "Processing": 7} + """ + for inner_key, value in source.items(): + target[inner_key] += value + + async def get_workforce_statuses( ewms_rc: RestClient, workflow_id: str | None, @@ -96,23 +109,24 @@ async def get_workforce_statuses( tf_state_dicts = await get_taskforce_infos(ewms_rc, workflow_id) # merge & sum the compound statuses - merged: dict[str, dict[str, int]] = defaultdict(lambda: defaultdict(int)) + merged_statuses: defaultdict[str, defaultdict[str, int]] = defaultdict( + lambda: defaultdict(int) + ) for state in tf_state_dicts: if not (d := state.get("compound_statuses")): continue for outer_key, inner_dict in d.items(): - for inner_key, value in inner_dict.items(): - merged[outer_key][inner_key] += value + _increment_counts(merged_statuses[outer_key], inner_dict) # compute `n_running`, excluding 'FatalError' n_running = sum( count - for substatus, count in merged.get("RUNNING", {}).items() + for substatus, count in merged_statuses.get("RUNNING", {}).items() if substatus != "FatalError" ) return { - "statuses": {k: dict(v) for k, v in merged.items()}, # convert to dict + "statuses": {k: dict(v) for k, v in merged_statuses.items()}, # convert to dict "n_running": n_running, # NOTE: it's tempting to sum other statuses' counts, but not all # statuses are mutually exclusive -- iow, ewms may double count for some jobs From b2854f079fae39cd2dc8e497322043fa8bc7f639 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 26 Feb 2025 16:17:08 -0600 Subject: [PATCH 325/327] comments --- skydriver/ewms.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 0026c603..5fc98f2b 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -81,8 +81,11 @@ async def get_taskforce_infos( def _increment_counts(target: defaultdict[str, int], source: dict[str, int]): """Increment the counts in `target` by the corresponding values in `source`. + This function updates `target` (a `defaultdict(int)`) by adding values from `source`. + If a key in `source` is missing in `target`, it is implicitly initialized to 0 before addition. + Example: - target = {"Tasking": 24} + target = defaultdict(int, {"Tasking": 24}) source = {"Tasking": 20, "Processing": 7} _increment_counts(target, source) # target becomes {"Tasking": 44, "Processing": 7} @@ -95,20 +98,23 @@ async def get_workforce_statuses( ewms_rc: RestClient, workflow_id: str | None, ) -> dict[str, dict[str, dict[str, int]] | int]: - """Get the compound statuses for the entire workflow's workforce (aka its taskforces), - along with the number of currently running workers. + """Aggregate the compound statuses of all taskforces in a workflow. + + This function retrieves workforce information, merges taskforce statuses, + and computes the number of currently running workers, excluding 'FatalError'. Example: - from ewms: + Input from ewms: >>> {'IDLE': {'null': 1}, 'RUNNING': {'Tasking': 24}} >>> {'IDLE': {'foo': 99}, 'RUNNING': {'Tasking': 20}} >>> {'RUNNING': {'Processing': 7}, 'REMOVED': {'Error': 1}} - out: + + Aggregated output: >>> {'IDLE': {'null': 1, 'foo': 99}, 'RUNNING': {'Tasking': 44, 'Processing': 7}, 'REMOVED': {'Error': 1}} """ tf_state_dicts = await get_taskforce_infos(ewms_rc, workflow_id) - # merge & sum the compound statuses + # Merge & sum the compound statuses merged_statuses: defaultdict[str, defaultdict[str, int]] = defaultdict( lambda: defaultdict(int) ) @@ -118,7 +124,7 @@ async def get_workforce_statuses( for outer_key, inner_dict in d.items(): _increment_counts(merged_statuses[outer_key], inner_dict) - # compute `n_running`, excluding 'FatalError' + # Compute `n_running`, excluding 'FatalError' n_running = sum( count for substatus, count in merged_statuses.get("RUNNING", {}).items() @@ -126,10 +132,10 @@ async def get_workforce_statuses( ) return { - "statuses": {k: dict(v) for k, v in merged_statuses.items()}, # convert to dict + "statuses": {k: dict(v) for k, v in merged_statuses.items()}, # to dicts "n_running": n_running, - # NOTE: it's tempting to sum other statuses' counts, but not all - # statuses are mutually exclusive -- iow, ewms may double count for some jobs + # NOTE: It's tempting to sum other statuses' counts, but not all + # statuses are mutually exclusive—some jobs may be double-counted. } From dd97fbfdab0cbdfd9c4f55b36eaca811029a6305 Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 26 Feb 2025 16:24:32 -0600 Subject: [PATCH 326/327] add `top_errors` to status getter endpoint --- skydriver/ewms.py | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 5fc98f2b..2b7ca265 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -101,9 +101,10 @@ async def get_workforce_statuses( """Aggregate the compound statuses of all taskforces in a workflow. This function retrieves workforce information, merges taskforce statuses, - and computes the number of currently running workers, excluding 'FatalError'. + computes the number of currently running workers (excluding 'FatalError'), + and aggregates occurrences of top task errors. - Example: + Example (for "statuses" key): Input from ewms: >>> {'IDLE': {'null': 1}, 'RUNNING': {'Tasking': 24}} >>> {'IDLE': {'foo': 99}, 'RUNNING': {'Tasking': 20}} @@ -111,15 +112,23 @@ async def get_workforce_statuses( Aggregated output: >>> {'IDLE': {'null': 1, 'foo': 99}, 'RUNNING': {'Tasking': 44, 'Processing': 7}, 'REMOVED': {'Error': 1}} + + Example (for "top_errors" key): + Input: + >>> {'MemoryError': 3, 'TimeoutError': 2} + >>> {'MemoryError': 1, 'NetworkError': 4} + + Aggregated output: + >>> {'MemoryError': 4, 'TimeoutError': 2, 'NetworkError': 4} """ - tf_state_dicts = await get_taskforce_infos(ewms_rc, workflow_id) + tf_infos = await get_taskforce_infos(ewms_rc, workflow_id) # Merge & sum the compound statuses merged_statuses: defaultdict[str, defaultdict[str, int]] = defaultdict( lambda: defaultdict(int) ) - for state in tf_state_dicts: - if not (d := state.get("compound_statuses")): + for tfi in tf_infos: + if not (d := tfi.get("compound_statuses")): continue for outer_key, inner_dict in d.items(): _increment_counts(merged_statuses[outer_key], inner_dict) @@ -131,11 +140,19 @@ async def get_workforce_statuses( if substatus != "FatalError" ) + # Aggregate errors + top_errors: defaultdict[str, int] = defaultdict(int) + for tfi in tf_infos: + if not (d := tfi.get("top_task_errors")): # dict[str, int] + continue + _increment_counts(top_errors, d) + return { - "statuses": {k: dict(v) for k, v in merged_statuses.items()}, # to dicts + "statuses": {k: dict(v) for k, v in merged_statuses.items()}, "n_running": n_running, # NOTE: It's tempting to sum other statuses' counts, but not all # statuses are mutually exclusive—some jobs may be double-counted. + "top_errors": dict(top_errors), } From 61b957d024e929a1b65209dbe7e72f5feab77f0a Mon Sep 17 00:00:00 2001 From: ric-evans Date: Wed, 26 Feb 2025 16:29:18 -0600 Subject: [PATCH 327/327] mypy --- skydriver/ewms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skydriver/ewms.py b/skydriver/ewms.py index 2b7ca265..bc3a4d82 100644 --- a/skydriver/ewms.py +++ b/skydriver/ewms.py @@ -97,7 +97,7 @@ def _increment_counts(target: defaultdict[str, int], source: dict[str, int]): async def get_workforce_statuses( ewms_rc: RestClient, workflow_id: str | None, -) -> dict[str, dict[str, dict[str, int]] | int]: +) -> dict[str, dict[str, dict[str, int]] | int | dict[str, int]]: """Aggregate the compound statuses of all taskforces in a workflow. This function retrieves workforce information, merges taskforce statuses,