From e4790047e14dd79342174c844e609c9e3f38aae7 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 7 Aug 2024 17:39:01 -0500 Subject: [PATCH 1/7] Add script to create a dataset mdx file --- scripts/dataset.mdx | 5 ++ scripts/mdx.py | 133 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 scripts/dataset.mdx create mode 100644 scripts/mdx.py diff --git a/scripts/dataset.mdx b/scripts/dataset.mdx new file mode 100644 index 0000000..036cfa7 --- /dev/null +++ b/scripts/dataset.mdx @@ -0,0 +1,5 @@ + + + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + + \ No newline at end of file diff --git a/scripts/mdx.py b/scripts/mdx.py new file mode 100644 index 0000000..794ad0d --- /dev/null +++ b/scripts/mdx.py @@ -0,0 +1,133 @@ +import yaml +import os + + +input_data = { + "collection": "climdex-tmaxxf-access-cm2-ssp370", + "bucket": "veda-data-store-staging", + "prefix": "climdex-tmaxxf-access-cm2-ssp370/", + "filename_regex": ".*-ssp370_(.*)_tmax.*.tif", + "datetime_group": ".*-ssp370_(.*)_tmax.*.tif", + "datetime_range": "year", + "assets": { + "tmax_above_86": { + "title": "Tmax Above 86", + "description": "Tmax Above 86", + "regex": ".*-ssp370_(.*)_tmax_above_86.tif" + }, + "tmax_above_90": { + "title": "Tmax Above 90", + "description": "Tmax Above 90", + "regex": ".*-ssp370_(.*)_tmax_above_90.tif" + }, + "tmax_above_100": { + "title": "Tmax Above 100", + "description": "Tmax Above 100", + "regex": ".*-ssp370_(.*)_tmax_above_100.tif" + }, + "tmax_above_110": { + "title": "Tmax Above 110", + "description": "Tmax Above 110", + "regex": ".*-ssp370_(.*)_tmax_above_110.tif" + }, + "tmax_above_115": { + "title": "Tmax Above 115", + "description": "Tmax Above 115", + "regex": ".*-ssp370_(.*)_tmax_above_115.tif" + } + } +} + + +def create_frontmatter(input_data): + collection_id = input_data["collection"] + + json_data = { + "id": collection_id, + "name": input_data.get("title", "Dataset Title"), + "featured": False, + "description": input_data.get("description", "Dataset Description"), + "media": { + "src": "https://bootstrap-cheatsheet.themeselection.com/assets/images/bs-images/img-2x1.png", + "alt": "Placeholder image", + "author": { + "name": "Media author", + "url": "" + } + }, + "taxonomy": [ + {"name": "Theme", "values": ["Greenhouse Gases"]}, + {"name": "Source", "values": ["NASA"]}, + ], + "infoDescription": "::markdown\n - **Temporal Extent:** 2015 - 2100\n - **Temporal Resolution:** Annual\n - **Spatial Extent:** Global\n - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n - **Data Units:** Days (Days per year above 90°F or 110°F)\n - **Data Type:** Research", + "layers": [] + } + + for asset_id, asset in input_data.get("assets", {}).items(): + layer = { + "id": f"{collection_id}-{asset_id}", + "stacCol": collection_id, + "name": asset.get("title", "Asset Title"), + "type": "raster", + "description": asset.get("description", "Asset Description"), + "zoomExtent": [0, 4], + "sourceParams": { + "assets": asset_id, + "resampling_method": "bilinear", + "colormap_name": "wistia", + "rescale": "0,365", + "maxzoom": 4 + }, + "compare": { + "datasetId": collection_id, + "layerId": asset_id, + "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}" + }, + "analysis": { + "exclude": False, + "metrics": ["mean"] + }, + "legend": { + "unit": {"label": "Days"}, + "type": "gradient", + "min": 0, + "max": 365, + "stops": ["#E4FF7A", "#FAED2D", "#FFCE0A", "#FFB100", "#FE9900", "#FC7F00"] + }, + "info": { + "source": "NASA", + "spatialExtent": "Global", + "temporalResolution": "Annual", + "unit": "Days" + } + } + json_data["layers"].append(layer) + yaml_data = yaml.dump(json_data, sort_keys=False) + + return yaml_data + + +def safe_open_w(path): + ''' Open "path" for writing, creating any parent directories as needed. + ''' + os.makedirs(os.path.dirname(path), exist_ok=True) + return open(path, 'w') + + +if __name__ == "__main__": + dataset_config = create_frontmatter(input_data) + front_matter = f"---\n{dataset_config}---\n" + + # Path to the existing file + file_path = "dataset.mdx" + + # Read the existing content of the file + with open(file_path, "r") as file: + existing_content = file.read() + + # Combine front matter and existing content + new_content = front_matter + existing_content + + # Write the combined content back to the file + with safe_open_w(f"../datasets/{input_data['collection']}.data.mdx") as file: + file.write(new_content) From a6b78a9c2eeda8899f892e33bbe20d37fd4c5266 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 7 Aug 2024 17:39:24 -0500 Subject: [PATCH 2/7] Add a workflow to run staging publish on pr --- .github/workflows/pr.yml | 123 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 .github/workflows/pr.yml diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..cb5e89f --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,123 @@ +name: Publish collection to staging and create dataset config PR + +on: + pull_request: + branches: + - main + paths: + - ingestion-data/staging/dataset-config/* + +jobs: + publish: + permissions: + pull-requests: read + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + - name: Get updated files + id: changed-files + uses: tj-actions/changed-files@v44 + with: + files: | + **.json + + - name: Get auth token + id: get-token + run: | + response=$(curl -X POST \ + https://${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=password" \ + -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \ + -d "username=${{ vars.STAGING_USERNAME }}" \ + -d "password=${{ vars.STAGING_PASSWORD }}") + + # Extract tokens + access_token=$(echo "$response" | jq -r '.access_token') + echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT + + - name: Publish all updated collections + id: publish-collections + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }} + run: | + # Ensure WORKFLOWS_URL is set + if [ -z "$WORKFLOWS_URL" ]; then + echo "WORKFLOWS_URL is not set" + exit 1 + fi + + publish_url="${WORKFLOWS_URL%/}/dataset/publish" + + # Iterate over the list of changed files + for file in "$@"; do + if [ -f "$file" ]; then + # Read the JSON content of the file + dataset_config=$(jq '.' "$file") + + # Send a POST request with the JSON content + response=$(curl -s -w "%{http_code}" -o response.txt -X POST \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $bearer_token" \ + -d "$dataset_config" \ + "$publish_url") + + # Extract the status code from the response + status_code=$(tail -n1 <<< "$response") + + if [ "$status_code" -ne 200 ] && [ "$status_code" -ne 201 ]; then + echo "Failed publishing $file because: $(cat response.txt)" + exit 1 + fi + else + echo "File $file does not exist" + exit 1 + fi + done + + CURRENT_PR_NUMBER=${{ github.event.pull_request.number }} + COMMENT_ID=$(gh pr comment $CURRENT_PR_NUMBER --body "Collections published. You can view them at https://staging.openveda.cloud/api/stac/collections/. \n Items might take a while to show up.") + echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT + + + - name: Create dataset mdx for given collections + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + cd scripts + for file in "${ALL_CHANGED_FILES[@]}" + do + python3 process.py "$file" + done + + - name: Set up Git + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Clone veda-config + run: git clone https://github.com/NASA_IMPACT/veda-config.git target-repo + + - name: Create PR with changes + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} + COMMENT_ID: ${{ steps.publish-collections.outputs.COMMENT_ID }} + run: | + NEW_BRANCH=`add-dataset-{date}-hash(ALL_CHANGED_FILES)` + cd veda-config + git checkout -b $NEW_BRANCH + cp -r ../datasets/* datasets/ + git add . + git commit -m "Add dataset(s)" + git push origin $NEW_BRANCH + echo -e "Add datasets \nAutomatically created by Github action" > msg + PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets\nAutomatically created by Github action") --json url -q '.url') + + CURRENT_PR_NUMBER=${{ github.event.pull_request.number }} + gh pr comment $CURRENT_PR_NUMBER --edit $COMMENT_ID --body "$(gh pr comment $CURRENT_PR_NUMBER --json body -q '.body')\n\nA PR has been created to [veda-config](https://github.com/NASA-IMPACT/veda-config) with the provided datasets.\n: ⛙ $PR_URL" From 02d0d4e9f6d67c20a6f0f0d4b0da2d4c66174863 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Wed, 7 Aug 2024 17:39:30 -0500 Subject: [PATCH 3/7] Add requirements --- scripts/requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 scripts/requirements.txt diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..4818cc5 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +pyyaml \ No newline at end of file From 307780d48afc9874805f14ac5d8547463b3cb1e8 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 15 Aug 2024 15:43:25 -0500 Subject: [PATCH 4/7] Clean up and add comments --- .github/workflows/pr.yml | 247 ++++++++++++++++++++++++++++++++------- 1 file changed, 203 insertions(+), 44 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index cb5e89f..7348860 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -1,43 +1,97 @@ +# This GitHub Actions workflow automates the process of +# publishing dataset collections to a staging environment +# and creating a pull request (PR) in the veda-config repository +# with the dataset configuration. +# It is triggered by a pull request to the main branch +# that modifies any files within the ingestion-data/dataset-config/ directory +# The workflow includes steps to +# - publish the datasets, +# - create a PR in veda-config repository, +# - constantly updates the status of the workflow in the PR comment + name: Publish collection to staging and create dataset config PR on: pull_request: branches: - - main + - main paths: - - ingestion-data/staging/dataset-config/* + # Run the workflow only if files inside this path are updated + - ingestion-data/dataset-config/* jobs: - publish: + dataset-publication-and-configuration: permissions: - pull-requests: read + pull-requests: write + contents: read runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + # Initializes the PR comment + # Edits existing or creates new comment + # Why? - Cleanliness! + - name: Initialize PR comment with workflow start + id: init-comment + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + body='### Workflow Status + **Starting workflow...** [View action run]($WORKFLOW_URL) + ' + + # Get the PR number + PR_NUMBER=${{ github.event.pull_request.number }} + + # Fetch existing comments + COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}') + + # Check if a comment already exists + COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1) + + if [ -z "$COMMENT_ID" ]; then + # No existing comment, create a new one + COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id') + else + # Comment exists, overwrite the existing comment + gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body" + fi + + echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT + + # Find only the updated files (file that differ from base) + # Only .json files + # The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps - name: Get updated files id: changed-files uses: tj-actions/changed-files@v44 with: files: | - **.json + **.json + # Uses service client creds to get token + # No username/password needed - name: Get auth token id: get-token run: | response=$(curl -X POST \ - https://${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \ - -H "Content-Type: application/x-www-form-urlencoded" \ - -d "grant_type=password" \ - -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \ - -d "username=${{ vars.STAGING_USERNAME }}" \ - -d "password=${{ vars.STAGING_PASSWORD }}") - - # Extract tokens + ${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials" \ + -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \ + -d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}" + ) + access_token=$(echo "$response" | jq -r '.access_token') echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT + # Makes request to /dataset/publish endpoint + # Outputs only files that were successfully published + # Used by other steps + # If none of the requests are successful, workflow fails + # Updates the PR comment with status of collection publication - name: Publish all updated collections id: publish-collections env: @@ -45,54 +99,103 @@ jobs: WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} run: | - # Ensure WORKFLOWS_URL is set if [ -z "$WORKFLOWS_URL" ]; then echo "WORKFLOWS_URL is not set" exit 1 fi + if [ -z "$AUTH_TOKEN" ]; then + echo "AUTH_TOKEN is not set" + exit 1 + fi + publish_url="${WORKFLOWS_URL%/}/dataset/publish" - - # Iterate over the list of changed files - for file in "$@"; do + bearer_token=$AUTH_TOKEN + + # Track successful publications + all_failed=true + success_collections=() + status_message='### Collection Publication Status + ' + + for file in "${ALL_CHANGED_FILES[@]}"; do + echo $file if [ -f "$file" ]; then - # Read the JSON content of the file dataset_config=$(jq '.' "$file") - - # Send a POST request with the JSON content - response=$(curl -s -w "%{http_code}" -o response.txt -X POST \ + collection_id=$(jq -r '.collection' "$file") + + response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \ -H "Content-Type: application/json" \ - -H "Authorization: Bearer $bearer_token" \ - -d "$dataset_config" \ - "$publish_url") + -H "Authorization: Bearer $AUTH_TOKEN" \ + -d "$dataset_config" + ) - # Extract the status code from the response status_code=$(tail -n1 <<< "$response") - - if [ "$status_code" -ne 200 ] && [ "$status_code" -ne 201 ]; then - echo "Failed publishing $file because: $(cat response.txt)" - exit 1 + + # Update status message based on response code + if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then + echo "$collection_id successfully published ✅" + status_message+="- **$collection_id**: Successfully published ✅ + " + success_collections+=("$file") + all_failed=false + else + echo "$collection_id failed to publish ❌" + status_message+="- **$collection_id**: Failed to publish ❌ + " fi else echo "File $file does not exist" exit 1 fi done + + # Exit workflow if all the requests fail + if [ "$all_failed" = true ]; then + echo "All collections failed to publish." + exit 1 + fi + + # Output only successful collections to be used in subsequent steps + echo "success_collections=$(IFS=','; echo "${success_collections[*]}")" >> $GITHUB_OUTPUT - CURRENT_PR_NUMBER=${{ github.event.pull_request.number }} - COMMENT_ID=$(gh pr comment $CURRENT_PR_NUMBER --body "Collections published. You can view them at https://staging.openveda.cloud/api/stac/collections/. \n Items might take a while to show up.") - echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT + # Update PR comment + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + $status_message" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + # Update PR comment + - name: Update PR comment for PR creation + if: success() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **Creating a PR in veda-config...**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' + # Creates a slim dataset mdx file for each collection based on the dataset config json - name: Create dataset mdx for given collections env: - ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }} run: | - cd scripts - for file in "${ALL_CHANGED_FILES[@]}" + pip install -r scripts/requirements.txt + for file in "${PUBLISHED_COLLECTION_FILES[@]}" do - python3 process.py "$file" + python3 scripts/mdx.py "$file" done - name: Set up Git @@ -100,24 +203,80 @@ jobs: git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" - - name: Clone veda-config - run: git clone https://github.com/NASA_IMPACT/veda-config.git target-repo + - name: Clone `veda-config` + env: + VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} + run: git clone https://${{ env.VEDA_CONFIG_GH_TOKEN }}@github.com/${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}.git + # Creates a PR in veda-config with the following changes: + # 1. the mdx files for all published collections + # 2. updates the stac/raster urls in .env file + # This step needs a GH_TOKEN that has permissions to create a PR in veda-config - name: Create PR with changes + id: create-pr env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} COMMENT_ID: ${{ steps.publish-collections.outputs.COMMENT_ID }} + PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }} run: | - NEW_BRANCH=`add-dataset-{date}-hash(ALL_CHANGED_FILES)` - cd veda-config + files_string=$(IFS=$'\n'; echo "${PUBLISHED_COLLECTION_FILES[*]}") + hash=$(echo -n "$files_string" | md5sum | cut -d ' ' -f 1) + NEW_BRANCH="add-dataset-$hash" + cd ${{ vars.VEDA_CONFIG_REPO_NAME }} + git fetch origin + if git ls-remote --exit-code --heads origin $NEW_BRANCH; then + git push origin --delete $NEW_BRANCH + fi git checkout -b $NEW_BRANCH + + # Update the env vars to staging based on env vars + sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|g" .env cp -r ../datasets/* datasets/ git add . git commit -m "Add dataset(s)" git push origin $NEW_BRANCH - echo -e "Add datasets \nAutomatically created by Github action" > msg - PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets\nAutomatically created by Github action") --json url -q '.url') + PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets (Automatically created by Github action)")) + + echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT + echo "PR creation succeeded" + + # Updates the comment with a link to the above PR + - name: Update PR comment with PR creation result + if: success() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + PR_URL=${{ steps.create-pr.outputs.PR_URL }} + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + - name: Update PR comment on PR creation failure + if: failure() && steps.create-pr.outcome == 'failure' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **Failed ❌ to create a PR with the dataset configuration. 😔 **" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + # If the workflow fails at any point, the PR comment will be updated + - name: Update PR comment on overall workflow failure + if: failure() && steps.create-pr.outcome != 'failure' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY - CURRENT_PR_NUMBER=${{ github.event.pull_request.number }} - gh pr comment $CURRENT_PR_NUMBER --edit $COMMENT_ID --body "$(gh pr comment $CURRENT_PR_NUMBER --json body -q '.body')\n\nA PR has been created to [veda-config](https://github.com/NASA-IMPACT/veda-config) with the provided datasets.\n: ⛙ $PR_URL" + ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" From e0623dea1215e98d5b61e1380dcffcf16cc1b0f3 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 15 Aug 2024 15:43:46 -0500 Subject: [PATCH 5/7] Add documentation about the automated workflow to README --- README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/README.md b/README.md index ac3fb3c..3401abd 100644 --- a/README.md +++ b/README.md @@ -212,3 +212,58 @@ pip-compile ``` This will update `requirements.txt` with a complete, realized set of Python dependencies. + +# Workflow automation +The repository consists of an automated workflow for staging dataset publication and configuration. + +The workflow file can be found in [.github/workflows/pr.yml](.github/workflows/pr.yml). + +This GitHub Actions workflow automates the process of publishing dataset collections to a staging environment and creating a pull request (PR) in the veda-config repository with the dataset configuration. It is triggered by a pull request to the main branch that modifies any files within the `ingestion-data/dataset-config/` directory. The status of the workflow run is automatically updated in a comment in the PR. + +The following table includes a description and role for each repository variable and secret needed by the workflow. + +| **Type** | **Variable/Secret** | **Description** | +|------------|----------------------------------|-----------------------------------------------------------------------------------------------------| +| Variable | `vars.STAGING_COGNITO_DOMAIN` | The domain used for Cognito OAuth2 authentication, where the authentication requests are sent. | +| Variable | `vars.STAGING_CLIENT_ID` | The client ID used in OAuth2 authentication to identify the application making the request. | +| Variable | `vars.STAGING_WORKFLOWS_URL` | The base URL for accessing the staging environment's workflows, where dataset publishing occurs. | +| Variable | `vars.VEDA_CONFIG_REPO_ORG` | The organization or user that owns the `veda-config` repository, used for repository cloning. | +| Variable | `vars.VEDA_CONFIG_REPO_NAME` | The name of the `veda-config` repository, which stores the dataset configuration. | +| Variable | `vars.ENV_FROM` | A substring of the the current raster/stac URLs in the .env file that's to be replaced by `vars.ENV_TO`. | +| Variable | `vars.ENV_TO` | A substring of the the current raster/stac URLs in the .env file that replaces `vars.ENV_FROM`. | +| Secret | `secrets.STAGING_CLIENT_SECRET` | The secret client key used in OAuth2 authentication, necessary for secure access to the API. | +| Secret | `secrets.VEDA_CONFIG_GH_TOKEN` | The GitHub token with access rights to the `veda-config` repository, used for creating pull requests.| + + +### `vars.ENV_FROM` and `vars.ENV_TO` usage +These are used to overwrite the stac/raster URLs in the `.env` ile in `veda-config` repository + +Command used: `sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|" .env` + +#### Example +``` +vars.ENV_FROM = openveda +vars.ENV_TO = staging.openveda +``` + +`.env` before +``` +... +# Endpoint for the Tiler server. No trailing slash. +API_RASTER_ENDPOINT='https://openveda.cloud/api/raster' + +# Endpoint for the STAC server. No trailing slash. +API_STAC_ENDPOINT='https://openveda.cloud/api/stac' +... +``` + +`.env` after +``` +... +# Endpoint for the Tiler server. No trailing slash. +API_RASTER_ENDPOINT='https://staging.openveda.cloud/api/raster' + +# Endpoint for the STAC server. No trailing slash. +API_STAC_ENDPOINT='https://staging.openveda.cloud/api/stac' +... +``` From 81677614af4792f6edec5af8cec1675ee57e52d2 Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 15 Aug 2024 16:14:53 -0500 Subject: [PATCH 6/7] Update mdx.py --- scripts/mdx.py | 86 ++++++++++++++++---------------------------------- 1 file changed, 28 insertions(+), 58 deletions(-) diff --git a/scripts/mdx.py b/scripts/mdx.py index 794ad0d..8c91549 100644 --- a/scripts/mdx.py +++ b/scripts/mdx.py @@ -1,42 +1,7 @@ import yaml import os - - -input_data = { - "collection": "climdex-tmaxxf-access-cm2-ssp370", - "bucket": "veda-data-store-staging", - "prefix": "climdex-tmaxxf-access-cm2-ssp370/", - "filename_regex": ".*-ssp370_(.*)_tmax.*.tif", - "datetime_group": ".*-ssp370_(.*)_tmax.*.tif", - "datetime_range": "year", - "assets": { - "tmax_above_86": { - "title": "Tmax Above 86", - "description": "Tmax Above 86", - "regex": ".*-ssp370_(.*)_tmax_above_86.tif" - }, - "tmax_above_90": { - "title": "Tmax Above 90", - "description": "Tmax Above 90", - "regex": ".*-ssp370_(.*)_tmax_above_90.tif" - }, - "tmax_above_100": { - "title": "Tmax Above 100", - "description": "Tmax Above 100", - "regex": ".*-ssp370_(.*)_tmax_above_100.tif" - }, - "tmax_above_110": { - "title": "Tmax Above 110", - "description": "Tmax Above 110", - "regex": ".*-ssp370_(.*)_tmax_above_110.tif" - }, - "tmax_above_115": { - "title": "Tmax Above 115", - "description": "Tmax Above 115", - "regex": ".*-ssp370_(.*)_tmax_above_115.tif" - } - } -} +import json +import sys def create_frontmatter(input_data): @@ -50,20 +15,17 @@ def create_frontmatter(input_data): "media": { "src": "https://bootstrap-cheatsheet.themeselection.com/assets/images/bs-images/img-2x1.png", "alt": "Placeholder image", - "author": { - "name": "Media author", - "url": "" - } + "author": {"name": "Media author", "url": ""}, }, "taxonomy": [ {"name": "Theme", "values": ["Greenhouse Gases"]}, {"name": "Source", "values": ["NASA"]}, ], "infoDescription": "::markdown\n - **Temporal Extent:** 2015 - 2100\n - **Temporal Resolution:** Annual\n - **Spatial Extent:** Global\n - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n - **Data Units:** Days (Days per year above 90°F or 110°F)\n - **Data Type:** Research", - "layers": [] + "layers": [], } - for asset_id, asset in input_data.get("assets", {}).items(): + for asset_id, asset in input_data.get("item_assets", {}).items(): layer = { "id": f"{collection_id}-{asset_id}", "stacCol": collection_id, @@ -76,30 +38,34 @@ def create_frontmatter(input_data): "resampling_method": "bilinear", "colormap_name": "wistia", "rescale": "0,365", - "maxzoom": 4 + "maxzoom": 4, }, "compare": { "datasetId": collection_id, "layerId": asset_id, - "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}" - }, - "analysis": { - "exclude": False, - "metrics": ["mean"] + "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}", }, + "analysis": {"exclude": False, "metrics": ["mean"]}, "legend": { "unit": {"label": "Days"}, "type": "gradient", "min": 0, "max": 365, - "stops": ["#E4FF7A", "#FAED2D", "#FFCE0A", "#FFB100", "#FE9900", "#FC7F00"] + "stops": [ + "#E4FF7A", + "#FAED2D", + "#FFCE0A", + "#FFB100", + "#FE9900", + "#FC7F00", + ], }, "info": { "source": "NASA", "spatialExtent": "Global", "temporalResolution": "Annual", - "unit": "Days" - } + "unit": "Days", + }, } json_data["layers"].append(layer) yaml_data = yaml.dump(json_data, sort_keys=False) @@ -108,18 +74,19 @@ def create_frontmatter(input_data): def safe_open_w(path): - ''' Open "path" for writing, creating any parent directories as needed. - ''' + """Open "path" for writing, creating any parent directories as needed.""" os.makedirs(os.path.dirname(path), exist_ok=True) - return open(path, 'w') + return open(path, "w") if __name__ == "__main__": + input_data = json.load(open(sys.argv[1])) dataset_config = create_frontmatter(input_data) front_matter = f"---\n{dataset_config}---\n" # Path to the existing file - file_path = "dataset.mdx" + curr_directory = os.path.dirname(os.path.abspath(__file__)) + file_path = os.path.join(curr_directory, "dataset.mdx") # Read the existing content of the file with open(file_path, "r") as file: @@ -129,5 +96,8 @@ def safe_open_w(path): new_content = front_matter + existing_content # Write the combined content back to the file - with safe_open_w(f"../datasets/{input_data['collection']}.data.mdx") as file: - file.write(new_content) + output_filepath = os.path.join( + curr_directory, f"../datasets/{input_data['collection']}.data.mdx" + ) + with safe_open_w(output_filepath) as ofile: + ofile.write(new_content) From dedc8b677da6b4281d4024bfe7bedf7be0c574eb Mon Sep 17 00:00:00 2001 From: Slesa Adhikari Date: Thu, 15 Aug 2024 16:30:34 -0500 Subject: [PATCH 7/7] Make the lint god happy --- .github/workflows/pr.yml | 42 ++++++++++++++++++++-------------------- README.md | 16 +++++++++------ scripts/mdx.py | 27 ++++++++++++++++++++++++-- scripts/requirements.txt | 2 +- 4 files changed, 57 insertions(+), 30 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 7348860..0951ccf 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -1,4 +1,4 @@ -# This GitHub Actions workflow automates the process of +# This GitHub Actions workflow automates the process of # publishing dataset collections to a staging environment # and creating a pull request (PR) in the veda-config repository # with the dataset configuration. @@ -41,7 +41,7 @@ jobs: body='### Workflow Status **Starting workflow...** [View action run]($WORKFLOW_URL) ' - + # Get the PR number PR_NUMBER=${{ github.event.pull_request.number }} @@ -50,7 +50,7 @@ jobs: # Check if a comment already exists COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1) - + if [ -z "$COMMENT_ID" ]; then # No existing comment, create a new one COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id') @@ -58,7 +58,7 @@ jobs: # Comment exists, overwrite the existing comment gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body" fi - + echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT # Find only the updated files (file that differ from base) @@ -110,30 +110,30 @@ jobs: echo "AUTH_TOKEN is not set" exit 1 fi - + publish_url="${WORKFLOWS_URL%/}/dataset/publish" bearer_token=$AUTH_TOKEN - + # Track successful publications all_failed=true success_collections=() status_message='### Collection Publication Status ' - + for file in "${ALL_CHANGED_FILES[@]}"; do echo $file if [ -f "$file" ]; then dataset_config=$(jq '.' "$file") collection_id=$(jq -r '.collection' "$file") - + response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer $AUTH_TOKEN" \ -d "$dataset_config" ) - + status_code=$(tail -n1 <<< "$response") - + # Update status message based on response code if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then echo "$collection_id successfully published ✅" @@ -151,20 +151,20 @@ jobs: exit 1 fi done - + # Exit workflow if all the requests fail if [ "$all_failed" = true ]; then echo "All collections failed to publish." exit 1 fi - + # Output only successful collections to be used in subsequent steps echo "success_collections=$(IFS=','; echo "${success_collections[*]}")" >> $GITHUB_OUTPUT # Update PR comment CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') UPDATED_BODY="$CURRENT_BODY - + $status_message" gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" @@ -177,7 +177,7 @@ jobs: run: | CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') UPDATED_BODY="$CURRENT_BODY - + **Creating a PR in veda-config...**" gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" @@ -202,12 +202,12 @@ jobs: run: | git config --global user.name "github-actions[bot]" git config --global user.email "github-actions[bot]@users.noreply.github.com" - + - name: Clone `veda-config` env: VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} run: git clone https://${{ env.VEDA_CONFIG_GH_TOKEN }}@github.com/${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}.git - + # Creates a PR in veda-config with the following changes: # 1. the mdx files for all published collections # 2. updates the stac/raster urls in .env file @@ -237,10 +237,10 @@ jobs: git commit -m "Add dataset(s)" git push origin $NEW_BRANCH PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets (Automatically created by Github action)")) - + echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT echo "PR creation succeeded" - + # Updates the comment with a link to the above PR - name: Update PR comment with PR creation result if: success() @@ -251,7 +251,7 @@ jobs: PR_URL=${{ steps.create-pr.outputs.PR_URL }} CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') UPDATED_BODY="$CURRENT_BODY - + **A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**" gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" @@ -263,7 +263,7 @@ jobs: run: | CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') UPDATED_BODY="$CURRENT_BODY - + **Failed ❌ to create a PR with the dataset configuration. 😔 **" gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" @@ -277,6 +277,6 @@ jobs: WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') UPDATED_BODY="$CURRENT_BODY - + ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**" gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" diff --git a/README.md b/README.md index 3401abd..f84394b 100644 --- a/README.md +++ b/README.md @@ -213,7 +213,8 @@ pip-compile This will update `requirements.txt` with a complete, realized set of Python dependencies. -# Workflow automation +## Workflow automation + The repository consists of an automated workflow for staging dataset publication and configuration. The workflow file can be found in [.github/workflows/pr.yml](.github/workflows/pr.yml). @@ -234,20 +235,22 @@ The following table includes a description and role for each repository variable | Secret | `secrets.STAGING_CLIENT_SECRET` | The secret client key used in OAuth2 authentication, necessary for secure access to the API. | | Secret | `secrets.VEDA_CONFIG_GH_TOKEN` | The GitHub token with access rights to the `veda-config` repository, used for creating pull requests.| - ### `vars.ENV_FROM` and `vars.ENV_TO` usage + These are used to overwrite the stac/raster URLs in the `.env` ile in `veda-config` repository Command used: `sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|" .env` - + #### Example -``` + +```bash vars.ENV_FROM = openveda vars.ENV_TO = staging.openveda ``` `.env` before -``` + +```bash ... # Endpoint for the Tiler server. No trailing slash. API_RASTER_ENDPOINT='https://openveda.cloud/api/raster' @@ -258,7 +261,8 @@ API_STAC_ENDPOINT='https://openveda.cloud/api/stac' ``` `.env` after -``` + +```bash ... # Endpoint for the Tiler server. No trailing slash. API_RASTER_ENDPOINT='https://staging.openveda.cloud/api/raster' diff --git a/scripts/mdx.py b/scripts/mdx.py index 8c91549..f78be51 100644 --- a/scripts/mdx.py +++ b/scripts/mdx.py @@ -1,3 +1,9 @@ +""" +This file creates a minimal .data.mdx file +from the input dataset config json file +Dependency: `dataset.mdx` file +""" + import yaml import os import json @@ -5,6 +11,9 @@ def create_frontmatter(input_data): + """ + Creates json based on input dataset config + """ collection_id = input_data["collection"] json_data = { @@ -21,7 +30,14 @@ def create_frontmatter(input_data): {"name": "Theme", "values": ["Greenhouse Gases"]}, {"name": "Source", "values": ["NASA"]}, ], - "infoDescription": "::markdown\n - **Temporal Extent:** 2015 - 2100\n - **Temporal Resolution:** Annual\n - **Spatial Extent:** Global\n - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n - **Data Units:** Days (Days per year above 90°F or 110°F)\n - **Data Type:** Research", + "infoDescription": """::markdown + - **Temporal Extent:** 2015 - 2100 + - **Temporal Resolution:** Annual + - **Spatial Extent:** Global + - **Spatial Resolution:** 0.25 degrees x 0.25 degrees + - **Data Units:** Days (Days per year above 90°F or 110°F) + - **Data Type:** Research + """, "layers": [], } @@ -43,7 +59,12 @@ def create_frontmatter(input_data): "compare": { "datasetId": collection_id, "layerId": asset_id, - "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}", + "mapLabel": ( + "::js ({ dateFns, datetime, compareDatetime }) " + "=> {if (dateFns && datetime && compareDatetime)" + "return `${dateFns.format(datetime, 'yyyy')} " + "VS ${dateFns.format(compareDatetime, 'yyyy')}`;}" + ), }, "analysis": {"exclude": False, "metrics": ["mean"]}, "legend": { @@ -68,6 +89,8 @@ def create_frontmatter(input_data): }, } json_data["layers"].append(layer) + + # Convert json to yaml for frontmatter yaml_data = yaml.dump(json_data, sort_keys=False) return yaml_data diff --git a/scripts/requirements.txt b/scripts/requirements.txt index 4818cc5..c3726e8 100644 --- a/scripts/requirements.txt +++ b/scripts/requirements.txt @@ -1 +1 @@ -pyyaml \ No newline at end of file +pyyaml