diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml new file mode 100644 index 0000000..0951ccf --- /dev/null +++ b/.github/workflows/pr.yml @@ -0,0 +1,282 @@ +# This GitHub Actions workflow automates the process of +# publishing dataset collections to a staging environment +# and creating a pull request (PR) in the veda-config repository +# with the dataset configuration. +# It is triggered by a pull request to the main branch +# that modifies any files within the ingestion-data/dataset-config/ directory +# The workflow includes steps to +# - publish the datasets, +# - create a PR in veda-config repository, +# - constantly updates the status of the workflow in the PR comment + +name: Publish collection to staging and create dataset config PR + +on: + pull_request: + branches: + - main + paths: + # Run the workflow only if files inside this path are updated + - ingestion-data/dataset-config/* + +jobs: + dataset-publication-and-configuration: + permissions: + pull-requests: write + contents: read + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + + # Initializes the PR comment + # Edits existing or creates new comment + # Why? - Cleanliness! + - name: Initialize PR comment with workflow start + id: init-comment + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + body='### Workflow Status + **Starting workflow...** [View action run]($WORKFLOW_URL) + ' + + # Get the PR number + PR_NUMBER=${{ github.event.pull_request.number }} + + # Fetch existing comments + COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}') + + # Check if a comment already exists + COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1) + + if [ -z "$COMMENT_ID" ]; then + # No existing comment, create a new one + COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id') + else + # Comment exists, overwrite the existing comment + gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body" + fi + + echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT + + # Find only the updated files (file that differ from base) + # Only .json files + # The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps + - name: Get updated files + id: changed-files + uses: tj-actions/changed-files@v44 + with: + files: | + **.json + + # Uses service client creds to get token + # No username/password needed + - name: Get auth token + id: get-token + run: | + response=$(curl -X POST \ + ${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \ + -H "Content-Type: application/x-www-form-urlencoded" \ + -d "grant_type=client_credentials" \ + -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \ + -d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}" + ) + + access_token=$(echo "$response" | jq -r '.access_token') + echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT + + # Makes request to /dataset/publish endpoint + # Outputs only files that were successfully published + # Used by other steps + # If none of the requests are successful, workflow fails + # Updates the PR comment with status of collection publication + - name: Publish all updated collections + id: publish-collections + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + if [ -z "$WORKFLOWS_URL" ]; then + echo "WORKFLOWS_URL is not set" + exit 1 + fi + + if [ -z "$AUTH_TOKEN" ]; then + echo "AUTH_TOKEN is not set" + exit 1 + fi + + publish_url="${WORKFLOWS_URL%/}/dataset/publish" + bearer_token=$AUTH_TOKEN + + # Track successful publications + all_failed=true + success_collections=() + status_message='### Collection Publication Status + ' + + for file in "${ALL_CHANGED_FILES[@]}"; do + echo $file + if [ -f "$file" ]; then + dataset_config=$(jq '.' "$file") + collection_id=$(jq -r '.collection' "$file") + + response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $AUTH_TOKEN" \ + -d "$dataset_config" + ) + + status_code=$(tail -n1 <<< "$response") + + # Update status message based on response code + if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then + echo "$collection_id successfully published βœ…" + status_message+="- **$collection_id**: Successfully published βœ… + " + success_collections+=("$file") + all_failed=false + else + echo "$collection_id failed to publish ❌" + status_message+="- **$collection_id**: Failed to publish ❌ + " + fi + else + echo "File $file does not exist" + exit 1 + fi + done + + # Exit workflow if all the requests fail + if [ "$all_failed" = true ]; then + echo "All collections failed to publish." + exit 1 + fi + + # Output only successful collections to be used in subsequent steps + echo "success_collections=$(IFS=','; echo "${success_collections[*]}")" >> $GITHUB_OUTPUT + + # Update PR comment + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + $status_message" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + # Update PR comment + - name: Update PR comment for PR creation + if: success() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **Creating a PR in veda-config...**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.9' + cache: 'pip' + + # Creates a slim dataset mdx file for each collection based on the dataset config json + - name: Create dataset mdx for given collections + env: + PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }} + run: | + pip install -r scripts/requirements.txt + for file in "${PUBLISHED_COLLECTION_FILES[@]}" + do + python3 scripts/mdx.py "$file" + done + + - name: Set up Git + run: | + git config --global user.name "github-actions[bot]" + git config --global user.email "github-actions[bot]@users.noreply.github.com" + + - name: Clone `veda-config` + env: + VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} + run: git clone https://${{ env.VEDA_CONFIG_GH_TOKEN }}@github.com/${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}.git + + # Creates a PR in veda-config with the following changes: + # 1. the mdx files for all published collections + # 2. updates the stac/raster urls in .env file + # This step needs a GH_TOKEN that has permissions to create a PR in veda-config + - name: Create PR with changes + id: create-pr + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }} + COMMENT_ID: ${{ steps.publish-collections.outputs.COMMENT_ID }} + PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }} + run: | + files_string=$(IFS=$'\n'; echo "${PUBLISHED_COLLECTION_FILES[*]}") + hash=$(echo -n "$files_string" | md5sum | cut -d ' ' -f 1) + NEW_BRANCH="add-dataset-$hash" + cd ${{ vars.VEDA_CONFIG_REPO_NAME }} + git fetch origin + if git ls-remote --exit-code --heads origin $NEW_BRANCH; then + git push origin --delete $NEW_BRANCH + fi + git checkout -b $NEW_BRANCH + + # Update the env vars to staging based on env vars + sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|g" .env + cp -r ../datasets/* datasets/ + git add . + git commit -m "Add dataset(s)" + git push origin $NEW_BRANCH + PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets (Automatically created by Github action)")) + + echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT + echo "PR creation succeeded" + + # Updates the comment with a link to the above PR + - name: Update PR comment with PR creation result + if: success() + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + PR_URL=${{ steps.create-pr.outputs.PR_URL }} + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **A PR has been created with the dataset configuration: πŸ—ΊοΈ [PR link]($PR_URL)**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + - name: Update PR comment on PR creation failure + if: failure() && steps.create-pr.outcome == 'failure' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + **Failed ❌ to create a PR with the dataset configuration. πŸ˜” **" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" + + # If the workflow fails at any point, the PR comment will be updated + - name: Update PR comment on overall workflow failure + if: failure() && steps.create-pr.outcome != 'failure' + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }} + run: | + WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body') + UPDATED_BODY="$CURRENT_BODY + + ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**" + gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY" diff --git a/README.md b/README.md index ac3fb3c..f84394b 100644 --- a/README.md +++ b/README.md @@ -212,3 +212,62 @@ pip-compile ``` This will update `requirements.txt` with a complete, realized set of Python dependencies. + +## Workflow automation + +The repository consists of an automated workflow for staging dataset publication and configuration. + +The workflow file can be found in [.github/workflows/pr.yml](.github/workflows/pr.yml). + +This GitHub Actions workflow automates the process of publishing dataset collections to a staging environment and creating a pull request (PR) in the veda-config repository with the dataset configuration. It is triggered by a pull request to the main branch that modifies any files within the `ingestion-data/dataset-config/` directory. The status of the workflow run is automatically updated in a comment in the PR. + +The following table includes a description and role for each repository variable and secret needed by the workflow. + +| **Type** | **Variable/Secret** | **Description** | +|------------|----------------------------------|-----------------------------------------------------------------------------------------------------| +| Variable | `vars.STAGING_COGNITO_DOMAIN` | The domain used for Cognito OAuth2 authentication, where the authentication requests are sent. | +| Variable | `vars.STAGING_CLIENT_ID` | The client ID used in OAuth2 authentication to identify the application making the request. | +| Variable | `vars.STAGING_WORKFLOWS_URL` | The base URL for accessing the staging environment's workflows, where dataset publishing occurs. | +| Variable | `vars.VEDA_CONFIG_REPO_ORG` | The organization or user that owns the `veda-config` repository, used for repository cloning. | +| Variable | `vars.VEDA_CONFIG_REPO_NAME` | The name of the `veda-config` repository, which stores the dataset configuration. | +| Variable | `vars.ENV_FROM` | A substring of the the current raster/stac URLs in the .env file that's to be replaced by `vars.ENV_TO`. | +| Variable | `vars.ENV_TO` | A substring of the the current raster/stac URLs in the .env file that replaces `vars.ENV_FROM`. | +| Secret | `secrets.STAGING_CLIENT_SECRET` | The secret client key used in OAuth2 authentication, necessary for secure access to the API. | +| Secret | `secrets.VEDA_CONFIG_GH_TOKEN` | The GitHub token with access rights to the `veda-config` repository, used for creating pull requests.| + +### `vars.ENV_FROM` and `vars.ENV_TO` usage + +These are used to overwrite the stac/raster URLs in the `.env` ile in `veda-config` repository + +Command used: `sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|" .env` + +#### Example + +```bash +vars.ENV_FROM = openveda +vars.ENV_TO = staging.openveda +``` + +`.env` before + +```bash +... +# Endpoint for the Tiler server. No trailing slash. +API_RASTER_ENDPOINT='https://openveda.cloud/api/raster' + +# Endpoint for the STAC server. No trailing slash. +API_STAC_ENDPOINT='https://openveda.cloud/api/stac' +... +``` + +`.env` after + +```bash +... +# Endpoint for the Tiler server. No trailing slash. +API_RASTER_ENDPOINT='https://staging.openveda.cloud/api/raster' + +# Endpoint for the STAC server. No trailing slash. +API_STAC_ENDPOINT='https://staging.openveda.cloud/api/stac' +... +``` diff --git a/scripts/dataset.mdx b/scripts/dataset.mdx new file mode 100644 index 0000000..036cfa7 --- /dev/null +++ b/scripts/dataset.mdx @@ -0,0 +1,5 @@ + + + Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. + + \ No newline at end of file diff --git a/scripts/mdx.py b/scripts/mdx.py new file mode 100644 index 0000000..f78be51 --- /dev/null +++ b/scripts/mdx.py @@ -0,0 +1,126 @@ +""" +This file creates a minimal .data.mdx file +from the input dataset config json file +Dependency: `dataset.mdx` file +""" + +import yaml +import os +import json +import sys + + +def create_frontmatter(input_data): + """ + Creates json based on input dataset config + """ + collection_id = input_data["collection"] + + json_data = { + "id": collection_id, + "name": input_data.get("title", "Dataset Title"), + "featured": False, + "description": input_data.get("description", "Dataset Description"), + "media": { + "src": "https://bootstrap-cheatsheet.themeselection.com/assets/images/bs-images/img-2x1.png", + "alt": "Placeholder image", + "author": {"name": "Media author", "url": ""}, + }, + "taxonomy": [ + {"name": "Theme", "values": ["Greenhouse Gases"]}, + {"name": "Source", "values": ["NASA"]}, + ], + "infoDescription": """::markdown + - **Temporal Extent:** 2015 - 2100 + - **Temporal Resolution:** Annual + - **Spatial Extent:** Global + - **Spatial Resolution:** 0.25 degrees x 0.25 degrees + - **Data Units:** Days (Days per year above 90Β°F or 110Β°F) + - **Data Type:** Research + """, + "layers": [], + } + + for asset_id, asset in input_data.get("item_assets", {}).items(): + layer = { + "id": f"{collection_id}-{asset_id}", + "stacCol": collection_id, + "name": asset.get("title", "Asset Title"), + "type": "raster", + "description": asset.get("description", "Asset Description"), + "zoomExtent": [0, 4], + "sourceParams": { + "assets": asset_id, + "resampling_method": "bilinear", + "colormap_name": "wistia", + "rescale": "0,365", + "maxzoom": 4, + }, + "compare": { + "datasetId": collection_id, + "layerId": asset_id, + "mapLabel": ( + "::js ({ dateFns, datetime, compareDatetime }) " + "=> {if (dateFns && datetime && compareDatetime)" + "return `${dateFns.format(datetime, 'yyyy')} " + "VS ${dateFns.format(compareDatetime, 'yyyy')}`;}" + ), + }, + "analysis": {"exclude": False, "metrics": ["mean"]}, + "legend": { + "unit": {"label": "Days"}, + "type": "gradient", + "min": 0, + "max": 365, + "stops": [ + "#E4FF7A", + "#FAED2D", + "#FFCE0A", + "#FFB100", + "#FE9900", + "#FC7F00", + ], + }, + "info": { + "source": "NASA", + "spatialExtent": "Global", + "temporalResolution": "Annual", + "unit": "Days", + }, + } + json_data["layers"].append(layer) + + # Convert json to yaml for frontmatter + yaml_data = yaml.dump(json_data, sort_keys=False) + + return yaml_data + + +def safe_open_w(path): + """Open "path" for writing, creating any parent directories as needed.""" + os.makedirs(os.path.dirname(path), exist_ok=True) + return open(path, "w") + + +if __name__ == "__main__": + input_data = json.load(open(sys.argv[1])) + dataset_config = create_frontmatter(input_data) + front_matter = f"---\n{dataset_config}---\n" + + # Path to the existing file + curr_directory = os.path.dirname(os.path.abspath(__file__)) + file_path = os.path.join(curr_directory, "dataset.mdx") + + # Read the existing content of the file + with open(file_path, "r") as file: + existing_content = file.read() + + # Combine front matter and existing content + new_content = front_matter + existing_content + + # Write the combined content back to the file + output_filepath = os.path.join( + curr_directory, f"../datasets/{input_data['collection']}.data.mdx" + ) + with safe_open_w(output_filepath) as ofile: + ofile.write(new_content) diff --git a/scripts/requirements.txt b/scripts/requirements.txt new file mode 100644 index 0000000..c3726e8 --- /dev/null +++ b/scripts/requirements.txt @@ -0,0 +1 @@ +pyyaml