From e4790047e14dd79342174c844e609c9e3f38aae7 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Wed, 7 Aug 2024 17:39:01 -0500
Subject: [PATCH 1/7] Add script to create a dataset mdx file

---
 scripts/dataset.mdx |   5 ++
 scripts/mdx.py      | 133 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 scripts/dataset.mdx
 create mode 100644 scripts/mdx.py
diff --git a/scripts/dataset.mdx b/scripts/dataset.mdx
new file mode 100644
index 0000000..036cfa7
--- /dev/null
+++ b/scripts/dataset.mdx
@@ -0,0 +1,5 @@
+<Block>
+  <Prose>
+    Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
+  </Prose>
+</Block>
\ No newline at end of file
diff --git a/scripts/mdx.py b/scripts/mdx.py
new file mode 100644
index 0000000..794ad0d
--- /dev/null
+++ b/scripts/mdx.py
@@ -0,0 +1,133 @@
+import yaml
+import os
+
+
+input_data = {
+  "collection": "climdex-tmaxxf-access-cm2-ssp370",
+  "bucket": "veda-data-store-staging",
+  "prefix": "climdex-tmaxxf-access-cm2-ssp370/",
+  "filename_regex": ".*-ssp370_(.*)_tmax.*.tif",
+  "datetime_group": ".*-ssp370_(.*)_tmax.*.tif",
+  "datetime_range": "year",
+  "assets": {
+    "tmax_above_86": {
+      "title": "Tmax Above 86",
+      "description": "Tmax Above 86",
+      "regex": ".*-ssp370_(.*)_tmax_above_86.tif"
+    },
+    "tmax_above_90": {
+      "title": "Tmax Above 90",
+      "description": "Tmax Above 90",
+      "regex": ".*-ssp370_(.*)_tmax_above_90.tif"
+    },
+    "tmax_above_100": {
+      "title": "Tmax Above 100",
+      "description": "Tmax Above 100",
+      "regex": ".*-ssp370_(.*)_tmax_above_100.tif"
+    },
+    "tmax_above_110": {
+      "title": "Tmax Above 110",
+      "description": "Tmax Above 110",
+      "regex": ".*-ssp370_(.*)_tmax_above_110.tif"
+    },
+    "tmax_above_115": {
+      "title": "Tmax Above 115",
+      "description": "Tmax Above 115",
+      "regex": ".*-ssp370_(.*)_tmax_above_115.tif"
+    }
+  }
+}
+
+
+def create_frontmatter(input_data):
+    collection_id = input_data["collection"]
+
+    json_data = {
+        "id": collection_id,
+        "name": input_data.get("title", "Dataset Title"),
+        "featured": False,
+        "description": input_data.get("description", "Dataset Description"),
+        "media": {
+            "src": "https://bootstrap-cheatsheet.themeselection.com/assets/images/bs-images/img-2x1.png",
+            "alt": "Placeholder image",
+            "author": {
+                "name": "Media author",
+                "url": ""
+            }
+        },
+        "taxonomy": [
+            {"name": "Theme", "values": ["Greenhouse Gases"]},
+            {"name": "Source", "values": ["NASA"]},
+        ],
+        "infoDescription": "::markdown\n  - **Temporal Extent:** 2015 - 2100\n  - **Temporal Resolution:** Annual\n  - **Spatial Extent:** Global\n  - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n  - **Data Units:** Days (Days per year above 90°F or 110°F)\n  - **Data Type:** Research",
+        "layers": []
+    }
+
+    for asset_id, asset in input_data.get("assets", {}).items():
+        layer = {
+            "id": f"{collection_id}-{asset_id}",
+            "stacCol": collection_id,
+            "name": asset.get("title", "Asset Title"),
+            "type": "raster",
+            "description": asset.get("description", "Asset Description"),
+            "zoomExtent": [0, 4],
+            "sourceParams": {
+                "assets": asset_id,
+                "resampling_method": "bilinear",
+                "colormap_name": "wistia",
+                "rescale": "0,365",
+                "maxzoom": 4
+            },
+            "compare": {
+                "datasetId": collection_id,
+                "layerId": asset_id,
+                "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}"
+            },
+            "analysis": {
+                "exclude": False,
+                "metrics": ["mean"]
+            },
+            "legend": {
+                "unit": {"label": "Days"},
+                "type": "gradient",
+                "min": 0,
+                "max": 365,
+                "stops": ["#E4FF7A", "#FAED2D", "#FFCE0A", "#FFB100", "#FE9900", "#FC7F00"]
+            },
+            "info": {
+                "source": "NASA",
+                "spatialExtent": "Global",
+                "temporalResolution": "Annual",
+                "unit": "Days"
+            }
+        }
+        json_data["layers"].append(layer)
+    yaml_data = yaml.dump(json_data, sort_keys=False)
+
+    return yaml_data
+
+
+def safe_open_w(path):
+    ''' Open "path" for writing, creating any parent directories as needed.
+    '''
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    return open(path, 'w')
+
+
+if __name__ == "__main__":
+    dataset_config = create_frontmatter(input_data)
+    front_matter = f"---\n{dataset_config}---\n"
+
+    # Path to the existing file
+    file_path = "dataset.mdx"
+
+    # Read the existing content of the file
+    with open(file_path, "r") as file:
+        existing_content = file.read()
+
+    # Combine front matter and existing content
+    new_content = front_matter + existing_content
+
+    # Write the combined content back to the file
+    with safe_open_w(f"../datasets/{input_data['collection']}.data.mdx") as file:
+        file.write(new_content)

From a6b78a9c2eeda8899f892e33bbe20d37fd4c5266 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Wed, 7 Aug 2024 17:39:24 -0500
Subject: [PATCH 2/7] Add a workflow to run staging publish on pr

---
 .github/workflows/pr.yml | 123 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 .github/workflows/pr.yml

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
new file mode 100644
index 0000000..cb5e89f
--- /dev/null
+++ b/.github/workflows/pr.yml
@@ -0,0 +1,123 @@
+name: Publish collection to staging and create dataset config PR
+
+on:
+  pull_request:
+    branches:
+        - main
+    paths:
+        - ingestion-data/staging/dataset-config/*
+
+jobs:
+  publish:
+    permissions:
+      pull-requests: read
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Get updated files
+        id: changed-files
+        uses: tj-actions/changed-files@v44
+        with:
+          files: |
+             **.json
+
+      - name: Get auth token
+        id: get-token
+        run: |
+          response=$(curl -X POST \
+          https://${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
+          -H "Content-Type: application/x-www-form-urlencoded" \
+          -d "grant_type=password" \
+          -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
+          -d "username=${{ vars.STAGING_USERNAME }}" \
+          -d "password=${{ vars.STAGING_PASSWORD }}")
+
+          # Extract tokens
+          access_token=$(echo "$response" | jq -r '.access_token')
+          echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT
+
+      - name: Publish all updated collections
+        id: publish-collections
+        env:
+          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+          WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }}
+        run: |
+          # Ensure WORKFLOWS_URL is set
+          if [ -z "$WORKFLOWS_URL" ]; then
+            echo "WORKFLOWS_URL is not set"
+            exit 1
+          fi
+
+          publish_url="${WORKFLOWS_URL%/}/dataset/publish"
+
+          # Iterate over the list of changed files
+          for file in "$@"; do
+            if [ -f "$file" ]; then
+              # Read the JSON content of the file
+              dataset_config=$(jq '.' "$file")
+              
+              # Send a POST request with the JSON content
+              response=$(curl -s -w "%{http_code}" -o response.txt -X POST \
+                -H "Content-Type: application/json" \
+                -H "Authorization: Bearer $bearer_token" \
+                -d "$dataset_config" \
+                "$publish_url")              
+              
+              # Extract the status code from the response
+              status_code=$(tail -n1 <<< "$response")
+
+              if [ "$status_code" -ne 200 ] && [ "$status_code" -ne 201 ]; then
+                echo "Failed publishing $file because: $(cat response.txt)"
+                exit 1
+              fi
+            else
+              echo "File $file does not exist"
+              exit 1
+            fi
+          done
+
+          CURRENT_PR_NUMBER=${{ github.event.pull_request.number }}
+          COMMENT_ID=$(gh pr comment $CURRENT_PR_NUMBER --body "Collections published. You can view them at https://staging.openveda.cloud/api/stac/collections/<collection-id>. \n Items might take a while to show up.")
+          echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
+
+
+      - name: Create dataset mdx for given collections
+        env:
+          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+        run: |
+          cd scripts
+          for file in "${ALL_CHANGED_FILES[@]}"
+          do
+            python3 process.py "$file"
+          done
+
+      - name: Set up Git
+        run: |
+          git config --global user.name "github-actions[bot]"
+          git config --global user.email "github-actions[bot]@users.noreply.github.com"
+  
+      - name: Clone veda-config
+        run: git clone https://github.com/NASA_IMPACT/veda-config.git target-repo
+  
+      - name: Create PR with changes
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }}
+          COMMENT_ID: ${{ steps.publish-collections.outputs.COMMENT_ID }}
+        run: |
+          NEW_BRANCH=`add-dataset-{date}-hash(ALL_CHANGED_FILES)`
+          cd veda-config
+          git checkout -b $NEW_BRANCH
+          cp -r ../datasets/* datasets/
+          git add .
+          git commit -m "Add dataset(s)"
+          git push origin $NEW_BRANCH
+          echo -e "Add datasets \nAutomatically created by Github action" > msg
+          PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets\nAutomatically created by Github action") --json url -q '.url')
+          
+          CURRENT_PR_NUMBER=${{ github.event.pull_request.number }}
+          gh pr comment $CURRENT_PR_NUMBER --edit $COMMENT_ID --body "$(gh pr comment $CURRENT_PR_NUMBER --json body -q '.body')\n\nA PR has been created to [veda-config](https://github.com/NASA-IMPACT/veda-config) with the provided datasets.\n: ⛙ $PR_URL"

From 02d0d4e9f6d67c20a6f0f0d4b0da2d4c66174863 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Wed, 7 Aug 2024 17:39:30 -0500
Subject: [PATCH 3/7] Add requirements

---
 scripts/requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 scripts/requirements.txt

diff --git a/scripts/requirements.txt b/scripts/requirements.txt
new file mode 100644
index 0000000..4818cc5
--- /dev/null
+++ b/scripts/requirements.txt
@@ -0,0 +1 @@
+pyyaml
\ No newline at end of file

From 307780d48afc9874805f14ac5d8547463b3cb1e8 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Thu, 15 Aug 2024 15:43:25 -0500
Subject: [PATCH 4/7] Clean up and add comments

---
 .github/workflows/pr.yml | 247 ++++++++++++++++++++++++++++++++-------
 1 file changed, 203 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index cb5e89f..7348860 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -1,43 +1,97 @@
+# This GitHub Actions workflow automates the process of 
+# publishing dataset collections to a staging environment
+# and creating a pull request (PR) in the veda-config repository
+# with the dataset configuration.
+# It is triggered by a pull request to the main branch
+# that modifies any files within the ingestion-data/dataset-config/ directory
+# The workflow includes steps to
+#   - publish the datasets,
+#   - create a PR in veda-config repository,
+#   - constantly updates the status of the workflow in the PR comment
+
 name: Publish collection to staging and create dataset config PR
 
 on:
   pull_request:
     branches:
-        - main
+      - main
     paths:
-        - ingestion-data/staging/dataset-config/*
+      # Run the workflow only if files inside this path are updated
+      - ingestion-data/dataset-config/*
 
 jobs:
-  publish:
+  dataset-publication-and-configuration:
     permissions:
-      pull-requests: read
+      pull-requests: write
+      contents: read
     runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v4
 
+      # Initializes the PR comment
+      # Edits existing or creates new comment
+      # Why? - Cleanliness!
+      - name: Initialize PR comment with workflow start
+        id: init-comment
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          body='### Workflow Status
+          **Starting workflow...** [View action run]($WORKFLOW_URL)
+          '
+          
+          # Get the PR number
+          PR_NUMBER=${{ github.event.pull_request.number }}
+
+          # Fetch existing comments
+          COMMENTS=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments --jq '.[] | select(.body | contains("### Workflow Status")) | {id: .id, body: .body}')
+
+          # Check if a comment already exists
+          COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1)
+          
+          if [ -z "$COMMENT_ID" ]; then
+            # No existing comment, create a new one
+            COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id')
+          else
+            # Comment exists, overwrite the existing comment
+            gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body"
+          fi
+          
+          echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
+
+      # Find only the updated files (file that differ from base)
+      # Only .json files
+      # The files are outputted to GITHUB_OUTPUT, which can be used in subsequent steps
       - name: Get updated files
         id: changed-files
         uses: tj-actions/changed-files@v44
         with:
           files: |
-             **.json
+            **.json
 
+      # Uses service client creds to get token
+      # No username/password needed
       - name: Get auth token
         id: get-token
         run: |
           response=$(curl -X POST \
-          https://${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
-          -H "Content-Type: application/x-www-form-urlencoded" \
-          -d "grant_type=password" \
-          -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
-          -d "username=${{ vars.STAGING_USERNAME }}" \
-          -d "password=${{ vars.STAGING_PASSWORD }}")
-
-          # Extract tokens
+            ${{ vars.STAGING_COGNITO_DOMAIN }}/oauth2/token \
+            -H "Content-Type: application/x-www-form-urlencoded" \
+            -d "grant_type=client_credentials" \
+            -d "client_id=${{ vars.STAGING_CLIENT_ID }}" \
+            -d "client_secret=${{ secrets.STAGING_CLIENT_SECRET }}"
+          )
+
           access_token=$(echo "$response" | jq -r '.access_token')
           echo "ACCESS_TOKEN=$access_token" >> $GITHUB_OUTPUT
 
+      # Makes request to /dataset/publish endpoint
+      # Outputs only files that were successfully published
+      # Used by other steps
+      # If none of the requests are successful, workflow fails
+      # Updates the PR comment with status of collection publication
       - name: Publish all updated collections
         id: publish-collections
         env:
@@ -45,54 +99,103 @@ jobs:
           WORKFLOWS_URL: ${{ vars.STAGING_WORKFLOWS_URL }}
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           AUTH_TOKEN: ${{ steps.get-token.outputs.ACCESS_TOKEN }}
+          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
         run: |
-          # Ensure WORKFLOWS_URL is set
           if [ -z "$WORKFLOWS_URL" ]; then
             echo "WORKFLOWS_URL is not set"
             exit 1
           fi
 
+          if [ -z "$AUTH_TOKEN" ]; then
+            echo "AUTH_TOKEN is not set"
+            exit 1
+          fi
+      
           publish_url="${WORKFLOWS_URL%/}/dataset/publish"
-
-          # Iterate over the list of changed files
-          for file in "$@"; do
+          bearer_token=$AUTH_TOKEN
+      
+          # Track successful publications
+          all_failed=true
+          success_collections=()
+          status_message='### Collection Publication Status
+          '
+      
+          for file in "${ALL_CHANGED_FILES[@]}"; do
+            echo $file
             if [ -f "$file" ]; then
-              # Read the JSON content of the file
               dataset_config=$(jq '.' "$file")
-              
-              # Send a POST request with the JSON content
-              response=$(curl -s -w "%{http_code}" -o response.txt -X POST \
+              collection_id=$(jq -r '.collection' "$file")
+      
+              response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \
                 -H "Content-Type: application/json" \
-                -H "Authorization: Bearer $bearer_token" \
-                -d "$dataset_config" \
-                "$publish_url")              
+                -H "Authorization: Bearer $AUTH_TOKEN" \
+                -d "$dataset_config"
+              )
               
-              # Extract the status code from the response
               status_code=$(tail -n1 <<< "$response")
-
-              if [ "$status_code" -ne 200 ] && [ "$status_code" -ne 201 ]; then
-                echo "Failed publishing $file because: $(cat response.txt)"
-                exit 1
+      
+              # Update status message based on response code
+              if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then
+                echo "$collection_id successfully published ✅"
+                status_message+="- **$collection_id**: Successfully published ✅
+                "
+                success_collections+=("$file")
+                all_failed=false
+              else
+                echo "$collection_id failed to publish ❌"
+                status_message+="- **$collection_id**: Failed to publish ❌
+                "
               fi
             else
               echo "File $file does not exist"
               exit 1
             fi
           done
+      
+          # Exit workflow if all the requests fail
+          if [ "$all_failed" = true ]; then
+            echo "All collections failed to publish."
+            exit 1
+          fi
+      
+          # Output only successful collections to be used in subsequent steps
+          echo "success_collections=$(IFS=','; echo "${success_collections[*]}")" >> $GITHUB_OUTPUT
 
-          CURRENT_PR_NUMBER=${{ github.event.pull_request.number }}
-          COMMENT_ID=$(gh pr comment $CURRENT_PR_NUMBER --body "Collections published. You can view them at https://staging.openveda.cloud/api/stac/collections/<collection-id>. \n Items might take a while to show up.")
-          echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
+          # Update PR comment
+          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
+          UPDATED_BODY="$CURRENT_BODY
+          
+          $status_message"
+          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
 
+      # Update PR comment
+      - name: Update PR comment for PR creation
+        if: success()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
+        run: |
+          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
+          UPDATED_BODY="$CURRENT_BODY
+          
+          **Creating a PR in veda-config...**"
+          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.9'
+          cache: 'pip'
 
+      # Creates a slim dataset mdx file for each collection based on the dataset config json
       - name: Create dataset mdx for given collections
         env:
-          ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }}
+          PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }}
         run: |
-          cd scripts
-          for file in "${ALL_CHANGED_FILES[@]}"
+          pip install -r scripts/requirements.txt
+          for file in "${PUBLISHED_COLLECTION_FILES[@]}"
           do
-            python3 process.py "$file"
+            python3 scripts/mdx.py "$file"
           done
 
       - name: Set up Git
@@ -100,24 +203,80 @@ jobs:
           git config --global user.name "github-actions[bot]"
           git config --global user.email "github-actions[bot]@users.noreply.github.com"
   
-      - name: Clone veda-config
-        run: git clone https://github.com/NASA_IMPACT/veda-config.git target-repo
+      - name: Clone `veda-config`
+        env:
+          VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }}
+        run: git clone https://${{ env.VEDA_CONFIG_GH_TOKEN }}@github.com/${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}.git
   
+      # Creates a PR in veda-config with the following changes:
+      # 1. the mdx files for all published collections
+      # 2. updates the stac/raster urls in .env file
+      # This step needs a GH_TOKEN that has permissions to create a PR in veda-config
       - name: Create PR with changes
+        id: create-pr
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }}
           COMMENT_ID: ${{ steps.publish-collections.outputs.COMMENT_ID }}
+          PUBLISHED_COLLECTION_FILES: ${{ steps.publish-collections.outputs.success_collections }}
         run: |
-          NEW_BRANCH=`add-dataset-{date}-hash(ALL_CHANGED_FILES)`
-          cd veda-config
+          files_string=$(IFS=$'\n'; echo "${PUBLISHED_COLLECTION_FILES[*]}")
+          hash=$(echo -n "$files_string" | md5sum | cut -d ' ' -f 1)
+          NEW_BRANCH="add-dataset-$hash"
+          cd ${{ vars.VEDA_CONFIG_REPO_NAME }}
+          git fetch origin
+          if git ls-remote --exit-code --heads origin $NEW_BRANCH; then
+            git push origin --delete $NEW_BRANCH
+          fi
           git checkout -b $NEW_BRANCH
+
+          # Update the env vars to staging based on env vars
+          sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|g" .env
           cp -r ../datasets/* datasets/
           git add .
           git commit -m "Add dataset(s)"
           git push origin $NEW_BRANCH
-          echo -e "Add datasets \nAutomatically created by Github action" > msg
-          PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets\nAutomatically created by Github action") --json url -q '.url')
+          PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets (Automatically created by Github action)"))
+          
+          echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT
+          echo "PR creation succeeded"
+      
+      # Updates the comment with a link to the above PR
+      - name: Update PR comment with PR creation result
+        if: success()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
+        run: |
+          PR_URL=${{ steps.create-pr.outputs.PR_URL }}
+          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
+          UPDATED_BODY="$CURRENT_BODY
+          
+          **A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**"
+          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
+
+      - name: Update PR comment on PR creation failure
+        if: failure() && steps.create-pr.outcome == 'failure'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
+        run: |
+          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
+          UPDATED_BODY="$CURRENT_BODY
+          
+          **Failed ❌ to create a PR with the dataset configuration. 😔 **"
+          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
+
+      # If the workflow fails at any point, the PR comment will be updated
+      - name: Update PR comment on overall workflow failure
+        if: failure() && steps.create-pr.outcome != 'failure'
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMMENT_ID: ${{ steps.init-comment.outputs.COMMENT_ID }}
+        run: |
+          WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+          CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
+          UPDATED_BODY="$CURRENT_BODY
           
-          CURRENT_PR_NUMBER=${{ github.event.pull_request.number }}
-          gh pr comment $CURRENT_PR_NUMBER --edit $COMMENT_ID --body "$(gh pr comment $CURRENT_PR_NUMBER --json body -q '.body')\n\nA PR has been created to [veda-config](https://github.com/NASA-IMPACT/veda-config) with the provided datasets.\n: ⛙ $PR_URL"
+          ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
+          gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"

From e0623dea1215e98d5b61e1380dcffcf16cc1b0f3 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Thu, 15 Aug 2024 15:43:46 -0500
Subject: [PATCH 5/7] Add documentation about the automated workflow to README

---
 README.md | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/README.md b/README.md
index ac3fb3c..3401abd 100644
--- a/README.md
+++ b/README.md
@@ -212,3 +212,58 @@ pip-compile
 ```
 
 This will update `requirements.txt` with a complete, realized set of Python dependencies.
+
+# Workflow automation
+The repository consists of an automated workflow for staging dataset publication and configuration.
+
+The workflow file can be found in [.github/workflows/pr.yml](.github/workflows/pr.yml).
+
+This GitHub Actions workflow automates the process of publishing dataset collections to a staging environment and creating a pull request (PR) in the veda-config repository with the dataset configuration. It is triggered by a pull request to the main branch that modifies any files within the `ingestion-data/dataset-config/` directory. The status of the workflow run is automatically updated in a comment in the PR.
+
+The following table includes a description and role for each repository variable and secret needed by the workflow.
+
+| **Type**   | **Variable/Secret**              | **Description**                                                                                     |
+|------------|----------------------------------|-----------------------------------------------------------------------------------------------------|
+| Variable   | `vars.STAGING_COGNITO_DOMAIN`    | The domain used for Cognito OAuth2 authentication, where the authentication requests are sent.      |
+| Variable   | `vars.STAGING_CLIENT_ID`         | The client ID used in OAuth2 authentication to identify the application making the request.         |
+| Variable   | `vars.STAGING_WORKFLOWS_URL`     | The base URL for accessing the staging environment's workflows, where dataset publishing occurs.     |
+| Variable   | `vars.VEDA_CONFIG_REPO_ORG`      | The organization or user that owns the `veda-config` repository, used for repository cloning.        |
+| Variable   | `vars.VEDA_CONFIG_REPO_NAME`     | The name of the `veda-config` repository, which stores the dataset configuration.                    |
+| Variable   | `vars.ENV_FROM`                  | A substring of the the current raster/stac URLs in the .env file that's to be replaced by `vars.ENV_TO`. |
+| Variable   | `vars.ENV_TO`                    | A substring of the the current raster/stac URLs in the .env file that replaces `vars.ENV_FROM`.       |
+| Secret     | `secrets.STAGING_CLIENT_SECRET`  | The secret client key used in OAuth2 authentication, necessary for secure access to the API.         |
+| Secret     | `secrets.VEDA_CONFIG_GH_TOKEN`   | The GitHub token with access rights to the `veda-config` repository, used for creating pull requests.|
+
+
+### `vars.ENV_FROM` and `vars.ENV_TO` usage
+These are used to overwrite the stac/raster URLs in the `.env` ile in `veda-config` repository
+
+Command used: `sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|" .env`
+ 
+#### Example
+```
+vars.ENV_FROM = openveda
+vars.ENV_TO = staging.openveda
+```
+
+`.env` before
+```
+...
+# Endpoint for the Tiler server. No trailing slash.
+API_RASTER_ENDPOINT='https://openveda.cloud/api/raster'
+
+# Endpoint for the STAC server. No trailing slash.
+API_STAC_ENDPOINT='https://openveda.cloud/api/stac'
+...
+```
+
+`.env` after
+```
+...
+# Endpoint for the Tiler server. No trailing slash.
+API_RASTER_ENDPOINT='https://staging.openveda.cloud/api/raster'
+
+# Endpoint for the STAC server. No trailing slash.
+API_STAC_ENDPOINT='https://staging.openveda.cloud/api/stac'
+...
+```

From 81677614af4792f6edec5af8cec1675ee57e52d2 Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Thu, 15 Aug 2024 16:14:53 -0500
Subject: [PATCH 6/7] Update mdx.py

---
 scripts/mdx.py | 86 ++++++++++++++++----------------------------------
 1 file changed, 28 insertions(+), 58 deletions(-)

diff --git a/scripts/mdx.py b/scripts/mdx.py
index 794ad0d..8c91549 100644
--- a/scripts/mdx.py
+++ b/scripts/mdx.py
@@ -1,42 +1,7 @@
 import yaml
 import os
-
-
-input_data = {
-  "collection": "climdex-tmaxxf-access-cm2-ssp370",
-  "bucket": "veda-data-store-staging",
-  "prefix": "climdex-tmaxxf-access-cm2-ssp370/",
-  "filename_regex": ".*-ssp370_(.*)_tmax.*.tif",
-  "datetime_group": ".*-ssp370_(.*)_tmax.*.tif",
-  "datetime_range": "year",
-  "assets": {
-    "tmax_above_86": {
-      "title": "Tmax Above 86",
-      "description": "Tmax Above 86",
-      "regex": ".*-ssp370_(.*)_tmax_above_86.tif"
-    },
-    "tmax_above_90": {
-      "title": "Tmax Above 90",
-      "description": "Tmax Above 90",
-      "regex": ".*-ssp370_(.*)_tmax_above_90.tif"
-    },
-    "tmax_above_100": {
-      "title": "Tmax Above 100",
-      "description": "Tmax Above 100",
-      "regex": ".*-ssp370_(.*)_tmax_above_100.tif"
-    },
-    "tmax_above_110": {
-      "title": "Tmax Above 110",
-      "description": "Tmax Above 110",
-      "regex": ".*-ssp370_(.*)_tmax_above_110.tif"
-    },
-    "tmax_above_115": {
-      "title": "Tmax Above 115",
-      "description": "Tmax Above 115",
-      "regex": ".*-ssp370_(.*)_tmax_above_115.tif"
-    }
-  }
-}
+import json
+import sys
 
 
 def create_frontmatter(input_data):
@@ -50,20 +15,17 @@ def create_frontmatter(input_data):
         "media": {
             "src": "https://bootstrap-cheatsheet.themeselection.com/assets/images/bs-images/img-2x1.png",
             "alt": "Placeholder image",
-            "author": {
-                "name": "Media author",
-                "url": ""
-            }
+            "author": {"name": "Media author", "url": ""},
         },
         "taxonomy": [
             {"name": "Theme", "values": ["Greenhouse Gases"]},
             {"name": "Source", "values": ["NASA"]},
         ],
         "infoDescription": "::markdown\n  - **Temporal Extent:** 2015 - 2100\n  - **Temporal Resolution:** Annual\n  - **Spatial Extent:** Global\n  - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n  - **Data Units:** Days (Days per year above 90°F or 110°F)\n  - **Data Type:** Research",
-        "layers": []
+        "layers": [],
     }
 
-    for asset_id, asset in input_data.get("assets", {}).items():
+    for asset_id, asset in input_data.get("item_assets", {}).items():
         layer = {
             "id": f"{collection_id}-{asset_id}",
             "stacCol": collection_id,
@@ -76,30 +38,34 @@ def create_frontmatter(input_data):
                 "resampling_method": "bilinear",
                 "colormap_name": "wistia",
                 "rescale": "0,365",
-                "maxzoom": 4
+                "maxzoom": 4,
             },
             "compare": {
                 "datasetId": collection_id,
                 "layerId": asset_id,
-                "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}"
-            },
-            "analysis": {
-                "exclude": False,
-                "metrics": ["mean"]
+                "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}",
             },
+            "analysis": {"exclude": False, "metrics": ["mean"]},
             "legend": {
                 "unit": {"label": "Days"},
                 "type": "gradient",
                 "min": 0,
                 "max": 365,
-                "stops": ["#E4FF7A", "#FAED2D", "#FFCE0A", "#FFB100", "#FE9900", "#FC7F00"]
+                "stops": [
+                    "#E4FF7A",
+                    "#FAED2D",
+                    "#FFCE0A",
+                    "#FFB100",
+                    "#FE9900",
+                    "#FC7F00",
+                ],
             },
             "info": {
                 "source": "NASA",
                 "spatialExtent": "Global",
                 "temporalResolution": "Annual",
-                "unit": "Days"
-            }
+                "unit": "Days",
+            },
         }
         json_data["layers"].append(layer)
     yaml_data = yaml.dump(json_data, sort_keys=False)
@@ -108,18 +74,19 @@ def create_frontmatter(input_data):
 
 
 def safe_open_w(path):
-    ''' Open "path" for writing, creating any parent directories as needed.
-    '''
+    """Open "path" for writing, creating any parent directories as needed."""
     os.makedirs(os.path.dirname(path), exist_ok=True)
-    return open(path, 'w')
+    return open(path, "w")
 
 
 if __name__ == "__main__":
+    input_data = json.load(open(sys.argv[1]))
     dataset_config = create_frontmatter(input_data)
     front_matter = f"---\n{dataset_config}---\n"
 
     # Path to the existing file
-    file_path = "dataset.mdx"
+    curr_directory = os.path.dirname(os.path.abspath(__file__))
+    file_path = os.path.join(curr_directory, "dataset.mdx")
 
     # Read the existing content of the file
     with open(file_path, "r") as file:
@@ -129,5 +96,8 @@ def safe_open_w(path):
     new_content = front_matter + existing_content
 
     # Write the combined content back to the file
-    with safe_open_w(f"../datasets/{input_data['collection']}.data.mdx") as file:
-        file.write(new_content)
+    output_filepath = os.path.join(
+        curr_directory, f"../datasets/{input_data['collection']}.data.mdx"
+    )
+    with safe_open_w(output_filepath) as ofile:
+        ofile.write(new_content)

From dedc8b677da6b4281d4024bfe7bedf7be0c574eb Mon Sep 17 00:00:00 2001
From: Slesa Adhikari <slesaad@gmail.com>
Date: Thu, 15 Aug 2024 16:30:34 -0500
Subject: [PATCH 7/7] Make the lint god happy

---
 .github/workflows/pr.yml | 42 ++++++++++++++++++++--------------------
 README.md                | 16 +++++++++------
 scripts/mdx.py           | 27 ++++++++++++++++++++++++--
 scripts/requirements.txt |  2 +-
 4 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml
index 7348860..0951ccf 100644
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@@ -1,4 +1,4 @@
-# This GitHub Actions workflow automates the process of 
+# This GitHub Actions workflow automates the process of
 # publishing dataset collections to a staging environment
 # and creating a pull request (PR) in the veda-config repository
 # with the dataset configuration.
@@ -41,7 +41,7 @@ jobs:
           body='### Workflow Status
           **Starting workflow...** [View action run]($WORKFLOW_URL)
           '
-          
+
           # Get the PR number
           PR_NUMBER=${{ github.event.pull_request.number }}
 
@@ -50,7 +50,7 @@ jobs:
 
           # Check if a comment already exists
           COMMENT_ID=$(echo "$COMMENTS" | jq -r '.id' | head -n 1)
-          
+
           if [ -z "$COMMENT_ID" ]; then
             # No existing comment, create a new one
             COMMENT_ID=$(gh api repos/${{ github.repository }}/issues/${PR_NUMBER}/comments -f body="$body" --jq '.id')
@@ -58,7 +58,7 @@ jobs:
             # Comment exists, overwrite the existing comment
             gh api repos/${{ github.repository }}/issues/comments/$COMMENT_ID -X PATCH -f body="$body"
           fi
-          
+
           echo "COMMENT_ID=$COMMENT_ID" >> $GITHUB_OUTPUT
 
       # Find only the updated files (file that differ from base)
@@ -110,30 +110,30 @@ jobs:
             echo "AUTH_TOKEN is not set"
             exit 1
           fi
-      
+
           publish_url="${WORKFLOWS_URL%/}/dataset/publish"
           bearer_token=$AUTH_TOKEN
-      
+
           # Track successful publications
           all_failed=true
           success_collections=()
           status_message='### Collection Publication Status
           '
-      
+
           for file in "${ALL_CHANGED_FILES[@]}"; do
             echo $file
             if [ -f "$file" ]; then
               dataset_config=$(jq '.' "$file")
               collection_id=$(jq -r '.collection' "$file")
-      
+
               response=$(curl -s -w "%{http_code}" -o response.txt -X POST "$publish_url" \
                 -H "Content-Type: application/json" \
                 -H "Authorization: Bearer $AUTH_TOKEN" \
                 -d "$dataset_config"
               )
-              
+
               status_code=$(tail -n1 <<< "$response")
-      
+
               # Update status message based on response code
               if [ "$status_code" -eq 200 ] || [ "$status_code" -eq 201 ]; then
                 echo "$collection_id successfully published ✅"
@@ -151,20 +151,20 @@ jobs:
               exit 1
             fi
           done
-      
+
           # Exit workflow if all the requests fail
           if [ "$all_failed" = true ]; then
             echo "All collections failed to publish."
             exit 1
           fi
-      
+
           # Output only successful collections to be used in subsequent steps
           echo "success_collections=$(IFS=','; echo "${success_collections[*]}")" >> $GITHUB_OUTPUT
 
           # Update PR comment
           CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
           UPDATED_BODY="$CURRENT_BODY
-          
+
           $status_message"
           gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
 
@@ -177,7 +177,7 @@ jobs:
         run: |
           CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
           UPDATED_BODY="$CURRENT_BODY
-          
+
           **Creating a PR in veda-config...**"
           gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
 
@@ -202,12 +202,12 @@ jobs:
         run: |
           git config --global user.name "github-actions[bot]"
           git config --global user.email "github-actions[bot]@users.noreply.github.com"
-  
+
       - name: Clone `veda-config`
         env:
           VEDA_CONFIG_GH_TOKEN: ${{ secrets.VEDA_CONFIG_GH_TOKEN }}
         run: git clone https://${{ env.VEDA_CONFIG_GH_TOKEN }}@github.com/${{ vars.VEDA_CONFIG_REPO_ORG }}/${{ vars.VEDA_CONFIG_REPO_NAME }}.git
-  
+
       # Creates a PR in veda-config with the following changes:
       # 1. the mdx files for all published collections
       # 2. updates the stac/raster urls in .env file
@@ -237,10 +237,10 @@ jobs:
           git commit -m "Add dataset(s)"
           git push origin $NEW_BRANCH
           PR_URL=$(GITHUB_TOKEN=$VEDA_CONFIG_GH_TOKEN gh pr create -H $NEW_BRANCH -B develop --title 'Add dataset [Automated workflow]' --body-file <(echo "Add datasets (Automatically created by Github action)"))
-          
+
           echo "PR_URL=$PR_URL" >> $GITHUB_OUTPUT
           echo "PR creation succeeded"
-      
+
       # Updates the comment with a link to the above PR
       - name: Update PR comment with PR creation result
         if: success()
@@ -251,7 +251,7 @@ jobs:
           PR_URL=${{ steps.create-pr.outputs.PR_URL }}
           CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
           UPDATED_BODY="$CURRENT_BODY
-          
+
           **A PR has been created with the dataset configuration: 🗺️ [PR link]($PR_URL)**"
           gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
 
@@ -263,7 +263,7 @@ jobs:
         run: |
           CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
           UPDATED_BODY="$CURRENT_BODY
-          
+
           **Failed ❌ to create a PR with the dataset configuration. 😔 **"
           gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
 
@@ -277,6 +277,6 @@ jobs:
           WORKFLOW_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
           CURRENT_BODY=$(gh api -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID --jq '.body')
           UPDATED_BODY="$CURRENT_BODY
-          
+
           ** ❌ The workflow run failed. [See logs here]($WORKFLOW_URL)**"
           gh api -X PATCH -H "Authorization: token $GITHUB_TOKEN" /repos/${{ github.repository }}/issues/comments/$COMMENT_ID -f body="$UPDATED_BODY"
diff --git a/README.md b/README.md
index 3401abd..f84394b 100644
--- a/README.md
+++ b/README.md
@@ -213,7 +213,8 @@ pip-compile
 
 This will update `requirements.txt` with a complete, realized set of Python dependencies.
 
-# Workflow automation
+## Workflow automation
+
 The repository consists of an automated workflow for staging dataset publication and configuration.
 
 The workflow file can be found in [.github/workflows/pr.yml](.github/workflows/pr.yml).
@@ -234,20 +235,22 @@ The following table includes a description and role for each repository variable
 | Secret     | `secrets.STAGING_CLIENT_SECRET`  | The secret client key used in OAuth2 authentication, necessary for secure access to the API.         |
 | Secret     | `secrets.VEDA_CONFIG_GH_TOKEN`   | The GitHub token with access rights to the `veda-config` repository, used for creating pull requests.|
 
-
 ### `vars.ENV_FROM` and `vars.ENV_TO` usage
+
 These are used to overwrite the stac/raster URLs in the `.env` ile in `veda-config` repository
 
 Command used: `sed -i "s|${{ vars.ENV_FROM }}|${{ vars.ENV_TO }}|" .env`
- 
+
 #### Example
-```
+
+```bash
 vars.ENV_FROM = openveda
 vars.ENV_TO = staging.openveda
 ```
 
 `.env` before
-```
+
+```bash
 ...
 # Endpoint for the Tiler server. No trailing slash.
 API_RASTER_ENDPOINT='https://openveda.cloud/api/raster'
@@ -258,7 +261,8 @@ API_STAC_ENDPOINT='https://openveda.cloud/api/stac'
 ```
 
 `.env` after
-```
+
+```bash
 ...
 # Endpoint for the Tiler server. No trailing slash.
 API_RASTER_ENDPOINT='https://staging.openveda.cloud/api/raster'
diff --git a/scripts/mdx.py b/scripts/mdx.py
index 8c91549..f78be51 100644
--- a/scripts/mdx.py
+++ b/scripts/mdx.py
@@ -1,3 +1,9 @@
+"""
+This file creates a minimal <collection>.data.mdx file
+from the input dataset config json file
+Dependency: `dataset.mdx` file
+"""
+
 import yaml
 import os
 import json
@@ -5,6 +11,9 @@
 
 
 def create_frontmatter(input_data):
+    """
+    Creates json based on input dataset config
+    """
     collection_id = input_data["collection"]
 
     json_data = {
@@ -21,7 +30,14 @@ def create_frontmatter(input_data):
             {"name": "Theme", "values": ["Greenhouse Gases"]},
             {"name": "Source", "values": ["NASA"]},
         ],
-        "infoDescription": "::markdown\n  - **Temporal Extent:** 2015 - 2100\n  - **Temporal Resolution:** Annual\n  - **Spatial Extent:** Global\n  - **Spatial Resolution:** 0.25 degrees x 0.25 degrees\n  - **Data Units:** Days (Days per year above 90°F or 110°F)\n  - **Data Type:** Research",
+        "infoDescription": """::markdown
+            - **Temporal Extent:** 2015 - 2100
+            - **Temporal Resolution:** Annual
+            - **Spatial Extent:** Global
+            - **Spatial Resolution:** 0.25 degrees x 0.25 degrees
+            - **Data Units:** Days (Days per year above 90°F or 110°F)
+            - **Data Type:** Research
+        """,
         "layers": [],
     }
 
@@ -43,7 +59,12 @@ def create_frontmatter(input_data):
             "compare": {
                 "datasetId": collection_id,
                 "layerId": asset_id,
-                "mapLabel": "::js ({ dateFns, datetime, compareDatetime }) => {if (dateFns && datetime && compareDatetime) return `${dateFns.format(datetime, 'yyyy')} VS ${dateFns.format(compareDatetime, 'yyyy')}`;}",
+                "mapLabel": (
+                    "::js ({ dateFns, datetime, compareDatetime }) "
+                    "=> {if (dateFns && datetime && compareDatetime)"
+                    "return `${dateFns.format(datetime, 'yyyy')} "
+                    "VS ${dateFns.format(compareDatetime, 'yyyy')}`;}"
+                ),
             },
             "analysis": {"exclude": False, "metrics": ["mean"]},
             "legend": {
@@ -68,6 +89,8 @@ def create_frontmatter(input_data):
             },
         }
         json_data["layers"].append(layer)
+
+    # Convert json to yaml for frontmatter
     yaml_data = yaml.dump(json_data, sort_keys=False)
 
     return yaml_data
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
index 4818cc5..c3726e8 100644
--- a/scripts/requirements.txt
+++ b/scripts/requirements.txt
@@ -1 +1 @@
-pyyaml
\ No newline at end of file
+pyyaml