From 0af34b317f936dde791fa94130abf64bf2e6a5ba Mon Sep 17 00:00:00 2001 From: Jennifer Tran Date: Mon, 8 Apr 2024 16:41:52 -0700 Subject: [PATCH 1/2] Add validation notebook --- .../validate-updated-collections.ipynb | 251 ++++++++++++++++++ 1 file changed, 251 insertions(+) create mode 100644 transformation-scripts/validate-updated-collections.ipynb diff --git a/transformation-scripts/validate-updated-collections.ipynb b/transformation-scripts/validate-updated-collections.ipynb new file mode 100644 index 0000000..db8050f --- /dev/null +++ b/transformation-scripts/validate-updated-collections.ipynb @@ -0,0 +1,251 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "import boto3\n", + "import os\n", + "import glob\n", + "import shutil\n", + "\n", + "from stac_validator import stac_validator" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "AWS_ACCESS_KEY_ID = \"[CHANGE ME]\"\n", + "AWS_SECRET_ACCESS_KEY = \"[CHANGE ME]\"\n", + "AWS_SESSION_TOKEN = \"[CHANGE ME]\"" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "s3_client = boto3.client(\n", + " \"s3\",\n", + " aws_access_key_id=AWS_ACCESS_KEY_ID,\n", + " aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n", + " aws_session_token=AWS_SESSION_TOKEN,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# Define the directory path\n", + "local_dir = \"./collections-to-validate\"" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "def download_collections(bucket_name, s3_prefixes):\n", + " s3 = s3_client\n", + " s3_responses = []\n", + " json_keys = []\n", + "\n", + " for s3_prefix in s3_prefixes:\n", + " response = s3.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n", + " s3_responses.append(response)\n", + "\n", + " for item in s3_responses:\n", + " for item in item[\"Contents\"]:\n", + " if item[\"Key\"].endswith(\".json\"):\n", + " json_keys.append(item[\"Key\"])\n", + "\n", + " # Create the directory if it doesn't exist\n", + " if not os.path.exists(local_dir):\n", + " os.makedirs(local_dir)\n", + " print(\"Directory created successfully at\", local_dir)\n", + " else:\n", + " print(\"Directory already exists at\", local_dir)\n", + "\n", + " # Download json files to local directory\n", + " for key in json_keys:\n", + " local_file_path = os.path.join(local_dir, os.path.basename(key))\n", + " s3.download_file(bucket_name, key, local_file_path)\n", + "\n", + "\n", + "def validate_collections():\n", + " json_files = glob.glob(os.path.join(local_dir, \"*.json\"))\n", + " for json_file in json_files:\n", + " print(f\"Validating file: {json_file}\")\n", + " stac = stac_validator.StacValidate(json_file)\n", + " stac.run()\n", + " valid_stac = stac.message[0][\"valid_stac\"]\n", + " print(valid_stac)\n", + " if not valid_stac:\n", + " print(f\"Invalid STAC. Error message {stac.message[0]['error_message']}\")\n", + "\n", + "\n", + "def cleanup():\n", + " shutil.rmtree(local_dir)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Directory created successfully at ./collections-to-validate\n" + ] + } + ], + "source": [ + "bucket_name = \"veda-data-store\"\n", + "s3_prefixes = [\"hlsl30-ej-reprocessed\", \"hlss30-ej-reprocessed\"]\n", + "download_collections(bucket_name, s3_prefixes)" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017221T144403.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021250T163901.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021210T163901.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021200T163901.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017173T144347.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021220T163901.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017293T150709.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017205T144356.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021290T164311.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021230T163215.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017218T150721.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017333T150709.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021185T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017323T150709.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021246T163220.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021195T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021182T163159.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021270T164051.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017317T144417.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017278T150721.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021262T163223.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021295T164339.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017193T150719.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017301T144420.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021278T163229.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017285T144419.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021198T163201.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017157T144341.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021280T164201.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017318T150721.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021205T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021235T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021225T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021245T163839.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021285T164139.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021265T163909.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T15RYP.2021294T163232.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T15RYP.2021300T164411.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017333T144411.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017237T144407.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.L30.T19QHA.2017269T144414.v2.0_stac-ej-reprocessed.json\n", + "True\n", + "Validating file: ./collections-to-validate/HLS.S30.T19QHA.2017233T150719.v2.0_stac-ej-reprocessed.json\n", + "True\n" + ] + } + ], + "source": [ + "validate_collections()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [], + "source": [ + "cleanup()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 287622e24f26a945a5d147c7f38c0a7081f91920 Mon Sep 17 00:00:00 2001 From: Jennifer Tran Date: Mon, 8 Apr 2024 16:57:30 -0700 Subject: [PATCH 2/2] Add stac-validator package --- requirements.in | 1 + requirements.txt | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/requirements.in b/requirements.in index 8f9b637..72e4543 100644 --- a/requirements.in +++ b/requirements.in @@ -4,3 +4,4 @@ pre-commit pystac[validation] pytest ruff +stac-validator diff --git a/requirements.txt b/requirements.txt index 46cb9d7..721878f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,7 @@ # This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile +# pip-compile requirements.in # appnope==0.1.3 # via ipython @@ -18,12 +18,17 @@ black[jupyter]==23.10.0 # via -r requirements.in build==1.0.3 # via pip-tools +certifi==2024.2.2 + # via requests cfgv==3.4.0 # via pre-commit +charset-normalizer==3.3.2 + # via requests click==8.1.7 # via # black # pip-tools + # stac-validator decorator==5.1.1 # via ipython distlib==0.3.7 @@ -34,6 +39,8 @@ filelock==3.12.4 # via virtualenv identify==2.5.30 # via pre-commit +idna==3.6 + # via requests iniconfig==2.0.0 # via pytest ipython==8.16.1 @@ -41,7 +48,9 @@ ipython==8.16.1 jedi==0.19.1 # via ipython jsonschema==4.19.1 - # via pystac + # via + # pystac + # stac-validator jsonschema-specifications==2023.7.1 # via jsonschema matplotlib-inline==0.1.6 @@ -95,6 +104,8 @@ referencing==0.30.2 # via # jsonschema # jsonschema-specifications +requests==2.31.0 + # via stac-validator rpds-py==0.10.6 # via # jsonschema @@ -105,6 +116,8 @@ six==1.16.0 # via # asttokens # python-dateutil +stac-validator==3.3.2 + # via -r requirements.in stack-data==0.6.3 # via ipython tokenize-rt==5.2.0 @@ -113,6 +126,8 @@ traitlets==5.11.2 # via # ipython # matplotlib-inline +urllib3==2.2.1 + # via requests virtualenv==20.24.5 # via pre-commit wcwidth==0.2.8