Skip to content

Commit

Permalink
feat: show how hls provided metadata were updated and published to st…
Browse files Browse the repository at this point in the history
…ac (#142)

* feat: show how hls provided metadata were updated and published to stac

* feat: add thumbnails to hls ej collections
  • Loading branch information
anayeaye authored Jun 11, 2024
1 parent f6cd153 commit fb7bfff
Show file tree
Hide file tree
Showing 4 changed files with 294 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@
},
"license": "MIT",
"stac_extensions": [
"https://stac-extensions.github.io/render/v1.0.0/schema.json",
"https://stac-extensions.github.io/item-assets/v1.0.0/schema.json"
"https://stac-extensions.github.io/render/v1.0.0/schema.json"
],
"item_assets": {},
"dashboard:is_periodic": false,
"dashboard:time_density": "day",
"stac_version": "1.0.0",
Expand Down Expand Up @@ -78,5 +76,13 @@
"processor"
]
}
]
],
"assets": {
"thumbnail": {
"title": "Thumbnail",
"href": "https://thumbnails.openveda.cloud/hls-events-ej--dataset-cover.png",
"type": "image/png",
"roles": ["thumbnail"]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,8 @@
},
"license": "MIT",
"stac_extensions": [
"https://stac-extensions.github.io/render/v1.0.0/schema.json",
"https://stac-extensions.github.io/item-assets/v1.0.0/schema.json"
"https://stac-extensions.github.io/render/v1.0.0/schema.json"
],
"item_assets": {},
"dashboard:is_periodic": false,
"dashboard:time_density": "day",
"stac_version": "1.0.0",
Expand Down Expand Up @@ -78,5 +76,13 @@
"processor"
]
}
]
],
"assets": {
"thumbnail": {
"title": "Thumbnail",
"href": "https://thumbnails.openveda.cloud/hls-events-ej--dataset-cover.png",
"type": "image/png",
"roles": ["thumbnail"]
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "969528b3-e2db-462b-a5a0-9f6a469b643c",
"metadata": {},
"source": [
"# Publish reprocessed HLS items using provider generated metadata in s3"
]
},
{
"cell_type": "markdown",
"id": "7a5dd00e-6577-47ef-833f-d5ac07df78fb",
"metadata": {},
"source": [
"#### Assertions\n",
"- This notebook follows the [update-hrefs.ipynb](https://github.com/NASA-IMPACT/veda-data/blob/main/transformation-scripts/update-hrefs.ipynb) notebook which updates the provider metadata to use the s3 hrefs for the objects in veda-data-store\n",
"- Assumption: the collection metadata in ingestion-data/production/collections is stac version 1.0.0 and has already been published to the target STAC catalog\n",
"\n",
"#### Update the stac version and store objects in s3\n",
"- Search for all reprocessed item metadata in `s3://veda-data-store/<collection_id>`\n",
"- Update json to stac version 1.0.0, validate, and post back to s3\n",
"- Use target VEDA instance's ingest-api/ingestions endpoint to verify hrefs and publish item metadata to STAC"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1a9f8bbd-1b62-404f-8c88-416cfe538575",
"metadata": {},
"outputs": [],
"source": [
"import boto3\n",
"import json\n",
"import requests\n",
"from pystac import Item\n",
"\n",
"# Test\n",
"# TARGET_STAC_API_URL = \"https://test.openveda.cloud/api/stac\"\n",
"# TARGET_INGEST_API_URL = \"https://test.openveda.cloud/api/ingest\"\n",
"\n",
"# Prod\n",
"TARGET_STAC_API_URL = \"https://openveda.cloud/api/stac\"\n",
"TARGET_INGEST_API_URL = \"https://openveda.cloud/api/ingest\"\n",
"\n",
"TOKEN = \"SECRET\"\n",
"authorization_header = f\"Bearer {TOKEN}\"\n",
"headers = {\n",
" \"Authorization\": authorization_header,\n",
" \"content-type\": \"application/json\",\n",
" \"accept\": \"application/json\",\n",
"}\n",
"authme_url = f\"{TARGET_INGEST_API_URL}/auth/me\"\n",
"response = requests.get(authme_url, headers=headers)\n",
"response.reason"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0cdc5478-31bf-4c5f-a6c2-86141c6228bb",
"metadata": {},
"outputs": [],
"source": [
"AWS_ACCESS_KEY_ID = \"[CHANGE ME]\"\n",
"AWS_SECRET_ACCESS_KEY = \"[CHANGE ME]\"\n",
"AWS_SESSION_TOKEN = \"[CHANGE ME]\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7a813b27-ba2b-463d-ac2b-8e2a083768f8",
"metadata": {},
"outputs": [],
"source": [
"s3_client = boto3.client(\n",
" \"s3\",\n",
" aws_access_key_id=AWS_ACCESS_KEY_ID,\n",
" aws_secret_access_key=AWS_SECRET_ACCESS_KEY,\n",
" aws_session_token=AWS_SESSION_TOKEN,\n",
")"
]
},
{
"cell_type": "markdown",
"id": "0e32dff3-0de2-448e-9c56-fbd12985fd02",
"metadata": {},
"source": [
"## Update the json in s3 to stac version 1.0.0\n",
"\n",
"These provided metadata are `stac_version` `1.0.0-beta.2` but we can make a minor modification to how the `stac_extensions` are provided get them up to the same stac version `1.0.0` used for the rest of the collections in our STAC catalog(s). \n",
"\n",
"\n",
"> **WARNING** this cell replaces an existing file in s3 instead of creating a new version, we are using it for a one time cleanup of a small known collection of invalid metadata that need to be corrected. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "58288478-e37e-4de8-bc37-19cbd8d044af",
"metadata": {},
"outputs": [],
"source": [
"bucket_name = \"veda-data-store\"\n",
"collection_ids = [\"hlsl30-002-ej-reprocessed\", \"hlss30-002-ej-reprocessed\"]\n",
"dry_run = True\n",
"verbose = True\n",
"\n",
"for collection_id in collection_ids:\n",
" s3_prefix = f\"{collection_id}/\"\n",
"\n",
" response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n",
"\n",
" # Filter for the STAC metadata files\n",
" json_keys = [\n",
" obj[\"Key\"]\n",
" for obj in response[\"Contents\"]\n",
" if obj[\"Key\"].endswith(\"stac-ej-reprocessed.json\")\n",
" ]\n",
" print(f\"\\n{collection_id=} matched metadata for {len(json_keys)} items\")\n",
"\n",
" for key in json_keys:\n",
" # Backup the original version of this metadata\n",
" # deprecated_key = key + \".deprecated\"\n",
"\n",
" # if not dry_run:\n",
" # s3_client.copy_object(\n",
" # CopySource={'Bucket': bucket_name, 'Key': key},\n",
" # Bucket=bucket_name,\n",
" # Key=deprecated_key,\n",
" # )\n",
" # if verbose:\n",
" # print(f\"Copied {key} to {deprecated_key}\")\n",
"\n",
" # Get object to update the metadata\n",
" response = s3_client.get_object(Bucket=bucket_name, Key=key)\n",
"\n",
" item_dict = json.loads(response[\"Body\"].read().decode(\"utf-8\"))\n",
"\n",
" # Add correct collection link\n",
" links = [link for link in item_dict[\"links\"] if link[\"rel\"] != \"collection\"]\n",
" links.append(\n",
" {\"rel\": \"collection\", \"href\": collection_id, \"type\": \"application/json\"}\n",
" )\n",
" item_dict[\"links\"] = links\n",
"\n",
" # Update the stac version for these items from \"stac_version\": \"1.0.0-beta.2\" and touch up metadata to meet 1.0.0 spec\n",
" item_dict[\"stac_version\"] = \"1.0.0\"\n",
"\n",
" # Add full extension hrefs https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#stac_extensions\n",
" item_extensions = item_dict[\"stac_extensions\"]\n",
" stac_extensions = []\n",
" for ext in item_extensions:\n",
" if \"https://stac-extensions.github.io\" not in ext:\n",
" stac_extensions.append(\n",
" f\"https://stac-extensions.github.io/{ext}/v1.0.0/schema.json\"\n",
" )\n",
" else:\n",
" stac_extensions.append(ext)\n",
" item_dict[\"stac_extensions\"] = stac_extensions\n",
"\n",
" # Make sure the asset hrefs are pointed at the correct collection's prefix\n",
" item_assets = item_dict[\"assets\"]\n",
" # Previous location did not have data version number\n",
" old_prefix = collection_id.replace(\"-002-\", \"-\")\n",
" for asset_key in item_assets.keys():\n",
" new_href = item_assets[asset_key][\"href\"].replace(old_prefix, collection_id)\n",
" item_assets[asset_key][\"href\"] = new_href\n",
"\n",
" # Validate the updated item\n",
" item = Item.from_dict(item_dict)\n",
" try:\n",
" item.validate()\n",
" except Exception as e:\n",
" print(f\"Invalid {collection_id=} {item.id=}\")\n",
"\n",
" # Replace the s3 object with the updated metadata for stac version 1.0.0\n",
" if not dry_run:\n",
" s3_client.put_object(\n",
" Bucket=bucket_name, Key=key, Body=json.dumps(item_dict)\n",
" )\n",
" if verbose:\n",
" print(f\"Updated {key}\")"
]
},
{
"cell_type": "markdown",
"id": "0e8ea1ae-1a0b-4656-abf8-dc3f8085a3b5",
"metadata": {},
"source": [
"## Publish item records to STAC"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7aea08e4-fb73-48d5-be7c-c3b147f52f63",
"metadata": {},
"outputs": [],
"source": [
"collection_ids = [\"hlsl30-002-ej-reprocessed\", \"hlss30-002-ej-reprocessed\"]\n",
"dry_run = True\n",
"verbose = True\n",
"\n",
"for collection_id in collection_ids:\n",
" s3_prefix = f\"{collection_id}/\"\n",
"\n",
" response = s3_client.list_objects_v2(Bucket=bucket_name, Prefix=s3_prefix)\n",
"\n",
" # Filter for the STAC metadata files\n",
" json_keys = [\n",
" obj[\"Key\"]\n",
" for obj in response[\"Contents\"]\n",
" if obj[\"Key\"].endswith(\"stac-ej-reprocessed.json\")\n",
" ]\n",
" print(f\"\\n{collection_id=} matched metadata for {len(json_keys)} items\")\n",
"\n",
" for key in json_keys:\n",
" response = s3_client.get_object(Bucket=bucket_name, Key=key)\n",
"\n",
" item_dict = json.loads(response[\"Body\"].read().decode(\"utf-8\"))\n",
"\n",
" # Validate the item\n",
" item = Item.from_dict(item_dict)\n",
" try:\n",
" item.validate()\n",
" except Exception as e:\n",
" print(f\"invalid {collection_id=} {item.id=}\")\n",
"\n",
" # Publish to target STAC catalog\n",
" publish_url = f\"{TARGET_INGEST_API_URL}/ingestions\"\n",
" if not dry_run:\n",
" publish_response = requests.post(\n",
" publish_url, headers=headers, json=item_dict\n",
" )\n",
" if verbose:\n",
" print(\n",
" f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {publish_response.reason=}\"\n",
" )\n",
" if not publish_response.reason == \"Created\":\n",
" print(\n",
" f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {publish_response.reason=}\"\n",
" )\n",
" else:\n",
" if verbose:\n",
" print(\n",
" f\"POST {publish_url} {collection_id=}\\n{item_dict['id']=} {dry_run=}\"\n",
" )"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.8"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit fb7bfff

Please sign in to comment.