diff --git a/.github/scripts/build_index.py b/.github/scripts/build_index.py new file mode 100644 index 0000000..e16f57c --- /dev/null +++ b/.github/scripts/build_index.py @@ -0,0 +1,58 @@ +import argparse +import json +import os +from pathlib import Path +import yaml +import jsonschema +from datetime import datetime + +def read_yaml_files(base_path, folders): + yaml_files = [] + for folder in folders: + folder_path = base_path.joinpath(folder) + for yaml_file in folder_path.glob('*.yaml'): + with open(yaml_file, 'r', encoding='utf-8') as f: + yaml_files.append((folder, yaml.safe_load(f))) + return yaml_files + +def validate_yaml(yaml_content, schema_path): + with open(schema_path, 'r', encoding='utf-8') as f: + schema = json.load(f) + jsonschema.validate(instance=yaml_content, schema=schema) + +def build_index(yaml_files, schemas_path): + index = {'index_timestamp': str(datetime.now()), 'catalog': {}} + for folder, content in yaml_files: + schema_path = schemas_path.joinpath(f"{folder}.json") + try: + validate_yaml(content, schema_path) + if folder not in index['catalog']: + index['catalog'][folder] = [] + index['catalog'][folder].append(content) + except jsonschema.exceptions.ValidationError as e: + print(f"Validation error in {folder}: {e}") + return index + +def main(): + parser = argparse.ArgumentParser(description="Build and deploy index from YAML files.") + parser.add_argument("--build-branch", "-b", type=str, required=True, help="Path to the build branch.") + parser.add_argument("--deploy-branch", "-d", type=str, required=True, help="Path to the deploy branch.") + parser.add_argument("--folders-to-scan", "-f", type=str, nargs='+', default=["datasets"], help="List of folders to scan for YAML files.") + args = parser.parse_args() + + build_path = Path(args.build_branch) + deploy_path = Path(args.deploy_branch) + schemas_path = build_path.joinpath('catalog/schemas') + + yaml_files = read_yaml_files(build_path.joinpath('catalog'), args.folders_to_scan) + index = build_index(yaml_files, schemas_path) + + deploy_path.mkdir(parents=True, exist_ok=True) + with open(deploy_path.joinpath('index.json'), 'w', encoding='utf-8') as f: + json.dump(index, f, indent=4) + + with open(build_path.joinpath('src/index.json'), 'w', encoding='utf-8') as f: + json.dump(index, f, indent=4) + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/.github/workflows/build_index.yml b/.github/workflows/build_index.yml new file mode 100644 index 0000000..58f2226 --- /dev/null +++ b/.github/workflows/build_index.yml @@ -0,0 +1,71 @@ +name: Build index + +on: + workflow_dispatch: + push: + branches: + - main + paths: + - 'catalog/**.yaml' + +permissions: + contents: write + +jobs: + build-index: + runs-on: ubuntu-latest + if: github.event.repository.fork == false + + steps: + - name: Checkout main + uses: actions/checkout@v4 + with: + fetch-depth: 2 + ref: 'main' + path: main + + - name: Checkout gh-pages + uses: actions/checkout@v4 + with: + fetch-depth: 2 + ref: 'gh-pages' + path: gh-pages + + - name: Setup python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install yaml & jsonschema + run: pip install pyyaml jsonschema + + - name: Build index + run: python -B main/.github/scripts/build_index.py --build-branch main --deploy-branch gh-pages --folders-to-scan datasets + + - name: Get last commit message - main + id: last-commit-message-main + run: echo "msg=$(git -C main log -1 --pretty=%s)" >> $GITHUB_OUTPUT + + - name: Get last commit message - gh-pages + id: last-commit-message-gh-pages + run: echo "msg=$(git -C gh-pages log -1 --pretty=%s)" >> $GITHUB_OUTPUT + + - name: Commit - main + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: ${{ steps.last-commit-message-main.outputs.msg }} + commit_options: '--amend --no-edit' + file_pattern: '*.json catalog/**.yaml' + push_options: '--force' + skip_fetch: true + repository: main + + - name: Commit - gh-pages + uses: stefanzweifel/git-auto-commit-action@v5 + with: + commit_message: ${{ steps.last-commit-message-gh-pages.outputs.msg }} + commit_options: '--amend --no-edit' + file_pattern: '*.json' + push_options: '--force' + skip_fetch: true + repository: gh-pages \ No newline at end of file diff --git a/README.md b/README.md index 2da6415..a866fe8 100644 --- a/README.md +++ b/README.md @@ -86,6 +86,13 @@ Making use of the last development in webassembly technologies Open issues are registered in the repository and taken by the community. + +## Development + +### How to test the github action build index? + +The github action can be tested by pulling both branches: + ## Develop by Develop by SDSC in the frame of the Open Research Data for the Sciences Hackathon in collaboration with the EPFL Open Science office. We thanks the pNeuma Team for the support during the development of this prototype. diff --git a/catalog/datasets/0001-pneuma-dataset.yaml b/catalog/datasets/0001-pneuma-dataset.yaml new file mode 100644 index 0000000..137005c --- /dev/null +++ b/catalog/datasets/0001-pneuma-dataset.yaml @@ -0,0 +1,20 @@ +schema_version: "1.0.0" +schema_type: "dataset" +dataset_id: "0001-pneuma-dataset" +title: "pNEUMA Dataset" +doi: "10.5281/zenodo.7426506" +version: "v1" +description: "pNEUMA is an open large-scale dataset of naturalistic trajectories of half a million vehicles that have been collected by a one-of-a-kind experiment by a swarm of drones in the congested downtown area of Athens, Greece." +created_at: "2022-12-08T16:26:11Z" +updated_at: "2022-12-08T16:26:11Z" +data_format: "zip" +tags: + - "traffic" + - "drone" +source: "Zenodo" +authors: + name: "Kim, Sohyeong" + orcid: "0000-0000-0000-0000" +license: "CC BY 4.0" +access_url: "https://zenodo.org/record/7426506/" +documentation_url: "https://zenodo.org/record/7426506" diff --git a/catalog/datasets/0002-pneuma-vision-dataset.yml b/catalog/datasets/0002-pneuma-vision-dataset.yml new file mode 100644 index 0000000..41b96d7 --- /dev/null +++ b/catalog/datasets/0002-pneuma-vision-dataset.yml @@ -0,0 +1,22 @@ +schema_version: "1.0.0" +schema_type: "dataset" +dataset_id: "0002-pneuma-vision-dataset" +title: "pNEUMA vision dataset" +doi: "10.5281/zenodo.10491409" +version: "v1" +description: "The pNEUMA dataset is the drone traffic imagery dataset that contains images of frame and vehicle annotations as positions. This dataset is the expansion of the pNEUMA, the urban trajectory dataset collected by swarms of drones in Athens. For more details about pNEUMA and pNEUMA Vision, please check our website at https://open-traffic.epfl.ch and github." +created_at: "2023-08-30T15:12:56Z" +updated_at: "2023-08-30T15:12:56Z" +data_format: "csv" +tags: + - "traffic" + - "GNSS" + - "urban mobility" + - "traffic safety" +source: "Zenodo" +authors: + name: "Barmpounakis, Emmanouil" + orcid: "0000-0000-0000-0000" +license: "CC BY 4.0" +access_url: "https://zenodo.org/record/10491409" +documentation_url: "https://zenodo.org/record/10491409" diff --git a/catalog/schemas/dataset.json b/catalog/schemas/datasets.json similarity index 88% rename from catalog/schemas/dataset.json rename to catalog/schemas/datasets.json index 67df0c0..00e58bd 100644 --- a/catalog/schemas/dataset.json +++ b/catalog/schemas/datasets.json @@ -2,6 +2,14 @@ "version": "1.0.0", "type": "object", "properties": { + "schema_version": { + "type": "string", + "description": "The version of the dataset schema user" + }, + "schema_type": { + "type": "string", + "description": "Which of the available schemas are used" + }, "dataset_id": { "type": "string", "description": "A unique identifier for the dataset" @@ -14,6 +22,10 @@ "type": "string", "description": "The DOI of the dataset" }, + "version": { + "type": "string", + "description": "Version of the dataset" + }, "description": { "type": "string", "description": "A brief description of the dataset" @@ -86,7 +98,6 @@ "description", "created_at", "data_format", - "size_in_mb", "source", "authors", "license", diff --git a/catalog/templates/dataset_template.yaml b/catalog/templates/datasets_template.yaml similarity index 100% rename from catalog/templates/dataset_template.yaml rename to catalog/templates/datasets_template.yaml diff --git a/src/index.json b/src/index.json index 4ba8eb2..cb42375 100644 --- a/src/index.json +++ b/src/index.json @@ -1,92 +1,49 @@ { - "index_timestamp": "2024-08-30T12:00:00Z", - "catalog": { - "datasets": [ - { - "version": "1.0.0", - "dataset_id": "123e4567-e89b-12d3-a456-426614174000", - "title": "Global Climate Data 2023", - "doi": "10.1234/global-climate-2023", - "description": "This dataset contains global climate data for the year 2023, including temperature, precipitation, and atmospheric pressure readings from various regions around the world.", - "created_at": "2023-01-01T12:00:00Z", - "updated_at": "2023-06-15T12:00:00Z", - "data_format": "CSV", - "tags": ["climate", "temperature", "precipitation", "atmospheric pressure"], - "source": "National Meteorological Organization", - "authors": { - "name": "Dr. Jane Doe", - "orcid": "0000-0002-1825-0097" - }, - "license": "CC BY 4.0", - "access_url": "https://example.com/datasets/global-climate-2023", - "documentation_url": "https://example.com/datasets/global-climate-2023/documentation", - "access_endpoint": "https://api.example.com/climate/2023/data", - "documentation_endpoint": "https://api.example.com/climate/2023/docs" + "index_timestamp": "2024-08-30T12:00:00Z", + "catalog": { + "datasets": [ + { + "schema_version": "1.0.0", + "schema_type": "dataset", + "dataset_id": "0001-pneuma-dataset", + "title": "pNEUMA Dataset", + "doi": "10.5281/zenodo.7426506", + "version": "v1", + "description": "pNEUMA is an open large-scale dataset of naturalistic trajectories of half a million vehicles that have been collected by a one-of-a-kind experiment by a swarm of drones in the congested downtown area of Athens, Greece. ", + "created_at": "2022-12-08T16:26:11Z", + "updated_at": "2022-12-08T16:26:11Z", + "data_format": "zip", + "tags": ["traffic", "drone"], + "source": "Zenodo", + "authors": { + "name": "Kim, Sohyeong", + "orcid": "0000-0000-0000-0000" }, - { - "version": "1.0.0", - "dataset_id": "223e4567-e89b-12d3-a456-426614174111", - "title": "Regional Climate Data 2022", - "doi": "10.1234/regional-climate-2022", - "description": "This dataset provides detailed climate data for specific regions for the year 2022.", - "created_at": "2022-01-01T12:00:00Z", - "updated_at": "2022-12-15T12:00:00Z", - "data_format": "JSON", - "tags": ["climate", "regional data", "temperature"], - "source": "Regional Weather Stations", - "authors": { - "name": "Dr. John Smith", - "orcid": "0000-0002-3456-7890" - }, - "license": "MIT", - "access_url": "https://example.com/datasets/regional-climate-2022", - "documentation_url": "https://example.com/datasets/regional-climate-2022/documentation", - "access_endpoint": "https://api.example.com/climate/2022/data", - "documentation_endpoint": "https://api.example.com/climate/2022/docs" + "license": "CC BY 4.0", + "access_url": "https://zenodo.org/record/7426506/", + "documentation_url": "https://zenodo.org/record/7426506" + }, + { + "schema_version": "1.0.0", + "schema_type": "dataset", + "dataset_id": "0002-pneuma-vision-dataset", + "title": "pNEUMA vision dataset", + "doi": "10.5281/zenodo.10491409", + "version": "v1", + "description": "The pNEUMA dataset is the drone traffic imagery dataset that contains images of frame and vehicle annotations as positions. This dataset is the expansion of the pNEUMA, the urban trajectory dataset collected by swarms of drones in Athens. For more details about pNEUMA and pNEUMA Vision, please check our website at https://open-traffic.epfl.ch and github. ", + "created_at": "2023-08-30T15:12:56Z", + "updated_at": "2023-08-30T15:12:56Z", + "data_format": "csv", + "tags": ["traffic", "GNSS", "urban mobility", "traffic safety"], + "source": "Zenodo", + "authors": { + "name": "Barmpounakis, Emmanouil", + "orcid": "0000-0000-0000-0000" }, - { - "version": "1.0.0", - "dataset_id": "323e4567-e89b-12d3-a456-426614174222", - "title": "Global Economic Indicators 2023", - "doi": "10.1234/global-economic-2023", - "description": "This dataset includes global economic indicators such as GDP, inflation rates, and unemployment figures for the year 2023.", - "created_at": "2023-02-01T12:00:00Z", - "updated_at": "2023-07-01T12:00:00Z", - "data_format": "Parquet", - "tags": ["economy", "GDP", "inflation", "unemployment"], - "source": "World Economic Forum", - "authors": { - "name": "Dr. Emily Johnson", - "orcid": "0000-0003-1234-5678" - }, - "license": "CC BY-SA 4.0", - "access_url": "https://example.com/datasets/global-economic-2023", - "documentation_url": "https://example.com/datasets/global-economic-2023/documentation", - "access_endpoint": "https://api.example.com/economy/2023/data", - "documentation_endpoint": "https://api.example.com/economy/2023/docs" - }, - { - "version": "1.0.0", - "dataset_id": "423e4567-e89b-12d3-a456-426614174333", - "title": "Regional Economic Data 2022", - "doi": "10.1234/regional-economic-2022", - "description": "This dataset provides economic data for various regions in 2022, including GDP and unemployment rates.", - "created_at": "2022-03-01T12:00:00Z", - "updated_at": "2022-09-01T12:00:00Z", - "data_format": "CSV", - "tags": ["economy", "regional", "GDP", "unemployment"], - "source": "Regional Economic Offices", - "authors": { - "name": "Dr. Michael Lee", - "orcid": "0000-0004-5678-9101" - }, - "license": "Apache 2.0", - "access_url": "https://example.com/datasets/regional-economic-2022", - "documentation_url": "https://example.com/datasets/regional-economic-2022/documentation", - "access_endpoint": "https://api.example.com/economy/2022/data", - "documentation_endpoint": "https://api.example.com/economy/2022/docs" - } - ] - } + "license": "CC BY 4.0", + "access_url": "https://zenodo.org/record/10491409", + "documentation_url": "https://zenodo.org/record/10491409" + } + ] } - \ No newline at end of file +}