diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml index 4d06c71e..62187e3c 100644 --- a/.github/workflows/check.yml +++ b/.github/workflows/check.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} diff --git a/Makefile b/Makefile index 3cfd246f..ec339425 100644 --- a/Makefile +++ b/Makefile @@ -22,7 +22,7 @@ help: venv: $(VIRTUAL_ENV)/timestamp -$(VIRTUAL_ENV)/timestamp: pyproject.toml setup.cfg +$(VIRTUAL_ENV)/timestamp: pyproject.toml pip install --upgrade pip pip install -e ".[dev,docs]" ifneq ($(wildcard requirements/extra.txt),) @@ -31,15 +31,16 @@ endif touch $(VIRTUAL_ENV)/timestamp format: venv - isort $(PROJECTNAME) - docformatter -i -r $(PROJECTNAME) - black $(PROJECTNAME) + ruff check $(PROJECTNAME) tests --fix lint: venv - flake8 --show-source $(PROJECTNAME) tests + ruff check $(PROJECTNAME) tests test: venv - pytest -v + pytest -v -m "not slow" + +test-slow: venv + pytest -v -m "slow" docs: venv - cd docs && sphinx-apidoc -o source/ ../$(PROJECTNAME) && make html + cd docs && sphinx-apidoc -o source/ ../$(PROJECTNAME) && make clean html diff --git a/README.md b/README.md index c0dfe3ce..27792e7c 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,12 @@ +

+ GeoGrapher Logo
+

+

+ + Hippocratic License + +

+ # GeoGrapher *GeoGrapher* is a Python library for building remote sensing @@ -11,7 +20,7 @@ utility functions. # Installation This package has two external dependencies: -- Python 3.8 or newer. +- Python 3.9 or newer. - The geopandas and rasterio libraries might depend on GDAL base C libraries. See [https://geopandas.org/en/stable/getting_started/install.html#dependencies](https://geopandas.org/en/stable/getting_started/install.html#dependencies) and [https://pypi.org/project/rasterio/](https://pypi.org/project/rasterio/) diff --git a/docs/source/_static/GeoGrapher.png b/docs/source/_static/GeoGrapher.png new file mode 100644 index 00000000..0c808741 Binary files /dev/null and b/docs/source/_static/GeoGrapher.png differ diff --git a/docs/source/_templates/custom-module-template.rst b/docs/source/_templates/custom-module-template.rst index ef2c09a5..c3e01c7d 100644 --- a/docs/source/_templates/custom-module-template.rst +++ b/docs/source/_templates/custom-module-template.rst @@ -63,4 +63,4 @@ {{ item }} {%- endfor %} {% endif %} -{% endblock %} \ No newline at end of file +{% endblock %} diff --git a/docs/source/advanced_cutting.rst b/docs/source/advanced_cutting.rst index c3a534e3..c469dcc7 100644 --- a/docs/source/advanced_cutting.rst +++ b/docs/source/advanced_cutting.rst @@ -245,7 +245,7 @@ You can write a custom ``RasterFilterPredicate`` to do this:: target_assoc: Connector, new_raster_dict: dict, source_assoc: Connector, - cut_rasters: List[str], + cut_rasters: list[str], ) -> bool: local_timestamp: str = rasters.loc[raster_name, 'local_timestamp'] diff --git a/docs/source/basic_cutting.rst b/docs/source/basic_cutting.rst index 8f412919..8133f07a 100644 --- a/docs/source/basic_cutting.rst +++ b/docs/source/basic_cutting.rst @@ -58,7 +58,7 @@ cutouts around vector features from 10980 × 10980 Sentinel-2 tiles):: source_data_dir=, target_data_dir=, name= - new_raster_size: Optional[RasterSize] + new_raster_size: RasterSize | None new_raster_size=512, target_raster_count=2, mode: "random") diff --git a/docs/source/cluster_rasters.rst b/docs/source/cluster_rasters.rst index ac9e6ebd..c04dee0e 100644 --- a/docs/source/cluster_rasters.rst +++ b/docs/source/cluster_rasters.rst @@ -18,7 +18,7 @@ train/validation split to avoid data leakage use the .. code-block:: from geographer.utils.cluster_rasters import get_raster_clusters - clusters : List[Set[str]] = get_raster_clusters( + clusters : list[Set[str]] = get_raster_clusters( connector=connector, clusters_defined_by='rasters_that_share_vectors', preclustering_method='y then x-axis' diff --git a/docs/source/conf.py b/docs/source/conf.py index 4a7983cc..202ec0f6 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -13,17 +13,17 @@ import os import sys -sys.path.insert(0, os.path.abspath('./geographer')) +sys.path.insert(0, os.path.abspath("./geographer")) # -- Project information ----------------------------------------------------- -project = 'GeoGrapher' -copyright = 'Open Source TBD' -author = 'Rustam Antia' +project = "GeoGrapher" +copyright = "Open Source TBD" +author = "Rustam Antia" # The full version, including alpha/beta/rc tags -release = '0.1.0' +release = "0.1.0" # -- General configuration --------------------------------------------------- @@ -32,19 +32,19 @@ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'nbsphinx', - 'nbsphinx_link', - 'sphinx.ext.autodoc', - 'sphinx.ext.autosummary', - 'sphinx_autodoc_typehints', - 'sphinxcontrib.autodoc_pydantic', - 'sphinx.ext.napoleon', - 'sphinx.ext.todo', - 'sphinx.ext.viewcode', + "nbsphinx", + "nbsphinx_link", + "sphinx.ext.autodoc", + "sphinx.ext.autosummary", + "sphinx_autodoc_typehints", + "sphinxcontrib.autodoc_pydantic", + "sphinx.ext.napoleon", + "sphinx.ext.todo", + "sphinx.ext.viewcode", ] # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -56,12 +56,12 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -html_theme = 'alabaster' +html_theme = 'furo' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = [] +html_static_path = ["_static"] # HTML settings html_theme_options = { @@ -73,9 +73,11 @@ # autodoc settings autodoc_default_options = { - 'inherited-members': 'pydantic.BaseModel,BaseModel', + "inherited-members": "pydantic.BaseModel,BaseModel", } +autodoc_typehints = "description" # or "signature" + # autodoc_pydantic settings autodoc_pydantic_config_members = False autodoc_pydantic_model_show_config_member = False @@ -83,12 +85,11 @@ autodoc_pydantic_model_show_validator_summary = False autodoc_pydantic_model_show_validator_members = False autodoc_pydantic_model_show_field_summary = False -autodoc_pydantic_model_hide_paramlist = False # change? +autodoc_pydantic_model_hide_paramlist = False # change? autodoc_pydantic_model_signature_prefix = "class" autodoc_pydantic_model_show_json = False autodoc_pydantic_settings_show_json = False - # todo settings todo_include_todos = True @@ -106,4 +107,4 @@ napoleon_use_rtype = True napoleon_preprocess_types = False napoleon_type_aliases = None -napoleon_attr_annotations = True \ No newline at end of file +napoleon_attr_annotations = True diff --git a/docs/source/downloaders.rst b/docs/source/downloaders.rst index 99c714a6..9b9fef2d 100644 --- a/docs/source/downloaders.rst +++ b/docs/source/downloaders.rst @@ -3,12 +3,35 @@ Downloading rasters To download rasters for vector features use ``RasterDownloaderForVectors``. -By plugging in different ``DownloaderForSingleVector`` and ``Processor`` -components it can interface with different sources of remote sensing rasters. -Currently, it can interface with the Copernicus Open Access Hub for Sentinel-2 -rasters, and JAXA for ALOS DEM (digital elevation model) data, and can easily -be extended to other data sources by writing custom -``DownloaderForSingleSingleVector`` and ``Processor`` classes. +A ``RasterDownloaderForVectors`` requires components that implement the +abstract base classes ``DownloaderForSingleVector`` and ``Processor``. + + - ``DownloaderForSingleVector`` defines how to search for and download + a raster for a single vector from a provider. + - ``Processor`` how to process the downloaded file into a GeoTiff raster. + +Available implementations ++++++++++++++++++++++++++ + +Currently, there are two concrete implementations of ``DownloaderForSingleVector``: + + - ``EodagDownloaderForSingleVector``: Based on + the excellent `"eodag" `_ package, this implementation supports downloading + over 50 product types from more than 10 providers. + - ``JAXADownloaderForSingleVector``: Designed for downloading DEM (digital elevation model) + data from the `"JAXA ALOS" `_ mission. + +Additionally, there are two concrete implementations of the ``Processor`` class: + + - ``Sentinel2SAFEProcessor``: Processes Level-2A Sentinel-2 SAFE files. + - ``JAXADownloadProcessor``: Processes JAXA DEM data. + +To use the ``RasterDownloaderForVectors`` for a new provider or product type +you only need to write custom implementations of ``DownloaderForSingleVector`` +or ``Processor``. + +.. _EODAG: https://eodag.readthedocs.io/en/stable/ +.. _JAXA_ALOS: https://www.eorc.jaxa.jp/ALOS/en/index_e.htm Example usage +++++++++++++ @@ -19,54 +42,46 @@ Example usage: from geographer.downloaders import ( RasterDownloaderForVectors, - SentinelDownloaderForSingleVector, - Sentinel2Processor + EodagDownloaderForSingleVector, + Sentinel2SAFEProcessor, ) - downloader_for_single_vector=SentinelDownloaderForSingleVector() - download_processor=Sentinel2Processor() + download_processor = Sentinel2SAFEProcessor() + downloader_for_single_vector = EodagDownloaderForSingleVector() downloader = RasterDownloaderForVectors( downloader_for_single_vector=downloader_for_single_vector, download_processor=download_processor, ) + + # Parameters needed by the EodagDownloaderForSingleVector.download method + downloader_params = { + "search_kwargs": { # Keyword arguments for the eodag search_all method + "provider": "cop_dataspace", # Download from copernicus dataspace + "productType": "S2_MSI_L2A", # Search for Sentinel-2 L2A products + "start": "2024-11-01", + "end": "2024-12-01", + }, + "filter_online": True, # Filter out products that are not online + "sort_by": ("cloudCover", "ASC"), # Prioritize search results with less cloud cover + "suffix_to_remove": ".SAFE", # Will strip .SAFE from the stem of the tif file names + } + # Parameters needed by the Sentinel2SAFEProcessor + processor_params = { + "resolution": 10, # Extract all 10m resolution bands + "delete_safe": True, # Delete the SAFE file after extracting a .tif file + } + downloader.download( connector=my_connector, vector_names=optional_list_of_vector_names, target_raster_count=2, - producttype='L2A', - max_percent_cloud_coverage=10, - resolution=10, - date=('NOW-10DAYS', 'NOW'), - area_relation='Contains' + downloader_params=downloader_params, # Only needed the first time downloader.download is called + processor_params=processor_params, # Only needed the first time downoader.download is called ) The raster counts for all vector features are updated after every download, so that unnecessary downloads and an imbalance in the dataset due to clustering of nearby vector features are avoided. -You can supply default values for dataset/data source specific ``download`` -arguments (e.g. ``producttype``, ``max_percent_cloud_coverage`` for the -``SentinelDownloaderForSingleVector``) in the -``RasterDownloaderForVectors``'s ``kwarg_defaults`` arguments dict, -so that one doesn't have to pass them by hand to the ``download`` method, -for example: - -.. code-block:: python - - downloader = RasterDownloaderForVectors( - download_dir=, - downloader_for_single_vector=SentinelDownloaderForSingleVector(), - download_processor=Sentinel2Processor(), - kwarg_defaults={ - 'max_percent_cloud_coverage' = 10, - 'producttype': L2A, - 'resolution': 10, - 'date': ('NOW-10DAYS', 'NOW'), - 'area_relation': 'Contains'}) - downloader.download( - connector=my_connector, - vector_names=optional_list_of_vector_names, - target_raster_count=2) - Data sources ++++++++++++ @@ -77,15 +92,22 @@ to GeoTiffs. Sentinel-2 ~~~~~~~~~~ -For *Sentinel-2* data, use the ``SentinelDownloaderForSingleVector`` -to download rasters from the Copernicus Open Access Hub and the ``Sentinel2Processor``. +For *Sentinel-2* data, use the ``EodagDownloaderForSingleVector`` with +``"productType": "S2_MSI_L2A"`` together with the ``Sentinel2SAFEProcessor`` as above. +Tested with cop_dataspace. Expected to work with other archive_depth=2 providers +(creodias, onda, sara). If archive_depth differs, you'll need to adapt the processor. +Please submit the adapted ``RasterDownloadProcessor`` as a merge request :) -Sentinel-1 -~~~~~~~~~~ +Sources/providers supported by `eodag` +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``EodagDownloaderForSingleVector`` will work with any sources/providers +`supported by eodag `_. For ``"productType"``s other than +``"S2_MSI_L2A"`` with ``archive_depth`` 2, you will need to write a custom +``RasterDownloadProcessor``. Please submit your custom ``RasterDownloadProcessor`` +as a merge request :) -The ``SentinelDownloaderForSingleVector`` should work with slight modifications -for downloading Sentinel-1 data from Copernicus Open Access Hub as well. Feel free to -submit a pull request for this feature. +.. _EODAG_PROVIDERS: https://eodag.readthedocs.io/en/stable/getting_started_guide/providers.html JAXA DEM data ~~~~~~~~~~~~~ diff --git a/docs/source/index.rst b/docs/source/index.rst index 15bf138c..2c84fec2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,6 +1,11 @@ -###################################### -Welcome to GeoGrapher's documentation! -###################################### +| + +.. image:: _static/GeoGrapher.png + :width: 80px + :align: center + +GeoGrapher Documentation +======================== **GeoGrapher** is a Python library for building and handling remote sensing computer vision datasets assembled from vector features and rasters. @@ -11,7 +16,7 @@ provides highly general and customizable dataset cutting functionality as well as other utility functions. User guide -================== +----------- .. toctree:: :maxdepth: 1 @@ -28,7 +33,7 @@ User guide glossary API Reference -============= +------------- .. toctree:: :maxdepth: 1 @@ -36,7 +41,7 @@ API Reference geographer Indices and tables -================== +------------------ * :ref:`genindex` * :ref:`modindex` diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 1ebea2d3..500780b0 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -4,7 +4,7 @@ Installation This package has two external dependencies: -- Python 3.8 or newer. +- Python 3.9 or newer. - The geopandas and rasterio libraries might depend on GDAL base C libraries. See `here for the geopandas instructions `_ diff --git a/geographer/add_drop_rasters_mixin.py b/geographer/add_drop_rasters_mixin.py index d7499aa7..d8a5a6e7 100644 --- a/geographer/add_drop_rasters_mixin.py +++ b/geographer/add_drop_rasters_mixin.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Optional, Sequence +from typing import TYPE_CHECKING, Sequence import pandas as pd from geopandas import GeoDataFrame @@ -21,7 +21,7 @@ class AddDropRastersMixIn: """Mix-in that implements methods to add and drop rasters.""" def add_to_rasters( - self, new_rasters: GeoDataFrame, label_maker: Optional[LabelMaker] = None + self, new_rasters: GeoDataFrame, label_maker: LabelMaker | None = None ): """Add rasters to connector's ``rasters`` attribute. @@ -76,7 +76,6 @@ def add_to_rasters( # go through all new rasters... for raster_name in new_rasters.index: - # add new raster vertex to the graph, add all connections # to existing rasters, and modify self.vectors 'raster_count' value raster_bounding_rectangle = new_rasters.loc[raster_name, "geometry"] @@ -97,7 +96,7 @@ def drop_rasters( self, raster_names: Sequence[str], remove_rasters_from_disk: bool = True, - label_maker: Optional[LabelMaker] = None, + label_maker: LabelMaker | None = None, ): """Drop rasters from ``rasters`` and from dataset. diff --git a/geographer/add_drop_vectors_mixin.py b/geographer/add_drop_vectors_mixin.py index ed395f0d..ecdd3b4a 100644 --- a/geographer/add_drop_vectors_mixin.py +++ b/geographer/add_drop_vectors_mixin.py @@ -3,7 +3,7 @@ from __future__ import annotations import logging -from typing import TYPE_CHECKING, Optional, Sequence, Union +from typing import TYPE_CHECKING, Sequence import pandas as pd from geopandas import GeoDataFrame @@ -28,7 +28,7 @@ class AddDropVectorsMixIn(object): def add_to_vectors( self, new_vectors: GeoDataFrame, - label_maker: Optional[LabelMaker] = None, + label_maker: LabelMaker | None = None, ): """Add vector features to connector's ``vectors`` attribute. @@ -91,7 +91,6 @@ def add_to_vectors( # For each new vector feature... for vector_name in new_vectors.index: - # ... add a vertex for the new vector feature to the graph and add all # connections to existing rasters. self._add_vector_to_graph(vector_name, vectors=new_vectors) @@ -114,8 +113,8 @@ def add_to_vectors( def drop_vectors( self, - vector_names: Sequence[Union[str, int]], - label_maker: Optional[LabelMaker] = None, + vector_names: Sequence[str | int], + label_maker: LabelMaker | None = None, ): """Drop vector features from connector's ``vectors`` attribute. diff --git a/geographer/base_model_dict_conversion/base_model_dict_conversion_functional.py b/geographer/base_model_dict_conversion/base_model_dict_conversion_functional.py index ddb436b4..2812b52b 100644 --- a/geographer/base_model_dict_conversion/base_model_dict_conversion_functional.py +++ b/geographer/base_model_dict_conversion/base_model_dict_conversion_functional.py @@ -7,7 +7,7 @@ from __future__ import annotations from pathlib import Path -from typing import Any, Optional, Union +from typing import Any from pydantic import BaseModel @@ -15,7 +15,7 @@ def get_nested_base_model_dict( - base_model_obj_or_dict: Union[BaseModel, dict, Any] + base_model_obj_or_dict: BaseModel | dict | Any, ) -> dict: """Return nested dict for BaseModel or dict. @@ -26,7 +26,7 @@ def get_nested_base_model_dict( dict_ = base_model_obj_or_dict dict_items = base_model_obj_or_dict.items() elif isinstance(base_model_obj_or_dict, BaseModel): - dict_ = base_model_obj_or_dict.dict() + dict_ = base_model_obj_or_dict.model_dump() dict_items = base_model_obj_or_dict dict_or_base_model_fields_dict = { @@ -80,14 +80,14 @@ def get_nested_base_model_dict( return result -def get_nested_dict(obj: Union[BaseModel, dict, Any]) -> Union[dict, Any]: +def get_nested_dict(obj: BaseModel | dict | Any) -> dict | Any: """Return nested dict if obj is a BaseModel or dict else return obj.""" def eval_nested_base_model_dict( - dict_or_field_value: Union[dict, Any], - constructor_symbol_table: Optional[dict[str, Any]] = None, -) -> Union[BaseModel, Any]: + dict_or_field_value: dict | Any, + constructor_symbol_table: dict[str, Any] | None = None, +) -> BaseModel | Any: """Evaluate nested BaseModel dict (or field contents). Args: @@ -172,13 +172,13 @@ def is_base_model_constructor_dict(dict_: dict) -> bool: def get_base_model_constructor( - dict_: dict, constructor_symbol_table: Optional[dict[str, Any]] = None + dict_: dict, constructor_symbol_table: dict[str, Any] | None = None ) -> BaseModel: """Return constructor corresponding to encoded BaseModel. Args: dict_ (dict): nested base model dict - constructor_symbol_table (Optional[dict[str, Any]], optional): optional symbol + constructor_symbol_table (dict[str, Any] | None, optional): optional symbol table of constructors. Defaults to None. Returns: diff --git a/geographer/base_model_dict_conversion/save_load_base_model_mixin.py b/geographer/base_model_dict_conversion/save_load_base_model_mixin.py index 829355c3..3d716c4e 100644 --- a/geographer/base_model_dict_conversion/save_load_base_model_mixin.py +++ b/geographer/base_model_dict_conversion/save_load_base_model_mixin.py @@ -8,7 +8,7 @@ from importlib import import_module from inspect import getmro, isabstract, isclass from pathlib import Path -from typing import Any, Optional, Union +from typing import Any from pydantic import BaseModel @@ -28,7 +28,7 @@ def save(self): """Save instance to file.""" pass - def _save(self, json_file_path: Union[str, Path]) -> None: + def _save(self, json_file_path: Path | str) -> None: """Save to json_file.""" # Use to implement save method with file_path determined by use case json_file_path = Path(json_file_path) @@ -42,8 +42,8 @@ def _save(self, json_file_path: Union[str, Path]) -> None: @classmethod def from_json_file( cls, - json_file_path: Union[Path, str], - constructor_symbol_table: Optional[dict[str, Any]] = None, + json_file_path: Path | str, + constructor_symbol_table: dict[str, Any] | None = None, ) -> Any: """Load and return saved BaseModel.""" if constructor_symbol_table is None: diff --git a/geographer/connector.py b/geographer/connector.py index 4c89549b..b77821d7 100644 --- a/geographer/connector.py +++ b/geographer/connector.py @@ -6,7 +6,7 @@ import logging from json.decoder import JSONDecodeError from pathlib import Path -from typing import Any, Literal, Optional, Sequence, Type, TypeVar, Union +from typing import Any, Literal, Sequence, Type, TypeVar import geopandas as gpd from geopandas import GeoDataFrame @@ -70,15 +70,15 @@ def __init__( load_from_disk: bool, # data dir - data_dir: Union[Path, str], + data_dir: Path | str, # args w/o default values - vectors: Optional[GeoDataFrame] = None, - rasters: Optional[GeoDataFrame] = None, + vectors: GeoDataFrame | None = None, + rasters: GeoDataFrame | None = None, # remaining non-path args w/ default values - task_vector_classes: Optional[Sequence[str]] = None, - background_class: Optional[str] = None, + task_vector_classes: Sequence[str] | None = None, + background_class: str | None = None, crs_epsg_code: int = STANDARD_CRS_EPSG_CODE, raster_count_col_name: str = "raster_count", @@ -196,7 +196,7 @@ def __getattr__(self, key: str) -> Any: @classmethod def from_data_dir( cls: Type[ConnectorType], - data_dir: Union[Path, str], + data_dir: Path | str, ) -> ConnectorType: """Initialize a connector from a data directory. @@ -217,19 +217,20 @@ def from_data_dir( try: attrs_path = Path(connector_dir) / \ INFERRED_PATH_ATTR_FILENAMES["attrs_path"] - with open(attrs_path, "r") as read_file: - kwargs = json.load(read_file) - except FileNotFoundError as exc: + with open(attrs_path, "r") as file: + kwargs = json.load(file) + except FileNotFoundError: log.exception( "Missing connector file %s found in %s", INFERRED_PATH_ATTR_FILENAMES['attrs_path'], connector_dir) - raise exc + raise except JSONDecodeError: log.exception( "The %s file in %s is corrupted!", INFERRED_PATH_ATTR_FILENAMES['attrs_path'], connector_dir) + raise new_connector = cls( load_from_disk=True, @@ -399,7 +400,7 @@ def save(self): def empty_connector_same_format( self, - data_dir: Union[Path, str] + data_dir: Path | str ) -> Connector: """Return an empty connector of the same format. @@ -528,7 +529,7 @@ def _load_df_from_disk(self, df_name: str) -> GeoDataFrame: def _init_set_paths( self, - data_dir: Union[Path, str], + data_dir: Path | str, ): """Set paths to raster/label data and connector component files. @@ -552,7 +553,7 @@ def _init_set_paths( @classmethod def _get_default_dirs_from_data_dir( cls, - data_dir: Union[Path, str] + data_dir: Path | str ) -> tuple[Path, Path, Path]: data_dir = Path(data_dir) @@ -587,7 +588,7 @@ def _check_no_non_task_vector_classes_are_task_classes( task_vector_classes: list[str], background_class: str, **kwargs ): - """TODO. + """Check non-task vector class and task classes are disjoint. Args: task_vector_classes: [description] diff --git a/geographer/converters/combine_remove_vector_classes.py b/geographer/converters/combine_remove_vector_classes.py index 17dd0c6b..1131f08f 100644 --- a/geographer/converters/combine_remove_vector_classes.py +++ b/geographer/converters/combine_remove_vector_classes.py @@ -8,7 +8,7 @@ import logging import shutil -from typing import List, Optional, Union +from typing import Optional, Union import pandas as pd from geopandas.geodataframe import GeoDataFrame @@ -31,10 +31,10 @@ class DSConverterCombineRemoveClasses(DSCreatorFromSource): removing vector feature classes. """ - classes: List[Union[str, List[str]]] = Field( + classes: list[Union[str, list[str]]] = Field( description="Classes to keep and combine. See docstring." ) - new_class_names: Optional[List[str]] = Field( + new_class_names: Optional[list[str]] = Field( default=None, description="Names of new classes" ) class_separator: str = Field( @@ -208,7 +208,6 @@ def _create_or_update(self) -> Connector: self.target_connector.add_to_rasters(df_of_rasters_to_add_to_target_dataset) if self.label_maker is not None: - # Determine labels to delete: # For each raster that already existed in the target dataset ... for ( @@ -260,7 +259,6 @@ def _get_new_class_names(self, classes: list[str]) -> list[str]: def _run_safety_checks( self, classes_to_keep: list[str], new_class_names: list[str] ): - if not set(classes_to_keep) <= set(self.source_connector.all_vector_classes): classes_not_in_source_dataset = set(classes_to_keep) - set( self.source_connector.all_vector_classes @@ -286,7 +284,7 @@ def _combine_or_remove_classes_from_vectors( self, label_type: str, vectors: GeoDataFrame, - classes: list[Union[str, list[str]]], + classes: list[str | list[str]], new_class_names: list[str], all_source_vector_classes: list[str], ) -> GeoDataFrame: @@ -377,6 +375,7 @@ def prob_of_class_names(classes: list[str]) -> list[str]: vectors = GeoDataFrame( pd.concat([vectors, temp_vectors], axis=1), # column axis crs=vectors.crs, + geometry="geometry", ) vectors.index.name = VECTOR_FEATURES_INDEX_NAME diff --git a/geographer/converters/label_type_soft_to_categorical.py b/geographer/converters/label_type_soft_to_categorical.py index e59fef12..9818c02f 100644 --- a/geographer/converters/label_type_soft_to_categorical.py +++ b/geographer/converters/label_type_soft_to_categorical.py @@ -41,7 +41,6 @@ def _update(self): self._create_or_update() def _create_or_update(self) -> Connector: - if self.source_assoc.label_type != "soft-categorical": raise ValueError( "Only works with label_type soft-categorical\n" diff --git a/geographer/converters/tif_to_npy.py b/geographer/converters/tif_to_npy.py index 854523c1..28d34d07 100644 --- a/geographer/converters/tif_to_npy.py +++ b/geographer/converters/tif_to_npy.py @@ -1,7 +1,5 @@ """Convert a dataset of GeoTiffs to NPYs.""" -from __future__ import annotations - import logging from typing import Literal @@ -36,7 +34,6 @@ def _update(self): self._create_or_update() def _create_or_update(self) -> None: - # need this later geoms_that_will_be_added_to_target_dataset = set( self.source_assoc.geoms_df.index @@ -84,7 +81,6 @@ def _create_or_update(self) -> None: ).is_file(): # ... convert the tif: Open the tif file ... with rio.open(tif_dir / tif_raster_name) as src: - raster_bands = self._get_bands_for_raster( self.bands, tif_dir / tif_raster_name, diff --git a/geographer/creator_from_source_dataset_base.py b/geographer/creator_from_source_dataset_base.py index 00790e8f..12349e32 100644 --- a/geographer/creator_from_source_dataset_base.py +++ b/geographer/creator_from_source_dataset_base.py @@ -1,12 +1,17 @@ """ABC for creating or updating a dataset from an existing source dataset.""" -from __future__ import annotations - from abc import ABC, abstractmethod from pathlib import Path -from typing import Dict, List, Optional - -from pydantic import BaseModel, Field, PrivateAttr +from typing import Optional + +from pydantic import ( + BaseModel, + ConfigDict, + Field, + PrivateAttr, + field_validator, + model_validator, +) from geographer.base_model_dict_conversion.save_load_base_model_mixin import ( SaveAndLoadBaseModelMixIn, @@ -21,6 +26,8 @@ class DSCreatorFromSource(ABC, SaveAndLoadBaseModelMixIn, BaseModel): """ABC for creating or updating a dataset from an existing one.""" + model_config = ConfigDict(extra="allow", arbitrary_types_allowed=True) + source_data_dir: Path target_data_dir: Path name: str = Field( @@ -30,24 +37,39 @@ class DSCreatorFromSource(ABC, SaveAndLoadBaseModelMixIn, BaseModel): _source_connector: Optional[Connector] = PrivateAttr(default=None) _target_connector: Optional[Connector] = PrivateAttr(default=None) - class Config: - """BaseModel Config.""" - - arbitrary_types_allowed = True - extra = "allow" - underscore_attrs_are_private = True - - def __init__(self, **data): - """Initialize from data. - - Args: - data: data - """ - super().__init__(**data) - self._source_connector = None - self._target_connector = None - self._set_source_connector() - self._set_target_connector() + @field_validator("source_data_dir", mode="before") + def validate_source_data_dir(cls, value: Path) -> Path: + """Ensure source_data_dir is a valid path.""" + if not value.is_dir(): + raise ValueError(f"Invalid source_data_dir: {value}") + return value + + @model_validator(mode="after") + def validate_connectors(self) -> "DSCreatorFromSource": + """Initialize and validate connectors.""" + if self.source_data_dir: + self._source_connector = Connector.from_data_dir(self.source_data_dir) + + if self.target_data_dir: + connector_file_paths_exist = [ + (self.target_data_dir / DEFAULT_CONNECTOR_DIR_NAME / filename).is_file() + for filename in INFERRED_PATH_ATTR_FILENAMES.values() + ] + + if all(connector_file_paths_exist): + self._target_connector = Connector.from_data_dir(self.target_data_dir) + elif not any(connector_file_paths_exist): + self._target_connector = ( + self._source_connector.empty_connector_same_format( + self.target_data_dir + ) + ) + else: + raise ValueError( + "Corrupted target dataset: only some of the connector files exist." + ) + + return self @abstractmethod def _create(self, *args, **kwargs) -> Connector: @@ -79,21 +101,11 @@ def save(self): @property def source_connector(self): """Connector in source_data_dir.""" - if ( - self._source_connector is None - or self._source_connector.rasters_dir.parent != self.source_data_dir - ): - self._set_source_connector() return self._source_connector @property def target_connector(self): """Connector in target_data_dir.""" - if ( - self._target_connector is None - or self._target_connector.rasters_dir.parent != self.target_data_dir - ): - self._set_target_connector() return self._target_connector def _after_creating_or_updating(self): @@ -102,30 +114,6 @@ def _after_creating_or_updating(self): Can be used to e.g. save parameters to the target_connector. """ - def _set_source_connector(self): - """Set source connector.""" - self._source_connector = Connector.from_data_dir(self.source_data_dir) - - def _set_target_connector(self): - """Set target connector.""" - connector_file_paths_exist = [ - (self.target_data_dir / DEFAULT_CONNECTOR_DIR_NAME / filename).is_file() - for filename in INFERRED_PATH_ATTR_FILENAMES.values() - ] - - if all(connector_file_paths_exist): - target_connector = Connector.from_data_dir(self.target_data_dir) - elif not any(connector_file_paths_exist): - target_connector = self.source_connector.empty_connector_same_format( - self.target_data_dir - ) - else: - raise ValueError( - "Corrupted target dataset: only some of the connector files exist." - ) - - self._target_connector = target_connector - def _add_missing_vectors_to_target(self): """Add missing vector features from source dataset to target dataset. @@ -152,7 +140,7 @@ class DSCreatorFromSourceWithBands(DSCreatorFromSource, ABC): Includes a bands field. """ - bands: Optional[Dict[str, Optional[List[int]]]] = Field( + bands: Optional[dict[str, Optional[list[int]]]] = Field( default=None, title="Dict of band indices", description="keys: raster directory names, values: list of band indices " diff --git a/geographer/cutters/cut_iter_over_rasters.py b/geographer/cutters/cut_iter_over_rasters.py index 5991b8c5..b55ab4b8 100644 --- a/geographer/cutters/cut_iter_over_rasters.py +++ b/geographer/cutters/cut_iter_over_rasters.py @@ -4,10 +4,8 @@ datasets of GeoTiffs from existing ones by iterating over rasters. """ -from __future__ import annotations - import logging -from typing import List, Optional +from typing import Optional from geopandas import GeoDataFrame from pydantic import Field @@ -41,7 +39,7 @@ class DSCutterIterOverRasters(DSCreatorFromSourceWithBands): description="Optional label maker. If given, will be used to recompute labels\ when necessary. Defaults to None", ) - cut_rasters: List[str] = Field( + cut_rasters: list[str] = Field( default_factory=list, description=( "Names of cut rasters in source_data_dir. Usually not to be set by hand!" @@ -129,7 +127,6 @@ def _create_or_update(self) -> None: for raster_name in tqdm( self.source_connector.rasters.index, desc="Cutting dataset: " ): - # If filter condition is satisfied, (if not, don't do anything) ... if self.raster_filter_predicate( raster_name, @@ -138,7 +135,6 @@ def _create_or_update(self) -> None: source_connector=self.source_connector, cut_rasters=self.cut_rasters, ): - # ... cut the rasters (and their labels) and remember information # to be appended to self.target_connector rasters in return dict rasters_from_single_cut_dict = self.raster_cutter( @@ -169,7 +165,6 @@ def _create_or_update(self) -> None: for new_raster_name, raster_bounding_rectangle in zip( new_raster_names, raster_bounding_rectangles ): - self.cut_rasters.append(raster_name) # Update graph and modify vectors in self.target_connector @@ -181,7 +176,9 @@ def _create_or_update(self) -> None: # Extract accumulated information about the rasters we've # created in the target dataset into a dataframe... new_rasters = GeoDataFrame( - new_rasters_dict, crs=self.target_connector.rasters.crs + new_rasters_dict, + crs=self.target_connector.rasters.crs, + geometry="geometry", ) new_rasters.set_index(RASTER_IMGS_INDEX_NAME, inplace=True) diff --git a/geographer/cutters/cut_iter_over_vectors.py b/geographer/cutters/cut_iter_over_vectors.py index f71c2320..4dd675af 100644 --- a/geographer/cutters/cut_iter_over_vectors.py +++ b/geographer/cutters/cut_iter_over_vectors.py @@ -9,7 +9,7 @@ import logging from collections import defaultdict -from typing import Dict, List, Optional, Union +from typing import Optional from geopandas import GeoDataFrame from pydantic import Field @@ -61,7 +61,7 @@ class DSCutterIterOverVectors(DSCreatorFromSourceWithBands): description="Optional label maker. If given, will be used to recompute labels\ when necessary. Defaults to None", ) - cut_rasters: Dict[str, List[str]] = Field( + cut_rasters: dict[str, list[str]] = Field( default_factory=lambda: defaultdict(list), title="Cut rasters dictionary", description="Normally, should not be set by hand! Dict with vector features\ @@ -153,7 +153,6 @@ def _create_or_update(self) -> None: # For each vector feature ... for vector_name in tqdm(vectors_to_iterate_over, desc="Cutting dataset: "): - # ... if we want to create new rasters for it ... if self.vector_filter_predicate( vector_name=vector_name, @@ -161,7 +160,6 @@ def _create_or_update(self) -> None: new_rasters_dict=new_rasters_dict, source_connector=self.source_connector, ): - # ... remember it ... added_vectors += [vector_name] @@ -186,7 +184,6 @@ def _create_or_update(self) -> None: source_connector=self.source_connector, cut_rasters=self.cut_rasters, ): - # Cut each raster (and label) and remember the information to be # appended to self.target_connector rasters in return dict rasters_from_single_cut_dict = self.raster_cutter( @@ -222,7 +219,6 @@ def _create_or_update(self) -> None: for new_raster_name, raster_bounding_rectangle in zip( new_raster_names, raster_bounding_rectangles ): - # Update graph and modify vectors # in self.target_connector self.target_connector._add_raster_to_graph_modify_vectors( @@ -246,7 +242,9 @@ def _create_or_update(self) -> None: # Extract accumulated information about the rasters we've created in the target # dataset into a dataframe... new_rasters = GeoDataFrame( - new_rasters_dict, crs=self.target_connector.rasters.crs + new_rasters_dict, + crs=self.target_connector.rasters.crs, + geometry="geometry", ) new_rasters.set_index(RASTER_IMGS_INDEX_NAME, inplace=True) @@ -288,7 +286,7 @@ def _create_or_update(self) -> None: self.target_connector.save() def _filter_out_previously_cut_rasters( - self, vector_name: Union[str, int], src_rasters_containing_vector: set[str] + self, vector_name: str | int, src_rasters_containing_vector: set[str] ) -> list[str]: """Filter out previously cut rasters. diff --git a/geographer/cutters/cut_rasters_around_every_vector.py b/geographer/cutters/cut_rasters_around_every_vector.py index d762e8db..140e0079 100644 --- a/geographer/cutters/cut_rasters_around_every_vector.py +++ b/geographer/cutters/cut_rasters_around_every_vector.py @@ -1,8 +1,10 @@ """Dataset cutter that cuts out rasters around vector features.""" +from __future__ import annotations + import logging from pathlib import Path -from typing import Literal, Optional, Union +from typing import Literal from geographer.cutters.cut_iter_over_vectors import DSCutterIterOverVectors from geographer.cutters.raster_selectors import RandomRasterSelector, RasterSelector @@ -19,15 +21,15 @@ def get_cutter_rasters_around_every_vector( - source_data_dir: Union[Path, str], - target_data_dir: Union[Path, str], + source_data_dir: Path | str, + target_data_dir: Path | str, name: str, mode: Literal["random", "centered", "variable"] = "random", - new_raster_size: Optional[RasterSize] = 512, - min_new_raster_size: Optional[RasterSize] = None, - scaling_factor: Optional[float] = None, + new_raster_size: RasterSize | None = 512, + min_new_raster_size: RasterSize | None = None, + scaling_factor: float | None = None, target_raster_count: int = 1, - bands: Optional[dict] = None, + bands: dict | None = None, random_seed: int = 10, ) -> DSCutterIterOverVectors: """Return dataset cutter that creates cutouts around vector features. diff --git a/geographer/cutters/raster_filter_predicates.py b/geographer/cutters/raster_filter_predicates.py index 9c78adef..e8b911d0 100644 --- a/geographer/cutters/raster_filter_predicates.py +++ b/geographer/cutters/raster_filter_predicates.py @@ -8,7 +8,6 @@ from abc import ABC, abstractmethod from collections.abc import Callable from pathlib import Path -from typing import Union from geopandas import GeoSeries from pandas import Series @@ -116,12 +115,12 @@ class RasterFilterRowCondition(RasterFilterPredicate): row_series_predicate: RowSeriesPredicate def __init__( - self, row_series_predicate: Callable[[Union[GeoSeries, Series]], bool] + self, row_series_predicate: Callable[[GeoSeries | Series], bool] ) -> None: """Initialize an instance of RasterFilterRowCondition. Args: - row_series_predicate (Callable[[Union[GeoSeries, Series]], bool]): + row_series_predicate: predicate to apply to the row corresponding to a raster (i.e. source_connector.rasters.loc[raster_name]) """ @@ -154,19 +153,20 @@ def __call__( result of aplying self.row_series_predicate to source_connector.rasters[raster_name] """ - row_series: Union[GeoSeries, Series] = source_connector.rasters.loc[raster_name] + row_series: GeoSeries | Series = source_connector.rasters.loc[raster_name] answer = self.row_series_predicate(row_series) return answer def wrap_function_as_RowSeriesPredicate( - fun: Callable[[Union[GeoSeries, Series]], bool] + fun: Callable[[GeoSeries | Series], bool], ) -> RowSeriesPredicate: """Wrap a function as a RowSeriesPredicate. Args: - fun (Callable[[Union[GeoSeries, Series]], bool]): + fun: + Function to wrap. Returns: RowSeriesPredicate. diff --git a/geographer/cutters/raster_selectors.py b/geographer/cutters/raster_selectors.py index dabccf3c..a2abe818 100644 --- a/geographer/cutters/raster_selectors.py +++ b/geographer/cutters/raster_selectors.py @@ -9,7 +9,7 @@ import random from abc import abstractmethod from pathlib import Path -from typing import Any, Union +from typing import Any from pydantic import BaseModel @@ -85,7 +85,7 @@ class RandomRasterSelector(RasterSelector): def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, raster_names_list: list[str], target_connector: Connector, new_rasters_dict: dict, diff --git a/geographer/cutters/single_raster_cutter_around_vector.py b/geographer/cutters/single_raster_cutter_around_vector.py index 53563ea8..7c9b0cc2 100644 --- a/geographer/cutters/single_raster_cutter_around_vector.py +++ b/geographer/cutters/single_raster_cutter_around_vector.py @@ -6,7 +6,7 @@ import math import random from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Literal, Optional import rasterio as rio from affine import Affine @@ -45,9 +45,9 @@ class SingleRasterCutterAroundVector(SingleRasterCutter): def __init__( self, mode: str, - new_raster_size: Optional[RasterSize] = None, - scaling_factor: Optional[float] = 1.2, - min_new_raster_size: Optional[RasterSize] = None, + new_raster_size: RasterSize | None = None, + scaling_factor: float | None = 1.2, + min_new_raster_size: RasterSize | None = None, random_seed: int = 42, **kwargs, ) -> None: @@ -131,7 +131,7 @@ def _check_raster_size_type_and_value(self, raster_size: RasterSize): @staticmethod def _get_size_rows_cols( - raster_size: Union[int, tuple[int, int]] + raster_size: int | tuple[int, int], ) -> tuple[int, int]: if isinstance(raster_size, tuple): new_raster_size_rows = raster_size[0] @@ -147,7 +147,7 @@ def _get_windows_transforms_raster_names( source_raster_name: str, source_connector: Connector, target_connector: Connector, - new_rasters_dict: Optional[dict] = None, + new_rasters_dict: dict | None = None, **kwargs: Any, ) -> list[tuple[Window, Affine, str]]: """Return windwos, transforms, and names of new rasters. @@ -179,7 +179,6 @@ def _get_windows_transforms_raster_names( vector_geom = target_connector.vectors.loc[vector_name, "geometry"] with rio.open(source_raster_path) as src: - # transform vector feature from connector's crs to raster source crs transformed_vector_geom = transform_shapely_geometry( vector_geom, @@ -248,7 +247,6 @@ def _get_windows_transforms_raster_names( for raster_row in range(num_small_rasters_in_row_direction): for raster_col in range(num_small_rasters_in_col_direction): - # Define the square window with the calculated offsets. window = rio.windows.Window( col_off=col_off + new_raster_size_cols * raster_col, @@ -283,7 +281,6 @@ def _get_windows_transforms_raster_names( # append window if it intersects the vector feature if window_bounding_rectangle.intersects(transformed_vector_geom): - windows_transforms_raster_names_single_geom.append( (window, window_transform, new_raster_name) ) @@ -346,7 +343,6 @@ def _get_grid_row_col_offsets_num_windows_row_col_direction( # Choose row and col offset if self.mode == "random": - # ... choose row and col offsets randomly subject to constraint that the # grid of raster windows contains rectangular envelope of vector feature. row_off = random.randint( @@ -374,7 +370,6 @@ def _get_grid_row_col_offsets_num_windows_row_col_direction( ) elif self.mode in {"centered", "variable"}: - # ... to find the row, col offsets to center the vector feature ... # ...we first find the centroid of the vector feature in the raster crs ... @@ -392,7 +387,6 @@ def _get_grid_row_col_offsets_num_windows_row_col_direction( ) else: - raise ValueError(f"Unknown mode: {self.mode}") return ( diff --git a/geographer/cutters/single_raster_cutter_base.py b/geographer/cutters/single_raster_cutter_base.py index d7e835fd..97ed718e 100644 --- a/geographer/cutters/single_raster_cutter_base.py +++ b/geographer/cutters/single_raster_cutter_base.py @@ -5,7 +5,7 @@ import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Optional, Tuple, Union +from typing import Any, Tuple import rasterio as rio from affine import Affine @@ -30,8 +30,8 @@ def _get_windows_transforms_raster_names( self, source_raster_name: str, source_connector: Connector, - target_connector: Optional[Connector] = None, - new_rasters_dict: Optional[dict] = None, + target_connector: Connector | None = None, + new_rasters_dict: dict | None = None, **kwargs: Any, ) -> list[Tuple[Window, Affine, str]]: """Return windows, window transforms, and new rasters. @@ -57,9 +57,9 @@ def __call__( self, raster_name: str, source_connector: Connector, - target_connector: Optional[Connector] = None, - new_rasters_dict: Optional[dict] = None, - bands: Optional[dict[str, Optional[list[int]]]] = None, + target_connector: Connector | None = None, + new_rasters_dict: dict | None = None, + bands: dict[str, list[int] | None] | None = None, **kwargs: Any, ) -> dict: """Cut new rasters and return return_dict. @@ -117,7 +117,6 @@ def __call__( window_transform, new_raster_name, ) in windows_transforms_raster_names: - # Make new raster and label in target dataset ... raster_bounds_in_raster_crs, raster_crs = self._make_new_raster_and_label( new_raster_name=new_raster_name, @@ -203,7 +202,7 @@ def _make_new_raster_and_label( target_connector: Connector, window: Window, window_transform: Affine, - bands: Optional[dict[str, Optional[list[int]]]], + bands: dict[str, list[int] | None] | None, ) -> Tuple[Tuple[float, float, float, float], CRS]: """Make a new raster and label. @@ -222,7 +221,6 @@ def _make_new_raster_and_label( for count, (source_rasters_dir, target_rasters_dir) in enumerate( zip(source_connector.raster_data_dirs, target_connector.raster_data_dirs) ): - source_raster_path = source_rasters_dir / source_raster_name dst_raster_path = target_rasters_dir / new_raster_name @@ -257,8 +255,8 @@ def _make_new_raster_and_label( def _write_window_to_geotif( self, - src_raster_path: Union[Path, str], - dst_raster_path: Union[Path, str], + src_raster_path: Path | str, + dst_raster_path: Path | str, raster_bands: list[int], window: Window, window_transform: Affine, @@ -277,7 +275,6 @@ def _write_window_to_geotif( """ # Open source ... with rio.open(src_raster_path) as src: - # and destination ... Path(dst_raster_path).parent.mkdir(exist_ok=True, parents=True) with rio.open( @@ -291,10 +288,8 @@ def _write_window_to_geotif( crs=src.crs, transform=window_transform, ) as dst: - # ... and go through the bands. for target_band, source_band in enumerate(raster_bands, start=1): - # Read window for that band from source ... new_raster_band_raster = src.read(source_band, window=window) diff --git a/geographer/cutters/single_raster_cutter_bbox.py b/geographer/cutters/single_raster_cutter_bbox.py index 729ea8c7..2fe60619 100644 --- a/geographer/cutters/single_raster_cutter_bbox.py +++ b/geographer/cutters/single_raster_cutter_bbox.py @@ -4,7 +4,7 @@ import logging from pathlib import Path -from typing import Any, Optional, Union +from typing import Any import geopandas as gpd import rasterio as rio @@ -21,7 +21,7 @@ def _correct_window_offset( - offset: Union[int, float], size: Union[int, float], new_size: int + offset: int | float, size: int | float, new_size: int ) -> int: center = offset + size / 2 return int(center - new_size / 2) @@ -47,7 +47,7 @@ def __init__(self, **data) -> None: bbox_geojson_path: path to geojson file containing the bboxes """ super().__init__(**data) - self._bboxes_df = gpd.read_file(self.bbox_geojson_path, driver="GeoJSON") + self._bboxes_df = gpd.read_file(self.bbox_geojson_path) @field_validator("bbox_geojson_path") def path_points_to_geojson(cls, value: Path): @@ -106,11 +106,10 @@ def _get_windows_transforms_raster_names( self, source_raster_name: str, source_connector: Connector, - target_connector: Optional[Connector] = None, - new_rasters_dict: Optional[dict] = None, + target_connector: Connector | None = None, + new_rasters_dict: dict | None = None, **kwargs: Any, ) -> list[str]: - source_raster_path = source_connector.rasters_dir / source_raster_name with rio.open(source_raster_path) as src: diff --git a/geographer/cutters/single_raster_cutter_grid.py b/geographer/cutters/single_raster_cutter_grid.py index 4ff73048..736a564a 100644 --- a/geographer/cutters/single_raster_cutter_grid.py +++ b/geographer/cutters/single_raster_cutter_grid.py @@ -4,7 +4,7 @@ import logging from pathlib import Path -from typing import Any, Optional +from typing import Any import rasterio as rio from affine import Affine @@ -71,15 +71,13 @@ def _get_windows_transforms_raster_names( self, source_raster_name: str, source_connector: Connector, - target_connector: Optional[Connector] = None, - new_rasters_dict: Optional[dict] = None, + target_connector: Connector | None = None, + new_rasters_dict: dict | None = None, **kwargs: Any, ) -> list[tuple[Window, Affine, str]]: - source_raster_path = source_connector.rasters_dir / source_raster_name with rio.open(source_raster_path) as src: - if not src.height % self.new_raster_size_rows == 0: logger.warning( "number of rows in source raster not divisible by " @@ -96,7 +94,6 @@ def _get_windows_transforms_raster_names( # Iterate through grid ... for i in range(src.width // self.new_raster_size_cols): for j in range(src.height // self.new_raster_size_rows): - # ... remember windows, ... window = Window( i * self.new_raster_size_cols, diff --git a/geographer/cutters/type_aliases.py b/geographer/cutters/type_aliases.py index 1e584b3d..370748ed 100644 --- a/geographer/cutters/type_aliases.py +++ b/geographer/cutters/type_aliases.py @@ -1,9 +1,5 @@ """Type aliases.""" -from __future__ import annotations +from typing import Tuple, Union # noqa: I001,I005 -from typing import Optional, Tuple, Union # noqa: I001,I005 - -# Tuple instead of tuple because pydantic needs old-style -# type declarations for python 3.8 -RasterSize = Optional[Union[int, Tuple[int, int]]] +RasterSize = Union[int, Tuple[int, int]] diff --git a/geographer/cutters/vector_filter_predicates.py b/geographer/cutters/vector_filter_predicates.py index 2c72731a..441f855a 100644 --- a/geographer/cutters/vector_filter_predicates.py +++ b/geographer/cutters/vector_filter_predicates.py @@ -4,7 +4,7 @@ import collections from abc import abstractmethod -from typing import Any, Literal, Union +from typing import Any, Literal from geopandas import GeoSeries from pandas import Series @@ -25,7 +25,7 @@ class VectorFilterPredicate(BaseModel, collections.abc.Callable): @abstractmethod def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, target_connector: Connector, new_rasters_dict: dict, source_connector: Connector, @@ -77,7 +77,7 @@ class IsVectorMissingRasters(VectorFilterPredicate): def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, target_connector: Connector, new_rasters_dict: dict, source_connector: Connector, @@ -114,7 +114,7 @@ class AlwaysTrue(VectorFilterPredicate): def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, target_connector: Connector, new_rasters_dict: dict, source_connector: Connector, @@ -134,7 +134,7 @@ class OnlyThisVector(VectorFilterPredicate): is equal to this_vector_name. """ - def __init__(self, this_vector_name: Union[str, int]) -> None: + def __init__(self, this_vector_name: str | int) -> None: """Initialize OnlyThisVector. Args: @@ -145,7 +145,7 @@ def __init__(self, this_vector_name: Union[str, int]) -> None: def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, target_connector: Connector, new_rasters_dict: dict, source_connector: Connector, @@ -165,17 +165,18 @@ class FilterVectorByRowCondition(VectorFilterPredicate): def __init__( self, row_series_predicate: collections.abc.Callable[ - [Union[GeoSeries, Series]], bool + [GeoSeries | Series], bool ], mode: Literal["source", "target"], ) -> None: """Initialize FilterVectorByRowCondition. Args: - row_series_predicate (Callable[Union[[GeoSeries, Series]], bool]): + row_series_predicate: predicate to apply to the row corresponding to a vector feature in vectors in source_connector or target_connector. - mode (str) : Which GeoDataFrame the predicate should be applied to. + mode: + Which GeoDataFrame the predicate should be applied to. One of 'source' or 'target' """ super().__init__() @@ -189,7 +190,7 @@ def __init__( def __call__( self, - vector_name: Union[str, int], + vector_name: str | int, target_connector: Connector, new_rasters_dict: dict, source_connector: Connector, @@ -202,7 +203,7 @@ def __call__( connector = source_connector vectors = connector.vectors - row_series: Union[GeoSeries, Series] = vectors.loc[vector_name] + row_series: GeoSeries | Series = vectors.loc[vector_name] answer = self.row_series_predicate(row_series) return answer diff --git a/geographer/downloaders/__init__.py b/geographer/downloaders/__init__.py index 30c7e486..2daf61dc 100644 --- a/geographer/downloaders/__init__.py +++ b/geographer/downloaders/__init__.py @@ -1,9 +1,9 @@ from geographer.downloaders.downloader_for_vectors import RasterDownloaderForVectors +from geographer.downloaders.eodag_downloader_for_single_vector import ( + EodagDownloaderForSingleVector, +) from geographer.downloaders.jaxa_download_processor import JAXADownloadProcessor from geographer.downloaders.jaxa_downloader_for_single_vector import ( JAXADownloaderForSingleVector, ) -from geographer.downloaders.sentinel2_download_processor import Sentinel2Processor -from geographer.downloaders.sentinel2_downloader_for_single_vector import ( - SentinelDownloaderForSingleVector, -) +from geographer.downloaders.sentinel2_download_processor import Sentinel2SAFEProcessor diff --git a/geographer/downloaders/base_download_processor.py b/geographer/downloaders/base_download_processor.py index 8aa108ce..775e8d77 100644 --- a/geographer/downloaders/base_download_processor.py +++ b/geographer/downloaders/base_download_processor.py @@ -2,12 +2,15 @@ from __future__ import annotations +import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Literal, Union +from typing import Any, Literal from pydantic import BaseModel +log = logging.getLogger(__name__) + class RasterDownloadProcessor(ABC, BaseModel): """Base class for download processors.""" @@ -19,20 +22,27 @@ def process( download_dir: Path, rasters_dir: Path, return_bounds_in_crs_epsg_code: int, - **kwargs: Any, + **params: Any, ) -> dict[ - Union[Literal["raster_name", "geometry", "orig_crs_epsg_code"], str], Any + Literal["raster_name", "geometry", "orig_crs_epsg_code"] | str, Any ]: """Process a single download. Args: - raster_name: name of raster - download_dir: directory containing download - rasters_dir: directory to place processed raster in - crs_epsg_code: EPSG code of crs raster bounds should be returned in - kwargs: other keyword arguments + raster_name: + Name of raster + download_dir: + Directory containing download + rasters_dir: + Directory to place processed raster in + crs_epsg_code: + EPSG code of crs raster bounds should be returned in + params: + Additional keyword arguments. Corresponds to the processor_params + argument of the RasterDownloaderForVectors.download method. Returns: - return_dict: Contains information about the downloaded product. + return_dict: + Contains information about the downloaded product. Keys should include: 'raster_name', 'geometry', 'orig_crs_epsg_code'. """ diff --git a/geographer/downloaders/base_downloader_for_single_vector.py b/geographer/downloaders/base_downloader_for_single_vector.py index 46650457..d70c3d83 100644 --- a/geographer/downloaders/base_downloader_for_single_vector.py +++ b/geographer/downloaders/base_downloader_for_single_vector.py @@ -2,13 +2,16 @@ from __future__ import annotations +import logging from abc import ABC, abstractmethod from pathlib import Path -from typing import Any, Literal, Union +from typing import Any, Literal from pydantic import BaseModel from shapely.geometry import Polygon +log = logging.getLogger(__name__) + class RasterDownloaderForSingleVector(ABC, BaseModel): """Base class for downloaders for a single vector feature.""" @@ -16,21 +19,26 @@ class RasterDownloaderForSingleVector(ABC, BaseModel): @abstractmethod def download( self, - vector_name: Union[int, str], + vector_name: str | int, vector_geom: Polygon, download_dir: Path, - previously_downloaded_rasters_set: set[Union[str, int]], - **kwargs, - ) -> dict[Union[Literal["raster_name", "raster_processed?"], str], Any]: + previously_downloaded_rasters_set: set[str | int], + **params: Any, + ) -> dict[Literal["raster_name", "raster_processed?"] | str, Any]: """Download (a series of) raster(s) for a single vector feature. Args: - vector_name: name of vector feature - vector_geom: geometry of vector feature - download_dir: directory to download to - previously_downloaded_rasters_set: set of (names of) - previously downloaded rasters - kwargs: other keyword arguments + vector_name: + Name of vector feature + vector_geom: + Geometry of vector feature + download_dir: + Directory in which raw downloads are placed + previously_downloaded_rasters_set: + Set of (names of) previously downloaded rasters + params: + Additional keyword arguments. Corresponds to the downloader_params + argument of the RasterDownloaderForVectors.download method. Returns: Dict with a key 'list_raster_info_dicts': The corresponding value is a diff --git a/geographer/downloaders/downloader_for_vectors.py b/geographer/downloaders/downloader_for_vectors.py index ed453039..a7b91c0d 100644 --- a/geographer/downloaders/downloader_for_vectors.py +++ b/geographer/downloaders/downloader_for_vectors.py @@ -5,12 +5,12 @@ import logging import random import shutil -from collections import Counter, defaultdict +from collections import Counter from pathlib import Path -from typing import Dict, Optional, Union +from typing import Any, Union from geopandas import GeoDataFrame -from pydantic import BaseModel, Field +from pydantic import BaseModel from shapely.ops import unary_union from tqdm.auto import tqdm @@ -29,8 +29,6 @@ ) from geographer.utils.utils import concat_gdfs -DEFAULT_TEMP_DOWNLOAD_DIR_NAME = "temp_download_dir" - log = logging.getLogger(__name__) log.setLevel(logging.WARNING) @@ -40,65 +38,70 @@ class RasterDownloaderForVectors(BaseModel, SaveAndLoadBaseModelMixIn): downloader_for_single_vector: RasterDownloaderForSingleVector download_processor: RasterDownloadProcessor - kwarg_defaults: Dict = Field(default_factory=dict) + temp_dir_relative_path: Union[Path, str] = "temp_download_dir" def download( self, - connector: Union[Path, str, Connector], - vector_names: Optional[Union[str, int, list[int], list[str]]] = None, + connector: Path | str | Connector, + vector_names: str | int | list[int] | list[str] | None = None, target_raster_count: int = 1, filter_out_vectors_contained_in_union_of_intersecting_rasters: bool = False, shuffle: bool = True, - **kwargs, + downloader_params: dict[str, Any] | None = None, + processor_params: dict[str, Any] | None = None, ): """Download a targeted number of rasters per vector feature. - Sequentially consider the vector features for which the raster count (number of - rasters fully containing a given vector feature) is less than - num_target_rasters_per_vector rasters in the connector's internal vectors - or the optional vectors argument (if given), for each such vector - feature attempt to download num_target_rasters_per_vector - raster_count rasters - fully containing the vector feature (or several rasters jointly containing the - vector feature), and integrate the new raster(s) into the dataset/connector. - Integrates rasters downloaded for a vector feature into the dataset/connector - immediately after downloading them and before downloading rasters for the next - vector feature. In particular, the raster count is updated immediately after - each download. + For each vector feature with fewer than `target_raster_count` + rasters fully containing it, this function attempts to download + additional rasters to meet the target. The new rasters are integrated + into the dataset/connector immediately after downloading, updating + the raster count for the vector feature before proceeding to the + next feature. Warning: - The targeted number of downloads is determined by target_raster_count - and a vector features raster_count. Since the raster_count is the number of - rasters in the dataset fully containing a vector feature for "large" - vector features (polygons) the raster_count will always remain zero and - every call of the download_rasters method that includes this vector feature - will download target_raster_count rasters (or raster series). - To avoid this, you can use the - filter_out_vectors_contained_in_union_of_intersecting_rasters argument. + The target number of downloads depends on `target_raster_count` + and the current `raster_count` (number of rasters fully containing + the vector feature). For vector features (e.g., polygons) too large to + be fully contained in any raster, the `raster_count` will remain zero, + and every call to this method will attempt to download `target_raster_count` + rasters (or raster series). To avoid this, use the + `filter_out_vectors_contained_in_union_of_intersecting_rasters` argument. Args: - vector_names: Optional vector_name or list of vector_names to download + vector_names: + Optional vector_name or list of vector_names to download rasters for. Defaults to None, i.e. consider all vector features in connector.vectors. - downloader: One of 'sentinel2' or 'jaxa'. Defaults, if possible, to + downloader: + One of 'sentinel2' or 'jaxa'. Defaults, if possible, to previously used downloader. - target_raster_count: target for number of rasters per vector feature in + target_raster_count: + Target for number of rasters per vector feature in the dataset after downloading. The actual number of rasters for each vector feature P that fully contain it could be lower if there are not enough rasters available or higher if after downloading num_target_rasters_per_vector rasters for P P is also contained in rasters downloaded for other vector features. - filter_out_vector vectors_contained_in_union_of_intersecting_rasters: + filter_out_vectors_contained_in_union_of_intersecting_rasters: Useful when dealing with 'large' vector features. Defaults to False. - shuffle: Whether to shuffle order of vector features for which rasters + shuffle: + Whether to shuffle order of vector features for which rasters will be downloaded. Might in practice prevent an uneven distribution of the raster count for repeated downloads. Defaults to True. - kwargs: optional additional keyword arguments passed to - downloader_for_single_vector and download_processor. - Defaults to self.kwarg_defaults. - - Note: - Any kwargs given will be saved to self.default_kwargs and become default - values. + downloader_params: + (Optional) keyword arguments to pass to the + downloader_for_single_vector.download. Corresponds to ``**params`` of + download method of the the abstract base class + RasterDownloaderForSingleVector. In particular, the keywords + vector_name, vector_geom, download_dir, and + previously_downloaded_rasters_set corresponding to the other + arguments are not allowed. + processor_params: + Optional additional keyword arguments passed to + download_processor.process as ``**params``. In particular, the keywords + raster_name, download_dir, rasters_dir, and + return_bounds_in_crs_epsg_code are not allowed. Returns: None @@ -115,12 +118,12 @@ def download( jointly cover the polygon then these 20 disjoint sets will all be downloaded. """ - self.kwarg_defaults.update(kwargs) - + downloader_params = downloader_params or {} + processor_params = processor_params or {} if not isinstance(connector, Connector): connector = Connector.from_data_dir(connector) connector.rasters_dir.mkdir(parents=True, exist_ok=True) - temp_download_dir = connector.data_dir / DEFAULT_TEMP_DOWNLOAD_DIR_NAME + temp_download_dir = connector.data_dir / self.temp_dir_relative_path temp_download_dir.mkdir(parents=True, exist_ok=True) vectors_for_which_to_download = self._get_vectors_for_which_to_download( @@ -139,7 +142,7 @@ def download( # Dict to keep track of rasters we've downloaded. We'll append this to # connector.rasters as a (geo)dataframe later - new_rasters_dict = defaultdict(list) + new_raster_dicts_list = [] pbar = tqdm( enumerate( @@ -150,7 +153,6 @@ def download( ) ) for count, (vector_name, vector_geom) in pbar: - # vector_geom = connector.vectors.loc[vector_name, 'geometry'] pbar.set_description( @@ -181,11 +183,9 @@ def download( continue while num_raster_series_to_download > 0: - # Try downloading a raster series and save returned dict (of dicts) # containing information for vectors, connector.rasters... try: - # DEBUG INFO log.debug( "attempting to download raster for vector feature. %s", @@ -200,7 +200,7 @@ def download( vector_geom=vector_geom, download_dir=temp_download_dir, previously_downloaded_rasters_set=previously_downloaded_rasters_set, # noqa: E501 - **self.kwarg_defaults, + **downloader_params, ) # WHY DOES THIS NOT WORK? @@ -211,7 +211,6 @@ def download( # ... unless either no rasters could be found ... except NoRastersForVectorFoundError as exc: - # ... in which case we save it in connector.vectors, ... connector.vectors.loc[vector_name, "download_exception"] = repr(exc) @@ -223,14 +222,12 @@ def download( # ... or a download error occured, ... except RasterDownloadError as exc: - connector.vectors.loc[vector_name, "download_exception"] = repr(exc) log.warning(exc, exc_info=True) # ... or downloader_for_single_vector tried downloading a previously # downloaded raster. except RasterAlreadyExistsError: - log.exception( "downloader_for_single_vector tried " "downloading a previously downloaded raster!" @@ -238,7 +235,6 @@ def download( # If the download_method call was successful ... else: - # ... we first extract the information to be appended to # connector.rasters. list_raster_info_dicts = return_dict["list_raster_info_dicts"] @@ -256,7 +252,6 @@ def download( break # ... else ... else: - self._run_safety_checks_on_downloaded_rasters( previously_downloaded_rasters_set, vector_name, @@ -265,7 +260,6 @@ def download( # For each download ... for raster_info_dict in list_raster_info_dicts: - # ... process it to a raster ... raster_name = raster_info_dict["raster_name"] single_raster_processed_return_dict = ( @@ -274,7 +268,7 @@ def download( temp_download_dir, connector.rasters_dir, connector.crs_epsg_code, - **self.kwarg_defaults, + **processor_params, ) ) @@ -295,16 +289,14 @@ def download( # Finally, remember we downloaded the raster. previously_downloaded_rasters_set.add(raster_name) - # update new_rasters_dict - for raster_info_dict in list_raster_info_dicts: - for key in raster_info_dict: - new_rasters_dict[key].append(raster_info_dict[key]) + # update new_raster_dicts_list + new_raster_dicts_list += list_raster_info_dicts num_raster_series_to_download -= 1 - if len(new_rasters_dict) > 0: + if len(new_raster_dicts_list) > 0: new_rasters = self._get_new_rasters( - new_rasters_dict, connector.crs_epsg_code + new_raster_dicts_list, connector.crs_epsg_code ) connector.rasters = concat_gdfs([connector.rasters, new_rasters]) connector.save() @@ -313,7 +305,7 @@ def download( if not list(temp_download_dir.iterdir()): shutil.rmtree(temp_download_dir) - def save(self, file_path: Union[Path, str]): + def save(self, file_path: Path | str): """Save downloader. By convention, the downloader should be saved to the connector @@ -323,8 +315,8 @@ def save(self, file_path: Union[Path, str]): @staticmethod def _run_safety_checks_on_downloaded_rasters( - previously_downloaded_rasters_set: set[Union[str, int]], - vector_name: Union[str, int], + previously_downloaded_rasters_set: set[str | int], + vector_name: str | int, list_raster_info_dicts: list[dict], ): """Check no rasters have been downloaded more than once. @@ -386,12 +378,11 @@ def _run_safety_checks_on_downloaded_rasters( def _get_vectors_for_which_to_download( self, - vector_names: Union[str, int, list[int], list[str]], + vector_names: str | int | list[int] | list[str], target_raster_count: int, connector: Connector, filter_out_vectors_contained_in_union_of_intersecting_rasters: bool, - ) -> list[Union[int, str]]: - + ) -> list[int | str]: if vector_names is None: vectors_for_which_to_download = list( connector.vectors.loc[ @@ -429,7 +420,7 @@ def _get_vectors_for_which_to_download( def _filter_out_vectors_with_null_geometry( self, - vector_names: Union[str, int, list[int], list[str]], + vector_names: str | int | list[int] | list[str], connector: Connector, ) -> None: vectors_w_null_geometry_mask = ( @@ -453,7 +444,7 @@ def _filter_out_vectors_with_null_geometry( def _filter_out_vectors_contained_in_union_of_intersecting_rasters( self, - vector_names: Union[str, int, list[int], list[str]], + vector_names: str | int | list[int] | list[str], connector: Connector, ) -> None: vector_names = [ @@ -469,11 +460,12 @@ def _filter_out_vectors_contained_in_union_of_intersecting_rasters( def _get_new_rasters( self, - new_rasters_dict: dict, + new_raster_dicts_list: list[dict[str, Any]], rasters_crs_epsg_code: int, ) -> GeoDataFrame: - """Build and return rasters of new rasters from new_rasters_dict.""" - new_rasters = GeoDataFrame(new_rasters_dict) + """Build and return new rasters gdf from new_raster_dicts_list.""" + new_rasters = GeoDataFrame.from_records(new_raster_dicts_list) + new_rasters.set_geometry("geometry", inplace=True) new_rasters.set_crs(epsg=rasters_crs_epsg_code, inplace=True) new_rasters.set_index("raster_name", inplace=True) new_rasters = new_rasters.convert_dtypes( diff --git a/geographer/downloaders/eodag_downloader_for_single_vector.py b/geographer/downloaders/eodag_downloader_for_single_vector.py new file mode 100644 index 00000000..b01ecc30 --- /dev/null +++ b/geographer/downloaders/eodag_downloader_for_single_vector.py @@ -0,0 +1,398 @@ +"""SingleRasterDownloader for all providers supported by eodag. + +In particular, this downloader can be used to obtain Sentinel-2 L2A +data. +""" + +from __future__ import annotations + +import logging +from datetime import date, datetime +from pathlib import Path +from typing import Any, Literal + +import eodag +import pandas as pd +import shapely +from eodag import EODataAccessGateway +from eodag.api.search_result import SearchResult +from eodag.utils import sanitize +from pydantic import Field, PrivateAttr +from shapely.geometry import Polygon + +from geographer.downloaders.base_downloader_for_single_vector import ( + RasterDownloaderForSingleVector, +) +from geographer.errors import NoRastersForVectorFoundError +from geographer.global_constants import DUMMY_VALUE + +log = logging.getLogger(__name__) + + +class SearchParams(dict): + """Parameters for the `search_all` method of an EODataAccessGateway. + + Note: + The `geom` parameter of the EODataAccessGateway.search_all method is + omitted, because its value is determined as a `geographer` argument. + + See See https://eodag.readthedocs.io/en/latest/api_reference/core.html#eodag.api.core.EODataAccessGateway.search_all. # noqa + for more details on most of the arguments below. + + This dictionary may include the following keys: + - `start` (str | None): + Start sensing time in ISO 8601 format (e.g. “1990-11-26”, + “1990-11-26T14:30:10.153Z”, “1990-11-26T14:30:10+02:00”, …). + If no time offset is given, the time is assumed to be given in UTC. + - `end` (str | None): + End sensing time in ISO 8601 format (e.g. “1990-11-26”, + “1990-11-26T14:30:10.153Z”, “1990-11-26T14:30:10+02:00”, …). + If no time offset is given, the time is assumed to be given in UTC. + - `provider` (str | None): + The provider to be used. If set, search fallback will be disabled. + If not set, the configured preferred provider will be used at first + before trying others until finding results. See + https://eodag.readthedocs.io/en/stable/_modules/eodag/api/core.html#EODataAccessGateway.search. # noqa + - `items_per_page` (int | None): + Number of items to retrieve per page. + - `locations` (dict[str, str] | None): + Location filtering by name using locations configuration + {""=""}. For example, {"country"="PA."} + will use the geometry of the features having the property ISO3 starting + with 'PA' such as Panama and Pakistan in the shapefile configured with + name=country and attr=ISO3. + - In addition, the dictionary may contain any other keys (except ``geom``) + compatible with the provider. + """ + + pass + + +class DownloadParams(dict): + """Parameters for the `download` method of an EOProduct. + + Refer to the EOProduct documentation for more details: + https://eodag.readthedocs.io/en/stable/api_reference/eoproduct.html + + Some parameters of the EOProduct.download method should not be used: + - `product`: Omitted because the value is determined by `geographer`. + - `progress_callback`: Omitted because its values cannot easily + be JSON serialized. + - `extract`: Omitted because geographer requires the value of this + kwarg to be True. + - `output_dir`: Omitted because the value is determined by `geographer`. + - `asset`: Omitted because it does not make sense for a downloader + for a single vector. + - `output_extension`: Omitted for simplicity's sake. + + This dictionary may include any of the following keys: + - `wait` (int): The wait time in minutes between two download attempts. + - `timeout` (int): The max time in minutes to retry downloading before stopping. + - `dl_url_params` (dict[str, str]): Additional URL parameters to pass to + the download URL. + - `delete_archive` (bool): Whether to delete the downloaded archives + after extraction. + """ + + pass + + +FORBIDDEN_DOWNLOAD_KWARGS_KEYS = [ + "product", + "progress_callback", + "extract", + "output_dir", + "asset", +] + + +ASC_OR_DESC = Literal["ASC", "DESC"] +ASC_OR_DESC_VALUES = ["ASC", "DESC"] + + +class EodagDownloaderForSingleVector(RasterDownloaderForSingleVector): + """Downloader for providers supported by eodag. + + Refer to the eodag documentation at + https://eodag.readthedocs.io/en/stable/ + for more details on eodag. + """ + + eodag_kwargs: dict[str, Any] = Field( + default_factory=dict, + description=( + "Optional kwargs defining an EODataAccessGateway instance. " + "Possible keys are 'user_conf_file_path' to define a Path to " + "the user configuration file and locations_conf_path to define " + "a Path to the locations configuration file. " + "See https://eodag.readthedocs.io/en/stable/api_reference/core.html#eodag.api.core.EODataAccessGateway." # noqa + ), + ) + + eodag_setup_logging_kwargs: dict[str, Any] = Field( + default_factory=lambda: dict(verbose=1), + description=( + "Kwargs to be passed to eodag.utils.logging.setup_logging " + "to set up eodag logging. See " + "https://eodag.readthedocs.io/en/stable/api_reference/utils.html#eodag.utils.logging.setup_logging" # noqa + ), + ) + + # Note that eodag as is not defined as a field. + # This is so the pydantic fields are json serializable. + _eodag: EODataAccessGateway = PrivateAttr() + + def model_post_init(self, __context): + """Perform additional initialization.""" + eodag.setup_logging(**self.eodag_setup_logging_kwargs) + + self._eodag = EODataAccessGateway(**self.eodag_kwargs) + + @property + def eodag(self) -> EODataAccessGateway: + """Get eodag.""" + return self._eodag + + def download( # type: ignore + self, + vector_name: str | int, + vector_geom: Polygon, + download_dir: Path, + previously_downloaded_rasters_set: set[str], + *, # downloader_params of RasterDownloaderForVectors.download start below + search_kwargs: SearchParams | None = None, + download_kwargs: DownloadParams | None = None, + properties_to_save: list[str] | None = None, + filter_property: dict[str, Any] | list[dict[str, Any]] | None = None, + filter_online: bool = True, + sort_by: str | tuple[str, ASC_OR_DESC] | None = None, + suffix_to_remove: str | None = None, + ) -> dict: + """Download a raster for a vector feature using eodag. + + Download a raster fully containing the vector feature, + returns a dict in the format needed by the associator. + + Note: + The start, end, provider, items_per_page, and locations arguments correspond + to kwargs of EODataAccessGateway.search_all (though the provider kwarg is only + documented for the EODataAccessGateway.search). The descriptions are adapted + from the official eodag documentation at + https://eodag.readthedocs.io/en/latest/api_reference/core.html#eodag.api.core.EODataAccessGateway. + + Args: + vector_name: + name of vector feature + vector_geom: + Geometry of vector feature + download_dir: + Directory Sentinel-2 products will be downloaded to. + previously_downloaded_rasters_set: + Set of already downloaded products. + search_kwargs: + Keyword arguments for the `search_all` method of an EODataAccessGateway, + excluding "geom". Refer to the docstring of SearchParams for more details. + download_kwargs: + Keyword arguments for the download` method of an EOProduct, excluding + certain keys. Refer to the docstring of DownloadParams for more details. + properties_to_save: + List of property keys to extract and save from an EOProduct's + properties dictionary. Values that cannot be stored in a + GeoDataFrame will be replaced with the string "__DUMMY_VALUE__". + filter_property: + Kwargs or list of kwargs defining criteria according to which products + should be filtered. These correspond exactly to kwargs for the + EODataAccessGateway.filter_property method. Refer to + https://eodag.readthedocs.io/en/stable/plugins_reference/generated/eodag.plugins.crunch.filter_property.FilterProperty.html#eodag.plugins.crunch.filter_property.FilterProperty # noqa + for more details. + filter_online: + Whether to filter the results to include only products that are online. + sort_by: + (Optional) A string or tuple like ("key", "ASC"|"DESC") by which to sort the results. + If a string is provided, it will be interpreted as ("key", "ASC"). + suffix_to_remove: + (Optional) A suffix to strip from the downloaded EOProduct's file name. + The resulting .tif raster will use the modified file name (if applicable) + with ".tif" appended. + + Returns: + A dictionary containing information about the rasters. + ({'list_raster_info_dicts': [raster_info_dict]}) + + Raises: + ValueError: Raised if an unkknown product type is given. + NoRastersForPolygonFoundError: Raised if no downloadable rasters + could be found for the vector feature. + """ + search_kwargs = search_kwargs or {} + download_kwargs = download_kwargs or {} + properties_to_save = properties_to_save or [] + filter_property = filter_property or {} + if isinstance(filter_property, dict): + filter_property = [filter_property] + sort_by = sort_by or [] + if isinstance(sort_by, (str, tuple)): + sort_by = [sort_by] + sort_by = [ + (entry, "ASC") if isinstance(entry, str) else entry for entry in sort_by + ] + + self._validate_download_args(download_kwargs=download_kwargs, sort_by=sort_by) + + search_criteria = search_kwargs | { + "geom": vector_geom, + } + + result: SearchResult = self.eodag.search_all(**search_criteria) + + # Only keep results that contain the geometry + result.filter_overlap(geometry=vector_geom, contains=True) + + for filter_kwargs in filter_property: + result.filter_property(**filter_kwargs) + + if filter_online: + result.filter_online() + + if len(result) == 0: + raise NoRastersForVectorFoundError( + f"No rasters for vector feature {vector_name} found with " + f"search criteria {search_criteria}!" + ) + + if sort_by: + # Currently only support sorting by a single key. + # In the future, we may implement hierarchical + # sorting by multiple keys. + key, asc_or_desc = sort_by[0] + if asc_or_desc == "ASC": + reverse = False + elif asc_or_desc == "DESC": + reverse = True + else: + raise ValueError( + f"sort_by is {sort_by[0]}, second tuple entry must be " + f"one of 'ASC' or 'DESC'" + ) + result = SearchResult( + products=sorted( + result, key=lambda product: product.properties[key], reverse=reverse + ), + number_matched=result.number_matched, + errors=result.errors, + ) + + # Return dicts with values to be collected in calling associator. + raster_info_dict = {} + + for eo_product in result: + + # For the next couple of lines we are essentially following + # the _prepare_download method of the + # eodag.plugins.download.base.Download class to extract + # the name of the extracted product. + sanitized_title = sanitize(eo_product.properties["title"]) + if sanitized_title == eo_product.properties["title"]: + collision_avoidance_suffix = "" + else: + collision_avoidance_suffix = "-" + sanitize(eo_product.properties["id"]) + extracted_product_file_name = sanitized_title + collision_avoidance_suffix + + if suffix_to_remove is not None: + raster_name = ( + extracted_product_file_name.removesuffix(suffix_to_remove) + ".tif" + ) + else: + raster_name = extracted_product_file_name + ".tif" + + if raster_name not in previously_downloaded_rasters_set: + + download_params = download_kwargs | dict( + product=eo_product, + output_dir=download_dir, + extract=True, + ) + + try: + location = self.eodag.download(**download_params) + + location_name = Path(location).name + if location_name != extracted_product_file_name: + msg = ( + "The name of the downloaded file (%s) does not " + "match the expected name (%s). eodag must have " + "changed the way they determine the file name. " + "Unfortunately, `geographer` relies on being able " + "to determine the name of the extracted file " + "without downloading the product. The " + "`EodagDownloaderForSingleVector` will have to be " + "updated to work with the new naming convention of" + "eodag. Sorry!" + ) + log.error(msg, location_name, extracted_product_file_name) + raise RuntimeError( + msg % (msg, location_name, extracted_product_file_name) + ) + + # And assemble the information to be updated + # in the returned raster_info_dict: + properties_to_save_dict = {} + for key in properties_to_save: + if key in eo_product.properties: + val = eo_product.properties.get(key) + definitely_accepted_types = ( + str, + int, + float, + type(None), + date, + datetime, + shapely.geometry.base.BaseGeometry, + ) + if not isinstance(val, definitely_accepted_types): + try: + pd.Series([val]) + except (TypeError, ValueError): + val = DUMMY_VALUE + properties_to_save_dict[key] = val + raster_info_dict.update(properties_to_save_dict) + raster_info_dict["raster_name"] = raster_name + raster_info_dict["raster_processed?"] = False + + return {"list_raster_info_dicts": [raster_info_dict]} + + except Exception as exc: + log.warning( + "Failed to download, extract, or process %s: %s", + eo_product, + str(exc), + ) + + raise NoRastersForVectorFoundError( + f"All rasters for {vector_name} failed to download." + ) + + def _validate_download_args(self, download_kwargs: DownloadParams, sort_by: list): + """Validate download arguments.""" + for key in download_kwargs: + if key in FORBIDDEN_DOWNLOAD_KWARGS_KEYS: + msg = "The key '%s' is forbidden and cannot be used." + log.error(msg, key) + raise ValueError(msg % key) + + for key, asc_or_desc in sort_by: + if asc_or_desc not in ASC_OR_DESC_VALUES: + msg = ( + "Found %s as second entry of a sort_by pair. " + "Must be one of 'ASC' or 'DESC'!" + ) + log.error(msg, asc_or_desc) + raise ValueError(msg % asc_or_desc) + + if len(sort_by) > 1: + msg = ( + "At the moment sorting is only supported for a single key at a time. " + "The length of the sort_by list must be at most 1." + ) + log.error(msg) + raise ValueError(msg) diff --git a/geographer/downloaders/jaxa_download_processor.py b/geographer/downloaders/jaxa_download_processor.py index c2b52290..17a5ce71 100644 --- a/geographer/downloaders/jaxa_download_processor.py +++ b/geographer/downloaders/jaxa_download_processor.py @@ -1,7 +1,5 @@ """RasterDownloadProcessor for JAXA downloads.""" -from __future__ import annotations - import logging import shutil from pathlib import Path @@ -25,7 +23,6 @@ def process( download_dir: Path, rasters_dir: Path, return_bounds_in_crs_epsg_code: int, - **kwargs, ) -> dict: """Process a downloaded JAXA file. diff --git a/geographer/downloaders/jaxa_downloader_for_single_vector.py b/geographer/downloaders/jaxa_downloader_for_single_vector.py index 434dbb41..45de48bf 100644 --- a/geographer/downloaders/jaxa_downloader_for_single_vector.py +++ b/geographer/downloaders/jaxa_downloader_for_single_vector.py @@ -25,7 +25,7 @@ from contextlib import closing from datetime import datetime from pathlib import Path -from typing import Any, Literal, Optional, Union +from typing import Any, Literal import numpy as np from shapely.geometry.base import BaseGeometry @@ -50,14 +50,14 @@ class JAXADownloaderForSingleVector(RasterDownloaderForSingleVector): def download( self, - vector_name: Union[int, str], + vector_name: str | int, vector_geom: BaseGeometry, download_dir: Path, - previously_downloaded_rasters_set: set[Union[str, int]], + previously_downloaded_rasters_set: set[str | int], + *, # downloader_params of RasterDownloaderForVectors.download start below data_version: str = None, download_mode: str = None, - **kwargs, - ) -> dict[Union[Literal["raster_name", "raster_processed?"], str], Any]: + ) -> dict[Literal["raster_name", "raster_processed?"] | str, Any]: """Download JAXA DEM data for a vector feature. Download DEM data from jaxa.jp's ftp-server for a given vector @@ -83,7 +83,6 @@ def download( Defaults if possible to whichever choice you made last time. download_mode: One of 'bboxvertices', 'bboxgrid'. Defaults if possible to whichever choice you made last time. - **kwargs: other kwargs, ignored. Returns: dict of dicts according to the connector convention @@ -101,9 +100,7 @@ def download( jaxa_file_and_folder_names = set() if download_mode == "bboxvertices": - for x, y in vector_geom.envelope.exterior.coords: - jaxa_folder_name = "{}/".format( self._obtain_jaxa_index(x // 5 * 5, y // 5 * 5) ) @@ -112,7 +109,6 @@ def download( jaxa_file_and_folder_names |= {(jaxa_file_name, jaxa_folder_name)} elif download_mode == "bboxgrid": - minx, miny, maxx, maxy = vector_geom.envelope.exterior.bounds deltax = math.ceil(maxx - minx) @@ -120,7 +116,6 @@ def download( for countx in range(deltax + 1): for county in range(deltay + 1): - x = minx + countx y = miny + county @@ -139,7 +134,6 @@ def download( ) # to collect information per downloaded file for connector for jaxa_file_name, jaxa_folder_name in jaxa_file_and_folder_names: - # Skip download if file has already been downloaded ... if jaxa_file_name[:-7] + "_DSM.tif" in previously_downloaded_rasters_set: # in this case skip download, don't store in list_raster_info_dicts @@ -215,8 +209,8 @@ def download( def _obtain_jaxa_index( self, - x: Optional[float] = None, - y: Optional[float] = None, + x: float | None = None, + y: float | None = None, nx: int = 3, ny: int = 3, ): diff --git a/geographer/downloaders/sentinel2_download_processor.py b/geographer/downloaders/sentinel2_download_processor.py index c39b4d22..e534743b 100644 --- a/geographer/downloaders/sentinel2_download_processor.py +++ b/geographer/downloaders/sentinel2_download_processor.py @@ -3,19 +3,26 @@ Should be easily extendable to Sentinel-1. """ -from __future__ import annotations - -import os +import logging +import shutil from pathlib import Path -from zipfile import ZipFile from geographer.downloaders.base_download_processor import RasterDownloadProcessor -from geographer.downloaders.sentinel2_safe_unpacking import safe_to_geotif_L2A +from geographer.downloaders.sentinel2_safe_unpacking import ( + NO_DATA_VAL, + safe_to_geotif_L2A, +) from geographer.utils.utils import transform_shapely_geometry +log = logging.getLogger(__name__) + -class Sentinel2Processor(RasterDownloadProcessor): - """Processes downloads of Sentinel-2 products from Copernicus Sci-hub.""" +# TODO Test with the 'creodias', 'onda', and 'sara' providers +# TODO (archive_depth 2). +# TODO Use provider's archive_depth to extend to archive_depth not +# TODO equal to 2 i.e. 'planetary_computer' (archive_depth 1). +class Sentinel2SAFEProcessor(RasterDownloadProcessor): + """Processes downloads of L2A Sentinel-2 SAFE files.""" def process( self, @@ -23,8 +30,11 @@ def process( download_dir: Path, rasters_dir: Path, return_bounds_in_crs_epsg_code: int, + *, # processor_params of RasterDownloaderForVectors.download start below resolution: int, - **kwargs, + delete_safe: bool, # TODO better name, uniformly usable for all processors? + file_suffix: str = ".SAFE", + nodata_val: int = NO_DATA_VAL, ) -> dict: """Process Sentinel-2 download. @@ -33,32 +43,77 @@ def process( GeoTiff raster in the right directory, and return information about the raster in a dict. + Warning: + Tested with the `cop_dataspace` eodag provider. It should also work with + 'creodias', 'onda', and 'sara', which have an `archive_depth` of 2. + For providers with a different `archive_depth`, the processor may need + adjustments to locate the SAFE file correctly based on the raster name. + Args: - raster_name: The name of the raster. - in_dir: The directory containing the zip file. - out_dir: The directory to save the - convert_to_crs_epsg: The EPSG code to use to create the raster bounds - property. # TODO: this name might not be appropriate as it - suggests that the raster geometries will be converted into that crs. - resolution: resolution. + raster_name: + The name of the raster. + download_dir: + The dir containing the SAFE file to be processed. + rasters_dir: + The dir in which the .tif output file should be placed. + return_bounds_in_crs_epsg_code: + The EPSG of the CRS in which the bounds of the raster + should be returned. + resolution: + The desired resolution of the output tif file. + delete_safe: + Whether to delete the SAFE file after extracting the tif file. + file_suffix: + Possible suffix by which the stem of the raster_name and the + downloaded SAFE file to be processed differ. If used together + with the `EodagDownloaderForSingleVector` for the 'cop_dataspace' + provider and the `RasterDownloaderForVectors` and the + `downloader_params` parameter dict of the + `RasterDownloaderForVectors.download` method contains + a `"suffix_to_remove: ".SAFE"` pair then the default value of + ".SAFE" for the file_suffix will result in nicer tif names, + e.g. S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20231208T031743.tif + instead of S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20231208T031743.SAFE.tif. # noqa + nodata_val: + The nodata value to fill. Defaults to 0. Returns: return_dict: Contains information about the downloaded product. """ - filename_no_extension = Path(raster_name).stem - zip_filename = filename_no_extension + ".zip" - safe_path = download_dir / f"safe_files/{filename_no_extension}.SAFE" - zip_path = download_dir / zip_filename - - # extract zip to SAFE - with ZipFile(zip_path) as zip_ref: - zip_ref.extractall(download_dir / Path("safe_files/")) - os.remove(zip_path) - # convert SAFE to GeoTiff + log.info("Processing %s to a .tif file. This might take a while..") + + safe_path = download_dir / raster_name.removesuffix(".tif") + safe_path_with_suffix = safe_path.with_suffix(file_suffix) + + if safe_path.exists() and (not safe_path_with_suffix.exists()): + pass # Use safe_path + elif safe_path_with_suffix.exists() and (not safe_path.exists()): + safe_path = safe_path_with_suffix + elif safe_path.exists() and safe_path_with_suffix.exists(): + msg = ( + "Both %s and %s exist in %s.\n" + "Unable to resolve ambiguity in which file/dir to process." + ) + log.error(msg, safe_path.name, safe_path_with_suffix.name, safe_path.parent) + raise RuntimeError( + msg % (safe_path.name, safe_path_with_suffix.name, safe_path.parent) + ) + elif (not safe_path.exists()) and (not safe_path_with_suffix.exists()): + msg = "Can't find SAFE file in expected location(s): %s" + log.error(msg, safe_path) + raise RuntimeError(msg % safe_path) + conversion_dict = safe_to_geotif_L2A( - safe_root=Path(safe_path), resolution=resolution, outdir=rasters_dir + safe_root=safe_path, + resolution=resolution, + outdir=rasters_dir, + nodata_val=nodata_val, ) + if delete_safe: + log.info("Deleting SAFE file: %s", safe_path) + shutil.rmtree(safe_path, ignore_errors=True) + orig_crs_epsg_code = int(conversion_dict["crs_epsg_code"]) raster_bounding_rectangle_orig_crs = conversion_dict[ "raster_bounding_rectangle" diff --git a/geographer/downloaders/sentinel2_downloader_for_single_vector.py b/geographer/downloaders/sentinel2_downloader_for_single_vector.py deleted file mode 100644 index 46c53966..00000000 --- a/geographer/downloaders/sentinel2_downloader_for_single_vector.py +++ /dev/null @@ -1,233 +0,0 @@ -"""SingleRasterDownloader for Sentinel-2 rasters from Copernicus Sci-hub. - -Should be easily extendable to Sentinel-1. -""" - -from __future__ import annotations - -import configparser -import logging -from pathlib import Path -from typing import Any, Union -from zipfile import ZipFile - -from sentinelsat import SentinelAPI -from sentinelsat.exceptions import ServerError, UnauthorizedError -from shapely import wkt -from shapely.geometry import Polygon - -from geographer.downloaders.base_downloader_for_single_vector import ( - RasterDownloaderForSingleVector, -) -from geographer.errors import NoRastersForVectorFoundError - -# logger -log = logging.getLogger(__name__) - - -class SentinelDownloaderForSingleVector(RasterDownloaderForSingleVector): - """Downloader for Sentinel-2 rasters. - - Requires environment variables sentinelAPIusername and - sentinelAPIpassword to set up the sentinel API. Assumes rasters has - columns 'geometry', 'timestamp', 'orig_crs_epsg_code', and - 'raster_processed?'. Subclass/modify if you need other columns. - - See - https://sentinelsat.readthedocs.io/en/latest/api_reference.html - for details on args passed to the API (e.g. date). - """ - - def download( # type: ignore - self, - vector_name: Union[str, int], - vector_geom: Polygon, - download_dir: Path, - previously_downloaded_rasters_set: set[str], - producttype: str, - resolution: int, - max_percent_cloud_coverage: int, - date: Any, - area_relation: str, - credentials: Union[tuple[str, str], Path, str], - **kwargs, - ) -> dict: - """Download a S-2 raster for a vector feature. - - Download a sentinel-2 raster fully containing the vector feature, - returns a dict in the format needed by the associator. - - Note: - If not given, the username and password for the Copernicus Sentinel-2 - OpenAPI will be read from an s2_copernicus_credentials.ini in - self.associator_dir. - - Args: - vector_name: name of vector feature - vector_geom: geometry of vector feature - download_dir: Directory Sentinel-2 products will be downloaded to. - previously_downloaded_rasters_set: Set of already downloaded products. - producttype: One of 'L1C'/'S2MSI1C' or 'L2A'/'S2MSI2A' - resolution: One of 10, 20, or 60. - max_percent_cloud_coverage: Integer between 0 and 100. - date: See https://sentinelsat.readthedocs.io/en/latest/api_reference.html - area_relation : See - https://sentinelsat.readthedocs.io/en/latest/api_reference.html - credentials: Tuple of username and password or - Path or str to ini file containing API credentials. - - Returns: - A dictionary containing information about the rasters. - ({'list_raster_info_dicts': [raster_info_dict]}) - - Raises: - ValueError: Raised if an unkknown product type is given. - NoRastersForPolygonFoundError: Raised if no downloadable rasters with cloud - coverage less than or equal to max_percent_cloud_coverage could be found - for the vector feature. - """ - self._check_args_are_valid(producttype, resolution, max_percent_cloud_coverage) - - # Determine missing args for the sentinel query. - rectangle_wkt: str = wkt.dumps(vector_geom.envelope) - producttype = self._get_longform_producttype(producttype) - - api = self._get_api(credentials) - - try: - - # Query, remember results - products = api.query( - area=rectangle_wkt, - date=date, - area_relation=area_relation, - producttype=producttype, - cloudcoverpercentage=(0, max_percent_cloud_coverage), - ) - - products = {k: v for k, v in products.items() if api.is_online(k)} - - except (UnauthorizedError, ServerError) as exc: - log.exception(str(exc)) - raise - - # If we couldn't find anything, remember that, so we can deal with it later. - if len(products) == 0: - raise NoRastersForVectorFoundError( - f"No rasters for vector feature {vector_name} found with " - f"cloud coverage less than or equal to {max_percent_cloud_coverage}!" - ) - - # Return dicts with values to be collected in calling associator. - raster_info_dict = {} - - # If the query was succesful, ... - products_list = list(products.keys()) - products_list = sorted( - products_list, key=lambda x: products[x]["cloudcoverpercentage"] - ) - # ... iterate over the products ordered by cloud coverage - for product_id in products_list: - - product_metadata = api.get_product_odata(product_id, full=True) - - try: - # (this key might have to be 'filename' - # (minus the .SAFE at the end) for L1C products?) - raster_name = product_metadata["title"] + ".tif" - except Exception as exc: - raise Exception( - "Couldn't get the filename. Are you trying to download L1C " - "products? Try changing the key for the products dict in the " - "line of code above this..." - ) from exc - - if raster_name not in previously_downloaded_rasters_set: - try: - api.download(product_id, directory_path=download_dir) - zip_path = download_dir / (product_metadata["title"] + ".zip") - with ZipFile(zip_path) as zip_ref: - assert zip_ref.testzip() is None - - # And assemble the information to be updated - # in the returned raster_info_dict: - raster_info_dict["raster_name"] = raster_name - raster_info_dict["raster_processed?"] = False - raster_info_dict["timestamp"] = product_metadata["Date"].strftime( - "%Y-%m-%d-%H:%M:%S" - ) - - return {"list_raster_info_dicts": [raster_info_dict]} - except Exception as exc: - log.warning( - "Failed to download or unzip %s: %s", - product_metadata["title"], - str(exc), - ) - - raise NoRastersForVectorFoundError( - f"All rasters for {vector_name} failed to download." - ) - - def _get_longform_producttype(self, producttype: str): - """Return producttype in longform as needed by the sentinel API.""" - if producttype in {"L2A", "S2MSI2A"}: - producttype = "S2MSI2A" - elif producttype in {"L1C", "S2MSI1C"}: - producttype = "S2MSI1C" - else: - raise ValueError(f"Unknown producttype: {producttype}") - - return producttype - - @staticmethod - def _check_args_are_valid( - producttype: str, - resolution: int, - max_percent_cloud_coverage: int, - ): - """Run some safety checks on the arg values.""" - if resolution not in {10, 20, 60}: - raise ValueError(f"Unknown resolution: {resolution}") - if max_percent_cloud_coverage < 0 or max_percent_cloud_coverage > 100: - raise ValueError( - f"Unknown max_percent_cloud_coverage: {max_percent_cloud_coverage}" - ) - if producttype not in {"L1C", "S2MSI1C", "L2A", "S2MSI2A"}: - raise ValueError(f"Unknown producttype: {producttype}") - - def _get_api(self, credentials: Union[tuple[str, str], Path, str]): - # Get username and password to set up the sentinel API ... - if ( - isinstance(credentials, tuple) - and len(credentials) == 2 - and all(isinstance(cred, str) for cred in credentials) - ): - username, password = credentials - elif isinstance(credentials, (str, Path)): - try: - config = configparser.ConfigParser() - if not credentials.is_file(): - raise FileNotFoundError( - "Can't find .ini file containing username and password in " - f"{credentials}" - ) - config.read(credentials) - username = config["login"]["username"] - password = config["login"]["password"] - except KeyError as exc: - log.error( - "Missing entry in 'sentinel_scihub.ini' file. " - "Need API credentials. %s", - exc, - ) - else: - raise TypeError( - "Need username and password or config_path to .ini file " - "containing username and password" - ) - - # ... and instantiate the API. - api = SentinelAPI(username, password) - - return api diff --git a/geographer/downloaders/sentinel2_safe_unpacking.py b/geographer/downloaders/sentinel2_safe_unpacking.py index 05618fb4..df31f562 100644 --- a/geographer/downloaders/sentinel2_safe_unpacking.py +++ b/geographer/downloaders/sentinel2_safe_unpacking.py @@ -6,7 +6,6 @@ import os from collections import OrderedDict from pathlib import Path -from typing import Union import geopandas as gpd import numpy as np @@ -15,6 +14,7 @@ from rasterio.errors import RasterioIOError from scipy.ndimage import zoom from shapely.geometry import box +from tqdm.auto import tqdm from geographer.utils.utils import create_logger @@ -25,52 +25,88 @@ def safe_to_geotif_L2A( safe_root: Path, - resolution: Union[str, int], + resolution: str | int, upsample_lower_resolution: bool = True, outdir: Path = None, TCI: bool = True, requested_jp2_masks: list[str] = ["CLDPRB", "SNWPRB"], requested_gml_mask: list[tuple[str, str]] = [("CLOUDS", "B00")], + nodata_val: int = NO_DATA_VAL, ) -> dict: - """Convert a L2A-level .SAFE file to geotif. + """Convert a L2A-level Sentinel-2 .SAFE file to a GeoTIFF. - Convert a .SAFE file with L2A sentinel-2 data to geotif and return a - dict with the crs epsg code and a shapely polygon defined by the raster - bounds. + The GeoTIFF contains raster bands derived from the .SAFE file, including: + - True color composite (TCI) bands if requested. + - JP2 masks (e.g., cloud or snow masks) at the desired resolution. + - Additional GML masks if available. Warning: - The L2A band structure changed in October 2021, new products do not contain gml - masks anymore. In this + Sentinel-2 L2A products dated later than October 2021 + no longer include GML masks. - ..note:: + Note: + + - The GeoTIFF bands are ordered as follows: + + 1. **True Color Composite (TCI)** (optional): + Red, Green, Blue (if ``TCI=True``). + 2. **Spectral Bands**: JP2 data bands at the target resolution, + optionally including upsampled lower-resolution bands + if ``upsample_lower_resolution=True``. + 3. **JP2 Masks**: Added in the order specified by ``requested_jp2_masks`` + (e.g., ``"CLDPRB"``, ``"SNWPRB"``). Masks are limited to a maximum + resolution of 20m. + 4. **GML Masks**: Rasterized from ``requested_gml_mask``, with + empty bands added for missing masks. - - band structure of final geotif: - if TCI: 1-3 TCI RGB - else sorted(jps2_masks and bands (either only desired resolution or - additionally upsampled)), gml_mask_order - jp2_masks are only available up to a resolution of 20 m, so for 10m the 20m mask ist taken - - SNWPRB for snow masks + - ``"SNWPRB"`` for snow masks + Args: - safe_root: is the safe folder root - resolution: the desired resolution - upsample_lower_resolution: Whether to include lower resolution bands and - upsample them - TCI: whether to load the true color raster - requested_jp2_masks: jp2 mask to load - requested_gml_mask: gml masks to load ([0] mask name as string, [1] band for - which to get the mask) + safe_root: + Path to the root directory of the .SAFE file. + resolution: + Desired resolution for the GeoTIFF (10, 20, or 60 meters). + upsample_lower_resolution: + If True, includes lower-resolution bands + and upsamples them to match the target resolution. Defaults to True. + outdir: + Directory where the GeoTIFF will be saved. If None, saves the + file in the parent directory of `safe_root`. Defaults to None. + TCI: + Whether to include true color raster bands (TCI). + Defaults to True. + requested_jp2_masks: + List of JP2 masks to include in the output. + Defaults to ["CLDPRB", "SNWPRB"]. + requested_gml_mask: List of GML masks to include. Each tuple contains + the mask name (e.g., "CLOUDS") and the associated band (e.g., "B00"). + Defaults to [("CLOUDS", "B00")]. + nodata_val: + Value to use for no-data areas in the GeoTIFF. Defaults to 0. Returns: - dict containing tif crs and bounding rectangle + dict: A dictionary containing: + - `crs_epsg_code` (int): + The EPSG code of the CRS. + - `raster_bounding_rectangle` (shapely.geometry.Polygon): + The bounding rectangle of the output GeoTIFF. + + Raises: + AssertionError: + If `resolution` is not one of the supported values (10, 20, 60). + RasterioIOError: + If there are issues reading or processing the JP2/GML files. """ # assert resolution is within available assert resolution in [10, 20, 60, "10", "20", "60"] # define output file + raster_name = safe_root.stem out_file_parent_dir = outdir if (outdir and outdir.is_dir()) else safe_root.parent - outfile = out_file_parent_dir / (safe_root.stem + "_TEMP.tif") + outfile = out_file_parent_dir / (raster_name + "_TEMP.tif") granule_dir = safe_root / "GRANULE" masks_dir = granule_dir / "{}/QI_DATA/".format(os.listdir(granule_dir)[0]) @@ -110,7 +146,6 @@ def safe_to_geotif_L2A( # include lower resolution bands if upsample_lower_resolution: - for higher_res in filter(lambda res: res > int(resolution), [10, 20, 60]): jp2_higher_res_path = granule_dir / "{}/IMG_DATA/R{}m/".format( os.listdir(granule_dir)[0], higher_res @@ -180,69 +215,75 @@ def safe_to_geotif_L2A( transform=out_default_reader.transform, dtype=out_default_reader.dtypes[0], ) as dst: + dst.nodata = nodata_val + + with tqdm(total=count, desc=f"Extracting tif from {raster_name}.SAFE.") as pbar: + + # write gml masks + for idx, (gml_name, gml_path) in enumerate(gml_mask_paths_dict.items()): + try: + if not gml_path.is_file(): + raise FileNotFoundError( + f"Can't find GML mask {gml_name} in expected location " + f"{gml_path.relative_to(safe_root)}" + ) + shapes = gpd.read_file(gml_path)["geometry"].values + mask, _, _ = rasterio.mask.raster_geometry_mask( + out_default_reader, shapes, crop=False, invert=True + ) + # in case mask is empty or does not exist: + except (ValueError, AssertionError, RasterioIOError, FileNotFoundError): + log.info( + "Using all zero band for gml mask %s for %s", + gml_name, + safe_root.name, + ) + mask = np.full( + shape=out_default_reader.read(1).shape, + fill_value=0.0, + dtype=np.uint16, + ) + + band_idx = len(bands_dict) + 3 * TCI + idx + 1 + tif_band_names[band_idx] = "_".join(gml_name) + + dst.write(mask.astype(np.uint16), band_idx) + pbar.update(1) + + # write jp2 bands + for idx, (band_name, (dst_reader, res)) in enumerate(bands_dict.items()): + if res != int(resolution): + assert res % int(resolution) == 0 + factor = res // int(resolution) + + raster = dst_reader.read(1) + raster = zoom(raster, factor, order=3) + + assert raster.shape == (10980, 10980) + + else: + raster = dst_reader.read(1) + + if not dst_reader.dtypes[0] == out_default_reader.dtypes[0]: + raster = (raster * (65535.0 / 255.0)).astype(np.uint16) + + band_idx = 3 * TCI + idx + 1 + tif_band_names[band_idx] = band_name + dst.write(raster, band_idx) + dst_reader.close() + pbar.update(1) - dst.nodata = NO_DATA_VAL - - # write gml masks - for idx, (gml_name, gml_path) in enumerate(gml_mask_paths_dict.items()): - - try: - assert gml_path.is_file() - shapes = gpd.read_file(gml_path)["geometry"].values - mask, _, _ = rasterio.mask.raster_geometry_mask( - out_default_reader, shapes, crop=False, invert=True - ) - # in case mask is empty or does not exist: - except (ValueError, AssertionError, RasterioIOError): - log.info( - "Using all zero band for gml mask %s for %s", - gml_name, - safe_root.name, - ) - mask = np.full( - shape=out_default_reader.read(1).shape, - fill_value=0.0, - dtype=np.uint16, - ) - - band_idx = len(bands_dict) + 3 * TCI + idx + 1 - tif_band_names[band_idx] = "_".join(gml_name) - - dst.write(mask.astype(np.uint16), band_idx) - - # write jp2 bands - for idx, (band_name, (dst_reader, res)) in enumerate(bands_dict.items()): - - if res != int(resolution): - - assert res % int(resolution) == 0 - factor = res // int(resolution) - - raster = dst_reader.read(1) - raster = zoom(raster, factor, order=3) - - assert raster.shape == (10980, 10980) - - else: - raster = dst_reader.read(1) - - if not dst_reader.dtypes[0] == out_default_reader.dtypes[0]: - raster = (raster * (65535.0 / 255.0)).astype(np.uint16) - - band_idx = 3 * TCI + idx + 1 - tif_band_names[band_idx] = band_name - dst.write(raster, band_idx) - dst_reader.close() - - # write tci - if TCI: - for i in range(3): - - band_idx = i + 1 - raster = (tci_band.read(band_idx) * (65535.0 / 255.0)).astype(np.uint16) - tif_band_names[band_idx] = f"tci_{band_idx}" + # write tci + if TCI: + for i in range(3): + band_idx = i + 1 + raster = (tci_band.read(band_idx) * (65535.0 / 255.0)).astype( + np.uint16 + ) + tif_band_names[band_idx] = f"tci_{band_idx}" - dst.write(raster, band_idx) + dst.write(raster, band_idx) + pbar.update(1) # add tags and descriptions for band_idx, name in tif_band_names.items(): @@ -252,7 +293,7 @@ def safe_to_geotif_L2A( crs_epsg_code = dst.crs.to_epsg() raster_bounding_rectangle = box(*dst.bounds) - outfile.rename(out_file_parent_dir / (safe_root.stem + ".tif")) + outfile.rename(out_file_parent_dir / (raster_name + ".tif")) return { "crs_epsg_code": crs_epsg_code, diff --git a/geographer/errors.py b/geographer/errors.py index 8916fada..b896f508 100644 --- a/geographer/errors.py +++ b/geographer/errors.py @@ -1,25 +1,25 @@ """Custom Error classes.""" -class Error(Exception): - """Base class for exceptions in the connector class.""" +class GeoGrapherError(Exception): + """Base class for exceptions.""" pass -class RasterAlreadyExistsError(Error): +class RasterAlreadyExistsError(GeoGrapherError): """Raster already exists in dataset.""" pass -class NoRastersForVectorFoundError(Error): +class NoRastersForVectorFoundError(GeoGrapherError): """No rasters found or none could be downloaded.""" pass -class RasterDownloadError(Error): +class RasterDownloadError(GeoGrapherError): """Error occurs while downloading raster.""" pass diff --git a/geographer/global_constants.py b/geographer/global_constants.py index b196c93c..c7955d6e 100644 --- a/geographer/global_constants.py +++ b/geographer/global_constants.py @@ -6,6 +6,8 @@ DATA_DIR_SUBDIRS = [ Path("rasters"), Path("labels"), -] # for sentinel-2, also Path("safe_files") +] RASTER_IMGS_INDEX_NAME = "raster_name" VECTOR_FEATURES_INDEX_NAME = "vector_name" + +DUMMY_VALUE = "__DUMMY_VALUE__" diff --git a/geographer/graph/bipartite_graph.py b/geographer/graph/bipartite_graph.py index a886ad95..09b01dd1 100644 --- a/geographer/graph/bipartite_graph.py +++ b/geographer/graph/bipartite_graph.py @@ -38,7 +38,7 @@ import logging from json import JSONDecodeError from pathlib import Path -from typing import Any, Optional +from typing import Any from geographer.graph.bipartite_graph_class import BipartiteGraphClass from geographer.graph.type_aliases import VertexColor, VertexName @@ -87,8 +87,8 @@ class BipartiteGraph(BipartiteGraphClass): def __init__( self, - graph_dict: Optional[dict] = None, - file_path: Optional[Path] = None, + graph_dict: dict | None = None, + file_path: Path | None = None, red: VertexColor = None, black: VertexColor = None, directed: bool = False, @@ -112,11 +112,11 @@ def __init__( directed: If True the graph is directed, defaults to False. """ if file_path is not None: - self.file_path: Optional[Path] = file_path + self.file_path: Path | None = file_path self.directed = directed try: - with open(file_path, "r") as read_file: - self._graph_dict = json.load(read_file) + with open(file_path, "r") as file: + self._graph_dict = json.load(file) except FileNotFoundError: log.exception("Graph dict file %s not found", file_path) except JSONDecodeError: @@ -178,7 +178,7 @@ def vertices_opposite( self, vertex_name: VertexName, vertex_color: VertexColor, - edge_data: Optional[Any] = None, + edge_data: Any | None = None, ) -> list[VertexColor]: """Return list of adjacent vertices. @@ -223,7 +223,7 @@ def exists_edge( from_vertex: VertexName, from_vertex_color: VertexColor, to_vertex: VertexName, - edge_data: Optional[Any] = None, + edge_data: Any | None = None, ) -> bool: """Return True if the edge is in the graph, False otherwise.""" if edge_data is None: @@ -332,14 +332,12 @@ def delete_vertex( force_delete_with_edges: """ if not self.exists_vertex(vertex_name, vertex_color): - log.info( "delete_vertex: nothing to do, vertex %s does not exist.", vertex_name ) # if force_delete_with_edges=False check if vertex has outgoing adjacent edges elif self.directed: - log.error( "Sorry, delete_vertex is not implemented for directed graphs. " "I was too lazy to code up the complication of checking " @@ -357,7 +355,6 @@ def delete_vertex( not force_delete_with_edges and list(self.vertices_opposite(vertex_name, vertex_color)) != [] ): - raise Exception( f"delete_vertex: vertex {vertex_name} of color {vertex_color} has " "edges. Set force_delete_with_edges=True to delete anyway " @@ -365,7 +362,6 @@ def delete_vertex( ) else: - # thinking of an undirected graph as a directed graph where for each edge # there is an opposite edge, we first take out the edges _ending_ in # vertex, i.e. the opposite edges to the outgoing ones at vertex. @@ -404,7 +400,7 @@ def delete_edge( opposite_color = self._opposite_color(from_vertex_color) self._graph_dict[opposite_color][to_vertex].pop(from_vertex) - def save_to_file(self, file_path: Optional[Path] = None): + def save_to_file(self, file_path: Path | None = None): """Save graph (i.e. graph_dict) to disk as json file. Args: diff --git a/geographer/graph/bipartite_graph_class.py b/geographer/graph/bipartite_graph_class.py index 340e3b68..0314b5ab 100644 --- a/geographer/graph/bipartite_graph_class.py +++ b/geographer/graph/bipartite_graph_class.py @@ -4,7 +4,7 @@ from abc import ABC from pathlib import Path -from typing import Any, Optional +from typing import Any from geographer.graph.type_aliases import VertexColor, VertexName @@ -36,7 +36,7 @@ def vertices_opposite( self, vertex_name: VertexName, vertex_color: VertexColor, - edge_data: Optional[Any] = None, + edge_data: Any | None = None, ) -> list[VertexColor]: """Return list of adjacent vertices.""" raise NotImplementedError @@ -54,7 +54,7 @@ def exists_edge( from_vertex: VertexName, from_vertex_color: VertexColor, to_vertex: VertexName, - edge_data: Optional[Any], + edge_data: Any | None, ) -> bool: """Return True if the edge is in the graph, False otherwise.""" raise NotImplementedError @@ -101,7 +101,7 @@ def delete_edge( """Delete edge from graph.""" raise NotImplementedError - def save_to_file(self, file_path: Optional[Path] = None): + def save_to_file(self, file_path: Path | None = None): """Save graph to file.""" raise NotImplementedError diff --git a/geographer/graph/bipartite_graph_mixin.py b/geographer/graph/bipartite_graph_mixin.py index 2f56d6c9..d59ed5bb 100644 --- a/geographer/graph/bipartite_graph_mixin.py +++ b/geographer/graph/bipartite_graph_mixin.py @@ -4,7 +4,7 @@ import logging from pathlib import Path -from typing import TYPE_CHECKING, Literal, Optional, Union +from typing import TYPE_CHECKING, Literal from geopandas import GeoDataFrame from shapely.geometry.base import BaseGeometry @@ -53,7 +53,7 @@ def rectangle_bounding_raster(self, raster_name: str) -> BaseGeometry: return self.rasters.loc[raster_name, "geometry"] def vectors_intersecting_raster( - self, raster_name: Union[str, list[str]] + self, raster_name: str | list[str] ) -> list[str]: """Return vector features intersecting one or (any of) several rasters. @@ -83,7 +83,7 @@ def vectors_intersecting_raster( def rasters_intersecting_vector( self, - vector_name: Union[str, list[str]], + vector_name: str | list[str], mode: Literal["names", "paths"] = "names", ) -> list[str]: """Return rasters intersecting several vector feature(s). @@ -125,7 +125,7 @@ def rasters_intersecting_vector( return answer def vectors_contained_in_raster( - self, raster_name: Union[str, list[str]] + self, raster_name: str | list[str] ) -> list[str]: """Return vector features fully containing a given raster. @@ -160,7 +160,7 @@ def vectors_contained_in_raster( def rasters_containing_vector( self, - vector_name: Union[str, list[str]], + vector_name: str | list[str], mode: Literal["names", "paths"] = "names", ) -> list[str]: """Return rasters in which a given vector feature is fully contained. @@ -259,10 +259,10 @@ def _connect_raster_to_vector( self, raster_name: str, vector_name: str, - contains_or_intersects: Optional[str] = None, - vectors: Optional[GeoDataFrame] = None, - raster_bounding_rectangle: Optional[BaseGeometry] = None, - graph: Optional[BipartiteGraphClass] = None, + contains_or_intersects: str | None = None, + vectors: GeoDataFrame | None = None, + raster_bounding_rectangle: BaseGeometry | None = None, + graph: BipartiteGraphClass | None = None, do_safety_check: bool = True, ): """Connect a raster to a vector feature in the graph. @@ -301,7 +301,6 @@ def _connect_raster_to_vector( # get containment relation if not given if contains_or_intersects is None: - vector_geom = vectors.loc[vector_name, "geometry"] non_empty_intersection = vector_geom.intersects(raster_bounding_rectangle) @@ -338,7 +337,7 @@ def _connect_raster_to_vector( vectors.loc[vector_name, self.raster_count_col_name] += 1 def _add_vector_to_graph( - self, vector_name: str, vectors: Optional[GeoDataFrame] = None + self, vector_name: str, vectors: GeoDataFrame | None = None ): """Connect a vector feature all intersecting rasters. @@ -397,9 +396,9 @@ def _add_vector_to_graph( def _add_raster_to_graph_modify_vectors( self, raster_name: str, - raster_bounding_rectangle: Optional[BaseGeometry] = None, - vectors: Optional[GeoDataFrame] = None, - graph: Optional[BipartiteGraphClass] = None, + raster_bounding_rectangle: BaseGeometry | None = None, + vectors: GeoDataFrame | None = None, + graph: BipartiteGraphClass | None = None, ): """Add raster to graph and modify vector features. diff --git a/geographer/img_polygon_associator_TODO b/geographer/img_polygon_associator_TODO deleted file mode 100644 index 64fea973..00000000 --- a/geographer/img_polygon_associator_TODO +++ /dev/null @@ -1,27 +0,0 @@ -[Rustam: I wrote this for myself, might not be comprehensible to anyone else.] - -TODO: README.MD etc. -TODO: SHOULD I COMBINE THE DOWNLOAD AND PROCESSING FUNCTIONS? -TODO: THE __make_geotif_label__ FUNCTION ASSUMES THERE IS A "TYPE" COLUMN GIVING THE SEGMENTATION TYPE OF A POLYGON IN POLYGONS_DF, AND THAT LABELS ARE CATEGORICAL. THIS WILL NOT COVER EVERY USE CASE. HOW BEST TO MAKE THIS MODULAR/ADAPTABLE? SHOULD I TAKE OUT THE __make_geotif_label__ FUNCTION AND PASS IT AS A PARAMETER WHEN CONSTRUCTING A CLASS INSTANCE? AGAIN< WE CAN TAKE A FUNCTIONAL OR OBJECT ORIENTED APPROACH. -TODO: add failsafe/assert statements that coordinates have been converted correctly by comparing raster_bounding_rectangle with box from metadata... why does this not work? -TODO: type checking not yet implemented -TODO: make sure column and index types are what they should be. - Not sure how best to deal with dtype being 'object' or 'O' when it should be str?? -TODO: Check we add a vertex in the graph whenever we add a polygon or an raster! - and take out 'have_raster?' and 'have_raster_downloaded?' values in self.polygons_df when we delete a polygon vertex - more generally, treat the rows of polygons as "glued" to the polygon vertices, and similarly for the rasters - there should be no public (i.e. non dunder) methods which break the abstraction barrier - by creating an associator that - - how to make precise what I mean? whenever we drop or add a polygon or raster or - change the polygon geometries appropriate connections (edges in the graph) should be added as well. - Or: the following invariants should be maintained by all methods: - - rows in self.polygons_df correspond to (bijectively) polygon vertices, all polygons are connected - (i.e. the polygon vertices are) to all rasters (i.e. raster vertices) they should be connected to, - and if there exists a raster fully containing a polygon that should be reflected in that - polygon's entry in the "have_raster?" column (what about have_raster_downloaded?) - - rows in self.rasters correspond (bijectively) to raster vertices, and all rasters are connected to all - polygons they should be connected to. - Another way of thinking aout this: One should never be able to manipulate the graph from outside the associator. One can only add or delete rasters, and the connections should be taken care of automatically. -TODO: index names of associator dataframes can be set (is this still true?). this should be reflected in the documentation -TODO: be consistent with capitalization, punctuation (full stops after descriptions?) diff --git a/geographer/label_makers/label_maker_base.py b/geographer/label_makers/label_maker_base.py index 24885600..b3ae5f70 100644 --- a/geographer/label_makers/label_maker_base.py +++ b/geographer/label_makers/label_maker_base.py @@ -8,7 +8,7 @@ import logging from abc import ABC, abstractmethod -from typing import TYPE_CHECKING, Optional +from typing import TYPE_CHECKING from pydantic import BaseModel @@ -34,7 +34,7 @@ class LabelMaker(ABC, BaseModel, SaveAndLoadBaseModelMixIn): def make_labels( self, connector: Connector, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ): """Create segmentation labels. @@ -47,7 +47,7 @@ def make_labels( def delete_labels( self, connector: Connector, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ): """Delete (pixel) labels from the connector's labels_dir. @@ -59,7 +59,7 @@ def delete_labels( def recompute_labels( self, connector: Connector, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ): """Recompute labels. diff --git a/geographer/label_makers/seg_label_maker_base.py b/geographer/label_makers/seg_label_maker_base.py index add9bf5e..56571d3e 100644 --- a/geographer/label_makers/seg_label_maker_base.py +++ b/geographer/label_makers/seg_label_maker_base.py @@ -4,7 +4,6 @@ import logging from abc import abstractmethod -from typing import Optional from pydantic import BaseModel, Field from tqdm.auto import tqdm @@ -56,7 +55,7 @@ def _run_safety_checks(self, connector: Connector): def make_labels( self, connector: Connector, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ): """Create segmentation labels. @@ -101,7 +100,7 @@ def make_labels( def delete_labels( self, connector: Connector, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ): """Delete (pixel) labels from the connector's labels_dir. @@ -136,7 +135,6 @@ def _compare_existing_rasters_to_rasters(connector: Connector): # ... then if the set of rasters is a strict subset # of the rasters in rasters ... if existing_rasters < set(connector.rasters.index): - # ... log a warning log.warning( "There are rasters in connector.rasters that " @@ -146,7 +144,6 @@ def _compare_existing_rasters_to_rasters(connector: Connector): # ... and if it is not a subset, ... if not existing_rasters <= set(connector.rasters.index): - # ... log an warning message = ( "Warning! There are rasters in the dataset's rasters " diff --git a/geographer/label_makers/seg_label_maker_categorical.py b/geographer/label_makers/seg_label_maker_categorical.py index c9b87b54..0e6fd277 100644 --- a/geographer/label_makers/seg_label_maker_categorical.py +++ b/geographer/label_makers/seg_label_maker_categorical.py @@ -1,7 +1,5 @@ """Label maker for categorical segmentation labels.""" -from __future__ import annotations - import logging import numpy as np @@ -48,7 +46,6 @@ def _make_label_for_raster(self, connector: Connector, raster_name: str): # If the raster does not exist ... if not raster_path.is_file(): - # ... log error to file. log.error( "SegLabelMakerCategorical: input raster %s does not exist!", raster_path @@ -56,16 +53,13 @@ def _make_label_for_raster(self, connector: Connector, raster_name: str): # Else, if the label already exists ... elif label_path.is_file(): - # ... log error to file. log.error("SegLabelMakerCategorical: label %s already exists!", label_path) # Else, ... else: - # ...open the raster, ... with rio.open(raster_path) as src: - profile = src.profile profile.update({"count": 1, "dtype": rio.uint8}) @@ -78,7 +72,6 @@ def _make_label_for_raster(self, connector: Connector, raster_name: str): # nbits=1, **profile, ) as dst: - # ... create an empty band of zeros (background class) ... label = np.zeros((src.height, src.width), dtype=np.uint8) @@ -86,7 +79,6 @@ def _make_label_for_raster(self, connector: Connector, raster_name: str): shapes = [] # pairs of geometries and values to burn in for count, seg_class in enumerate(segmentation_classes, start=1): - # To do that, first find (the df of) the geometries # intersecting the raster ... vectors_intersecting_raster: GeoDataFrame = ( diff --git a/geographer/label_makers/seg_label_maker_soft_categorical.py b/geographer/label_makers/seg_label_maker_soft_categorical.py index d39398d4..6a13bdfa 100644 --- a/geographer/label_makers/seg_label_maker_soft_categorical.py +++ b/geographer/label_makers/seg_label_maker_soft_categorical.py @@ -54,7 +54,6 @@ def _make_label_for_raster( # If the raster does not exist ... if not raster_path.is_file(): - # ... log error to file. log.error( "_make_geotif_label_soft_categorical: input raster %s does not exist!", @@ -63,7 +62,6 @@ def _make_label_for_raster( # Else, if the label already exists ... elif label_path.is_file(): - # ... log error to file. log.error( "_make_geotif_label_soft_categorical: label %s already exists!", @@ -72,19 +70,16 @@ def _make_label_for_raster( # Else, ... else: - label_bands_count = self._get_label_bands_count(connector) # ...open the raster, ... with rio.open(raster_path) as src: - # Create profile for the label. profile = src.profile profile.update({"count": label_bands_count, "dtype": rio.float32}) # Open the label ... with rio.open(label_path, "w+", **profile) as dst: - # ... and create one band in the label for each segmentation class. # (if an implicit background band is to be included, @@ -94,7 +89,6 @@ def _make_label_for_raster( for count, seg_class in enumerate( connector.task_vector_classes, start=start_band ): - # To do that, first find (the df of) # the geoms intersecting the raster ... vectors_intersecting_raster_df = connector.vectors.loc[ @@ -149,7 +143,6 @@ def _make_label_for_raster( # If the background is not included in the segmentation classes ... if self.add_background_band: - # ... add background band. non_background_band_indices = list( @@ -172,16 +165,13 @@ def _make_label_for_raster( dst.write(background_band, 1) def _get_label_bands_count(self, connector: Connector) -> bool: - # If the background is not included in the segmentation classes (default) ... if self.add_background_band: - # ... add a band for the implicit background segmentation class, ... label_bands_count = 1 + len(connector.task_vector_classes) # ... if the background *is* included, ... elif not self.add_background_band: - # ... don't. label_bands_count = len(connector.task_vector_classes) diff --git a/geographer/raster_bands_getter_mixin.py b/geographer/raster_bands_getter_mixin.py index c00a15df..9956c138 100644 --- a/geographer/raster_bands_getter_mixin.py +++ b/geographer/raster_bands_getter_mixin.py @@ -3,7 +3,6 @@ from __future__ import annotations from pathlib import Path -from typing import Optional import rasterio as rio @@ -13,7 +12,7 @@ class RasterBandsGetterMixIn: def _get_bands_for_raster( self, - bands: Optional[dict[str, Optional[list[int]]]], + bands: dict[str, list[int] | None] | None, source_raster_path: Path, ) -> list[int]: """Return bands indices to be used in the target raster. diff --git a/geographer/testing/graph_df_compatibility.py b/geographer/testing/graph_df_compatibility.py index da7aa31f..48c9b2d8 100644 --- a/geographer/testing/graph_df_compatibility.py +++ b/geographer/testing/graph_df_compatibility.py @@ -54,11 +54,9 @@ def check_graph_vertices_counts(connector: Connector): answer = True for set_description, set_difference in set_descriptions_and_differences: - num_elements_in_difference = len(set_difference) if num_elements_in_difference != 0: - answer = False are_or_is = "are" if num_elements_in_difference > 1 else "is" @@ -83,7 +81,6 @@ def check_graph_vertices_counts(connector: Connector): ) if not counts_correct.all(): - return_df = pd.concat( [connector.vectors[connector.raster_count_col_name], raster_count_edges], axis=1, diff --git a/geographer/testing/mock_download.py b/geographer/testing/mock_download.py index 2f40d83f..ad548328 100644 --- a/geographer/testing/mock_download.py +++ b/geographer/testing/mock_download.py @@ -7,9 +7,9 @@ import random from pathlib import Path -from typing import Any, Dict, Literal, Union +from typing import Any, Literal -from pydantic import Field +from pydantic import ConfigDict, Field from shapely.geometry import Polygon from geographer.connector import Connector @@ -31,12 +31,9 @@ class MockDownloadProcessor(RasterDownloadProcessor): downloaded. """ - source_connector: Connector = Field(exclude=True) - - class Config: - """BaseModel Config.""" + model_config = ConfigDict(arbitrary_types_allowed=True) - arbitrary_types_allowed = True + source_connector: Connector = Field(exclude=True) def process( self, @@ -45,8 +42,8 @@ def process( rasters_dir: Path, return_bounds_in_crs_epsg_code: int, **kwargs: Any, - ) -> Dict[ - Union[Literal["raster_name", "geometry", "orig_crs_epsg_code"], str], Any + ) -> dict[ + Literal["raster_name", "geometry", "orig_crs_epsg_code"] | str, Any ]: """Process "downloaded" file, i.e. does nothing. @@ -70,23 +67,20 @@ class MockDownloaderForSingleVector(RasterDownloaderForSingleVector): source directory. No actual raster data is copied. """ + model_config = ConfigDict(arbitrary_types_allowed=True) + source_connector: Connector = Field(exclude=True) probability_of_download_error: float = 0.1 probability_raster_already_downloaded: float = 0.1 - class Config: - """BaseModel Config.""" - - arbitrary_types_allowed = True - def download( self, - vector_name: Union[int, str], + vector_name: int | str, vector_geom: Polygon, download_dir: Path, - previously_downloaded_rasters_set: set[Union[str, int]], + previously_downloaded_rasters_set: set[str | int], **kwargs, - ) -> dict[Union[Literal["raster_name", "raster_processed?"], str], Any]: + ) -> dict[Literal["raster_name", "raster_processed?"] | str, Any]: """Mock download an raster. Mock download an raster fully containing a vector feature or @@ -131,7 +125,6 @@ def download( # If there isn't such an raster ... if rasters_containing_vector == []: - # ... inform the calling download_missing_rasters_for_vectors # by raising an error. raise NoRastersForVectorFoundError( @@ -142,14 +135,12 @@ def download( # Else, there is an raster in the source dataset # containing the vector feature. else: - # With some probability the API answers our query with # an raster that has already been downloaded... if ( rasters_containing_vector and random.random() < self.probability_raster_already_downloaded ): - # ... in which case we raise an error. raise RasterAlreadyExistsError( "random.random() was less than " @@ -165,13 +156,11 @@ def download( ] if remaining_rasters: - # ... choose one to 'download'. raster_name = random.choice(remaining_rasters) # With some probabibility ... if random.random() < self.probability_of_download_error: - # ... an error occurs when downloading, # so we raise an RasterDownloadError. raise RasterDownloadError( @@ -191,7 +180,6 @@ def download( } else: - raise NoRastersForVectorFoundError( "No new rasters containing vector feature " f"{vector_name} found in source dataset" diff --git a/geographer/utils/__init__.py b/geographer/utils/__init__.py index 44c46e67..cf3a72a3 100644 --- a/geographer/utils/__init__.py +++ b/geographer/utils/__init__.py @@ -1,8 +1,9 @@ """Utils for handling remote sensing datasets. -convert_connector_dataset_tif2npy converts a dataset of GeoTiffs to -.npys. rasters_from_tif_dir generates an rasters GeoDataFrame from a -directory of GeoTiffs. +- convert_connector_dataset_tif2npy converts a dataset of GeoTiffs to + .npys. +- rasters_from_tif_dir generates an rasters GeoDataFrame from a + directory of GeoTiffs. """ from geographer.utils.rasters_from_tif_dir import ( diff --git a/geographer/utils/cluster_rasters.py b/geographer/utils/cluster_rasters.py index 3f543d06..d7ff085d 100644 --- a/geographer/utils/cluster_rasters.py +++ b/geographer/utils/cluster_rasters.py @@ -9,7 +9,7 @@ import itertools from pathlib import Path -from typing import Any, Literal, Optional, Tuple, Union +from typing import Any, Literal, Tuple import networkx as nx import pandas as pd @@ -21,15 +21,13 @@ def get_raster_clusters( - connector: Union[Connector, Path, str], + connector: Connector | Path | str, clusters_defined_by: Literal[ "rasters_that_share_vectors", "rasters_that_share_vectors_or_overlap", ], - raster_names: Optional[list[str]] = None, - preclustering_method: Optional[ - Literal["x then y-axis", "y then x-axis", "x-axis", "y-axis"] - ] = "y then x-axis", # TODO!!!!!!!!!! + raster_names: list[str] | None = None, + preclustering_method: Literal["x then y-axis", "y then x-axis", "x-axis", "y-axis"] | None = "y then x-axis", # TODO!!!!!!!!!! ) -> list[set[str]]: """Return clusters of raster. @@ -37,7 +35,7 @@ def get_raster_clusters( connector: connector or path or str to data dir containing connector clusters_defined_by: relation between rasters defining clusters raster_names: optional list of raster names - preclustering_method (Optional[ str]): optional preclustering method to speed + preclustering_method: optional preclustering method to speed up clustering Returns: @@ -56,12 +54,10 @@ def get_raster_clusters( raster_names = connector.rasters.index.tolist() if preclustering_method is None: - preclusters = [set(raster_names)] singletons, non_singletons = [], preclusters elif preclustering_method in {"x-axis", "y-axis"}: - axis = preclustering_method[0] # 'x' or 'y' geoms = _get_preclustering_geoms(connector=connector, raster_names=raster_names) @@ -69,7 +65,6 @@ def get_raster_clusters( singletons, non_singletons = _separate_non_singletons(preclusters) elif preclustering_method in {"x then y-axis", "y then x-axis"}: - first_axis = preclustering_method[0] second_axis = "y" if first_axis == "x" else "x" @@ -115,13 +110,10 @@ def _refine_preclustering_along_second_axis( singletons, preclusters_along_2nd_axis = [], [] for precluster in preclusters: - if len(precluster) == 1: - singletons.append(precluster) else: - precluster_geoms = _get_preclustering_geoms( connector=connector, raster_names=list(precluster) ) @@ -140,7 +132,6 @@ def _refine_preclustering_along_second_axis( def _get_preclustering_geoms( connector: Connector, raster_names: list[str] ) -> GeoDataFrame: - # raster geoms rasters = deepcopy_gdf(connector.rasters[["geometry"]].loc[raster_names]) rasters["name"] = rasters.index @@ -172,7 +163,9 @@ def _get_preclustering_geoms( assert set(vectors["name"]) & set(rasters["name"]) == set() # combine geoms - geoms = GeoDataFrame(pd.concat([rasters, vectors]), crs=rasters.crs) + geoms = GeoDataFrame( + pd.concat([rasters, vectors]), crs=rasters.crs, geometry="geometry" + ) # don't need ? geoms = deepcopy_gdf(geoms) @@ -185,6 +178,7 @@ def _get_preclustering_geoms( geoms = GeoDataFrame( pd.concat([geoms, geoms.geometry.bounds], axis=1), # column axis crs=geoms.crs, + geometry="geometry", ) return geoms @@ -193,7 +187,6 @@ def _get_preclustering_geoms( def _separate_non_singletons( preclusters: list[set[Any]], ) -> tuple[list[set[Any]], list[set[Any]]]: - singletons, non_singletions = [], [] for precluster in preclusters: if len(precluster) == 1: @@ -208,7 +201,6 @@ def _separate_non_singletons( def _pre_cluster_along_axis( geoms: GeoDataFrame, axis: Literal["x", "y"] ) -> list[set[str]]: - if axis not in {"x", "y"}: raise ValueError("axis arg should be one of 'x', 'y'.") @@ -231,7 +223,6 @@ def _pre_cluster_along_axis( raster_clusters_along_axis = [] while interval_endpoints != []: - rightmost_endpoint = interval_endpoints.pop() assert rightmost_endpoint["type"] == "max" @@ -295,11 +286,9 @@ def _are_connected_by_an_edge( other_raster_bbox = connector.rasters.loc[another_raster].geometry if clusters_defined_by == "rasters_that_overlap": - connected = raster_bbox.intersects(other_raster_bbox) elif clusters_defined_by == "rasters_that_share_vectors": - vectors_in_raster = set(connector.vectors_intersecting_raster(raster)) vectors_in_other_raster = set( connector.vectors_intersecting_raster(another_raster) @@ -308,7 +297,6 @@ def _are_connected_by_an_edge( connected = vectors_in_raster & vectors_in_other_raster != set() elif clusters_defined_by == "rasters_that_share_vectors_or_overlap": - connected_bc_rasters_overlap = _are_connected_by_an_edge( raster, another_raster, "rasters_that_overlap", connector ) @@ -319,7 +307,6 @@ def _are_connected_by_an_edge( connected = connected_bc_rasters_overlap or connected_bc_of_shared_polygons else: - raise ValueError(f"Unknown clusters_defined_by arg: {clusters_defined_by}") return connected diff --git a/geographer/utils/connector_utils.py b/geographer/utils/connector_utils.py index a3616a74..d3f95589 100644 --- a/geographer/utils/connector_utils.py +++ b/geographer/utils/connector_utils.py @@ -1,7 +1,5 @@ """Utilites used in the Connector class.""" -from __future__ import annotations - import logging import pandas as pd @@ -46,7 +44,9 @@ def empty_gdf( }, } - new_empty_gdf = GeoDataFrame(new_empty_gdf_dict, crs=f"EPSG:{crs_epsg_code}") + new_empty_gdf = GeoDataFrame( + new_empty_gdf_dict, crs=f"EPSG:{crs_epsg_code}", geometry="geometry" + ) new_empty_gdf.set_index(index_name, inplace=True) return new_empty_gdf @@ -123,7 +123,6 @@ def _check_df_cols_agree( ): """Log if column names don't agree.""" if set(df.columns) != set(self_df.columns) and len(self_df) > 0: - df1_cols_not_in_df2 = set(df.columns) - set(self_df.columns) df2_cols_not_in_df1 = set(self_df.columns) - set(df.columns) diff --git a/geographer/utils/merge_datasets.py b/geographer/utils/merge_datasets.py index 2355848b..a445c807 100644 --- a/geographer/utils/merge_datasets.py +++ b/geographer/utils/merge_datasets.py @@ -1,9 +1,10 @@ """Utility functions for merging datasets.""" +from __future__ import annotations + import os import shutil from pathlib import Path -from typing import Union from tqdm.auto import tqdm @@ -11,8 +12,8 @@ def merge_datasets( - source_data_dir: Union[Path, str], - target_data_dir: Union[Path, str], + source_data_dir: Path | str, + target_data_dir: Path | str, delete_source: bool = True, ) -> None: """Merge datasets. @@ -46,7 +47,7 @@ def merge_datasets( # TODO rewrite using pathlib -def merge_dirs(root_src_dir: Union[Path, str], root_dst_dir: Union[Path, str]) -> None: +def merge_dirs(root_src_dir: Path | str, root_dst_dir: Path | str) -> None: """Recursively merge two folders including subfolders. (Shamelessly copied from stackoverflow) diff --git a/geographer/utils/rasters_from_tif_dir.py b/geographer/utils/rasters_from_tif_dir.py index aea8763a..8400ec1c 100644 --- a/geographer/utils/rasters_from_tif_dir.py +++ b/geographer/utils/rasters_from_tif_dir.py @@ -4,7 +4,7 @@ import pathlib from pathlib import Path -from typing import Callable, Optional, Union +from typing import Callable import rasterio as rio from geopandas import GeoDataFrame @@ -19,8 +19,6 @@ def default_read_in_raster_for_raster_df_function( ) -> tuple[int, Polygon]: """Read in crs and bbox defining a GeoTIFF raster. - ..note:: - Args: raster_path: location of the raster @@ -28,10 +26,8 @@ def default_read_in_raster_for_raster_df_function( tuple: crs code of the raster, bounding rectangle of the raster """ if raster_path.suffix in [".tif", ".tiff"]: - # ... open them in rasterio ... with rio.open(raster_path, "r") as src: - # ... extract information ... orig_crs_epsg_code = src.crs.to_epsg() @@ -45,9 +41,9 @@ def default_read_in_raster_for_raster_df_function( def rasters_from_rasters_dir( - rasters_dir: Union[pathlib.Path, str], - rasters_crs_epsg_code: Optional[int] = None, - raster_names: Optional[list[str]] = None, + rasters_dir: pathlib.Path | str, + rasters_crs_epsg_code: int | None = None, + raster_names: list[str] | None = None, rasters_datatype: str = "tif", read_in_raster_for_raster_df_function: Callable[ [Path], tuple[int, Polygon] @@ -91,14 +87,13 @@ def rasters_from_rasters_dir( # dict to keep track of information about the rasters that # we will make the rasters from. - new_rasters_dict: dict[str, Union[str, GEOMS_UNION, int]] = { + new_rasters_dict: dict[str, str | GEOMS_UNION | int] = { index_or_col_name: [] for index_or_col_name in {"raster_name", "geometry", "orig_crs_epsg_code"} } # for all rasters in dir ... for raster_path in tqdm(raster_paths, desc="building rasters"): - ( orig_crs_epsg_code, raster_bounding_rectangle_orig_crs, @@ -125,7 +120,7 @@ def rasters_from_rasters_dir( new_rasters_dict[key].append(raster_info_dict[key]) # ... and create a rasters GeoDatFrame from new_rasters_dict: - new_rasters = GeoDataFrame(new_rasters_dict) + new_rasters = GeoDataFrame(new_rasters_dict, geometry="geometry") new_rasters.set_crs(epsg=rasters_crs_epsg_code, inplace=True) new_rasters.set_index("raster_name", inplace=True) diff --git a/geographer/utils/utils.py b/geographer/utils/utils.py index a3c7f321..f02a1b8e 100644 --- a/geographer/utils/utils.py +++ b/geographer/utils/utils.py @@ -66,7 +66,9 @@ def create_logger(app_name: str, level: int = logging.INFO) -> logging.Logger: WARNING. One needs to additionally set the console handler level to the desired level, which is done by this function. - ..note:: Function might be adapted for more specialized usage in the future + .. note:: + + Function might be adapted for more specialized usage in the future Args: app_name: Name of the logger. Will appear in the console output @@ -133,7 +135,7 @@ def transform_shapely_geometry( return transformed_geometry -def round_shapely_geometry(geometry: GEOMS_UNION, ndigits=1) -> Union[Polygon, Point]: +def round_shapely_geometry(geometry: GEOMS_UNION, ndigits=1) -> Polygon| Point: """Round the coordinates of a shapely geometry. Round the coordinates of a shapely geometry (e.g. Polygon or Point). @@ -153,7 +155,10 @@ def round_shapely_geometry(geometry: GEOMS_UNION, ndigits=1) -> Union[Polygon, P def deepcopy_gdf(gdf: GeoDataFrame) -> GeoDataFrame: """Return deepcopy of GeoDataFrame.""" gdf_copy = GeoDataFrame( - columns=gdf.columns, data=copy.deepcopy(gdf.values), crs=gdf.crs + columns=gdf.columns, + data=copy.deepcopy(gdf.values), + crs=gdf.crs, + geometry=gdf.geometry.name, ) gdf_copy = gdf_copy.astype(gdf.dtypes) gdf_copy.set_index(gdf.index, inplace=True) @@ -169,12 +174,19 @@ def concat_gdfs(objs: list[GeoDataFrame], **kwargs: Any) -> GeoDataFrame: list. """ for obj in objs: - if isinstance(obj, GeoDataFrame) and obj.crs != objs[0].crs: - raise ValueError("all geodataframes should have the same crs") + if isinstance(obj, GeoDataFrame): + if obj.crs != objs[0].crs: + raise ValueError("All geodataframes should have the same CRS") + if obj.geometry.name != objs[0].geometry.name: + raise ValueError( + "All geodataframes should have the same geometry column!" + ) elif not isinstance(obj, GeoDataFrame): raise ValueError("all objs should be GeoDataFrames") - concatenated_gdf = GeoDataFrame(pd.concat(objs, **kwargs), crs=objs[0].crs) + concatenated_gdf = GeoDataFrame( + pd.concat(objs, **kwargs), crs=objs[0].crs, geometry=objs[0].geometry.name + ) concatenated_gdf.index.name = objs[0].index.name return concatenated_gdf @@ -186,7 +198,7 @@ def map_dict_values(fun: Callable, dict_arg: dict) -> dict: def create_kml_all_geodataframes( - data_dir: Union[Path, str], out_path: Union[Path, str] + data_dir: Path | str, out_path: Path | str ) -> None: """Create KML file from a dataset's rasters and vectors. @@ -200,12 +212,8 @@ def create_kml_all_geodataframes( rasters_path = data_dir / "connector/rasters.geojson" vectors_path = data_dir / "connector/vectors.geojson" - rasters = gpd.read_file(rasters_path, driver="GeoJSON")[ - ["geometry", RASTER_IMGS_INDEX_NAME] - ] - vectors = gpd.read_file(vectors_path, driver="GeoJSON")[ - ["geometry", VECTOR_FEATURES_INDEX_NAME] - ] + rasters = gpd.read_file(rasters_path)[["geometry", RASTER_IMGS_INDEX_NAME]] + vectors = gpd.read_file(vectors_path)[["geometry", VECTOR_FEATURES_INDEX_NAME]] rasters["Description"] = "raster" rasters["Name"] = rasters[RASTER_IMGS_INDEX_NAME] diff --git a/notebooks/blogpost.ipynb b/notebooks/blogpost.ipynb index 4d75a59f..7e1aee5e 100644 --- a/notebooks/blogpost.ipynb +++ b/notebooks/blogpost.ipynb @@ -14,21 +14,24 @@ "First, we import geographer, as well as some other imports we will need." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install geographer matplotlib" + ] + }, { "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/rustam/dida/GeoGrapher/geographer-env/lib/python3.10/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ + "from datetime import date, timedelta\n", + "import os\n", + "\n", "import geographer as gg\n", "import geopandas as gpd\n", "from pathlib import Path" @@ -43,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -70,7 +73,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -110,55 +113,55 @@ " Munich Olympiastadion\n", " Munich, Germany\n", " stadium\n", - " POLYGON Z ((11.54677 48.17472 0.00000, 11.5446...\n", + " POLYGON Z ((11.54677 48.17472 0, 11.54469 48.1...\n", " \n", " \n", " Munich Track and Field Stadium1\n", " Munich, Germany\n", " stadium\n", - " POLYGON Z ((11.54382 48.17279 0.00000, 11.5438...\n", + " POLYGON Z ((11.54382 48.17279 0, 11.5438 48.17...\n", " \n", " \n", " Munich Olympia Track and Field2\n", " Munich, Germany\n", " stadium\n", - " POLYGON Z ((11.54686 48.17892 0.00000, 11.5468...\n", + " POLYGON Z ((11.54686 48.17892 0, 11.54685 48.1...\n", " \n", " \n", " Munich Staedtisches Stadion Dantestr\n", " Munich, Germany\n", " stadium\n", - " POLYGON Z ((11.52913 48.16874 0.00000, 11.5291...\n", + " POLYGON Z ((11.52913 48.16874 0, 11.5291 48.16...\n", " \n", " \n", " Vasil Levski National Stadium\n", " Sofia, Bulgaria\n", " stadium\n", - " POLYGON Z ((23.33410 42.68813 0.00000, 23.3340...\n", + " POLYGON Z ((23.3341 42.68813 0, 23.33408 42.68...\n", " \n", " \n", " Bulgarian Army Stadium\n", " Sofia, Bulgaria\n", " stadium\n", - " POLYGON Z ((23.34065 42.68492 0.00000, 23.3406...\n", + " POLYGON Z ((23.34065 42.68492 0, 23.34062 42.6...\n", " \n", " \n", " Arena Sofia\n", " Sofia, Bulgaria\n", " stadium\n", - " POLYGON Z ((23.34018 42.68318 0.00000, 23.3401...\n", + " POLYGON Z ((23.34018 42.68318 0, 23.34018 42.6...\n", " \n", " \n", " Jingu Baseball Stadium\n", " Tokyo, Japan\n", " stadium\n", - " POLYGON Z ((139.71597 35.67490 0.00000, 139.71...\n", + " POLYGON Z ((139.71597 35.6749 0, 139.71599 35....\n", " \n", " \n", " Japan National Stadium\n", " Tokyo, Japan\n", " stadium\n", - " POLYGON Z ((139.71482 35.67644 0.00000, 139.71...\n", + " POLYGON Z ((139.71482 35.67644 0, 139.71484 35...\n", " \n", " \n", "\n", @@ -179,18 +182,18 @@ "\n", " geometry \n", "vector_name \n", - "Munich Olympiastadion POLYGON Z ((11.54677 48.17472 0.00000, 11.5446... \n", - "Munich Track and Field Stadium1 POLYGON Z ((11.54382 48.17279 0.00000, 11.5438... \n", - "Munich Olympia Track and Field2 POLYGON Z ((11.54686 48.17892 0.00000, 11.5468... \n", - "Munich Staedtisches Stadion Dantestr POLYGON Z ((11.52913 48.16874 0.00000, 11.5291... \n", - "Vasil Levski National Stadium POLYGON Z ((23.33410 42.68813 0.00000, 23.3340... \n", - "Bulgarian Army Stadium POLYGON Z ((23.34065 42.68492 0.00000, 23.3406... \n", - "Arena Sofia POLYGON Z ((23.34018 42.68318 0.00000, 23.3401... \n", - "Jingu Baseball Stadium POLYGON Z ((139.71597 35.67490 0.00000, 139.71... \n", - "Japan National Stadium POLYGON Z ((139.71482 35.67644 0.00000, 139.71... " + "Munich Olympiastadion POLYGON Z ((11.54677 48.17472 0, 11.54469 48.1... \n", + "Munich Track and Field Stadium1 POLYGON Z ((11.54382 48.17279 0, 11.5438 48.17... \n", + "Munich Olympia Track and Field2 POLYGON Z ((11.54686 48.17892 0, 11.54685 48.1... \n", + "Munich Staedtisches Stadion Dantestr POLYGON Z ((11.52913 48.16874 0, 11.5291 48.16... \n", + "Vasil Levski National Stadium POLYGON Z ((23.3341 42.68813 0, 23.33408 42.68... \n", + "Bulgarian Army Stadium POLYGON Z ((23.34065 42.68492 0, 23.34062 42.6... \n", + "Arena Sofia POLYGON Z ((23.34018 42.68318 0, 23.34018 42.6... \n", + "Jingu Baseball Stadium POLYGON Z ((139.71597 35.6749 0, 139.71599 35.... \n", + "Japan National Stadium POLYGON Z ((139.71482 35.67644 0, 139.71484 35... " ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -210,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -226,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -266,63 +269,63 @@ " \n", " \n", " Munich Olympiastadion\n", - " POLYGON Z ((11.54677 48.17472 0.00000, 11.5446...\n", + " POLYGON Z ((11.54677 48.17472 0, 11.54469 48.1...\n", " 0\n", " Munich, Germany\n", " stadium\n", " \n", " \n", " Munich Track and Field Stadium1\n", - " POLYGON Z ((11.54382 48.17279 0.00000, 11.5438...\n", + " POLYGON Z ((11.54382 48.17279 0, 11.5438 48.17...\n", " 0\n", " Munich, Germany\n", " stadium\n", " \n", " \n", " Munich Olympia Track and Field2\n", - " POLYGON Z ((11.54686 48.17892 0.00000, 11.5468...\n", + " POLYGON Z ((11.54686 48.17892 0, 11.54685 48.1...\n", " 0\n", " Munich, Germany\n", " stadium\n", " \n", " \n", " Munich Staedtisches Stadion Dantestr\n", - " POLYGON Z ((11.52913 48.16874 0.00000, 11.5291...\n", + " POLYGON Z ((11.52913 48.16874 0, 11.5291 48.16...\n", " 0\n", " Munich, Germany\n", " stadium\n", " \n", " \n", " Vasil Levski National Stadium\n", - " POLYGON Z ((23.33410 42.68813 0.00000, 23.3340...\n", + " POLYGON Z ((23.3341 42.68813 0, 23.33408 42.68...\n", " 0\n", " Sofia, Bulgaria\n", " stadium\n", " \n", " \n", " Bulgarian Army Stadium\n", - " POLYGON Z ((23.34065 42.68492 0.00000, 23.3406...\n", + " POLYGON Z ((23.34065 42.68492 0, 23.34062 42.6...\n", " 0\n", " Sofia, Bulgaria\n", " stadium\n", " \n", " \n", " Arena Sofia\n", - " POLYGON Z ((23.34018 42.68318 0.00000, 23.3401...\n", + " POLYGON Z ((23.34018 42.68318 0, 23.34018 42.6...\n", " 0\n", " Sofia, Bulgaria\n", " stadium\n", " \n", " \n", " Jingu Baseball Stadium\n", - " POLYGON Z ((139.71597 35.67490 0.00000, 139.71...\n", + " POLYGON Z ((139.71597 35.6749 0, 139.71599 35....\n", " 0\n", " Tokyo, Japan\n", " stadium\n", " \n", " \n", " Japan National Stadium\n", - " POLYGON Z ((139.71482 35.67644 0.00000, 139.71...\n", + " POLYGON Z ((139.71482 35.67644 0, 139.71484 35...\n", " 0\n", " Tokyo, Japan\n", " stadium\n", @@ -334,15 +337,15 @@ "text/plain": [ " geometry \\\n", "vector_name \n", - "Munich Olympiastadion POLYGON Z ((11.54677 48.17472 0.00000, 11.5446... \n", - "Munich Track and Field Stadium1 POLYGON Z ((11.54382 48.17279 0.00000, 11.5438... \n", - "Munich Olympia Track and Field2 POLYGON Z ((11.54686 48.17892 0.00000, 11.5468... \n", - "Munich Staedtisches Stadion Dantestr POLYGON Z ((11.52913 48.16874 0.00000, 11.5291... \n", - "Vasil Levski National Stadium POLYGON Z ((23.33410 42.68813 0.00000, 23.3340... \n", - "Bulgarian Army Stadium POLYGON Z ((23.34065 42.68492 0.00000, 23.3406... \n", - "Arena Sofia POLYGON Z ((23.34018 42.68318 0.00000, 23.3401... \n", - "Jingu Baseball Stadium POLYGON Z ((139.71597 35.67490 0.00000, 139.71... \n", - "Japan National Stadium POLYGON Z ((139.71482 35.67644 0.00000, 139.71... \n", + "Munich Olympiastadion POLYGON Z ((11.54677 48.17472 0, 11.54469 48.1... \n", + "Munich Track and Field Stadium1 POLYGON Z ((11.54382 48.17279 0, 11.5438 48.17... \n", + "Munich Olympia Track and Field2 POLYGON Z ((11.54686 48.17892 0, 11.54685 48.1... \n", + "Munich Staedtisches Stadion Dantestr POLYGON Z ((11.52913 48.16874 0, 11.5291 48.16... \n", + "Vasil Levski National Stadium POLYGON Z ((23.3341 42.68813 0, 23.33408 42.68... \n", + "Bulgarian Army Stadium POLYGON Z ((23.34065 42.68492 0, 23.34062 42.6... \n", + "Arena Sofia POLYGON Z ((23.34018 42.68318 0, 23.34018 42.6... \n", + "Jingu Baseball Stadium POLYGON Z ((139.71597 35.6749 0, 139.71599 35.... \n", + "Japan National Stadium POLYGON Z ((139.71482 35.67644 0, 139.71484 35... \n", "\n", " raster_count location type \n", "vector_name \n", @@ -357,7 +360,7 @@ "Japan National Stadium 0 Tokyo, Japan stadium " ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -372,7 +375,17 @@ "source": [ "# 3. Downloading Rasters for the Vector Data\n", "\n", - "To download rasters for the stadiums, we define a downloader:" + "To download rasters for the stadiums, we will use a downloader based on [eodag](https://eodag.readthedocs.io/en/stable/index.html). We will download rasters from the [copernicus dataspace](https://dataspace.copernicus.eu/). If you do not yet have a copernicus dataspace account, you can create one [here](https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/auth?client_id=cdse-public&redirect_uri=https%3A%2F%2Fdataspace.copernicus.eu%2Fbrowser%2F&response_type=code&scope=openid). To use eodag, eodag will need the username and password of your copernicus dataspace account. One can set these in a config file, but here we will use environment variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# os.environ[\"EODAG__COP_DATASPACE__AUTH__CREDENTIALS__USERNAME\"] = \"PLEASE_CHANGE_ME\"\n", + "# os.environ[\"EODAG__COP_DATASPACE__AUTH__CREDENTIALS__PASSWORD\"] = \"PLEASE_CHANGE_ME\"" ] }, { @@ -383,12 +396,12 @@ "source": [ "from geographer.downloaders import (\n", " RasterDownloaderForVectors,\n", - " SentinelDownloaderForSingleVector,\n", - " Sentinel2Processor,\n", + " EodagDownloaderForSingleVector,\n", + " Sentinel2SAFEProcessor,\n", ")\n", "\n", - "downloader_for_single_vector = SentinelDownloaderForSingleVector()\n", - "download_processor = Sentinel2Processor()\n", + "downloader_for_single_vector = EodagDownloaderForSingleVector()\n", + "download_processor = Sentinel2SAFEProcessor()\n", "\n", "downloader = RasterDownloaderForVectors(\n", " downloader_for_single_vector=downloader_for_single_vector,\n", @@ -400,52 +413,157 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To use the Copernicus SciHub API we need to a username and password. You can sign up for an account [here](https://scihub.copernicus.eu/dhus/#/self-registration). The password and username will be assumed to be stored in a .ini file." + "To download rasters and add them to our dataset we then run the following command. Downloading the large SAFE files and processing the SAFEs to GeoTiffs can take quite a while." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, - "outputs": [], - "source": [ - "credentials_ini_path = Path(\"copernicus_scihub_credentials.ini\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f4f8ae6a2ac54ba1a114d900cf054cea", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0it [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c9e9281a68b946fca16a467ae22836a9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "71391febd6f64f15a15b28d68addb279", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Extracting tif from S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20241108T123352.SAFE.: 0%| | 0/21 …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-21 22:56:22,769 - geographer.downloaders.sentinel2_safe_unpacking - INFO - Using all zero band for gml mask ('CLOUDS', 'B00') for S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20241108T123352.SAFE\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "5cdb756986724631a325c59db8288b6c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1841eba3e6b34cf599fc94983371bd3a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Extracting tif from S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20240731T125141.SAFE.: 0%| | 0/21 …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-21 23:01:13,644 - geographer.downloaders.sentinel2_safe_unpacking - INFO - Using all zero band for gml mask ('CLOUDS', 'B00') for S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20240731T125141.SAFE\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d0451faca6314052ab509d29b57bcd7c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "0.00B [00:00, ?B/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c300b0ab0bd44174a0ae9d145f88b3e3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Extracting tif from S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20231208T031743.SAFE.: 0%| | 0/21 …" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-11-21 23:06:47,516 - geographer.downloaders.sentinel2_safe_unpacking - INFO - Using all zero band for gml mask ('CLOUDS', 'B00') for S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20231208T031743.SAFE\n" + ] + } + ], "source": [ - "The contents of the ini file should look as follows:\n", + "# Here, we define the parameters needed by the EodagDownloaderForSingleVector.download method\n", + "downloader_params = {\n", + " \"search_kwargs\": { # Keyword arguments for the eodag search_all method\n", + " \"provider\": \"cop_dataspace\", # Download from copernicus dataspace \n", + " \"productType\": \"S2_MSI_L2A\", # Search for Sentinel-2 L2A products\n", + " \"start\": (date.today() - timedelta(days=364)).strftime(\"%Y-%m-%d\"), # one year ago\n", + " \"end\": date.today().strftime(\"%Y-%m-%d\"), # today\n", + " },\n", + " \"filter_online\": True, # Filter out products that are not online\n", + " \"sort_by\": (\"cloudCover\", \"ASC\"), # Sort products by percentage of cloud cover in ascending order\n", + " \"suffix_to_remove\": \".SAFE\" # Will strip .SAFE from the stem of the tif file names\n", + "}\n", + "# Here, we define the parameters needed by the Sentinel2SAFEProcessor\n", + "processor_params = {\n", + " \"resolution\": 10, # Extract all 10m resolution bands\n", + " \"delete_safe\": True, # Delete the SAFE file after extracting a .tif file\n", + "} \n", "\n", - "```\n", - "[login]\n", - "username = your_username\n", - "password = your_password\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To download rasters and add them to our dataset we then run the following command." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ "downloader.download(\n", " connector=connector,\n", - " credentials=credentials_ini_path, # could also directly supply (username, password) tuple\n", - " producttype=\"L2A\",\n", - " max_percent_cloud_coverage=10,\n", - " resolution=10, # resolution of extracted GeoTiff\n", - " date=(\"NOW-364DAYS\", \"NOW\"),\n", - " area_relation=\"Contains\",\n", + " downloader_params=downloader_params,\n", + " processor_params=processor_params,\n", ")" ] }, @@ -458,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -482,72 +600,61 @@ " \n", " \n", " \n", + " geometry\n", " raster_processed?\n", - " timestamp\n", " orig_crs_epsg_code\n", - " geometry\n", " \n", " \n", " raster_name\n", " \n", " \n", " \n", - " \n", " \n", " \n", " \n", " \n", - " S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20220627T162810.tif\n", + " S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20241108T123352.tif\n", + " POLYGON ((23.54663 42.33578, 23.58754 43.32358...\n", " True\n", - " 2022-06-27-10:06:11\n", - " 32632\n", - " POLYGON ((11.79809 47.73104, 11.85244 48.71769...\n", + " 32634\n", " \n", " \n", - " S2A_MSIL2A_20220412T012701_N0400_R074_T54SUE_20220412T042315.tif\n", + " S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20240731T125141.tif\n", + " POLYGON ((11.79809 47.73104, 11.85244 48.71769...\n", " True\n", - " 2022-04-12-01:27:01\n", - " 32654\n", - " POLYGON ((140.00972 35.15084, 139.99743 36.140...\n", + " 32632\n", " \n", " \n", - " S2A_MSIL2A_20220722T092041_N0400_R093_T34TFN_20220722T134859.tif\n", + " S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20231208T031743.tif\n", + " POLYGON ((140.00972 35.15084, 139.99743 36.140...\n", " True\n", - " 2022-07-22-09:20:41\n", - " 32634\n", - " POLYGON ((23.54663 42.33578, 23.58754 43.32358...\n", + " 32654\n", " \n", " \n", "\n", "" ], "text/plain": [ + " geometry \\\n", + "raster_name \n", + "S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20... POLYGON ((23.54663 42.33578, 23.58754 43.32358... \n", + "S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20... POLYGON ((11.79809 47.73104, 11.85244 48.71769... \n", + "S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20... POLYGON ((140.00972 35.15084, 139.99743 36.140... \n", + "\n", " raster_processed? \\\n", "raster_name \n", - "S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20... True \n", - "S2A_MSIL2A_20220412T012701_N0400_R074_T54SUE_20... True \n", - "S2A_MSIL2A_20220722T092041_N0400_R093_T34TFN_20... True \n", + "S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20... True \n", + "S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20... True \n", + "S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20... True \n", "\n", - " timestamp \\\n", - "raster_name \n", - "S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20... 2022-06-27-10:06:11 \n", - "S2A_MSIL2A_20220412T012701_N0400_R074_T54SUE_20... 2022-04-12-01:27:01 \n", - "S2A_MSIL2A_20220722T092041_N0400_R093_T34TFN_20... 2022-07-22-09:20:41 \n", - "\n", - " orig_crs_epsg_code \\\n", - "raster_name \n", - "S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20... 32632 \n", - "S2A_MSIL2A_20220412T012701_N0400_R074_T54SUE_20... 32654 \n", - "S2A_MSIL2A_20220722T092041_N0400_R093_T34TFN_20... 32634 \n", - "\n", - " geometry \n", - "raster_name \n", - "S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20... POLYGON ((11.79809 47.73104, 11.85244 48.71769... \n", - "S2A_MSIL2A_20220412T012701_N0400_R074_T54SUE_20... POLYGON ((140.00972 35.15084, 139.99743 36.140... \n", - "S2A_MSIL2A_20220722T092041_N0400_R093_T34TFN_20... POLYGON ((23.54663 42.33578, 23.58754 43.32358... " + " orig_crs_epsg_code \n", + "raster_name \n", + "S2A_MSIL2A_20241108T092201_N0511_R093_T34TFN_20... 32634 \n", + "S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20... 32632 \n", + "S2B_MSIL2A_20231208T013039_N0509_R074_T54SUE_20... 32654 " ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -565,7 +672,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -573,10 +680,10 @@ "output_type": "stream", "text": [ "rasters containing Munich Olympiastadion:\n", - "['S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20220627T162810.tif'] \n", + "['S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20240731T125141.tif'] \n", "\n", - "vector features (stadiums) intersecting S2A_MSIL2A_20220627T100611_N0400_R022_T32UPU_20220627T162810.tif:\n", - "['Munich Track and Field Stadium1', 'Munich Olympiastadion', 'Munich Olympia Track and Field2', 'Munich Staedtisches Stadion Dantestr']\n" + "vector features (stadiums) intersecting S2B_MSIL2A_20240731T100559_N0511_R022_T32UPU_20240731T125141.tif:\n", + "['Munich Olympiastadion', 'Munich Staedtisches Stadion Dantestr', 'Munich Track and Field Stadium1', 'Munich Olympia Track and Field2']\n" ] } ], @@ -601,15 +708,22 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "Cutting dataset: 100%|██████████| 9/9 [00:01<00:00, 7.56it/s]\n" - ] + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "685cb0130efd43458007ba344c45453a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Cutting dataset: 0%| | 0/9 [00:00" ] @@ -692,14 +813,19 @@ "import rioxarray\n", "from matplotlib import pyplot as plt\n", "\n", - "raster_paths = target_connector.rasters_containing_vector(\"Munich Olympiastadion\", mode=\"paths\")\n", + "raster_paths = target_connector.rasters_containing_vector(\n", + " \"Munich Olympiastadion\", mode=\"paths\"\n", + ")\n", "raster_path = raster_paths[0]\n", "label_path = target_connector.labels_dir / raster_path.name\n", "\n", - "raster = rioxarray.open_rasterio(raster_path).sel(band=[1,2,3]).values.transpose(1, 2, 0) / 65535\n", + "raster = (\n", + " rioxarray.open_rasterio(raster_path).sel(band=[1, 2, 3]).values.transpose(1, 2, 0)\n", + " / 65535\n", + ")\n", "label = rioxarray.open_rasterio(label_path).values.transpose(1, 2, 0) / 255\n", "\n", - "fig, ax = plt.subplots(1,2, figsize=(13, 13))\n", + "fig, ax = plt.subplots(1, 2, figsize=(13, 13))\n", "\n", "ax[0].imshow(raster)\n", "ax[0].axis(\"off\")\n", @@ -725,7 +851,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.10.12" }, "orig_nbformat": 4, "vscode": { diff --git a/notebooks/tutorial_nb_basics.ipynb b/notebooks/tutorial_nb_basics.ipynb index e52602d1..e763efdb 100644 --- a/notebooks/tutorial_nb_basics.ipynb +++ b/notebooks/tutorial_nb_basics.ipynb @@ -31,9 +31,13 @@ "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", + "\n", + "from datetime import date, timedelta\n", + "\n", "import geographer as gg\n", "import geopandas as gpd\n", - "from pathlib import Path" + "\n" ] }, { @@ -620,7 +624,7 @@ "source": [ "## 3. Downloading rasters for the vector data\n", "\n", - "To download rasters for the stadiums, we use the `RasterDownloaderForVectors`. This class needs to be passed a `DownloaderForSingleVector` to interface with the particular data source for our rasters, and a `RasterDownloadProcessor` to process the downloaded files. In this example, we would like to download Sentinel-2, so we choose the `SentinelDownloaderForSingleVector` to interface with [Copernicus Open Access Hub](https://scihub.copernicus.eu/) and the Sentinel2Processor to process the downloaded zipped .SAFE files to GeoTiff files (see [here](https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-2-msi/data-formats) for an explanation of the Sentinel-2 data format). The GeoTiff format is a georeferenced version for remote sensing raster data of the Tiff format for normal rasters.\n", + "To download rasters for the stadiums, we use the `RasterDownloaderForVectors`. This class needs to be passed a `DownloaderForSingleVector` to interface with the data provider for our rasters, and a `RasterDownloadProcessor` to process the downloaded files. In this example, we will use the `EodagDownloaderForSingleVector` which uses [eodag](https://eodag.readthedocs.io/en/stable/) as a backend giving easy access to more than 10 providers and more than 50 different product types. We will use it to download Sentinel-2 from the [Copernicus Dataspace](https://dataspace.copernicus.eu/). To process the downloaded SAFE files (see [here](https://sentiwiki.copernicus.eu/web/s2-products) for an explanation of the Sentinel-2 data format) into GeoTiffs we use the `Sentinel2SAFEProcessor`. The GeoTiff format is a georeferenced version for remote sensing raster data of the Tiff format for normal rasters.\n", "\n", "Here, we define the downloader:" ] @@ -633,13 +637,12 @@ "source": [ "from geographer.downloaders import (\n", " RasterDownloaderForVectors,\n", - " SentinelDownloaderForSingleVector,\n", - " Sentinel2Processor,\n", + " EodagDownloaderForSingleVector,\n", + " Sentinel2SAFEProcessor,\n", ")\n", "\n", - "downloader_for_single_vector = SentinelDownloaderForSingleVector()\n", - "download_processor = Sentinel2Processor()\n", - "\n", + "download_processor = Sentinel2SAFEProcessor()\n", + "downloader_for_single_vector = EodagDownloaderForSingleVector()\n", "downloader = RasterDownloaderForVectors(\n", " downloader_for_single_vector=downloader_for_single_vector,\n", " download_processor=download_processor,\n", @@ -650,16 +653,39 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To use the Copernicus SciHub API we need to a username and password. You can sign up for an account [here](https://scihub.copernicus.eu/dhus/#/self-registration). The password and username will be assumed to be stored in a .ini file. The format of the file should be as follows." + "TODO Username and password for cop_dataspace\n", + "\n", + "If you do not yet have a copernicus dataspace account, you can create one [here](https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/auth?client_id=cdse-public&redirect_uri=https%3A%2F%2Fdataspace.copernicus.eu%2Fbrowser%2F&response_type=code&scope=openid). To use eodag, eodag will need the username and password of your copernicus dataspace account. One can set these in a config file, but here we will use environment variables:" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "credentials_ini_path = DATA_DIR / \"copernicus_scihub_credentials.ini\"" + "# os.environ[\"EODAG__COP_DATASPACE__AUTH__CREDENTIALS__USERNAME\"] = \"PLEASE_CHANGE_ME\"\n", + "# os.environ[\"EODAG__COP_DATASPACE__AUTH__CREDENTIALS__PASSWORD\"] = \"PLEASE_CHANGE_ME\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The `downloader_for_single_vector` has an eodag attribute which is an `EODataAccessGateway`. We can use it to for example check that it is configured to access the copernicus dataspace provider:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "downloader_for_single_vector.eodag.\n", + "\n", + "assert \"cop_dataspace\" in downloader_for_single_vector.eodag.available_providers(\n", + " product_type=\"S2_MSI_L2A\"\n", + " )" ] }, { @@ -1121,15 +1147,29 @@ } ], "source": [ + "# Here, we define the parameters needed by the EodagDownloaderForSingleVector.download method\n", + "downloader_params = {\n", + " \"search_kwargs\": { # Keyword arguments for the eodag search_all method\n", + " \"provider\": \"cop_dataspace\", # Download from copernicus dataspace \n", + " \"productType\": \"S2_MSI_L2A\", # Search for Sentinel-2 L2A products\n", + " \"start\": (date.today() - timedelta(days=364)).strftime(\"%Y-%m-%d\"), # one year ago\n", + " \"end\": date.today().strftime(\"%Y-%m-%d\"), # today\n", + " },\n", + " \"filter_online\": True, # Filter out products that are not online\n", + " \"sort_by\": (\"cloudCover\", \"ASC\"), # Sort products by percentage of cloud cover in ascending order\n", + " \"suffix_to_remove\": \".SAFE\" # Will strip .SAFE from the stem of the tif file names\n", + "}\n", + "# Here, we define the parameters needed by the Sentinel2SAFEProcessor\n", + "processor_params = {\n", + " \"resolution\": 10, # Extract all 10m resolution bands\n", + " \"delete_safe\": True, # Delete the SAFE file after extracting a .tif file\n", + "}\n", + "\n", "downloader.download(\n", " connector=connector,\n", - " target_raster_count=2, # optional, defaults to 1. See explanation below.\n", - " credentials=credentials_ini_path, # could also directly supply (username, password) tuple\n", - " producttype=\"L2A\",\n", - " max_percent_cloud_coverage=10,\n", - " resolution=10, # resolution of extracted GeoTiff\n", - " date=(\"NOW-364DAYS\", \"NOW\"),\n", - " area_relation=\"Contains\",\n", + " target_raster_count=2, # optional, defaults to 1. Aim for 2 rasters covering each stadium. See below for further explanation.\n", + " downloader_params=downloader_params,\n", + " processor_params=processor_params,\n", ")" ] }, diff --git a/notebooks/tutorial_nb_cut_label_cluster.ipynb b/notebooks/tutorial_nb_cut_label_cluster.ipynb index d0e76b59..352e85d9 100644 --- a/notebooks/tutorial_nb_cut_label_cluster.ipynb +++ b/notebooks/tutorial_nb_cut_label_cluster.ipynb @@ -20,6 +20,15 @@ "First, we import geographer, as well as some other imports we will need." ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install geographer matplotlib" + ] + }, { "cell_type": "code", "execution_count": 20, @@ -550,14 +559,19 @@ } ], "source": [ - "raster_paths = target_connector.rasters_containing_vector(\"Munich Olympiastadion\", mode=\"paths\")\n", + "raster_paths = target_connector.rasters_containing_vector(\n", + " \"Munich Olympiastadion\", mode=\"paths\"\n", + ")\n", "raster_path = raster_paths[0]\n", "label_path = target_connector.labels_dir / raster_path.name\n", "\n", - "raster = rioxarray.open_rasterio(raster_path).sel(band=[1,2,3]).values.transpose(1, 2, 0) / 65535\n", + "raster = (\n", + " rioxarray.open_rasterio(raster_path).sel(band=[1, 2, 3]).values.transpose(1, 2, 0)\n", + " / 65535\n", + ")\n", "label = rioxarray.open_rasterio(label_path).values.transpose(1, 2, 0) / 255\n", "\n", - "fig, ax = plt.subplots(1,2, figsize=(13, 13))\n", + "fig, ax = plt.subplots(1, 2, figsize=(13, 13))\n", "\n", "ax[0].imshow(raster)\n", "ax[0].axis(\"off\")\n", @@ -1039,7 +1053,7 @@ "from geographer.cutters import DSCutterIterOverVectors\n", "\n", "cutter_json_path = TARGET_DATA_DIR2 / \"connector\" / f\"{TRUNCATED_CUTTER_NAME}.json\"\n", - "cutter = DSCutterIterOverVectors.from_json_file(cutter_json_path)\n" + "cutter = DSCutterIterOverVectors.from_json_file(cutter_json_path)" ] }, { diff --git a/pyproject.toml b/pyproject.toml index 8b94f82f..70fb85c2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,22 +1,81 @@ -[tool.black] -target-version = ['py37', 'py38', 'py39'] +[project] +name = "geographer" +version = "1.0.0" +description = "Build object-centric remote sensing computer vision datasets" +readme = "README.md" +license = {file = "LICENSE"} +authors = [ + {name = "Rustam Antia", email = "rustam.antia@gmail.com"} +] +keywords = ["remote sensing", "computer vision", "satellite imagery", "GIS", "machine learning"] +classifiers = [ + "License :: Other/Proprietary License", + "Programming Language :: Python :: 3" +] +requires-python = ">=3.9" +dependencies = [ + "eodag", + "eval_type_backport", + "fiona", + "geojson", + "geopandas", + "GitPython", + "ipywidgets", + "networkx", + "numpy", + "packaging", + "pandas", + "pydantic >= 2.0", + "pyproj", + "rasterio", + "requests", + "rioxarray", + "rtree", + "scipy", + "sentinelsat", + "Shapely", + "tqdm", + "urllib3" +] -[tool.isort] -atomic = true -profile = "black" -skip_gitignore = true - -[tool.mypy] -plugins = ["pydantic.mypy"] -show_error_codes = true -disallow_untyped_defs = false -disallow_incomplete_defs = false -ignore_missing_imports = true -warn_unused_ignores = false -warn_return_any = false -warn_unreachable = false -strict_optional=true +[project.optional-dependencies] +dev = [ + "ruff==0.7.4", + "build", + "docformatter", + "ipykernel", + "pytest" +] +docs = [ + "furo", + "autodoc_pydantic", + "docutils", + "pandoc", + "Sphinx", + "sphinx-autodoc-typehints", + "nbsphinx", + "nbsphinx-link" +] [build-system] -requires = ["setuptools"] +requires = ["setuptools", "wheel", "build"] build-backend = "setuptools.build_meta" + +[tool.setuptools] +packages = {find = {include = ["geographer*"], exclude = ["docs*", "tests*", "notebooks*"]}} + +[tool.pytest.ini_options] +markers = [ + "slow: marks tests as slow (deselect with '-m \"not slow\"')" +] + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +extend-ignore = ["E266"] # allow multiple leading '#' for block comments +per-file-ignores = { "__init__.py" = ["F401", "D104"] } + +[tool.ruff.lint.isort] +combine-as-imports = true +force-sort-within-sections = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 5412d2c6..00000000 --- a/setup.cfg +++ /dev/null @@ -1,105 +0,0 @@ -[metadata] -name = geographer -version = 0.1 -author = Rustam Antia -author_email = rustam.antia@gmail.com -description = Build objec-centric remote sensing computer vision datasets -long_description = file: README.md -long_description_content_type = text/markdown -keywords = remote sensing, computer vision, satellite imagery, GIS, machine learning -license = Apache-2.0 -classifiers = - License :: OSI Approved :: Apache Software License - Programming Language :: Python :: 3 - -[options] -python_requires = >= 3.8 -packages = find: -zip_safe = False -include_package_data = True -install_requires = - fiona - geojson - geopandas - GitPython - ipywidgets - matplotlib - networkx - numpy - packaging - pandas - pydantic - pygeos - pyproj - rasterio - requests - rioxarray - rtree - scipy - sentinelsat - Shapely - tqdm - urllib3 - -[options.extras_require] -dev = - black - docformatter - flake8 - flake8-black - flake8-docstrings - flake8-isort - isort - mypy - pytest -docs = - alabaster - autodoc_pydantic - docutils - pandoc - Sphinx - sphinx-autodoc-typehints - nbsphinx - nbsphinx-link - -[options.package_data] -geographer = data/schema.json, *.txt -* = README.md - -[flake8] -max-line-length = 88 -# allow multiple leading '#' for block comment -extend-ignore = E266 -per-file-ignores = __init__.py:F401,D104 - -[mypy] - -[mypy-geopandas.*] -ignore_missing_imports = True - -[mypy-rasterio.*] -ignore_missing_imports = True - -[mypy-scipy.*] -ignore_missing_imports = True - -[mypy-networkx.*] -ignore_missing_imports = True - -[mypy-affine.*] -ignore_missing_imports = True - -[mypy-pandas.*] -ignore_missing_imports = True - -[mypy-sentinelsat.*] -ignore_missing_imports = True - -[mypy-tqdm.*] -ignore_missing_imports = True - -[mypy-shapely.*] -ignore_missing_imports = True - -[mypy-fiona.*] -ignore_missing_imports = True diff --git a/tests/cluster_rasters_test.py b/tests/cluster_rasters_test.py index 4c0b2125..f8c4aef7 100644 --- a/tests/cluster_rasters_test.py +++ b/tests/cluster_rasters_test.py @@ -25,11 +25,10 @@ def test_cluster_rasters(): # Create empty connector data_dir = Path("/whatever/") connector = Connector.from_scratch(data_dir=data_dir) - """ Create vectors """ - new_vectors = gpd.GeoDataFrame() + new_vectors = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_vectors.rename_axis(VECTOR_FEATURES_INDEX_NAME, inplace=True) # polygon names and geometries @@ -45,11 +44,10 @@ def test_cluster_rasters(): new_vectors = new_vectors.set_crs(epsg=STANDARD_CRS_EPSG_CODE) connector.add_to_vectors(new_vectors) - """ Create rasters """ - new_rasters = gpd.GeoDataFrame() + new_rasters = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_rasters.rename_axis(RASTER_IMGS_INDEX_NAME, inplace=True) # geometries (raster bounding rectangles) @@ -85,7 +83,6 @@ def test_cluster_rasters(): new_rasters = new_rasters.set_crs(epsg=STANDARD_CRS_EPSG_CODE) connector.add_to_rasters(new_rasters) - """ Test clusters defined by 'rasters_that_share_vectors_or_overlap' """ @@ -98,7 +95,6 @@ def test_cluster_rasters(): frozenset({"raster3", "raster1", "raster2"}), frozenset({"raster4", "raster6", "raster5"}), } - """ Test clusters defined by 'rasters_that_share_vectors' """ diff --git a/tests/conftest.py b/tests/conftest.py index 4aeab935..5d093c49 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,6 @@ """Pytest fixtures.""" import shutil -from pathlib import Path import pytest from utils import create_dummy_rasters, delete_dummy_rasters, get_test_dir @@ -12,10 +11,11 @@ @pytest.fixture(scope="session") -def dummy_cut_source_data_dir() -> Path: +def dummy_cut_source_data_dir(): """Return cut source data dir containing dummy data. - Dummy rasters are created before the pytest session starts and removed afterwards. + Dummy rasters are created before the pytest session starts and + removed afterwards. """ data_dir = get_test_dir() / CUT_SOURCE_DATA_DIR_NAME connector = Connector.from_data_dir(data_dir=data_dir) diff --git a/tests/connector_test.py b/tests/connector_test.py index 48b99eb1..f7dde623 100644 --- a/tests/connector_test.py +++ b/tests/connector_test.py @@ -37,12 +37,11 @@ def test_connector(): connector = Connector.from_scratch( data_dir=data_dir, task_vector_classes=TASK_FEATURE_CLASSES ) - """ Toy vectors """ # create empty GeoDataFrame with the right index name - new_vectors = gpd.GeoDataFrame() + new_vectors = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_vectors.rename_axis(VECTOR_FEATURES_INDEX_NAME, inplace=True) # polygon names and geometries @@ -62,9 +61,7 @@ def test_connector(): # set crs new_vectors = new_vectors.set_crs(epsg=STANDARD_CRS_EPSG_CODE) - """ - Test add_to_vectors - """ + """Test add_to_vectors.""" # add vectors connector.add_to_vectors(new_vectors) @@ -76,12 +73,10 @@ def test_connector(): new_vectors, ) assert check_graph_vertices_counts(connector) - """ - Toy rasters - """ + """Toy rasters.""" # empty GeoDataFrame with right index name - new_rasters = gpd.GeoDataFrame() + new_rasters = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_rasters.rename_axis(RASTER_IMGS_INDEX_NAME, inplace=True) # geometries (raster bounding rectangles) @@ -102,9 +97,7 @@ def test_connector(): # set crs new_rasters = new_rasters.set_crs(epsg=STANDARD_CRS_EPSG_CODE) - """ - Test add_to_rasters - """ + """Test add_to_rasters.""" connector.add_to_rasters(new_rasters) assert connector._graph._graph_dict == { @@ -119,12 +112,10 @@ def test_connector(): }, } assert check_graph_vertices_counts(connector) - """ - Test have_raster_for_vector, rectangle_bounding_raster, + """Test have_raster_for_vector, rectangle_bounding_raster, polygons_intersecting_raster, polygons_contained_in_raster, - rasters_containing_vector, values of 'have_raster?' - column in connector.vectors. - """ + rasters_containing_vector, values of 'have_raster?' column in + connector.vectors.""" assert (connector.rectangle_bounding_raster("raster1")).equals( box(-0.5, -0.5, 6, 6) ) @@ -138,11 +129,9 @@ def test_connector(): "polygon1", "polygon2", } - """ - Add more rasters - """ + """Add more rasters.""" # empty GeoDataFrame with right index name - new_rasters2 = gpd.GeoDataFrame() + new_rasters2 = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_rasters2.rename_axis(RASTER_IMGS_INDEX_NAME, inplace=True) # the new_rasters2 geometries will be the raster bounding rectangles here: @@ -195,9 +184,7 @@ def test_connector(): }, } assert check_graph_vertices_counts(connector) - """ - Drop vector feature - """ + """Drop vector feature.""" connector.drop_vectors("polygon3") # test containment/intersection relations, i.e. graph structure @@ -226,12 +213,10 @@ def test_connector(): }, } assert check_graph_vertices_counts(connector) - """ - Add more vector features - """ + """Add more vector features.""" # create empty GeoDataFrame with the right index name - new_vectors2 = gpd.GeoDataFrame() + new_vectors2 = gpd.GeoDataFrame(geometry=gpd.GeoSeries([])) new_vectors2.rename_axis(VECTOR_FEATURES_INDEX_NAME, inplace=True) # polygon names and geometries @@ -288,9 +273,7 @@ def test_connector(): # assert we have no duplicate entries assert len(connector.rasters) == 4 assert len(connector.vectors) == 4 - """ - Test drop_rasters - """ + """Test drop_rasters.""" connector.drop_rasters(["raster2", "raster3"]) assert len(connector.rasters) == 2 @@ -313,9 +296,7 @@ def test_connector(): } assert check_graph_vertices_counts(connector) - """ - Test drop_vectors - """ + """Test drop_vectors.""" connector.drop_vectors(["polygon1", "polygon3"]) assert len(connector.vectors) == 2 diff --git a/tests/cut_rasters_around_every_vector_test.py b/tests/cut_rasters_around_every_vector_test.py index 4d818aa9..5dba4e4b 100644 --- a/tests/cut_rasters_around_every_vector_test.py +++ b/tests/cut_rasters_around_every_vector_test.py @@ -12,7 +12,6 @@ """ import shutil -from typing import List from shapely.geometry import Polygon from shapely.ops import unary_union @@ -121,7 +120,7 @@ def test_rasters_around_every_vector(dummy_cut_source_data_dir): tempelhofer_feld: Polygon = target_connector.vectors.loc[ "berlin_tempelhofer_feld" ].geometry - rasters_intersecting_tempelhofer_feld: List[str] = ( + rasters_intersecting_tempelhofer_feld: list[str] = ( target_connector.rasters_intersecting_vector("berlin_tempelhofer_feld") ) bboxes = target_connector.rasters.geometry.loc[ diff --git a/tests/data/cut_source/connector/rasters.geojson b/tests/data/cut_source/connector/rasters.geojson index 06db1810..1a5a7c45 100644 --- a/tests/data/cut_source/connector/rasters.geojson +++ b/tests/data/cut_source/connector/rasters.geojson @@ -1,5 +1,6 @@ { "type": "FeatureCollection", +"name": "rasters", "crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, "features": [ { "type": "Feature", "properties": { "raster_name": "S2A_MSIL2A_20220309T100841_N0400_R022_T32UQD_20220309T121849.tif", "raster_processed?": true, "timestamp": "2022-03-09-10:08:41", "orig_crs_epsg_code": 32632 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ 13.531032170806659, 52.175484718678582 ], [ 13.634183017872079, 53.159421967147352 ], [ 11.994640922835734, 53.211992871403481 ], [ 11.927820031382028, 52.226225390811273 ], [ 13.531032170806659, 52.175484718678582 ] ] ] } } diff --git a/tests/data/cut_source/connector/vectors.geojson b/tests/data/cut_source/connector/vectors.geojson index 22ca2506..b5596b90 100644 --- a/tests/data/cut_source/connector/vectors.geojson +++ b/tests/data/cut_source/connector/vectors.geojson @@ -1,5 +1,6 @@ { "type": "FeatureCollection", +"name": "vectors", "crs": { "type": "name", "properties": { "name": "urn:ogc:def:crs:OGC:1.3:CRS84" } }, "features": [ { "type": "Feature", "properties": { "vector_name": "berlin_brandenburg_gate", "raster_count": 1, "Description": "", "type": "object", "prob_of_class_object": 1.0 }, "geometry": { "type": "Polygon", "coordinates": [ [ [ 13.377565438910141, 52.516568397149548, 0.0 ], [ 13.37767670931829, 52.5159632173402, 0.0 ], [ 13.37811966376411, 52.515999764239929, 0.0 ], [ 13.37810339593884, 52.516119741283603, 0.0 ], [ 13.37786452595477, 52.516109878538877, 0.0 ], [ 13.377804199934889, 52.5164569450953, 0.0 ], [ 13.37804096941727, 52.516481108895917, 0.0 ], [ 13.378024559886381, 52.516596731533298, 0.0 ], [ 13.377565438910141, 52.516568397149548, 0.0 ] ] ] } }, diff --git a/tests/mock_download_test.py b/tests/mock_download_test.py index c36b92aa..88853e0f 100644 --- a/tests/mock_download_test.py +++ b/tests/mock_download_test.py @@ -1,5 +1,4 @@ -""" -Test RasterDownloadProcessor using mock downloader. +"""Test RasterDownloadProcessor using mock downloader. Virtually 'downloads' (no files operations are actually done) from a dataset of rasters in a source directory. @@ -36,9 +35,7 @@ def test_mock_download(): data_dir=data_dir, task_vector_classes=["object"] ) - vectors = gpd.read_file( - test_dir / "geographer_download_test.geojson", driver="GeoJSON" - ) + vectors = gpd.read_file(test_dir / "geographer_download_test.geojson") vectors.set_index("name", inplace=True) connector.add_to_vectors(vectors) diff --git a/tests/save_load_base_model_test.py b/tests/save_load_base_model_test.py index c3d0042a..320e2aea 100644 --- a/tests/save_load_base_model_test.py +++ b/tests/save_load_base_model_test.py @@ -1,63 +1,84 @@ -"""Test saving/loading nested BaseModels. - -TODO: split up into smaller units (more simple nestings etc) -""" +"""Test saving/loading nested BaseModels.""" from pathlib import Path import git +from pydantic import BaseModel -from geographer.downloaders.downloader_for_vectors import RasterDownloaderForVectors -from geographer.downloaders.sentinel2_download_processor import Sentinel2Processor -from geographer.downloaders.sentinel2_downloader_for_single_vector import ( - SentinelDownloaderForSingleVector, +from geographer.base_model_dict_conversion.save_load_base_model_mixin import ( + SaveAndLoadBaseModelMixIn, ) +class InnermostBaseModel(BaseModel): + """Innermost BaseModel.""" + + int_value: int + str_value: str + + +class NestedBaseModel(BaseModel): + """Nested BaseModel.""" + + dict_value: dict + innermost_base_model: InnermostBaseModel + + +class OutermostBaseModel(BaseModel, SaveAndLoadBaseModelMixIn): + """Outermost BaseModel.""" + + nested_base_model: NestedBaseModel + + json_path: Path + + def save(self): + """Save the model.""" + self._save(self.json_path) + + def test_save_load_nested_base_model(): """Test saving and loading nested BaseModel.""" - # get repo working tree directory repo = git.Repo(".", search_parent_directories=True) repo_root = Path(repo.working_tree_dir) - download_test_data_dir = repo_root / "tests/data/temp/download_s2_test" - - # define nested BaseModel - s2_download_processor = Sentinel2Processor() - s2_downloader_for_single_vector = SentinelDownloaderForSingleVector() - s2_downloader = RasterDownloaderForVectors( - download_dir=download_test_data_dir / "download", - downloader_for_single_vector=s2_downloader_for_single_vector, - download_processor=s2_download_processor, - kwarg_defaults={ # further nesting: dictionary - "producttype": "L2A", - "resolution": 10, - "max_percent_cloud_coverage": 10, - "date": ("NOW-364DAYS", "NOW"), - "area_relation": "Contains", - "credentials_ini_path": download_test_data_dir / "credentials.ini", - # keys must be strings in the following dict, see - # https://stackoverflow.com/questions/1450957/pythons-json-module-converts-int-dictionary-keys-to-strings # noqa: E501 - "additional_nested_dictionary": { - "1": 2, - "3": 4, + temp_dir = repo_root / "tests/data/temp/" + outermost_base_model_json_path = temp_dir / "outermost_base_model.json" + + # Define a nested model + outermost_base_model = OutermostBaseModel( + nested_base_model=NestedBaseModel( + dict_value={ + "a": 1, + "b": { + "c": None, + }, }, - "some_list": [1, 2, 3, 4], - }, + innermost_base_model=InnermostBaseModel( + int_value=2, + str_value="str_value", + ), + ), + json_path=outermost_base_model_json_path, ) """ - Test save and load Sentinel-2 Downloader + Test saving and loading a nested BaseModel """ # save - s2_downloader_json_path = download_test_data_dir / "connector/s2_downloader.json" - s2_downloader.save(s2_downloader_json_path) + outermost_base_model.save() # load - s2_downloader_from_json = RasterDownloaderForVectors.from_json_file( - s2_downloader_json_path, + outermost_base_model_from_json = OutermostBaseModel.from_json_file( + outermost_base_model_json_path, + constructor_symbol_table={ + "InnermostBaseModel": InnermostBaseModel, + "NestedBaseModel": NestedBaseModel, + "OutermostBaseModel": OutermostBaseModel, + }, ) # make sure saving and loading again doesn't change anything - assert s2_downloader_from_json == s2_downloader + assert ( + outermost_base_model_from_json.model_dump() == outermost_base_model.model_dump() + ) if __name__ == "__main__": diff --git a/tests/download_s2_test_manually.py b/tests/test_eodag_s2_download.py similarity index 55% rename from tests/download_s2_test_manually.py rename to tests/test_eodag_s2_download.py index 2bc0f61e..a16f6d54 100644 --- a/tests/download_s2_test_manually.py +++ b/tests/test_eodag_s2_download.py @@ -1,27 +1,29 @@ -""" -Manually triggered test of Sentinel-2 downloader. +"""Manually triggered test of Sentinel-2 downloader. Run by hand to test downloading Sentinel-2 data. Intentionally not discoverable by pytest: Downloading Sentinel-2 rasters is slow and uses a lot of disk space. Needs an .ini file with API credentials. - -TODO: write test for download_mode 'bboxgrid' using large polygon """ import shutil +from datetime import date, timedelta import geopandas as gpd +import pytest from utils import get_test_dir from geographer import Connector from geographer.downloaders.downloader_for_vectors import RasterDownloaderForVectors -from geographer.downloaders.sentinel2_download_processor import Sentinel2Processor -from geographer.downloaders.sentinel2_downloader_for_single_vector import ( - SentinelDownloaderForSingleVector, +from geographer.downloaders.eodag_downloader_for_single_vector import ( + EodagDownloaderForSingleVector, ) +from geographer.downloaders.sentinel2_download_processor import Sentinel2SAFEProcessor +# To run just this test, execute +# pytest -v -s tests/test_eodag_s2_download.py::test_s2_download +@pytest.mark.slow def test_s2_download(): """Test downloading Sentinel-2 data.""" # noqa: D202 @@ -31,49 +33,61 @@ def test_s2_download(): test_dir = get_test_dir() data_dir = test_dir / "temp/download_s2_test" - credentials_ini_path = test_dir / "download_s2_test_credentials.ini" - assert ( - credentials_ini_path.is_file() - ), f"Need credentials in {credentials_ini_path} to test sentinel download" - vectors = gpd.read_file( - test_dir / "geographer_download_test.geojson", driver="GeoJSON" - ) + vectors = gpd.read_file(test_dir / "geographer_download_test.geojson") vectors.set_index("name", inplace=True) connector = Connector.from_scratch( data_dir=data_dir, task_vector_classes=["object"] ) connector.add_to_vectors(vectors) - """ Test RasterDownloaderForVectors for Sentinel-2 data """ - s2_download_processor = Sentinel2Processor() - s2_downloader_for_single_vector = SentinelDownloaderForSingleVector() - s2_downloader = RasterDownloaderForVectors( - downloader_for_single_vector=s2_downloader_for_single_vector, - download_processor=s2_download_processor, - kwarg_defaults={ - "producttype": "L2A", - "resolution": 10, - "max_percent_cloud_coverage": 10, - "date": ("NOW-364DAYS", "NOW"), - "area_relation": "Contains", - "credentials": credentials_ini_path, - }, + product_type = "S2_MSI_L2A" + download_processor = Sentinel2SAFEProcessor() + downloader_for_single_vector = EodagDownloaderForSingleVector() + downloader = RasterDownloaderForVectors( + downloader_for_single_vector=downloader_for_single_vector, + download_processor=download_processor, ) + if "cop_dataspace" not in downloader_for_single_vector.eodag.available_providers( + product_type=product_type + ): + raise RuntimeError( + "'cop_dataspace' needs to be available as a provider. " + "Probably the username and/or password are missing." + ) """ Download Sentinel-2 rasters """ - s2_downloader.download(connector=connector) + downloader_params = { + "search_kwargs": { + "provider": "cop_dataspace", + "productType": product_type, + "start": (date.today() - timedelta(days=364)).strftime("%Y-%m-%d"), + "end": date.today().strftime("%Y-%m-%d"), + }, + "filter_online": True, + "sort_by": ("cloudCover", "ASC"), + "suffix_to_remove": ".SAFE", + } + processor_params = { + "resolution": 10, + "delete_safe": True, + } + downloader.download( + connector=connector, + downloader_params=downloader_params, + processor_params=processor_params, + ) # The vectors contain # - 2 objects in Berlin (Reichstag and Brandenburg gate) # that are very close to each other # - 2 objects in Lisbon (Castelo de Sao Jorge and # Praca Do Comercio) that are very close to each other. - # Thus the s2_downloader should have downloaded two rasters, + # Thus the downloader should have downloaded two rasters, # one for Berlin and one for Lisbon, each containing two objects. # Berlin @@ -89,7 +103,6 @@ def test_s2_download(): assert connector.rasters_containing_vector( "lisbon_castelo_de_sao_jorge" ) == connector.rasters_containing_vector("lisbon_praca_do_comercio") - """ Clean up: delete downloads """ diff --git a/tests/download_jaxa_test_manually.py b/tests/test_jaxa_download.py similarity index 88% rename from tests/download_jaxa_test_manually.py rename to tests/test_jaxa_download.py index d5ee5d76..8eed66e2 100644 --- a/tests/download_jaxa_test_manually.py +++ b/tests/test_jaxa_download.py @@ -1,5 +1,4 @@ -""" -Manually triggered test of JAXA downloader. +"""Manually triggered test of JAXA downloader. Run by hand to test downloading JAXA data. Intentionally not discoverable by pytest: Downloading JAXA rasters is slightly slow. @@ -8,6 +7,7 @@ import shutil import geopandas as gpd +import pytest from utils import get_test_dir from geographer import Connector @@ -18,6 +18,9 @@ ) +# To run just this test, execute +# pytest -v -s tests/test_jaxa_download.py::test_jaxa_download +@pytest.mark.slow def test_jaxa_download(): """Test downloading JAXA data.""" # noqa: D202 @@ -27,16 +30,13 @@ def test_jaxa_download(): test_dir = get_test_dir() data_dir = test_dir / "temp/download_jaxa_test" - vectors = gpd.read_file( - test_dir / "geographer_download_test.geojson", driver="GeoJSON" - ) + vectors = gpd.read_file(test_dir / "geographer_download_test.geojson") vectors.set_index("name", inplace=True) connector = Connector.from_scratch( data_dir=data_dir, task_vector_classes=["object"] ) connector.add_to_vectors(vectors) - """ Test RasterDownloaderForVectors for Sentinel-2 data """ @@ -45,16 +45,17 @@ def test_jaxa_download(): jaxa_downloader = RasterDownloaderForVectors( downloader_for_single_vector=jaxa_downloader_for_single_vector, download_processor=jaxa_download_processor, - kwarg_defaults={ - "data_version": "1804", - "download_mode": "bboxvertices", - }, ) - """ Download Sentinel-2 rasters """ - jaxa_downloader.download(connector=connector) + jaxa_downloader.download( + connector=connector, + downloader_params={ + "data_version": "1804", + "download_mode": "bboxvertices", + }, + ) # The vectors contain # - 2 objects in Berlin (Reichstag and Brandenburg gate) # that are very close to each other @@ -76,7 +77,6 @@ def test_jaxa_download(): assert connector.rasters_containing_vector("lisbon_castelo_de_sao_jorge") == ( connector.rasters_containing_vector("lisbon_praca_do_comercio") ) - """ Clean up: delete downloads """ diff --git a/tests/utils.py b/tests/utils.py index 6e021630..78ca2ce0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,7 +4,6 @@ import shutil from pathlib import Path -from typing import Optional, Union import git import numpy as np @@ -22,14 +21,14 @@ def get_test_dir(): def create_dummy_rasters( - data_dir: Union[Path, str], + data_dir: Path | str, raster_size: int, - raster_names: Optional[list[str]] = None, + raster_names: list[str] | None = None, ) -> None: """Create dummy rasters. - Create dummy rasters for a dataset from the connector's - rasters geodataframe. + Create dummy rasters for a dataset from the connector's rasters + geodataframe. """ connector = Connector.from_data_dir(data_dir) connector.rasters_dir.mkdir(parents=True, exist_ok=True) @@ -45,7 +44,6 @@ def create_dummy_rasters( ), desc="Creating dummy rasters", ): - raster_array = np.stack( [np.ones((raster_size, raster_size), dtype=np.uint8) * n for n in range(3)] ) @@ -71,7 +69,7 @@ def create_dummy_rasters( dst.write(raster_array[idx, :, :], idx + 1) -def delete_dummy_rasters(data_dir: Union[Path, str]) -> None: - """Delete dummy raster data (rasters and segmentation labels) from dataset.""" +def delete_dummy_rasters(data_dir: Path | str) -> None: + """Delete dummy raster data from dataset.""" shutil.rmtree(data_dir / "rasters", ignore_errors=True) shutil.rmtree(data_dir / "labels", ignore_errors=True)