From 628ece4ef1237f984ba72bb5789d638841074671 Mon Sep 17 00:00:00 2001 From: Scott Staniewicz Date: Mon, 11 Dec 2023 12:12:28 -0500 Subject: [PATCH] Add stitched geometry creation (#12) * add static layer download and geometry creation * add more types, fix pre-commits * add tests and fix failing * add tests and fix failing * spacing * try to fix pytet warning, io is optional * install rastserio too * try reverse * try this version * install deps correctly * add another warning ignore, gdal 3.4 * gdal min is 3.5 * missing future import * use `lru_cache` for py38 * add changelog, bump to py39 * remove `cast` * back to `@cache` --- .github/workflows/ci.yml | 12 +- CHANGELOG.md | 22 ++ LICENSE | 36 ++- environment-geo.yml | 6 +- environment.yml | 2 +- pyproject.toml | 12 +- src/opera_utils/__init__.py | 1 - src/opera_utils/_dates.py | 16 +- src/opera_utils/_io.py | 122 ++++++++ src/opera_utils/_utils.py | 155 ++++++++++ src/opera_utils/constants.py | 15 + src/opera_utils/geometry.py | 166 +++++++++++ src/opera_utils/stitching.py | 533 +++++++++++++++++++++++++++++++++++ tests/conftest.py | 5 - tests/test_io.py | 91 ++++++ tests/test_utils.py | 58 ++++ 16 files changed, 1218 insertions(+), 34 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 src/opera_utils/_utils.py create mode 100644 src/opera_utils/geometry.py create mode 100644 src/opera_utils/stitching.py create mode 100644 tests/test_io.py create mode 100644 tests/test_utils.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index e5bbf13..4136a78 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -16,13 +16,17 @@ jobs: matrix: os: [ubuntu-latest, macos-latest] deps: - # Note: for now we're manually adding tophu deps, - # will change once it's on conda forge - label: Latest - spec: "" + spec: >- + rasterio + gdal + asf_search - label: Minimum spec: >- - python=3.8 + python=3.9 + rasterio=1.3 + gdal=3.5 + asf_search=6.7.2 fail-fast: false name: ${{ matrix.os }} • ${{ matrix.deps.label }} diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..bf4498a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,22 @@ +# [Unreleased](https://github.com/opera-adt/opera-utils/compare/v0.1.4...main) + +**Added** +- Based raster metadata convenience functions as `_io.get_raster_*` +- Ability to download CSLC static layers and stitch into line-of-sight rasters + + +**Requirements** +Minimum python version is 3.9 + +- click>=7.0 +- h5py>=1.10 +- numpy>=1.20 +- pooch>=1.7 +- pyproj>=3.3 +- shapely>=1.8 +- typing_extensions>=4 + +For other interactions with geospatial rasters (i.e. reading metadata, creating geometries from Static Layers): +- asf_search>=6.7.2 +- gdal>=3.5 +- rasterio>=1.3 diff --git a/LICENSE b/LICENSE index bc1ef5d..10af7c5 100644 --- a/LICENSE +++ b/LICENSE @@ -1,3 +1,36 @@ +This software is licensed under either the BSD-3-Clause or Apache-2.0 license. Users may +choose which license to apply. The terms of each license are reproduced below. + +--------------------------------------------------------------------------------------- + +Copyright (c) 2023 California Institute of Technology ("Caltech"). U.S. Government +sponsorship acknowledged. + +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are +permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this list of + conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright notice, this list of + conditions and the following disclaimer in the documentation and/or other materials + provided with the distribution. +* Neither the name of Caltech nor its operating division, the Jet Propulsion Laboratory, + nor the names of its contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL +THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT +OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------------------------------------------------------------------------- Apache License Version 2.0, January 2004 @@ -187,7 +220,8 @@ same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2023 Scott Staniewicz + Copyright 2023 California Institute of Technology (“Caltech”). + U.S. Government sponsorship acknowledged. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/environment-geo.yml b/environment-geo.yml index 3c070f7..7d7d26b 100644 --- a/environment-geo.yml +++ b/environment-geo.yml @@ -1,6 +1,6 @@ channels: - conda-forge dependencies: - - pyogrio>=0.5 - - gdal>=3.3 - - geopandas-base>=0.12 + - asf_search>=6.7 + - gdal>=3.5 + - rasterio>=1.3 diff --git a/environment.yml b/environment.yml index fbb0d31..a2f9f30 100644 --- a/environment.yml +++ b/environment.yml @@ -2,7 +2,7 @@ name: opera-utils-env channels: - conda-forge dependencies: - - python>=3.8 + - python>=3.9 - pip>=21.3 # https://pip.pypa.io/en/stable/reference/build-system/pyproject-toml/#editable-installation - git # for pip install, due to setuptools_scm - click>=7.0 diff --git a/pyproject.toml b/pyproject.toml index d26c16b..bb267cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,13 +9,12 @@ authors = [ ] description = "Miscellaneous utilities for working with OPERA data products" readme = { file = "README.md", content-type = "text/markdown" } -requires-python = ">=3.8" +requires-python = ">=3.9" classifiers = [ "Intended Audience :: Developers", "Intended Audience :: Science/Research", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -71,7 +70,7 @@ preview = true [tool.isort] profile = "black" -known_first_party = [" opera_utils"] +known_first_party = ["opera_utils"] [tool.mypy] python_version = "3.10" @@ -84,4 +83,9 @@ ignore = "D100,D102,D104,D105,D106,D107,D203,D204,D213,D413" [tool.pytest.ini_options] doctest_optionflags = "NORMALIZE_WHITESPACE NUMBER" addopts = " --cov=opera_utils --doctest-modules --randomly-seed=1234 --ignore=scripts --ignore=docs" -filterwarnings = ["error"] +filterwarnings = [ + "error", + "ignore:h5py is running against HDF5.*:UserWarning", + # https://github.com/dateutil/dateutil/pull/1285 will be released, but not yet + "ignore:datetime.datetime.utcfromtimestamp.*:DeprecationWarning", +] diff --git a/src/opera_utils/__init__.py b/src/opera_utils/__init__.py index 7fe9603..89f23bb 100644 --- a/src/opera_utils/__init__.py +++ b/src/opera_utils/__init__.py @@ -4,7 +4,6 @@ from __future__ import annotations from ._dates import * -from ._io import * from ._version import version as __version__ from .bursts import * from .burst_frame_db import * diff --git a/src/opera_utils/_dates.py b/src/opera_utils/_dates.py index 6e0c5b4..70e29e0 100644 --- a/src/opera_utils/_dates.py +++ b/src/opera_utils/_dates.py @@ -4,10 +4,10 @@ import itertools import re from collections import defaultdict -from pathlib import Path from typing import Iterable, overload from ._types import DateOrDatetime, Filename, PathLikeT +from ._utils import _get_path_from_gdal_str __all__ = [ "get_dates", @@ -145,20 +145,6 @@ def _parse_date(datestr: str, fmt: str = DATE_FORMAT) -> datetime.datetime: return datetime.datetime.strptime(datestr, fmt) -def _get_path_from_gdal_str(name: Filename) -> Path: - s = str(name) - if s.upper().startswith("DERIVED_SUBDATASET"): - # like DERIVED_SUBDATASET:AMPLITUDE:slc_filepath.tif - p = s.split(":")[-1].strip('"').strip("'") - elif ":" in s and (s.upper().startswith("NETCDF") or s.upper().startswith("HDF")): - # like NETCDF:"slc_filepath.nc":subdataset - p = s.split(":")[1].strip('"').strip("'") - else: - # Whole thing is the path - p = str(name) - return Path(p) - - def _date_format_to_regex(date_format: str) -> re.Pattern: r"""Convert a python date format string to a regular expression. diff --git a/src/opera_utils/_io.py b/src/opera_utils/_io.py index e69de29..3a67b7d 100644 --- a/src/opera_utils/_io.py +++ b/src/opera_utils/_io.py @@ -0,0 +1,122 @@ +from __future__ import annotations + +from typing import Any + +import numpy as np +import rasterio as rio +from affine import Affine +from pyproj import CRS + +from ._types import Bbox, PathOrStr + + +def get_raster_nodata(filename: PathOrStr, band: int = 1) -> float | None: + """Get the nodata value from a file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + band : int, optional + Band to get nodata value for, by default 1. + + Returns + ------- + Optional[float] + Nodata value, or None if not found. + """ + nodatas = _get_dataset_attr(filename, "nodatavals") + return nodatas[band - 1] + + +def get_raster_crs(filename: PathOrStr) -> CRS: + """Get the CRS from a file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + + Returns + ------- + CRS + pyproj CRS for `filename` + """ + return _get_dataset_attr(filename, "crs") + + +def get_raster_transform(filename: PathOrStr) -> Affine: + """Get the rasterio `Affine` transform from a file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + + Returns + ------- + List[float] + 6 floats representing a GDAL Geotransform. + """ + return _get_dataset_attr(filename, "transform") + + +def get_raster_gt(filename: PathOrStr) -> list[float]: + """Get the gdal geotransform from a file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + + Returns + ------- + Affine + Two dimensional affine transform for 2D linear mapping. + """ + return get_raster_transform(filename).to_gdal() + + +def get_raster_dtype(filename: PathOrStr, band: int = 1) -> np.dtype: + """Get the numpy data type from a raster file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + band : int, optional + Band to get nodata value for, by default 1. + + Returns + ------- + np.dtype + Data type. + """ + dtype_per_band = _get_dataset_attr(filename, "dtypes") + return np.dtype(dtype_per_band[band - 1]) + + +def get_raster_driver(filename: PathOrStr) -> str: + """Get the GDAL driver `ShortName` from a file. + + Parameters + ---------- + filename : PathOrStr + Path to the file to load. + + Returns + ------- + str + Driver name. + """ + return _get_dataset_attr(filename, "driver") + + +def get_raster_bounds(filename: PathOrStr) -> Bbox: + """Get the (left, bottom, right, top) bounds of the image.""" + return _get_dataset_attr(filename, "bounds") + + +def _get_dataset_attr(filename: PathOrStr, attr_name: str) -> Any: + with rio.open(filename) as src: + return getattr(src, attr_name) diff --git a/src/opera_utils/_utils.py b/src/opera_utils/_utils.py new file mode 100644 index 0000000..015e462 --- /dev/null +++ b/src/opera_utils/_utils.py @@ -0,0 +1,155 @@ +from __future__ import annotations + +import os +import shutil +import tempfile +from contextlib import contextmanager +from pathlib import Path +from typing import Generator + +import numpy as np +from numpy.typing import DTypeLike + +from ._types import PathOrStr + +__all__ = [ + "format_nc_filename", + "scratch_directory", +] + + +def format_nc_filename(filename: PathOrStr, ds_name: str | None = None) -> str: + """Format an HDF5/NetCDF filename with dataset for reading using GDAL. + + If `filename` is already formatted, or if `filename` is not an HDF5/NetCDF + file (based on the file extension), it is returned unchanged. + + Parameters + ---------- + filename : str or PathLike + Filename to format. + ds_name : str, optional + Dataset name to use. If not provided for a .h5 or .nc file, an error is raised. + + Returns + ------- + str + Formatted filename like + NETCDF:"filename.nc":"//ds_name" + + Raises + ------ + ValueError + If `ds_name` is not provided for a .h5 or .nc file. + """ + # If we've already formatted the filename, return it + if str(filename).startswith("NETCDF:") or str(filename).startswith("HDF5:"): + return str(filename) + + if not (os.fspath(filename).endswith(".nc") or os.fspath(filename).endswith(".h5")): + return os.fspath(filename) + + # Now we're definitely dealing with an HDF5/NetCDF file + if ds_name is None: + raise ValueError("Must provide dataset name for HDF5/NetCDF files") + + return f'NETCDF:"{filename}":"//{ds_name.lstrip("/")}"' + + +def _get_path_from_gdal_str(name: PathOrStr) -> Path: + s = str(name) + if s.upper().startswith("DERIVED_SUBDATASET"): + # like DERIVED_SUBDATASET:AMPLITUDE:slc_filepath.tif + p = s.split(":")[-1].strip('"').strip("'") + elif ":" in s and (s.upper().startswith("NETCDF") or s.upper().startswith("HDF")): + # like NETCDF:"slc_filepath.nc":subdataset + p = s.split(":")[1].strip('"').strip("'") + else: + # Whole thing is the path + p = str(name) + return Path(p) + + +def numpy_to_gdal_type(np_dtype: DTypeLike) -> int: + """Convert numpy dtype to gdal type. + + Parameters + ---------- + np_dtype : DTypeLike + Numpy dtype to convert. + + Returns + ------- + int + GDAL type code corresponding to `np_dtype`. + + Raises + ------ + TypeError + If `np_dtype` is not a numpy dtype, or if the provided dtype is not + supported by GDAL (for example, `np.dtype('>i4')`) + """ + from osgeo import gdal_array, gdalconst + + np_dtype = np.dtype(np_dtype) + + if np.issubdtype(bool, np_dtype): + return gdalconst.GDT_Byte + gdal_code = gdal_array.NumericTypeCodeToGDALTypeCode(np_dtype) + if gdal_code is None: + raise TypeError(f"dtype {np_dtype} not supported by GDAL.") + return gdal_code + + +def gdal_to_numpy_type(gdal_type: str | int) -> np.dtype: + """Convert gdal type to numpy type.""" + from osgeo import gdal, gdal_array + + if isinstance(gdal_type, str): + gdal_type = gdal.GetDataTypeByName(gdal_type) + return np.dtype(gdal_array.GDALTypeCodeToNumericTypeCode(gdal_type)) + + +@contextmanager +def scratch_directory( + dir_: PathOrStr | None = None, *, delete: bool = True +) -> Generator[Path, None, None]: + """Context manager that creates a (possibly temporary) file system directory. + + If `dir_` is a path-like object, a directory will be created at the specified + file system path if it did not already exist. Otherwise, if `dir_` is None, a + temporary directory will instead be created as though by + ``tempfile.TemporaryDirectory()``. + + The directory may be automatically removed from the file system upon exiting the + context manager. + + Parameters + ---------- + dir_ : PathOrStr or None, optional + Scratch directory path. If None, a temporary directory will be created. Defaults + to None. + delete : bool, optional + If True and `dir_` didn't exist, the directory and its contents are + recursively removed from the file system upon exiting the context manager. + Note that if `dir_` already exists, this option is ignored. + Defaults to True. + + Yields + ------ + pathlib.Path + Scratch directory path. If `delete` was True, the directory will be removed from + the file system upon exiting the context manager scope. + """ + if dir_ is None: + scratchdir = Path(tempfile.mkdtemp()) + dir_already_existed = False + else: + scratchdir = Path(dir_) + dir_already_existed = scratchdir.exists() + scratchdir.mkdir(parents=True, exist_ok=True) + + yield scratchdir + + if delete and not dir_already_existed: + shutil.rmtree(scratchdir) diff --git a/src/opera_utils/constants.py b/src/opera_utils/constants.py index 468baf7..8ebfc15 100644 --- a/src/opera_utils/constants.py +++ b/src/opera_utils/constants.py @@ -22,3 +22,18 @@ r"[tT](?P\d{3})[-_](?P\d{6})[-_](?Piw[1-3])", re.IGNORECASE, ) + +DEFAULT_TIFF_OPTIONS = ( + "COMPRESS=DEFLATE", + "ZLEVEL=4", + "TILED=YES", + "BLOCKXSIZE=128", + "BLOCKYSIZE=128", +) +EXTRA_COMPRESSED_TIFF_OPTIONS = ( + *DEFAULT_TIFF_OPTIONS, + # Note: we're dropping mantissa bits before we do not + # need prevision for LOS rasters (or incidence) + "DISCARD_LSB=6", + "PREDICTOR=2", +) diff --git a/src/opera_utils/geometry.py b/src/opera_utils/geometry.py new file mode 100644 index 0000000..a56c248 --- /dev/null +++ b/src/opera_utils/geometry.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +import logging +import netrc +from enum import Enum +from functools import cache +from pathlib import Path +from typing import Literal, Mapping, Sequence + +import asf_search as asf + +from opera_utils import get_burst_ids_for_frame, stitching +from opera_utils._types import PathOrStr +from opera_utils._utils import format_nc_filename, scratch_directory + +from .constants import EXTRA_COMPRESSED_TIFF_OPTIONS + +logger = logging.getLogger(__name__) + + +class Layer(Enum): + """Names of available datasets in CSLC static layers HDF5 files.""" + + LOS_EAST = "los_east" + LOS_NORTH = "los_north" + LAYOVER_SHADOW_MASK = "layover_shadow_mask" + LOCAL_INCIDENCE_ANGLE = "local_incidence_angle" + + +# Layover shadow mask. 0=no layover, no shadow; 1=shadow; 2=layover; 3=shadow and layover. +DEFAULT_LAYERS = list(Layer)[:3] # Skip the local incidence, much less compressible +DEFAULT_STRIDES = {"x": 6, "y": 3} +LAYER_TO_NODATA = { + Layer.LOS_EAST: 0, + Layer.LOS_NORTH: 0, + Layer.LOCAL_INCIDENCE_ANGLE: 0, + # layover_shadow_mask is Int8 with 127 meaning nodata + Layer.LAYOVER_SHADOW_MASK: 127, +} + + +def create_geometry_files( + *, + frame_id: int | None = None, + burst_ids: Sequence[str] | None = None, + output_dir: PathOrStr = Path("."), + download_dir: PathOrStr | None = None, + save_hdf5_files: bool = False, + layers: Sequence[Layer | str] = DEFAULT_LAYERS, + strides: Mapping[str, int] = DEFAULT_STRIDES, + max_download_jobs: int = 3, +) -> list[Path]: + """Create merged geometry files for a frame of list of burst IDs. + + Parameters + ---------- + frame_id : int | None, optional + DISP frame ID to create, by default None + burst_ids : Sequence[str] | None, optional + Alternative to `frame_id`, manually specify CSLC burst IDs. + output_dir : PathOrStr, optional + Directory to store output geotiffs, by default Path(".") + download_dir : PathOrStr | None, optional + Directory to save files, by default None + save_hdf5_files : bool, optional + _description_, by default False + layers : Sequence[Layer | str], optional + _description_, by default DEFAULT_LAYERS + strides : Mapping[str, int], optional + _description_, by default DEFAULT_STRIDES + max_download_jobs : int, optional + _description_, by default 3 + + Returns + ------- + list[Path] + _description_ + + Raises + ------ + ValueError + _description_ + """ + if frame_id is not None: + burst_ids = get_burst_ids_for_frame(frame_id=frame_id) + + if not burst_ids: + raise ValueError("Must provide frame_id or burst_ids") + + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True, parents=True) + if download_dir is None: + download_dir = output_path / "hdf5" + output_files: list[Path] = [] + + with scratch_directory(download_dir, delete=not save_hdf5_files) as sd: + local_hdf5_files = _download_static_layers( + burst_ids=burst_ids, output_dir=sd, max_jobs=max_download_jobs + ) + for layer in layers: + layer_enum = Layer(layer) + name = layer_enum.value + gdal_strings = [ + format_nc_filename(f, ds_name=f"data/{name}") for f in local_hdf5_files + ] + nodata = LAYER_TO_NODATA[Layer(layer)] + cur_outfile = output_path / f"{name}.tif" + output_files.append(cur_outfile) + logger.info(f"Merging images for {name}") + stitching.merge_images( + file_list=gdal_strings, + outfile=cur_outfile, + strides=strides, + driver="GTIff", + options=EXTRA_COMPRESSED_TIFF_OPTIONS, + resample_alg="nearest", + in_nodata=nodata, + out_nodata=nodata, + ) + + return output_files + + +@cache +def _search(burst_ids: Sequence[str]) -> asf.ASFSearchResults: + return asf.search(operaBurstID=list(burst_ids), processingLevel="CSLC-STATIC") + + +def _get_urls( + results: asf.ASFSearchResults, + type_: Literal["https", "s3"] = "https", +) -> list[str]: + if type_ == "https": + return [r.properties["url"] for r in results] + elif type_ == "s3": + # TODO: go through .umm, find s3 url + raise NotImplementedError() + else: + raise ValueError(f"type_ must be 'https' or 's3'. Got {type_}") + # r.umm + # 'RelatedUrls': [... + # {'URL': 's3://asf-cumulus-prod-opera-products/OPERA_L2_CSLC + # 'Type': 'GET DATA VIA DIRECT ACCESS', + # 'Description': 'This link provides direct download access vi + # 'Format': 'HDF5'}, + + +def _download_static_layers( + burst_ids: Sequence[str], output_dir: Path, max_jobs: int = 3 +) -> list[Path]: + # Make a tuple so it can be hashed + results = _search(burst_ids=tuple(burst_ids)) + session = _get_auth_session() + urls = _get_urls(results) + asf.download_urls(urls=urls, path=output_dir, session=session, processes=max_jobs) + return [output_dir / r.properties["fileName"] for r in results] + + +def _get_auth_session() -> asf.ASFSession: + host = "urs.earthdata.nasa.gov" + + auth = netrc.netrc().authenticators(host) + if auth is None: + raise ValueError(f"No .netrc entry foudn for {host}") + username, _, password = auth + return asf.ASFSession().auth_with_creds(username, password) diff --git a/src/opera_utils/stitching.py b/src/opera_utils/stitching.py new file mode 100644 index 0000000..00e9b15 --- /dev/null +++ b/src/opera_utils/stitching.py @@ -0,0 +1,533 @@ +"""stitching.py: utilities for merging geographic rasters.""" +from __future__ import annotations + +import logging +import math +import subprocess +import tempfile +from os import fspath +from pathlib import Path +from typing import Iterable, Mapping, Optional, Sequence + +import numpy as np +from numpy.typing import DTypeLike +from osgeo import gdal, osr +from pyproj import Transformer + +from opera_utils import _io +from opera_utils._types import Bbox, PathOrStr +from opera_utils._utils import _get_path_from_gdal_str, numpy_to_gdal_type + +from .constants import DEFAULT_TIFF_OPTIONS + +logger = logging.getLogger(__name__) + + +def merge_images( + file_list: Sequence[PathOrStr], + outfile: PathOrStr, + target_aligned_pixels: bool = True, + out_bounds: Optional[Bbox] = None, + out_bounds_epsg: Optional[int] = None, + strides: Mapping[str, int] = {"x": 1, "y": 1}, + driver: str = "GTiff", + out_nodata: Optional[float] = 0, + out_dtype: Optional[DTypeLike] = None, + in_nodata: Optional[float] = None, + resample_alg: str = "nearest", + overwrite=False, + options: Optional[Sequence[str]] = DEFAULT_TIFF_OPTIONS, + create_only: bool = False, +) -> None: + """Combine multiple SLC images on the same date into one image. + + Parameters + ---------- + file_list : list[PathOrStr] + list of raster filenames + outfile : PathOrStr + Path to output file + target_aligned_pixels: bool + If True, adjust output image bounds so that pixel coordinates + are integer multiples of pixel size, matching the ``-tap`` + options of GDAL utilities. + Default is True. + out_bounds: Optional[tuple[float]] + if provided, forces the output image bounds to + (left, bottom, right, top). + Otherwise, computes from the outside of all input images. + out_bounds_epsg: Optional[int] + EPSG code for the `out_bounds`. + If not provided, assumed to match the projections of `file_list`. + strides : dict[str, int] + subsample factor: {"x": x strides, "y": y strides} + driver : str + GDAL driver to use for output file. Default is GTiff. + out_nodata : Optional[float | str] + Nodata value to use for output file. Default is 0. + out_dtype : Optional[DTypeLike] + Output data type. Default is None, which will use the data type + of the first image in the list. + in_nodata : Optional[float | str] + Override the files' `nodata` and use `in_nodata` during merging. + resample_alg : str, default="nearest" + Method for gdal to use for reprojection. + Default is nearest-neighbor. + overwrite : bool + Overwrite existing files. Default is False. + options : Optional[Sequence[str]] + Driver-specific creation options passed to GDAL. Default is ["SUFFIX=ADD"] + create_only : bool + If True, creates an empty output file, does not write data. Default is False. + """ + if Path(outfile).exists(): + if not overwrite: + logger.info(f"{outfile} already exists, skipping") + return + else: + logger.info(f"Overwrite=True: removing {outfile}") + Path(outfile).unlink() + + if len(file_list) == 1: + logger.info("Only one image, no stitching needed") + logger.info(f"Copying {file_list[0]} to {outfile} and zeroing nodata values.") + _copy_set_nodata( + file_list[0], + outfile=outfile, + driver=driver, + creation_options=options, + out_nodata=out_nodata or 0, + ) + return + + # Make sure all the files are in the same projection. + projection = _get_mode_projection(file_list) + # If not, warp them to the most common projection using VRT files in a tempdir + temp_dir = tempfile.TemporaryDirectory() + + if strides is not None and strides["x"] > 1 and strides["y"] > 1: + file_list = get_downsampled_vrts( + file_list, + strides=strides, + dirname=Path(temp_dir.name), + ) + + warped_file_list = warp_to_projection( + file_list, + # temp_dir, + dirname=Path(temp_dir.name), + projection=projection, + resample_alg=resample_alg, + ) + # Compute output array shape. We guarantee it will cover the output + # bounds completely + bounds, combined_nodata = get_combined_bounds_nodata( # type: ignore + *warped_file_list, + target_aligned_pixels=target_aligned_pixels, + out_bounds=out_bounds, + out_bounds_epsg=out_bounds_epsg, + strides=strides, + ) + (xmin, ymin, xmax, ymax) = bounds + + # Write out the files for gdal_merge using the --optfile flag + optfile = Path(temp_dir.name) / "file_list.txt" + optfile.write_text("\n".join(map(str, warped_file_list))) + args = [ + "gdal_merge.py", + "-o", + outfile, + "--optfile", + optfile, + "-of", + driver, + "-ul_lr", + xmin, + ymax, + xmax, + ymin, + ] + if out_nodata is not None: + args.extend(["-a_nodata", str(out_nodata)]) + if in_nodata is not None or combined_nodata is not None: + ndv = str(in_nodata) if in_nodata is not None else str(combined_nodata) + args.extend(["-n", ndv]) # type: ignore + if out_dtype is not None: + out_gdal_dtype = gdal.GetDataTypeName(numpy_to_gdal_type(out_dtype)) + args.extend(["-ot", out_gdal_dtype]) + if target_aligned_pixels: + args.append("-tap") + if create_only: + args.append("-create") + if options is not None: + for option in options: + args.extend(["-co", option]) + + arg_list = [str(a) for a in args] + logger.info(f"Running {' '.join(arg_list)}") + subprocess.check_call(arg_list) + + temp_dir.cleanup() + + +def get_downsampled_vrts( + filenames: Sequence[PathOrStr], + strides: Mapping[str, int], + dirname: PathOrStr, +) -> list[Path]: + """Create downsampled VRTs from a list of files. + + Does not reproject, only uses `gdal_translate`. + + Parameters + ---------- + filenames : Sequence[PathOrStr] + list of filenames to warp. + strides : dict[str, int] + subsample factor: {"x": x strides, "y": y strides} + dirname : PathOrStr + The directory to write the warped files to. + + Returns + ------- + list[PathOrStr] + The warped filenames. + """ + if not filenames: + return [] + warped_files = [] + res = _get_resolution(filenames) + for idx, fn in enumerate(filenames): + fn = Path(fn) + warped_fn = Path(dirname) / _get_temp_filename(fn, idx, "_downsampled") + logger.debug(f"Downsampling {fn} by {strides}") + warped_files.append(warped_fn) + gdal.Translate( + fspath(warped_fn), + fspath(fn), + format="VRT", # Just creates a file that will warp on the fly + resampleAlg="nearest", # nearest neighbor for resampling + xRes=res[0] * strides["x"], + yRes=res[1] * strides["y"], + ) + + return warped_files + + +def _get_temp_filename(fn: Path, idx: int, extra: str = ""): + base = _get_path_from_gdal_str(fn).stem + return f"{base}_{idx}{extra}.vrt" + + +def warp_to_projection( + filenames: Sequence[PathOrStr], + dirname: PathOrStr, + projection: str, + res: Optional[tuple[float, float]] = None, + resample_alg: str = "lanczos", +) -> list[Path]: + """Warp a list of files to `projection`. + + If the input file's projection matches `projection`, the same file is returned. + Otherwise, a new file is created in `dirname` with the same name as the input file, + but with '_warped' appended. + + Parameters + ---------- + filenames : Sequence[PathOrStr] + list of filenames to warp. + dirname : PathOrStr + The directory to write the warped files to. + projection : str + The desired projection, as a WKT string or 'EPSG:XXXX' string. + res : tuple[float, float] + The desired [x, y] resolution. + resample_alg : str, default="lanczos" + Method for gdal to use for reprojection. + Default is lanczos (sinc-kernel) + + Returns + ------- + list[PathOrStr] + The warped filenames. + """ + if projection is None: + projection = _get_mode_projection(filenames) + if res is None: + res = _get_resolution(filenames) + + warped_files = [] + for idx, fn in enumerate(filenames): + fn = Path(fn) + ds = gdal.Open(fspath(fn)) + proj_in = ds.GetProjection() + if proj_in == projection: + warped_files.append(fn) + continue + warped_fn = Path(dirname) / _get_temp_filename(fn, idx, "_warped") + warped_fn = Path(dirname) / f"{fn.stem}_{idx}_warped.vrt" + from_srs_name = ds.GetSpatialRef().GetName() + to_srs_name = osr.SpatialReference(projection).GetName() + logger.info( + f"Reprojecting {fn} from {from_srs_name} to match mode projection" + f" {to_srs_name}" + ) + warped_files.append(warped_fn) + gdal.Warp( + fspath(warped_fn), + fspath(fn), + format="VRT", # Just creates a file that will warp on the fly + dstSRS=projection, + resampleAlg=resample_alg, + targetAlignedPixels=True, # align in multiples of dx, dy + xRes=res[0], + yRes=res[1], + ) + + return warped_files + + +def _get_mode_projection(filenames: Iterable[PathOrStr]) -> str: + """Get the most common projection in the list.""" + projs = [gdal.Open(fspath(fn)).GetProjection() for fn in filenames] + return max(set(projs), key=projs.count) + + +def _get_resolution(filenames: Iterable[PathOrStr]) -> tuple[float, float]: + """Get the most common resolution in the list.""" + gts = [gdal.Open(fspath(fn)).GetGeoTransform() for fn in filenames] + res = [(dx, dy) for (_, dx, _, _, _, dy) in gts] + if len(set(res)) > 1: + raise ValueError(f"The input files have different resolutions: {res}. ") + return res[0] + + +def get_combined_bounds_nodata( + *filenames: PathOrStr, + target_aligned_pixels: bool = False, + out_bounds: Optional[Bbox] = None, + out_bounds_epsg: Optional[int] = None, + strides: Mapping[str, int] = {"x": 1, "y": 1}, +) -> tuple[Bbox, str | float | None]: + """Get the bounds and nodata of the combined image. + + Parameters + ---------- + filenames : list[PathOrStr] + list of filenames to combine + target_aligned_pixels : bool + if True, adjust output image bounds so that pixel coordinates + are integer multiples of pixel size, matching the `-tap` GDAL option. + out_bounds: Optional[Bbox] + if provided, forces the output image bounds to + (left, bottom, right, top). + Otherwise, computes from the outside of all input images. + out_bounds_epsg: Optional[int] + The EPSG of `out_bounds`. If not provided, assumed to be the same + as the EPSG of all `*filenames`. + strides : dict[str, int] + subsample factor: {"x": x strides, "y": y strides} + + Returns + ------- + bounds : Bbox + (min_x, min_y, max_x, max_y) + nodata : float + Nodata value of the input files + + Raises + ------ + ValueError: + If the inputs files have different resolutions/projections/nodata values + """ + # scan input files + xs = [] + ys = [] + resolutions = set() + projs = set() + nodatas = set() + + # Check all files match in resolution/projection + for fn in filenames: + ds = gdal.Open(fspath(fn)) + left, bottom, right, top = _io.get_raster_bounds(fn) + gt = ds.GetGeoTransform() + dx, dy = gt[1], gt[5] + + resolutions.add((abs(dx), abs(dy))) # dy is negative for north-up + projs.add(ds.GetProjection()) + + xs.extend([left, right]) + ys.extend([bottom, top]) + + nd = _io.get_raster_nodata(fn) + # Need to stringify 'nan', or it is repeatedly added + nodatas.add(str(nd) if (nd is not None and np.isnan(nd)) else nd) + + if len(resolutions) > 1: + raise ValueError(f"The input files have different resolutions: {resolutions}. ") + if len(projs) > 1: + raise ValueError(f"The input files have different projections: {projs}. ") + if len(nodatas) > 1: + raise ValueError(f"The input files have different nodata values: {nodatas}. ") + res = (abs(dx) * strides["x"], abs(dy) * strides["y"]) + + if out_bounds is not None: + if out_bounds_epsg is not None: + dst_epsg = _io.get_raster_crs(filenames[0]).to_epsg() + bounds = _reproject_bounds(out_bounds, out_bounds_epsg, dst_epsg) + else: + bounds = out_bounds # type: ignore + else: + bounds = min(xs), min(ys), max(xs), max(ys) + + if target_aligned_pixels: + bounds = _align_bounds(bounds, res) + + return bounds, list(nodatas)[0] + + +def _align_bounds(bounds: Iterable[float], res: tuple[float, float]): + """Align boundary with an integer multiple of the resolution.""" + left, bottom, right, top = bounds + left = math.floor(left / res[0]) * res[0] + right = math.ceil(right / res[0]) * res[0] + bottom = math.floor(bottom / res[1]) * res[1] + top = math.ceil(top / res[1]) * res[1] + return (left, bottom, right, top) + + +def _reproject_bounds(bounds: Bbox, src_epsg: int, dst_epsg: int) -> Bbox: + t = Transformer.from_crs(src_epsg, dst_epsg, always_xy=True) + left, bottom, right, top = bounds + bbox: Bbox = (*t.transform(left, bottom), *t.transform(right, top)) # type: ignore + return bbox + + +def get_transformed_bounds(filename: PathOrStr, epsg_code: Optional[int] = None): + """Get the bounds of a raster, possibly in a different CRS. + + Parameters + ---------- + filename : str + Path to the raster file. + epsg_code : Optional[int] + EPSG code of the CRS to transform to. + If not provided, or the raster is already in the desired CRS, + the bounds will not be transformed. + + Returns + ------- + tuple + The bounds of the raster as (left, bottom, right, top) + """ + bounds = _io.get_raster_bounds(filename) + if epsg_code is None: + return bounds + from_epsg = _io.get_raster_crs(filename=filename).to_epsg() + assert from_epsg is not None + if from_epsg == epsg_code: + return bounds + + return _reproject_bounds(bounds, from_epsg, epsg_code) + + +def _copy_set_nodata( + infile: PathOrStr, + outfile: Optional[PathOrStr] = None, + ext: Optional[str] = None, + in_band: int = 1, + out_nodata: float = 0, + driver="GTiff", + creation_options=DEFAULT_TIFF_OPTIONS, +): + """Make a copy of infile and replace NaNs/input nodata with `out_nodata`.""" + in_p = Path(infile) + if outfile is None: + if ext is None: + ext = in_p.suffix + out_dir = in_p.parent + outfile = out_dir / (in_p.stem + "_tmp" + ext) + + ds_in = gdal.Open(fspath(infile)) + drv = gdal.GetDriverByName(driver) + ds_out = drv.CreateCopy(fspath(outfile), ds_in, options=creation_options) + + bnd = ds_in.GetRasterBand(in_band) + nodata = bnd.GetNoDataValue() + arr = bnd.ReadAsArray() + # also make sure to replace NaNs, even if nodata is not set + mask = np.logical_or(np.isnan(arr), arr == nodata) + arr[mask] = out_nodata + + bnd1 = ds_out.GetRasterBand(1) + bnd1.WriteArray(arr) + bnd1.SetNoDataValue(out_nodata) + ds_out = bnd1 = None + + return outfile + + +def warp_to_match( + input_file: PathOrStr, + match_file: PathOrStr, + output_file: Optional[PathOrStr] = None, + resample_alg: str = "near", + output_format: Optional[str] = None, +) -> Path: + """Reproject `input_file` to align with the `match_file`. + + Uses the bounds, resolution, and CRS of `match_file`. + + Parameters + ---------- + input_file: PathOrStr + Path to the image to be reprojected. + match_file: PathOrStr + Path to the input image to serve as a reference for the reprojected image. + Uses the bounds, resolution, and CRS of this image. + output_file: PathOrStr + Path to the output, reprojected image. + If None, creates an in-memory warped VRT using the `/vsimem/` protocol. + resample_alg: str, optional, default = "near" + Resampling algorithm to be used during reprojection. + See https://gdal.org/programs/gdalwarp.html#cmdoption-gdalwarp-r for choices. + output_format: str, optional, default = None + Output format to be used for the output image. + If None, gdal will try to infer the format from the output file extension, or + (if the extension of `output_file` matches `input_file`) use the input driver. + + Returns + ------- + Path + Path to the output image. + Same as `output_file` if provided, otherwise a path to the in-memory VRT. + """ + bounds = _io.get_raster_bounds(match_file) + crs_wkt = _io.get_raster_crs(match_file).to_wkt() + gt = _io.get_raster_gt(match_file) + resolution = (gt[1], gt[5]) + + if output_file is None: + output_file = f"/vsimem/warped_{Path(input_file).stem}.vrt" + logger.debug(f"Creating in-memory warped VRT: {output_file}") + + if output_format is None and Path(input_file).suffix == Path(output_file).suffix: + output_format = _io.get_raster_driver(input_file) + + options = gdal.WarpOptions( + dstSRS=crs_wkt, + format=output_format, + xRes=resolution[0], + yRes=resolution[1], + outputBounds=bounds, + outputBoundsSRS=crs_wkt, + resampleAlg=resample_alg, + ) + gdal.Warp( + fspath(output_file), + fspath(input_file), + options=options, + ) + + return Path(output_file) diff --git a/tests/conftest.py b/tests/conftest.py index d28896b..e69de29 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +0,0 @@ -import pytest - -pytestmark = pytest.mark.filterwarnings( - "ignore::UserWarning:h5py is running against HDF5.*" -) diff --git a/tests/test_io.py b/tests/test_io.py new file mode 100644 index 0000000..9994aa1 --- /dev/null +++ b/tests/test_io.py @@ -0,0 +1,91 @@ +import numpy as np +import pytest +from pyproj import CRS + +try: + import rasterio as rio + import rasterio.transform + + RASTERIO_MISSING = False +except ImportError: + RASTERIO_MISSING = True + + +if RASTERIO_MISSING: + pytest.skip(reason="Rasterio not installed") + +from opera_utils import _io + + +@pytest.fixture(scope="module") +def bounds(): + return (-104, 30, -103, 33) + + +@pytest.fixture(scope="module") +def crs(): + return CRS.from_epsg(32611) + + +@pytest.fixture(scope="module") +def shape(): + return 30, 20 + + +@pytest.fixture(scope="module") +def transform(bounds, shape): + height, width = shape + return rasterio.transform.from_bounds(*bounds, width=width, height=height) + + +@pytest.fixture(scope="module") +def raster_file(tmp_path_factory, crs, transform, shape): + height, width = shape + img = np.random.rand(height, width).astype("float32") + + filename = tmp_path_factory.mktemp("data") / "raster.tif" + with rio.open( + filename, + "w", + driver="GTiff", + width=width, + height=height, + dtype="float32", + count=1, + crs=crs, + transform=transform, + nodata=np.nan, + ) as src: + src.write(img, 1) + + return filename + + +def test_get_raster_bounds(raster_file, bounds): + assert _io.get_raster_bounds(raster_file) == bounds + + +def test_get_raster_crs(raster_file, crs): + assert _io.get_raster_crs(raster_file) == crs + + +def test_get_raster_nodata(raster_file): + assert np.isnan(_io.get_raster_nodata(raster_file)) + + +def test_get_raster_transform(raster_file, transform): + assert _io.get_raster_transform(raster_file) == transform + + +def test_get_raster_get(raster_file, transform): + assert _io.get_raster_gt(raster_file) == transform.to_gdal() + + +def test_get_raster_dtype( + raster_file, +): + assert _io.get_raster_dtype(raster_file) == np.float32 + + +def test_get_raster_driver(raster_file): + assert _io.get_raster_driver(raster_file) == "GTiff" diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..8cf3ee3 --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,58 @@ +from pathlib import Path + +import pytest + +from opera_utils._utils import format_nc_filename, scratch_directory + + +def test_scratch_directory(tmp_path): + test_dir = tmp_path / "test_dir" + with scratch_directory(test_dir) as d: + (d / "test1.txt").write_text("asf") + assert not Path(test_dir).exists() + + with scratch_directory(test_dir, delete=False) as d: + (d / "test1.txt").write_text("asdf") + assert Path(test_dir).exists() + assert Path(test_dir / "test1.txt").read_text() == "asdf" + + +def test_scratch_directory_already_exists(tmp_path): + test_dir = tmp_path / "test_dir" + test_dir.mkdir() + assert Path(test_dir).exists() + + with scratch_directory(test_dir, delete=True) as d: + (d / "test1.txt").write_text("asdf") + assert Path(test_dir).exists() + + with scratch_directory(test_dir, delete=False) as d: + (d / "test1.txt").write_text("asdf") + assert Path(test_dir).exists() + + +def test_format_nc_filename(): + expected = 'NETCDF:"/usr/19990101/20200303_20210101.nc":"//variable"' + assert ( + format_nc_filename("/usr/19990101/20200303_20210101.nc", "variable") == expected + ) + + # check on Path + assert ( + format_nc_filename(Path("/usr/19990101/20200303_20210101.nc"), "variable") + == expected + ) + + # check non-netcdf file + assert ( + format_nc_filename("/usr/19990101/20200303_20210101.tif") + == "/usr/19990101/20200303_20210101.tif" + ) + assert ( + format_nc_filename("/usr/19990101/20200303_20210101.int", "ignored") + == "/usr/19990101/20200303_20210101.int" + ) + + with pytest.raises(ValueError): + # Missing the subdataset name + format_nc_filename("/usr/19990101/20200303_20210101.nc")