Skip to content

Commit

Permalink
Add group_by_date functions (#10)
Browse files Browse the repository at this point in the history
* add `group_by_date`

* rename index when loading geodataframes

* add `typing_extensions` and ignore transient h5py warning
  • Loading branch information
scottstanie authored Dec 8, 2023
1 parent 80fc976 commit 528a1e7
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 110 deletions.
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ dependencies:
- pooch>=1.7
- pyproj>=3.3
- shapely>=1.8
- typing_extensions>=4
74 changes: 63 additions & 11 deletions src/opera_utils/_dates.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
from __future__ import annotations

import datetime
import itertools
import re
from collections import defaultdict
from pathlib import Path
from typing import Iterable, overload

from ._types import Filename, PathLikeT
from ._types import DateOrDatetime, Filename, PathLikeT

__all__ = [
"get_dates",
"filter_by_date",
"group_by_date",
"DATE_FORMAT",
]

DATE_FORMAT = "%Y%m%d"
DATETIME_FORMAT = "%Y%m%dT%H%M%S"


def get_dates(filename: Filename, fmt: str = DATE_FORMAT) -> list[datetime.date]:
def get_dates(filename: Filename, fmt: str = DATE_FORMAT) -> list[datetime.datetime]:
"""Search for dates in the stem of `filename` matching `fmt`.
Excludes dates that are not in the stem of `filename` (in the directories).
Expand All @@ -31,15 +34,15 @@ def get_dates(filename: Filename, fmt: str = DATE_FORMAT) -> list[datetime.date]
Returns
-------
list[datetime.date]
list[datetime.datetime]
list of dates found in the stem of `filename` matching `fmt`.
Examples
--------
>>> get_dates("/path/to/20191231.slc.tif")
[datetime.date(2019, 12, 31)]
[datetime.datetime(2019, 12, 31, 0, 0)]
>>> get_dates("S1A_IW_SLC__1SDV_20191231T000000_20191231T000000_032123_03B8F1_1C1D.nc")
[datetime.date(2019, 12, 31), datetime.date(2019, 12, 31)]
[datetime.datetime(2019, 12, 31, 0, 0), datetime.datetime(2019, 12, 31, 0, 0)]
>>> get_dates("/not/a/date_named_file.tif")
[]
""" # noqa: E501
Expand All @@ -54,7 +57,7 @@ def get_dates(filename: Filename, fmt: str = DATE_FORMAT) -> list[datetime.date]
@overload
def filter_by_date(
files: Iterable[PathLikeT],
dates: Iterable[datetime.date],
dates: Iterable[DateOrDatetime],
fmt: str = DATE_FORMAT,
) -> list[PathLikeT]:
...
Expand All @@ -63,7 +66,7 @@ def filter_by_date(
@overload
def filter_by_date(
files: Iterable[str],
dates: Iterable[datetime.date],
dates: Iterable[DateOrDatetime],
fmt: str = DATE_FORMAT,
) -> list[str]:
...
Expand All @@ -80,6 +83,11 @@ def filter_by_date(files, dates, fmt=DATE_FORMAT):
Iterable of dates to filter by
fmt : str, optional
Format of date to search for. Default is %Y%m%d
Returns
-------
list[PathLikeT]
Items in `files`
"""
date_set = set(dates)
out = []
Expand All @@ -90,8 +98,51 @@ def filter_by_date(files, dates, fmt=DATE_FORMAT):
return out


def _parse_date(datestr: str, fmt: str = DATE_FORMAT) -> datetime.date:
return datetime.datetime.strptime(datestr, fmt).date()
def group_by_date(
files: Iterable[PathLikeT], file_date_fmt: str = DATE_FORMAT
) -> dict[tuple[datetime.datetime, ...], list[PathLikeT]]:
"""Combine files by date into a dict.
Parameters
----------
files: Iterable[Filename]
Path to folder containing files with dates in the filename.
file_date_fmt: str
Format of the date in the filename.
Default is [dolphin.DEFAULT_DATETIME_FORMAT][]
Returns
-------
dict
key is a list of dates in the filenames.
Value is a list of Paths on that date.
E.g.:
{(datetime.datetime(2017, 10, 13),
[Path(...)
Path(...),
...]),
(datetime.datetime(2017, 10, 25),
[Path(...)
Path(...),
...]),
}
"""
# collapse into groups of dates
# Use a `defaultdict` so we dont have to sort the files by date in advance,
# but rather just extend the list each time there's a new group
grouped_images: dict[tuple[datetime.datetime, ...], list[PathLikeT]] = defaultdict(
list
)

for dates, g in itertools.groupby(
files, key=lambda x: tuple(get_dates(x, fmt=file_date_fmt))
):
grouped_images[dates].extend(list(g))
return grouped_images


def _parse_date(datestr: str, fmt: str = DATE_FORMAT) -> datetime.datetime:
return datetime.datetime.strptime(datestr, fmt)


def _get_path_from_gdal_str(name: Filename) -> Path:
Expand All @@ -111,8 +162,6 @@ def _get_path_from_gdal_str(name: Filename) -> Path:
def _date_format_to_regex(date_format: str) -> re.Pattern:
r"""Convert a python date format string to a regular expression.
Useful for Year, month, date date formats.
Parameters
----------
date_format : str
Expand All @@ -139,6 +188,9 @@ def _date_format_to_regex(date_format: str) -> re.Pattern:
date_format = date_format.replace("%Y", r"\d{4}")
date_format = date_format.replace("%m", r"\d{2}")
date_format = date_format.replace("%d", r"\d{2}")
date_format = date_format.replace("%H", r"\d{2}")
date_format = date_format.replace("%M", r"\d{2}")
date_format = date_format.replace("%S", r"\d{2}")

# Return the resulting regular expression
return re.compile(date_format)
3 changes: 3 additions & 0 deletions src/opera_utils/_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import datetime
import sys
from os import PathLike
from typing import TYPE_CHECKING, Tuple, TypeVar, Union
Expand Down Expand Up @@ -28,3 +29,5 @@
# Used for callable types
T = TypeVar("T")
P = ParamSpec("P")

DateOrDatetime = Union[datetime.datetime, datetime.date]
8 changes: 7 additions & 1 deletion src/opera_utils/burst_frame_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ def get_frame_geojson(
as_geodataframe=as_geodataframe,
columns=columns,
where=where,
index_name="frame_id",
)


Expand All @@ -81,6 +82,7 @@ def get_burst_id_geojson(
as_geodataframe=as_geodataframe,
columns=columns,
where=where,
index_name="burst_id_jpl",
)


Expand All @@ -96,6 +98,7 @@ def _get_geojson(
as_geodataframe: bool = False,
columns: Optional[Sequence[str]] = None,
where: Optional[str] = None,
index_name: Optional[str] = None,
) -> dict:
# https://gdal.org/user/ogr_sql_dialect.html#where
# https://pyogrio.readthedocs.io/en/latest/introduction.html#filter-records-by-attribute-value
Expand All @@ -104,7 +107,10 @@ def _get_geojson(

# import geopandas as gpd
# return gpd.read_file(f)
return read_dataframe(f, columns=columns, where=where)
gdf = read_dataframe(f, columns=columns, where=where, fid_as_index=True)
if index_name:
gdf.index.name = index_name
return gdf

return read_zipped_json(f)

Expand Down
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import pytest

pytestmark = pytest.mark.filterwarnings(
"ignore::UserWarning:h5py is running against HDF5.*"
)
100 changes: 56 additions & 44 deletions src/opera_utils/test_dates.py → tests/test_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,21 +6,22 @@

def test_date_format_to_regex():
# Test date format strings with different specifiers and delimiters
date_formats = ["%Y-%m-%d", "%Y/%m/%d", "%Y%m%d", "%d-%m-%Y", "%m/%d/%Y"]
matching_dates = [
("%Y-%m-%d", "2021-01-01"),
("%Y/%m/%d", "2022/02/02"),
("%Y%m%d", "20230103"),
("%d-%m-%Y", "01-04-2024"),
("%m/%d/%Y", "05/06/2025"),
"2021-01-01",
"2022/02/02",
"20230103",
"01-04-2024",
"05/06/2025",
"20210301",
]
for date_format, date in matching_dates:
for date_format, date in zip(date_formats, matching_dates):
pattern = _dates._date_format_to_regex(date_format)

# Test that the date matches the regular expression
assert pattern.match(date) is not None

# Test date formats that should not match the dates in "non_matching_dates"
date_formats = ["%Y-%m-%d", "%Y/%m/%d", "%Y%m%d", "%d-%m-%Y", "%m/%d/%Y"]
non_matching_dates = ["01-01-2021", "2022-02-03", "2022-03-04", "2022/05/06"]
for date, date_format in zip(non_matching_dates, date_formats):
pattern = _dates._date_format_to_regex(date_format)
Expand All @@ -29,25 +30,35 @@ def test_date_format_to_regex():
assert pattern.match(date) is None


def test_datetime_format_to_regex():
# Check on a Sentinel-1-like datetime format
date_format = "%Y%m%dT%H%M%S"
date = "20221204T005230"
pattern = _dates._date_format_to_regex(date_format)

# Test that the date matches the regular expression
assert pattern.match(date)


def test_get_dates():
assert _dates.get_dates("20200303_20210101.int") == [
datetime.date(2020, 3, 3),
datetime.date(2021, 1, 1),
datetime.datetime(2020, 3, 3),
datetime.datetime(2021, 1, 1),
]

assert _dates.get_dates("20200303.slc")[0] == datetime.date(2020, 3, 3)
assert _dates.get_dates(Path("20200303.slc"))[0] == datetime.date(2020, 3, 3)
assert _dates.get_dates("20200303.slc")[0] == datetime.datetime(2020, 3, 3)
assert _dates.get_dates(Path("20200303.slc"))[0] == datetime.datetime(2020, 3, 3)
# Check that it's the filename, not the path
assert _dates.get_dates(Path("/usr/19990101/asdf20200303.tif"))[0] == datetime.date(
2020, 3, 3
)
assert _dates.get_dates("/usr/19990101/asdf20200303.tif")[0] == datetime.date(
assert _dates.get_dates(Path("/usr/19990101/asdf20200303.tif"))[
0
] == datetime.datetime(2020, 3, 3)
assert _dates.get_dates("/usr/19990101/asdf20200303.tif")[0] == datetime.datetime(
2020, 3, 3
)

assert _dates.get_dates("/usr/19990101/20200303_20210101.int") == [
datetime.date(2020, 3, 3),
datetime.date(2021, 1, 1),
datetime.datetime(2020, 3, 3),
datetime.datetime(2021, 1, 1),
]

assert _dates.get_dates("/usr/19990101/notadate.tif") == []
Expand All @@ -57,44 +68,45 @@ def test_get_dates_with_format():
# try other date formats
fmt = "%Y-%m-%d"
assert _dates.get_dates("2020-03-03_2021-01-01.int", fmt) == [
datetime.date(2020, 3, 3),
datetime.date(2021, 1, 1),
datetime.datetime(2020, 3, 3),
datetime.datetime(2021, 1, 1),
]

fmt = "%Y%m%dT%H%M%S"
# Check the OPERA name
fn = "OPERA_L2_CSLC-S1_T087-185678-IW2_20180210T232711Z_20230101T100506Z_S1A_VV_v1.0.h5"
assert _dates.get_dates(fn, fmt) == [
datetime.datetime(2018, 2, 10, 23, 27, 11),
datetime.datetime(2023, 1, 1, 10, 5, 6),
]

# Check the Sentinel name
fn = "S1A_IW_SLC__1SDV_20221204T005230_20221204T005257_046175_05873C_3B80.zip"
assert _dates.get_dates(fn, fmt) == [
datetime.datetime(2022, 12, 4, 0, 52, 30),
datetime.datetime(2022, 12, 4, 0, 52, 57),
]

# Check without a format using default
assert _dates.get_dates(fn) == [
datetime.datetime(2022, 12, 4, 0, 0, 0),
datetime.datetime(2022, 12, 4, 0, 0, 0),
]


def test_get_dates_with_gdal_string():
# Checks that is can parse 'NETCDF:"/path/to/file.nc":variable'
assert _dates.get_dates('NETCDF:"/usr/19990101/20200303_20210101.nc":variable') == [
datetime.date(2020, 3, 3),
datetime.date(2021, 1, 1),
datetime.datetime(2020, 3, 3),
datetime.datetime(2021, 1, 1),
]
assert _dates.get_dates(
'NETCDF:"/usr/19990101/20200303_20210101.nc":"//variable/2"'
) == [
datetime.date(2020, 3, 3),
datetime.date(2021, 1, 1),
datetime.datetime(2020, 3, 3),
datetime.datetime(2021, 1, 1),
]
# Check the derived dataset name too
assert _dates.get_dates(
'DERIVED_SUBDATASET:AMPLITUDE:"/usr/19990101/20200303_20210101.int"'
) == [datetime.date(2020, 3, 3), datetime.date(2021, 1, 1)]


def test_filter_by_date():
files = [
"slc_20200303.tif",
"slc_20190101.tif",
"slc_20210101.tif",
"slc_20180101.tif",
]
dates = [datetime.date(2018, 1, 1), datetime.date(2019, 1, 1)]
expected_files = ["slc_20190101.tif", "slc_20180101.tif"]

filtered_files = _dates.filter_by_date(files, dates)
assert filtered_files == expected_files

# Check that it works with Path objects
files = [Path(f) for f in files]
expected_files = [Path(f) for f in expected_files]
filtered_files = _dates.filter_by_date(files, dates)
assert filtered_files == expected_files
) == [datetime.datetime(2020, 3, 3), datetime.datetime(2021, 1, 1)]
Loading

0 comments on commit 528a1e7

Please sign in to comment.