Skip to content

Commit e30f0d1

Browse files
v0.3.3 repartitions output on parent cell
1 parent 0d84f56 commit e30f0d1

File tree

3 files changed

+110
-12
lines changed

3 files changed

+110
-12
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -152,13 +152,13 @@ vector2dggs h3 -v DEBUG -id ogc_fid -r 9 -p 5 -t 4 --overwrite -tbl topo50_lake
152152
title={{vector2dggs}},
153153
author={Ardo, James and Law, Richard},
154154
url={https://github.com/manaakiwhenua/vector2dggs},
155-
version={0.3.2},
155+
version={0.3.3},
156156
date={2023-04-20}
157157
}
158158
```
159159
160160
APA/Harvard
161161
162-
> Ardo, J., & Law, R. (2023). vector2dggs (0.3.2) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
162+
> Ardo, J., & Law, R. (2023). vector2dggs (0.3.3) [Computer software]. https://github.com/manaakiwhenua/vector2dggs
163163
164164
[![manaakiwhenua-standards](https://github.com/manaakiwhenua/vector2dggs/workflows/manaakiwhenua-standards/badge.svg)](https://github.com/manaakiwhenua/manaakiwhenua-standards)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "vector2dggs"
3-
version = "0.3.2"
3+
version = "0.3.3"
44
description = "CLI DGGS indexer for vector geospatial data"
55
authors = ["James Ardo <ardoj@landcareresearch.co.nz>"]
66
maintainers = ["Richard Law <lawr@landcareresearch.co.nz>"]

vector2dggs/h3.py

Lines changed: 107 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,34 @@
3737
) # This is to filter out the polyfill warnings when rows failed to get indexed at a resolution, can be commented out to find missing rows
3838

3939

40+
DEFAULT_PARENT_OFFSET = 6
41+
42+
43+
class ParentResolutionException(Exception):
44+
pass
45+
46+
47+
def _get_parent_res(parent_res: Union[None, int], resolution: int):
48+
"""
49+
Uses a parent resolution,
50+
OR,
51+
Given a target resolution, returns our recommended parent resolution.
52+
53+
Used for intermediate re-partioning.
54+
"""
55+
return (
56+
int(parent_res)
57+
if parent_res is not None
58+
else max(MIN_H3, (resolution - DEFAULT_PARENT_OFFSET))
59+
)
60+
61+
4062
def polyfill(
41-
pq_in: Path, spatial_sort_col: str, resolution: int, output_directory: str
63+
pq_in: Path,
64+
spatial_sort_col: str,
65+
resolution: int,
66+
parent_res: Union[None, int],
67+
output_directory: str,
4268
) -> None:
4369
"""
4470
Reads a geoparquet, performs H3 polyfilling,
@@ -52,6 +78,9 @@ def polyfill(
5278
)
5379
df = pd.DataFrame(df).drop(columns=["index", "geometry"])
5480
df.index.rename(f"h3_{resolution:02}", inplace=True)
81+
parent_res: int = _get_parent_res(parent_res, resolution)
82+
# Secondary (parent) H3 index, used later for partitioning
83+
df = df.h3.h3_to_parent(parent_res).reset_index()
5584
df.to_parquet(
5685
PurePath(output_directory, pq_in.name),
5786
engine="auto",
@@ -60,14 +89,59 @@ def polyfill(
6089
return None
6190

6291

63-
def polyfill_star(args):
92+
def polyfill_star(args) -> None:
6493
return polyfill(*args)
6594

6695

96+
def _parent_partitioning(
97+
input_dir: Path,
98+
output_dir: Path,
99+
resolution,
100+
parent_res: Union[None, int],
101+
**kwargs,
102+
) -> Path:
103+
parent_res: int = _get_parent_res(parent_res, resolution)
104+
with TqdmCallback(desc="Reading spatial partitions"):
105+
# Set index as parent cell
106+
ddf = dd.read_parquet(input_dir, engine="pyarrow").set_index(
107+
f"h3_{parent_res:02}"
108+
)
109+
with TqdmCallback(desc="Counting parents"):
110+
# Count parents, to get target number of partitions
111+
uniqueh3 = sorted(list(ddf.index.unique().compute()))
112+
113+
LOGGER.debug(
114+
"Repartitioning into %d partitions, based on parent cells",
115+
len(uniqueh3) + 1,
116+
)
117+
118+
with TqdmCallback(desc="Repartitioning"):
119+
ddf = (
120+
ddf.repartition( # See "notes" on why divisions expects repetition of the last item https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.repartition.html
121+
divisions=(uniqueh3 + [uniqueh3[-1]]), force=True
122+
)
123+
.reset_index()
124+
.set_index(f"h3_{resolution:02}")
125+
.drop(columns=[f"h3_{parent_res:02}"])
126+
.to_parquet(
127+
output_dir,
128+
overwrite=kwargs.get("overwrite", False),
129+
engine=kwargs.get("engine", "pyarrow"),
130+
write_index=True,
131+
# append=False,
132+
name_function=lambda i: f"{uniqueh3[i]}.parquet",
133+
compression=kwargs.get("compression", "ZSTD"),
134+
)
135+
)
136+
LOGGER.debug("Parent cell repartitioning complete")
137+
return output_dir
138+
139+
67140
def _index(
68141
input_file: Union[Path, str],
69142
output_directory: Union[Path, str],
70143
resolution: int,
144+
parent_res: Union[None, int],
71145
keep_attributes: bool,
72146
npartitions: int,
73147
spatial_sorting: str,
@@ -78,6 +152,7 @@ def _index(
78152
con: Union[sqlalchemy.engine.Connection, sqlalchemy.engine.Engine] = None,
79153
table: str = None,
80154
geom_col: str = "geom",
155+
overwrite: bool = False,
81156
) -> Path:
82157
"""
83158
Performs multi-threaded H3 polyfilling on (multi)polygons.
@@ -138,7 +213,7 @@ def _index(
138213
else f"{spatial_sorting}_distance"
139214
)
140215

141-
with tempfile.TemporaryDirectory() as tmpdir:
216+
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir:
142217
with TqdmCallback():
143218
ddf.to_parquet(tmpdir, overwrite=True)
144219

@@ -149,12 +224,19 @@ def _index(
149224
"H3 Indexing on spatial partitions by polyfill with H3 resolution: %d",
150225
resolution,
151226
)
152-
with Pool(processes=processes) as pool:
153-
args = [
154-
(filepath, spatial_sort_col, resolution, output_directory)
155-
for filepath in filepaths
156-
]
157-
list(tqdm(pool.imap(polyfill_star, args), total=len(args)))
227+
with tempfile.TemporaryDirectory(suffix=".parquet") as tmpdir2:
228+
with Pool(processes=processes) as pool:
229+
args = [
230+
(filepath, spatial_sort_col, resolution, parent_res, tmpdir2)
231+
for filepath in filepaths
232+
]
233+
list(tqdm(pool.imap(polyfill_star, args), total=len(args)))
234+
235+
output_directory = _parent_partitioning(
236+
tmpdir2, output_directory, resolution, parent_res, overwrite=overwrite
237+
)
238+
239+
return output_directory
158240

159241

160242
@click.command(context_settings={"show_default": True})
@@ -169,6 +251,13 @@ def _index(
169251
help="H3 resolution to index",
170252
nargs=1,
171253
)
254+
@click.option(
255+
"-pr",
256+
"--parent_res",
257+
required=False,
258+
type=click.Choice(list(map(str, range(MIN_H3, MAX_H3 + 1)))),
259+
help="H3 Parent resolution for the output partition. Defaults to resolution - 6",
260+
)
172261
@click.option(
173262
"-id",
174263
"--id_field",
@@ -253,6 +342,7 @@ def h3(
253342
vector_input: Union[str, Path],
254343
output_directory: Union[str, Path],
255344
resolution: str,
345+
parent_res: str,
256346
id_field: str,
257347
keep_attributes: bool,
258348
partitions: int,
@@ -270,6 +360,12 @@ def h3(
270360
VECTOR_INPUT is the path to input vector geospatial data.
271361
OUTPUT_DIRECTORY should be a directory, not a file or database table, as it will instead be the write location for an Apache Parquet data store.
272362
"""
363+
if parent_res is not None and not int(parent_res) < int(resolution):
364+
raise ParentResolutionException(
365+
"Parent resolution ({pr}) must be less than target resolution ({r})".format(
366+
pr=parent_res, r=resolution
367+
)
368+
)
273369
con: sqlalchemy.engine.Connection = None
274370
scheme: str = urlparse(vector_input).scheme
275371
if bool(scheme) and scheme != "file":
@@ -305,6 +401,7 @@ def h3(
305401
vector_input,
306402
output_directory,
307403
int(resolution),
404+
parent_res,
308405
keep_attributes,
309406
partitions,
310407
spatial_sorting,
@@ -315,4 +412,5 @@ def h3(
315412
con=con,
316413
table=table,
317414
geom_col=geom_col,
415+
overwrite=overwrite,
318416
)

0 commit comments

Comments
 (0)