Skip to content

Commit f7c3aa5

Browse files
authored
feat(r): Optimize conversion from sfc to ArrowArray (#76)
* first stab * first go * maybe fix * points * maybe better preallocation * tidy * nanoarrow compat * coverage * use wk_meta * better
1 parent 23beab8 commit f7c3aa5

File tree

5 files changed

+317
-10
lines changed

5 files changed

+317
-10
lines changed

r/geoarrow/NAMESPACE

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33
S3method(as_geoarrow_array,character)
44
S3method(as_geoarrow_array,default)
55
S3method(as_geoarrow_array,nanoarrow_array)
6+
S3method(as_geoarrow_array,sfc)
67
S3method(as_geoarrow_array,wk_wkb)
78
S3method(as_geoarrow_array,wk_wkt)
89
S3method(as_geoarrow_array,wk_xy)
910
S3method(as_geoarrow_array_stream,default)
1011
S3method(as_geoarrow_array_stream,nanoarrow_array_stream)
12+
S3method(as_nanoarrow_array,sfc)
1113
S3method(infer_geoarrow_schema,default)
1214
S3method(infer_geoarrow_schema,nanoarrow_array)
1315
S3method(infer_geoarrow_schema,nanoarrow_array_stream)
@@ -26,5 +28,6 @@ export(na_extension_large_wkb)
2628
export(na_extension_large_wkt)
2729
export(na_extension_wkb)
2830
export(na_extension_wkt)
31+
importFrom(nanoarrow,as_nanoarrow_array)
2932
importFrom(nanoarrow,infer_nanoarrow_schema)
3033
useDynLib(geoarrow, .registration = TRUE)

r/geoarrow/R/sf-compat.R

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,34 @@ infer_nanoarrow_schema.sfc <- function(x, ...) {
55
infer_geoarrow_schema(x)
66
}
77

8-
# Eventually we can add specializations for as_geoarrow_array() based on
9-
# st_as_grob(), which is very fast and generates lengths + a column-major
10-
# matrix full of buffers we can provide views into.
8+
#' @export
9+
as_geoarrow_array.sfc <- function(x, ..., schema = NULL) {
10+
# Let the default method handle custom output schemas
11+
if (!is.null(schema)) {
12+
return(NextMethod())
13+
}
14+
15+
meta <- wk::wk_vector_meta(x)
16+
17+
# Let the default method handle M values (the optimized path doesn't
18+
# handle mixed XYZ/XYZM/XYM but can deal with mixed XY and XYZ)
19+
if (meta$has_m) {
20+
return(NextMethod())
21+
}
22+
23+
if (meta$geometry_type %in% 1:6) {
24+
schema <- infer_geoarrow_schema(x)
25+
array <- nanoarrow::nanoarrow_allocate_array()
26+
.Call(geoarrow_c_as_nanoarrow_array_sfc, x, schema, array)
27+
nanoarrow::nanoarrow_array_set_schema(array, schema)
28+
array
29+
} else {
30+
NextMethod()
31+
}
32+
}
33+
34+
#' @importFrom nanoarrow as_nanoarrow_array
35+
#' @export
36+
as_nanoarrow_array.sfc <- function(x, ..., schema = NULL) {
37+
as_geoarrow_array(x, ..., schema = schema)
38+
}

r/geoarrow/src/r-init.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ SEXP geoarrow_c_kernel(SEXP kernel_name_sexp, SEXP arg_types_sexp, SEXP options_
1313
SEXP schema_out_xptr);
1414
SEXP geoarrow_c_kernel_push(SEXP kernel_xptr, SEXP args_sexp, SEXP array_out_xptr);
1515
SEXP geoarrow_c_kernel_finish(SEXP kernel_xptr, SEXP array_out_xptr);
16+
SEXP geoarrow_c_as_nanoarrow_array_sfc(SEXP sfc, SEXP schema_xptr, SEXP array_xptr);
1617

1718
static const R_CallMethodDef CallEntries[] = {
1819
{"geoarrow_c_handle_stream", (DL_FUNC)&geoarrow_c_handle_stream, 2},
@@ -23,6 +24,7 @@ static const R_CallMethodDef CallEntries[] = {
2324
{"geoarrow_c_kernel", (DL_FUNC)&geoarrow_c_kernel, 4},
2425
{"geoarrow_c_kernel_push", (DL_FUNC)&geoarrow_c_kernel_push, 3},
2526
{"geoarrow_c_kernel_finish", (DL_FUNC)&geoarrow_c_kernel_finish, 2},
27+
{"geoarrow_c_as_nanoarrow_array_sfc", (DL_FUNC)&geoarrow_c_as_nanoarrow_array_sfc, 3},
2628
{NULL, NULL, 0}};
2729

2830
void R_init_geoarrow(DllInfo* dll) {

r/geoarrow/src/r-sf-compat.c

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
#define R_NO_REMAP
2+
#include <R.h>
3+
#include <Rinternals.h>
4+
5+
#include "geoarrow.h"
6+
#include "nanoarrow.h"
7+
8+
static inline int builder_append_sfg(SEXP item, struct GeoArrowBuilder* builder,
9+
int level, int32_t* current_offsets) {
10+
switch (TYPEOF(item)) {
11+
// Level of nesting
12+
case VECSXP: {
13+
if (level >= builder->view.n_offsets) {
14+
Rf_error("Unexpected level of nesting whilst buliding ArrowArray from sfc");
15+
}
16+
17+
int32_t n = Rf_length(item);
18+
current_offsets[level] += n;
19+
NANOARROW_RETURN_NOT_OK(
20+
GeoArrowBuilderOffsetAppend(builder, level, current_offsets + level, 1));
21+
for (int32_t i = 0; i < n; i++) {
22+
builder_append_sfg(VECTOR_ELT(item, i), builder, level + 1, current_offsets);
23+
}
24+
break;
25+
}
26+
27+
// Matrix containing a coordinate sequence
28+
case REALSXP: {
29+
int32_t n = Rf_nrows(item);
30+
current_offsets[level] += n;
31+
NANOARROW_RETURN_NOT_OK(
32+
GeoArrowBuilderOffsetAppend(builder, level, current_offsets + level, 1));
33+
34+
if (n == 0) {
35+
return GEOARROW_OK;
36+
}
37+
38+
int n_col = Rf_ncols(item);
39+
double* coords = REAL(item);
40+
struct GeoArrowBufferView view;
41+
view.data = (uint8_t*)coords;
42+
view.size_bytes = n * sizeof(double);
43+
44+
int first_coord_buffer = 1 + builder->view.n_offsets;
45+
for (int i = 0; i < n_col; i++) {
46+
// Omit dimensions in sfc but not in builder
47+
if (i >= builder->view.coords.n_values) {
48+
break;
49+
}
50+
51+
NANOARROW_RETURN_NOT_OK(
52+
GeoArrowBuilderAppendBuffer(builder, first_coord_buffer + i, view));
53+
view.data += view.size_bytes;
54+
}
55+
56+
// Fill dimensions in builder but not in sfc with nan
57+
for (int i = n_col; i < builder->view.coords.n_values; i++) {
58+
double nan_dbl = NAN;
59+
view.data = (uint8_t*)&nan_dbl;
60+
view.size_bytes = sizeof(double);
61+
NANOARROW_RETURN_NOT_OK(GeoArrowBuilderReserveBuffer(
62+
builder, first_coord_buffer + i, n * sizeof(double)));
63+
for (int j = 0; j < n; j++) {
64+
GeoArrowBuilderAppendBufferUnsafe(builder, first_coord_buffer + i, view);
65+
}
66+
}
67+
break;
68+
}
69+
70+
default:
71+
Rf_error("Unexpected element whilst building ArrowArray from sfc");
72+
}
73+
74+
return GEOARROW_OK;
75+
}
76+
77+
static inline int builder_append_sfc_point(SEXP sfc, struct GeoArrowBuilder* builder) {
78+
R_xlen_t n = Rf_xlength(sfc);
79+
80+
for (int i = 0; i < builder->view.coords.n_values; i++) {
81+
NANOARROW_RETURN_NOT_OK(GeoArrowBuilderCoordsReserve(builder, n));
82+
}
83+
84+
SEXP item_sexp;
85+
double* item;
86+
int coord_size;
87+
for (R_xlen_t i = 0; i < n; i++) {
88+
item_sexp = VECTOR_ELT(sfc, i);
89+
item = REAL(item_sexp);
90+
coord_size = Rf_length(item_sexp);
91+
for (int j = 0; j < coord_size; j++) {
92+
// Omit dimensions in sfc but not in builder
93+
if (j >= builder->view.coords.n_values) {
94+
break;
95+
}
96+
97+
builder->view.coords.values[j][i] = item[j];
98+
}
99+
100+
// Fill dimensions in builder but not in sfc with nan
101+
for (int j = coord_size; j < builder->view.coords.n_values; j++) {
102+
builder->view.coords.values[j][i] = NAN;
103+
}
104+
}
105+
106+
builder->view.coords.size_coords = n;
107+
builder->view.length = n;
108+
return GEOARROW_OK;
109+
}
110+
111+
static int builder_append_sfc(SEXP sfc, struct GeoArrowBuilder* builder) {
112+
if (Rf_inherits(sfc, "sfc_POINT")) {
113+
return builder_append_sfc_point(sfc, builder);
114+
}
115+
116+
R_xlen_t n = Rf_xlength(sfc);
117+
118+
// Append initial 0 to the offset buffers and reserve memory for their minimum
119+
// likely size (might be inaccurate for sfcs with a lot of empties).
120+
int32_t zero = 0;
121+
NANOARROW_RETURN_NOT_OK(GeoArrowBuilderOffsetReserve(builder, 0, n + 1));
122+
GeoArrowBuilderOffsetAppendUnsafe(builder, 0, &zero, 1);
123+
124+
for (int i = 1; i < builder->view.n_offsets; i++) {
125+
NANOARROW_RETURN_NOT_OK(GeoArrowBuilderOffsetReserve(builder, i, (n + 1) * 1.5));
126+
GeoArrowBuilderOffsetAppendUnsafe(builder, i, &zero, 1);
127+
}
128+
129+
// Keep track of current last value
130+
int32_t current_offsets[] = {0, 0, 0};
131+
132+
// Append elements
133+
for (R_xlen_t i = 0; i < n; i++) {
134+
SEXP item = VECTOR_ELT(sfc, i);
135+
NANOARROW_RETURN_NOT_OK(builder_append_sfg(item, builder, 0, current_offsets));
136+
}
137+
138+
builder->view.length = n;
139+
return GEOARROW_OK;
140+
}
141+
142+
static void finalize_builder_xptr(SEXP builder_xptr) {
143+
struct GeoArrowBuilder* builder =
144+
(struct GeoArrowBuilder*)R_ExternalPtrAddr(builder_xptr);
145+
if (builder != NULL && builder->private_data != NULL) {
146+
GeoArrowBuilderReset(builder);
147+
}
148+
149+
if (builder != NULL) {
150+
free(builder);
151+
}
152+
}
153+
154+
SEXP geoarrow_c_as_nanoarrow_array_sfc(SEXP sfc, SEXP schema_xptr, SEXP array_xptr) {
155+
struct ArrowSchema* schema = (struct ArrowSchema*)R_ExternalPtrAddr(schema_xptr);
156+
struct ArrowArray* array = (struct ArrowArray*)R_ExternalPtrAddr(array_xptr);
157+
158+
// Use external pointer finalizer to ensure builder is cleaned up
159+
struct GeoArrowBuilder* builder =
160+
(struct GeoArrowBuilder*)malloc(sizeof(struct GeoArrowBuilder));
161+
if (builder == NULL) {
162+
Rf_error("Failed to allocate for GeoArrowBuilder");
163+
}
164+
builder->private_data = NULL;
165+
SEXP builder_xptr = PROTECT(R_MakeExternalPtr(builder, R_NilValue, R_NilValue));
166+
R_RegisterCFinalizer(builder_xptr, &finalize_builder_xptr);
167+
168+
struct GeoArrowError error;
169+
error.message[0] = '\0';
170+
171+
// Initialize the builder
172+
int result = GeoArrowBuilderInitFromSchema(builder, schema, &error);
173+
if (result != GEOARROW_OK) {
174+
Rf_error("GeoArrowBuilderInitFromSchema() failed: %s", error.message);
175+
}
176+
177+
// Build the offset buffers from the various layers of nesting
178+
result = builder_append_sfc(sfc, builder);
179+
if (result != GEOARROW_OK) {
180+
Rf_error("builder_append_sfc() failed to allocate memory for offset buffers");
181+
}
182+
183+
// Build result
184+
result = GeoArrowBuilderFinish(builder, array, &error);
185+
if (result != GEOARROW_OK) {
186+
Rf_error("GeoArrowBuilderFinish() failed: %s", error.message);
187+
}
188+
189+
UNPROTECT(1);
190+
return R_NilValue;
191+
}

0 commit comments

Comments
 (0)