Skip to content

Commit 19dea56

Browse files
authored
Merge pull request rapidsai#17910 from rapidsai/branch-25.02
Forward-merge branch-25.02 into branch-25.04
2 parents 81c383c + 8b89ea0 commit 19dea56

File tree

2 files changed

+80
-29
lines changed

2 files changed

+80
-29
lines changed

cpp/src/io/json/read_json.cu

Lines changed: 37 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -165,23 +165,26 @@ std::size_t estimate_size_per_subchunk(std::size_t chunk_size)
165165
}
166166

167167
/**
168-
* @brief Return the upper bound on the batch size for the JSON reader.
168+
* @brief Return the batch size for the JSON reader.
169169
*
170-
* The datasources passed to the JSON reader are split into batches demarcated by byte range
171-
* offsets and read iteratively. The batch size is capped at INT_MAX bytes, which is the
172-
* default value returned by the function. This value can be overridden at runtime using the
173-
* environment variable LIBCUDF_JSON_BATCH_SIZE
170+
* The datasources passed to the JSON reader are read iteratively in batches demarcated by byte
171+
* range offsets. The tokenizer requires the JSON buffer read in each batch to be of size at most
172+
* INT_MAX bytes.
173+
* Since the byte range corresponding to a given batch can cause the last JSON line
174+
* in the batch to be incomplete, the batch size returned by this function allows for an additional
175+
* `max_subchunks_prealloced` subchunks to be allocated beyond the byte range offsets. Since the
176+
* size of the subchunk depends on the size of the byte range, the batch size is variable and cannot
177+
* be directly controlled by the user. As a workaround, the environment variable
178+
* LIBCUDF_JSON_BATCH_SIZE can be used to set a fixed batch size at runtime.
174179
*
175180
* @return size in bytes
176181
*/
177-
std::size_t get_batch_size_upper_bound()
182+
std::size_t get_batch_size(std::size_t chunk_size)
178183
{
179-
auto const batch_size_str = std::getenv("LIBCUDF_JSON_BATCH_SIZE");
180-
int64_t const batch_size = batch_size_str != nullptr ? std::atol(batch_size_str) : 0L;
181-
auto const batch_limit = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
182-
auto const batch_size_upper_bound = static_cast<std::size_t>(
183-
(batch_size > 0 && batch_size < batch_limit) ? batch_size : batch_limit);
184-
return batch_size_upper_bound;
184+
auto const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
185+
auto const batch_limit = static_cast<std::size_t>(std::numeric_limits<int32_t>::max()) -
186+
(max_subchunks_prealloced * size_per_subchunk);
187+
return std::min(batch_limit, getenv_or<std::size_t>("LIBCUDF_JSON_BATCH_SIZE", batch_limit));
185188
}
186189

187190
/**
@@ -295,6 +298,10 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
295298
}
296299
}
297300

301+
auto const batch_limit = static_cast<size_t>(std::numeric_limits<int32_t>::max());
302+
CUDF_EXPECTS(static_cast<size_t>(next_delim_pos - first_delim_pos - shift_for_nonzero_offset) <
303+
batch_limit,
304+
"The size of the JSON buffer returned by every batch cannot exceed INT_MAX bytes");
298305
return datasource::owning_buffer<rmm::device_buffer>(
299306
std::move(buffer),
300307
reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
@@ -365,17 +372,11 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
365372
reader_opts.is_enabled_lines() || total_source_size < std::numeric_limits<int32_t>::max(),
366373
"Parsing Regular JSON inputs of size greater than INT_MAX bytes is not supported");
367374

368-
std::size_t chunk_offset = reader_opts.get_byte_range_offset();
369-
std::size_t chunk_size = reader_opts.get_byte_range_size();
370-
chunk_size = !chunk_size ? total_source_size - chunk_offset
371-
: std::min(chunk_size, total_source_size - chunk_offset);
372-
373-
std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
374-
std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
375-
std::size_t const batch_size =
376-
batch_size_upper_bound < (max_subchunks_prealloced * size_per_subchunk)
377-
? batch_size_upper_bound
378-
: batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
375+
std::size_t chunk_offset = reader_opts.get_byte_range_offset();
376+
std::size_t chunk_size = reader_opts.get_byte_range_size();
377+
chunk_size = !chunk_size ? total_source_size - chunk_offset
378+
: std::min(chunk_size, total_source_size - chunk_offset);
379+
std::size_t const batch_size = get_batch_size(chunk_size);
379380

380381
/*
381382
* Identify the position (zero-indexed) of starting source file from which to begin
@@ -490,11 +491,19 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
490491
// Dispatch individual batches to read_batch and push the resulting table into
491492
// partial_tables array. Note that the reader options need to be updated for each
492493
// batch to adjust byte range offset and byte range size.
493-
for (std::size_t i = 1; i < batch_offsets.size() - 1; i++) {
494-
batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
495-
batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
496-
partial_tables.emplace_back(
497-
read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
494+
for (std::size_t batch_offset_pos = 1; batch_offset_pos < batch_offsets.size() - 1;
495+
batch_offset_pos++) {
496+
batched_reader_opts.set_byte_range_offset(batch_offsets[batch_offset_pos]);
497+
batched_reader_opts.set_byte_range_size(batch_offsets[batch_offset_pos + 1] -
498+
batch_offsets[batch_offset_pos]);
499+
auto partial_table =
500+
read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref());
501+
if (partial_table.tbl->num_columns() == 0 && partial_table.tbl->num_rows() == 0) {
502+
CUDF_EXPECTS(batch_offset_pos == batch_offsets.size() - 2,
503+
"Only the partial table generated by the last batch can be empty");
504+
break;
505+
}
506+
partial_tables.emplace_back(std::move(partial_table));
498507
}
499508

500509
auto expects_schema_equality =

cpp/tests/io/json/json_test.cpp

Lines changed: 43 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -3461,4 +3461,46 @@ TEST_F(JsonReaderTest, MismatchedBeginEndTokens)
34613461
EXPECT_THROW(cudf::io::read_json(opts), cudf::logic_error);
34623462
}
34633463

3464+
/**
3465+
* @brief Base test fixture for JSON batched reader tests
3466+
*/
3467+
struct JsonBatchedReaderTest : public cudf::test::BaseFixture {
3468+
public:
3469+
void set_batch_size(size_t batch_size_upper_bound)
3470+
{
3471+
setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
3472+
}
3473+
3474+
~JsonBatchedReaderTest() { unsetenv("LIBCUDF_JSON_BATCH_SIZE"); }
3475+
};
3476+
3477+
TEST_F(JsonBatchedReaderTest, EmptyLastBatch)
3478+
{
3479+
std::string data = R"(
3480+
{"a": "b"}
3481+
{"a": "b"}
3482+
{"a": "b"}
3483+
{"a": "b"}
3484+
)";
3485+
size_t size_of_last_batch = 5;
3486+
// This test constructs two batches by setting the batch size such that the last batch is an
3487+
// incomplete line. The JSON string corresponding to the first batch is
3488+
// '\n{"a": "b"}\n{"a": "b"}\n{"a": "b"}\n{"a": '
3489+
// The JSON string corresponding to the second batch is
3490+
// '"b"}\n'
3491+
this->set_batch_size(data.size() - size_of_last_batch);
3492+
auto opts =
3493+
cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
3494+
.lines(true)
3495+
.build();
3496+
auto result = cudf::io::read_json(opts);
3497+
3498+
EXPECT_EQ(result.tbl->num_columns(), 1);
3499+
EXPECT_EQ(result.tbl->num_rows(), 4);
3500+
EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
3501+
EXPECT_EQ(result.metadata.schema_info[0].name, "a");
3502+
CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
3503+
cudf::test::strings_column_wrapper{{"b", "b", "b", "b"}});
3504+
}
3505+
34643506
CUDF_TEST_PROGRAM_MAIN()

0 commit comments

Comments
 (0)