Skip to content

Commit 64b6ee3

Browse files
authored
Fix writing of compressed ORC files with large stripe footers (rapidsai#17700)
In ORC, stripe footers can be compressed, the same way as the data. This means that compressed footers need to be written in multiple blocks if they are larger than the maximum block size. This applies even if the footer is actually uncompressed (in this case a flag in the block header is raised). Currently, the ORC writer does not take into account that footer can be larger than max block size, and writes the entire thing in a single block, which is not valid. The issue only applies to compressed files. Uncompressed files do not apply this limitation to the footers. This PR changes the way the stripe footers are written to account for this case. The output hasn't changed for files with small stripe footers. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - https://github.com/nvdbaranec URL: rapidsai#17700
1 parent 2b6cf07 commit 64b6ee3

File tree

2 files changed

+41
-8
lines changed

2 files changed

+41
-8
lines changed

cpp/src/io/orc/writer_impl.cu

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2595,16 +2595,30 @@ void writer::impl::write_orc_data_to_sink(encoded_data const& enc_data,
25952595
: 0;
25962596
if (orc_table.column(i - 1).orc_kind() == TIMESTAMP) { sf.writerTimezone = "UTC"; }
25972597
}
2598-
ProtobufWriter pbw((_compression != compression_type::NONE) ? 3 : 0);
2598+
2599+
ProtobufWriter pbw;
25992600
pbw.write(sf);
2600-
stripe.footerLength = pbw.size();
2601-
if (_compression != compression_type::NONE) {
2602-
uint32_t uncomp_sf_len = (stripe.footerLength - 3) * 2 + 1;
2603-
pbw.buffer()[0] = static_cast<uint8_t>(uncomp_sf_len >> 0);
2604-
pbw.buffer()[1] = static_cast<uint8_t>(uncomp_sf_len >> 8);
2605-
pbw.buffer()[2] = static_cast<uint8_t>(uncomp_sf_len >> 16);
2601+
if (_compression == compression_type::NONE) {
2602+
_out_sink->host_write(pbw.data(), pbw.size());
2603+
stripe.footerLength = pbw.size();
2604+
} else {
2605+
std::size_t bytes_written = 0;
2606+
std::size_t written_sf_len = 0;
2607+
while (written_sf_len < pbw.size()) {
2608+
auto const block_size = std::min(_compression_blocksize, pbw.size() - written_sf_len);
2609+
auto const header_val = block_size * 2 + 1; // 1 means uncompressed
2610+
CUDF_EXPECTS(header_val >> 24 == 0, "Block length exceeds maximum size");
2611+
std::array const header{static_cast<uint8_t>(header_val >> 0),
2612+
static_cast<uint8_t>(header_val >> 8),
2613+
static_cast<uint8_t>(header_val >> 16)};
2614+
2615+
_out_sink->host_write(header.data(), header.size());
2616+
_out_sink->host_write(pbw.data() + written_sf_len, block_size);
2617+
written_sf_len += block_size;
2618+
bytes_written += header.size() + block_size;
2619+
}
2620+
stripe.footerLength = bytes_written;
26062621
}
2607-
_out_sink->host_write(pbw.data(), pbw.size());
26082622
}
26092623
for (auto const& task : write_tasks) {
26102624
task.wait();

cpp/tests/io/orc_test.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2196,4 +2196,23 @@ TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
21962196
EXPECT_EQ(out_buffer.size(), 0);
21972197
}
21982198

2199+
TEST_F(OrcWriterTest, MultipleBlocksInStripeFooter)
2200+
{
2201+
std::vector<std::string> vals_col(8, "a");
2202+
str_col col{vals_col.begin(), vals_col.end()};
2203+
cudf::column_view col_view = col;
2204+
table_view expected(std::vector<cudf::column_view>{6400, col_view});
2205+
2206+
std::vector<char> out_buffer;
2207+
cudf::io::orc_writer_options out_opts =
2208+
cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, expected);
2209+
// Write with compression on (default)
2210+
cudf::io::write_orc(out_opts);
2211+
2212+
cudf::io::orc_reader_options in_opts = cudf::io::orc_reader_options::builder(
2213+
cudf::io::source_info{out_buffer.data(), out_buffer.size()});
2214+
auto result = cudf::io::read_orc(in_opts);
2215+
CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
2216+
}
2217+
21992218
CUDF_TEST_PROGRAM_MAIN()

0 commit comments

Comments
 (0)