Skip to content

Commit c867a32

Browse files
committed
[ntuple] fix schema serialization
When serializing the header and header extension we're currently assuming that the first time we serialize it we don't have a header extension. While this is true in most cases, it's not true when we are incrementally merging an existing RNTuple. With this fix we correctly handle this case by skipping all columns belonging to the extension header when serializing the regular header.
1 parent 1534704 commit c867a32

File tree

4 files changed

+107
-25
lines changed

4 files changed

+107
-25
lines changed

tree/ntuple/v7/inc/ROOT/RNTupleDescriptor.hxx

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -551,7 +551,7 @@ private:
551551
DescriptorId_t fFieldZeroId = kInvalidDescriptorId; ///< Set by the descriptor builder
552552

553553
std::uint64_t fNPhysicalColumns = 0; ///< Updated by the descriptor builder when columns are added
554-
std::uint64_t fOnDiskHeaderSize = 0; ///< Set by the descriptor builder when deserialized
554+
std::uint64_t fOnDiskHeaderSize = 0; ///< Set by the descriptor builder when deserialized
555555

556556
std::set<unsigned int> fFeatureFlags;
557557
std::unordered_map<DescriptorId_t, RFieldDescriptor> fFieldDescriptors;
@@ -566,8 +566,8 @@ private:
566566
std::uint64_t fOnDiskHeaderXxHash3 = 0; ///< Set by the descriptor builder when deserialized
567567
std::uint64_t fOnDiskFooterSize = 0; ///< Like fOnDiskHeaderSize, contains both cluster summaries and page locations
568568

569-
std::uint64_t fNEntries = 0; ///< Updated by the descriptor builder when the cluster groups are added
570-
std::uint64_t fNClusters = 0; ///< Updated by the descriptor builder when the cluster groups are added
569+
std::uint64_t fNEntries = 0; ///< Updated by the descriptor builder when the cluster groups are added
570+
std::uint64_t fNClusters = 0; ///< Updated by the descriptor builder when the cluster groups are added
571571

572572
/**
573573
* Once constructed by an RNTupleDescriptorBuilder, the descriptor is mostly immutable except for set of
@@ -1042,6 +1042,8 @@ public:
10421042
/// We cannot create this vector when building the fFields because at the time when AddExtendedField is called,
10431043
/// the field is not yet linked into the schema tree.
10441044
std::vector<DescriptorId_t> GetTopLevelFields(const RNTupleDescriptor &desc) const;
1045+
1046+
bool ContainsField(DescriptorId_t fieldId) const { return fFieldIdsLookup.find(fieldId) != fFieldIdsLookup.end(); }
10451047
};
10461048

10471049
namespace Internal {

tree/ntuple/v7/src/RNTupleSerialize.cxx

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -247,13 +247,21 @@ std::uint32_t SerializePhysicalColumn(const ROOT::Experimental::RColumnDescripto
247247
std::uint32_t SerializeColumnsOfFields(const ROOT::Experimental::RNTupleDescriptor &desc,
248248
std::span<const ROOT::Experimental::DescriptorId_t> fieldList,
249249
const ROOT::Experimental::Internal::RNTupleSerializer::RContext &context,
250-
void *buffer)
250+
void *buffer, bool forHeaderExtension)
251251
{
252252
auto base = reinterpret_cast<unsigned char *>(buffer);
253253
auto pos = base;
254254
void **where = (buffer == nullptr) ? &buffer : reinterpret_cast<void **>(&pos);
255255

256+
const auto *xHeader = !forHeaderExtension ? desc.GetHeaderExtension() : nullptr;
257+
256258
for (auto parentId : fieldList) {
259+
// If we're serializing the non-extended header and we already have a header extension (which may happen if
260+
// we load an RNTuple for incremental merging), we need to skip all the extended fields, as they need to be
261+
// written in the header extension, not in the regular header.
262+
if (xHeader && xHeader->ContainsField(parentId))
263+
continue;
264+
257265
for (const auto &c : desc.GetColumnIterable(parentId)) {
258266
if (c.IsAliasColumn())
259267
continue;
@@ -476,13 +484,18 @@ std::uint32_t SerializeAliasColumn(const ROOT::Experimental::RColumnDescriptor &
476484
std::uint32_t SerializeAliasColumnsOfFields(const ROOT::Experimental::RNTupleDescriptor &desc,
477485
std::span<const ROOT::Experimental::DescriptorId_t> fieldList,
478486
const ROOT::Experimental::Internal::RNTupleSerializer::RContext &context,
479-
void *buffer)
487+
void *buffer, bool forHeaderExtension)
480488
{
481489
auto base = reinterpret_cast<unsigned char *>(buffer);
482490
auto pos = base;
483491
void **where = (buffer == nullptr) ? &buffer : reinterpret_cast<void **>(&pos);
484492

493+
const auto *xHeader = !forHeaderExtension ? desc.GetHeaderExtension() : nullptr;
494+
485495
for (auto parentId : fieldList) {
496+
if (xHeader && xHeader->ContainsField(parentId))
497+
continue;
498+
486499
for (const auto &c : desc.GetColumnIterable(parentId)) {
487500
if (!c.IsAliasColumn())
488501
continue;
@@ -1246,8 +1259,6 @@ void ROOT::Experimental::Internal::RNTupleSerializer::RContext::MapSchema(const
12461259
};
12471260

12481261
R__ASSERT(desc.GetNFields() > 0); // we must have at least a zero field
1249-
if (!forHeaderExtension)
1250-
R__ASSERT(!desc.GetHeaderExtension());
12511262

12521263
std::vector<DescriptorId_t> fieldTrees;
12531264
if (!forHeaderExtension) {
@@ -1303,9 +1314,16 @@ std::uint32_t ROOT::Experimental::Internal::RNTupleSerializer::SerializeSchemaDe
13031314
}
13041315
}
13051316
} else {
1306-
nFields = desc.GetNFields() - 1;
1307-
nColumns = desc.GetNPhysicalColumns();
1308-
nAliasColumns = desc.GetNLogicalColumns() - desc.GetNPhysicalColumns();
1317+
if (auto xHeader = desc.GetHeaderExtension()) {
1318+
nFields = desc.GetNFields() - xHeader->GetNFields() - 1;
1319+
nColumns = desc.GetNPhysicalColumns() - xHeader->GetNPhysicalColumns();
1320+
nAliasColumns = desc.GetNLogicalColumns() - desc.GetNPhysicalColumns() -
1321+
(xHeader->GetNLogicalColumns() - xHeader->GetNPhysicalColumns());
1322+
} else {
1323+
nFields = desc.GetNFields() - 1;
1324+
nColumns = desc.GetNPhysicalColumns();
1325+
nAliasColumns = desc.GetNLogicalColumns() - desc.GetNPhysicalColumns();
1326+
}
13091327
}
13101328
const auto nExtraTypeInfos = desc.GetNExtraTypeInfos();
13111329
const auto &onDiskFields = context.GetOnDiskFieldList();
@@ -1320,7 +1338,7 @@ std::uint32_t ROOT::Experimental::Internal::RNTupleSerializer::SerializeSchemaDe
13201338

13211339
frame = pos;
13221340
pos += SerializeListFramePreamble(nColumns, *where);
1323-
pos += SerializeColumnsOfFields(desc, fieldList, context, *where);
1341+
pos += SerializeColumnsOfFields(desc, fieldList, context, *where, forHeaderExtension);
13241342
for (const auto &c : extraColumns) {
13251343
if (!c.get().IsAliasColumn()) {
13261344
pos += SerializePhysicalColumn(c.get(), context, *where);
@@ -1330,7 +1348,7 @@ std::uint32_t ROOT::Experimental::Internal::RNTupleSerializer::SerializeSchemaDe
13301348

13311349
frame = pos;
13321350
pos += SerializeListFramePreamble(nAliasColumns, *where);
1333-
pos += SerializeAliasColumnsOfFields(desc, fieldList, context, *where);
1351+
pos += SerializeAliasColumnsOfFields(desc, fieldList, context, *where, forHeaderExtension);
13341352
for (const auto &c : extraColumns) {
13351353
if (c.get().IsAliasColumn()) {
13361354
pos += SerializeAliasColumn(c.get(), context, *where);

tree/ntuple/v7/test/ntuple_merger.cxx

Lines changed: 73 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1606,9 +1606,15 @@ TEST(RNTupleMerger, MergeAsymmetric1TFileMerger)
16061606

16071607
TEST(RNTupleMerger, MergeIncrementalLMExt)
16081608
{
1609-
// Create the input files
1609+
// Create the input files:
1610+
// File 0: f_0: int
1611+
// File 1: f_0: int, f_1: float
1612+
// File 2: f_0: int, f_1: float, f_2: string
1613+
// File 3: f_0: int, f_1: float, f_2: string, f_3: int
1614+
// ...
1615+
// each file has 5 entries.
16101616
std::vector<FileRaii> inputFiles;
1611-
const auto nInputs = 10;
1617+
const auto nInputs = 12;
16121618
auto model = RNTupleModel::Create();
16131619
for (int fileIdx = 0; fileIdx < nInputs; ++fileIdx) {
16141620
auto &fileGuard =
@@ -1637,26 +1643,80 @@ TEST(RNTupleMerger, MergeIncrementalLMExt)
16371643
case 1: *entry.GetPtr<float>(fldName) = fileIdx + fillIdx + fieldIdx; break;
16381644
default: *entry.GetPtr<std::string>(fldName) = std::to_string(fileIdx + fillIdx + fieldIdx);
16391645
}
1640-
writer->Fill();
16411646
}
1647+
writer->Fill();
16421648
}
16431649
}
16441650

16451651
// Incrementally merge the inputs
16461652
FileRaii fileGuard("test_ntuple_merge_incr_lmext.root");
16471653
fileGuard.PreserveFile();
1648-
const auto compression = 0;//ROOT::RCompressionSetting::EDefaults::kUseGeneralPurpose;
1654+
const auto compression = 0;
1655+
1656+
{
1657+
TFileMerger merger(kFALSE, kFALSE);
1658+
merger.OutputFile(fileGuard.GetPath().c_str(), "RECREATE", compression);
1659+
merger.SetMergeOptions(TString("rntuple.MergingMode=Union"));
16491660

1650-
TFileMerger merger(kFALSE, kFALSE);
1651-
merger.OutputFile(fileGuard.GetPath().c_str(), "RECREATE", compression);
1652-
merger.SetMergeOptions(TString("rntuple.MergingMode=Union"));
1661+
for (int i = 0; i < nInputs; ++i) {
1662+
auto tfile = std::unique_ptr<TFile>(TFile::Open(inputFiles[i].GetPath().c_str(), "READ"));
1663+
merger.AddFile(tfile.get());
1664+
bool result =
1665+
merger.PartialMerge(TFileMerger::kIncremental | TFileMerger::kNonResetable | TFileMerger::kKeepCompression);
1666+
ASSERT_TRUE(result);
1667+
}
1668+
}
16531669

1654-
for (int i = 0; i < nInputs; ++i) {
1655-
auto tfile = std::unique_ptr<TFile>(TFile::Open(inputFiles[i].GetPath().c_str(), "READ"));
1670+
// Now verify that the output file contains all the expected data.
1671+
{
1672+
auto reader = RNTupleReader::Open("ntpl", fileGuard.GetPath());
1673+
const auto &desc = reader->GetDescriptor();
1674+
for (int i = 0; i < nInputs; ++i) {
1675+
const auto fieldId = desc.FindFieldId(std::string("f_") + std::to_string(i));
1676+
EXPECT_NE(fieldId, ROOT::Experimental::kInvalidDescriptorId);
1677+
const auto &fdesc = desc.GetFieldDescriptor(fieldId);
1678+
for (const auto &colId : fdesc.GetLogicalColumnIds()) {
1679+
const auto &cdesc = desc.GetColumnDescriptor(colId);
1680+
EXPECT_EQ(cdesc.GetFirstElementIndex(), (cdesc.GetIndex() == 0) * i * 5);
1681+
}
1682+
}
16561683

1657-
merger.AddFile(tfile.get());
1658-
bool result =
1659-
merger.PartialMerge(TFileMerger::kIncremental | TFileMerger::kNonResetable | TFileMerger::kKeepCompression);
1660-
ASSERT_TRUE(result);
1684+
RNTupleView<int> v_int[] = {
1685+
reader->GetView<int>("f_0"),
1686+
reader->GetView<int>("f_3"),
1687+
reader->GetView<int>("f_6"),
1688+
reader->GetView<int>("f_9"),
1689+
};
1690+
RNTupleView<float> v_float[] = {
1691+
reader->GetView<float>("f_1"),
1692+
reader->GetView<float>("f_4"),
1693+
reader->GetView<float>("f_7"),
1694+
reader->GetView<float>("f_10"),
1695+
};
1696+
RNTupleView<std::string> v_string[] = {
1697+
reader->GetView<std::string>("f_2"),
1698+
reader->GetView<std::string>("f_5"),
1699+
reader->GetView<std::string>("f_8"),
1700+
reader->GetView<std::string>("f_11"),
1701+
};
1702+
for (auto entryId : reader->GetEntryRange()) {
1703+
int fileIdx = entryId / 5;
1704+
int localEntryId = entryId % 5;
1705+
1706+
for (int i = 0; i < nInputs / 3; ++i) {
1707+
auto x0 = v_int[i](entryId);
1708+
int expected_x0 = (entryId >= 15u * i) * (fileIdx + localEntryId + i * 3);
1709+
EXPECT_EQ(x0, expected_x0);
1710+
1711+
auto x1 = v_float[i](entryId);
1712+
float expected_x1 = (entryId >= 5 + 15u * i) * (fileIdx + localEntryId + i * 3 + 1);
1713+
EXPECT_FLOAT_EQ(x1, expected_x1);
1714+
1715+
auto x2 = v_string[i](entryId);
1716+
std::string expected_x2 =
1717+
(entryId >= 10 + 15u * i) ? std::to_string(fileIdx + localEntryId + i * 3 + 2) : "";
1718+
EXPECT_EQ(x2, expected_x2);
1719+
}
1720+
}
16611721
}
16621722
}

tree/ntuple/v7/test/ntuple_test.hxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ using RPrepareVisitor = ROOT::Experimental::RPrepareVisitor;
114114
using RPrintSchemaVisitor = ROOT::Experimental::RPrintSchemaVisitor;
115115
using RRawFile = ROOT::Internal::RRawFile;
116116
using EContainerFormat = RNTupleFileWriter::EContainerFormat;
117+
template <typename T>
118+
using RNTupleView = ROOT::Experimental::RNTupleView<T>;
117119

118120
using ROOT::Experimental::Internal::MakeUninitArray;
119121

0 commit comments

Comments
 (0)