Skip to content

Commit 6373155

Browse files
author
Hannah Bast
committed
Merge remote-tracking branch 'origin/master' into updateMetadata
2 parents 7ab2815 + acb6633 commit 6373155

26 files changed

+341
-197
lines changed

src/engine/GroupBy.cpp

Lines changed: 101 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -330,7 +330,7 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) {
330330
if (useHashMapOptimization) {
331331
const auto* child = _subtree->getRootOperation()->getChildren().at(0);
332332
// Skip sorting
333-
subresult = child->getResult();
333+
subresult = child->getResult(true);
334334
// Update runtime information
335335
auto runTimeInfoChildren =
336336
child->getRootOperation()->getRuntimeInfoPointer();
@@ -366,13 +366,28 @@ ProtoResult GroupBy::computeResult(bool requestLaziness) {
366366
}
367367

368368
if (useHashMapOptimization) {
369-
auto localVocab = subresult->getCopyOfLocalVocab();
370-
IdTable idTable = CALL_FIXED_SIZE(
371-
groupByCols.size(), &GroupBy::computeGroupByForHashMapOptimization,
372-
this, metadataForUnsequentialData->aggregateAliases_,
373-
subresult->idTable(), groupByCols, &localVocab);
369+
// Helper lambda that calls `computeGroupByForHashMapOptimization` for the
370+
// given `subresults`.
371+
auto computeWithHashMap = [this, &metadataForUnsequentialData,
372+
&groupByCols](auto&& subresults) {
373+
auto doCompute = [&]<int NumCols> {
374+
return computeGroupByForHashMapOptimization<NumCols>(
375+
metadataForUnsequentialData->aggregateAliases_, AD_FWD(subresults),
376+
groupByCols);
377+
};
378+
return ad_utility::callFixedSize(groupByCols.size(), doCompute);
379+
};
374380

375-
return {std::move(idTable), resultSortedOn(), std::move(localVocab)};
381+
// Now call `computeWithHashMap` and return the result. It expects a range
382+
// of results, so if the result is fully materialized, we create an array
383+
// with a single element.
384+
if (subresult->isFullyMaterialized()) {
385+
return computeWithHashMap(
386+
std::array{std::pair{std::cref(subresult->idTable()),
387+
std::cref(subresult->localVocab())}});
388+
} else {
389+
return computeWithHashMap(std::move(subresult->idTables()));
390+
}
376391
}
377392

378393
size_t inWidth = _subtree->getResultWidth();
@@ -846,7 +861,7 @@ std::optional<IdTable> GroupBy::computeGroupByForJoinWithFullScan() const {
846861
const auto& index = getExecutionContext()->getIndex();
847862

848863
// TODO<joka921, C++23> Simplify the following pattern by using
849-
// `ql::views::chunkd_by` and implement a lazy version of this view for
864+
// `ql::views::chunk_by` and implement a lazy version of this view for
850865
// input iterators.
851866

852867
// Take care of duplicate values in the input.
@@ -1487,78 +1502,95 @@ static constexpr auto makeProcessGroupsVisitor =
14871502

14881503
// _____________________________________________________________________________
14891504
template <size_t NUM_GROUP_COLUMNS>
1490-
IdTable GroupBy::computeGroupByForHashMapOptimization(
1491-
std::vector<HashMapAliasInformation>& aggregateAliases,
1492-
const IdTable& subresult, const std::vector<size_t>& columnIndices,
1493-
LocalVocab* localVocab) const {
1494-
AD_CONTRACT_CHECK(columnIndices.size() == NUM_GROUP_COLUMNS ||
1495-
NUM_GROUP_COLUMNS == 0);
1496-
1497-
// Initialize aggregation data
1505+
Result GroupBy::computeGroupByForHashMapOptimization(
1506+
std::vector<HashMapAliasInformation>& aggregateAliases, auto subresults,
1507+
const std::vector<size_t>& columnIndices) const {
1508+
AD_CORRECTNESS_CHECK(columnIndices.size() == NUM_GROUP_COLUMNS ||
1509+
NUM_GROUP_COLUMNS == 0);
1510+
LocalVocab localVocab;
1511+
1512+
// Initialize the data for the aggregates of the GROUP BY operation.
14981513
HashMapAggregationData<NUM_GROUP_COLUMNS> aggregationData(
14991514
getExecutionContext()->getAllocator(), aggregateAliases,
15001515
columnIndices.size());
15011516

1502-
// Initialize evaluation context
1503-
sparqlExpression::EvaluationContext evaluationContext(
1504-
*getExecutionContext(), _subtree->getVariableColumns(), subresult,
1505-
getExecutionContext()->getAllocator(), *localVocab, cancellationHandle_,
1506-
deadline_);
1507-
1508-
evaluationContext._groupedVariables = ad_utility::HashSet<Variable>{
1509-
_groupByVariables.begin(), _groupByVariables.end()};
1510-
evaluationContext._isPartOfGroupBy = true;
1511-
1517+
// Process the input blocks (pairs of `IdTable` and `LocalVocab`) one after
1518+
// the other.
15121519
ad_utility::Timer lookupTimer{ad_utility::Timer::Stopped};
15131520
ad_utility::Timer aggregationTimer{ad_utility::Timer::Stopped};
1514-
for (size_t i = 0; i < subresult.size(); i += GROUP_BY_HASH_MAP_BLOCK_SIZE) {
1515-
checkCancellation();
1516-
1517-
evaluationContext._beginIndex = i;
1518-
evaluationContext._endIndex =
1519-
std::min(i + GROUP_BY_HASH_MAP_BLOCK_SIZE, subresult.size());
1520-
1521-
auto currentBlockSize = evaluationContext.size();
1522-
1523-
// Perform HashMap lookup once for all groups in current block
1524-
using U = HashMapAggregationData<NUM_GROUP_COLUMNS>::template ArrayOrVector<
1525-
std::span<const Id>>;
1526-
U groupValues;
1527-
resizeIfVector(groupValues, columnIndices.size());
1528-
1529-
// TODO<C++23> use views::enumerate
1530-
size_t j = 0;
1531-
for (auto& idx : columnIndices) {
1532-
groupValues[j] = subresult.getColumn(idx).subspan(
1533-
evaluationContext._beginIndex, currentBlockSize);
1534-
++j;
1535-
}
1536-
lookupTimer.cont();
1537-
auto hashEntries = aggregationData.getHashEntries(groupValues);
1538-
lookupTimer.stop();
1539-
1540-
aggregationTimer.cont();
1541-
for (auto& aggregateAlias : aggregateAliases) {
1542-
for (auto& aggregate : aggregateAlias.aggregateInfo_) {
1543-
sparqlExpression::ExpressionResult expressionResult =
1544-
GroupBy::evaluateChildExpressionOfAggregateFunction(
1545-
aggregate, evaluationContext);
1546-
1547-
auto& aggregationDataVariant =
1548-
aggregationData.getAggregationDataVariant(
1549-
aggregate.aggregateDataIndex_);
1550-
1551-
std::visit(makeProcessGroupsVisitor(currentBlockSize,
1552-
&evaluationContext, hashEntries),
1553-
std::move(expressionResult), aggregationDataVariant);
1521+
for (const auto& [inputTableRef, inputLocalVocabRef] : subresults) {
1522+
const IdTable& inputTable = inputTableRef;
1523+
const LocalVocab& inputLocalVocab = inputLocalVocabRef;
1524+
1525+
// Merge the local vocab of each input block.
1526+
//
1527+
// NOTE: If the input blocks have very similar or even identical non-empty
1528+
// local vocabs, no deduplication is performed.
1529+
localVocab.mergeWith(std::span{&inputLocalVocab, 1});
1530+
1531+
// Setup the `EvaluationContext` for this input block.
1532+
sparqlExpression::EvaluationContext evaluationContext(
1533+
*getExecutionContext(), _subtree->getVariableColumns(), inputTable,
1534+
getExecutionContext()->getAllocator(), localVocab, cancellationHandle_,
1535+
deadline_);
1536+
evaluationContext._groupedVariables = ad_utility::HashSet<Variable>{
1537+
_groupByVariables.begin(), _groupByVariables.end()};
1538+
evaluationContext._isPartOfGroupBy = true;
1539+
1540+
// Iterate of the rows of this input block. Process (up to)
1541+
// `GROUP_BY_HASH_MAP_BLOCK_SIZE` rows at a time.
1542+
for (size_t i = 0; i < inputTable.size();
1543+
i += GROUP_BY_HASH_MAP_BLOCK_SIZE) {
1544+
checkCancellation();
1545+
1546+
evaluationContext._beginIndex = i;
1547+
evaluationContext._endIndex =
1548+
std::min(i + GROUP_BY_HASH_MAP_BLOCK_SIZE, inputTable.size());
1549+
1550+
auto currentBlockSize = evaluationContext.size();
1551+
1552+
// Perform HashMap lookup once for all groups in current block
1553+
using U = HashMapAggregationData<
1554+
NUM_GROUP_COLUMNS>::template ArrayOrVector<std::span<const Id>>;
1555+
U groupValues;
1556+
resizeIfVector(groupValues, columnIndices.size());
1557+
1558+
// TODO<C++23> use views::enumerate
1559+
size_t j = 0;
1560+
for (auto& idx : columnIndices) {
1561+
groupValues[j] = inputTable.getColumn(idx).subspan(
1562+
evaluationContext._beginIndex, currentBlockSize);
1563+
++j;
1564+
}
1565+
lookupTimer.cont();
1566+
auto hashEntries = aggregationData.getHashEntries(groupValues);
1567+
lookupTimer.stop();
1568+
1569+
aggregationTimer.cont();
1570+
for (auto& aggregateAlias : aggregateAliases) {
1571+
for (auto& aggregate : aggregateAlias.aggregateInfo_) {
1572+
sparqlExpression::ExpressionResult expressionResult =
1573+
GroupBy::evaluateChildExpressionOfAggregateFunction(
1574+
aggregate, evaluationContext);
1575+
1576+
auto& aggregationDataVariant =
1577+
aggregationData.getAggregationDataVariant(
1578+
aggregate.aggregateDataIndex_);
1579+
1580+
std::visit(makeProcessGroupsVisitor(currentBlockSize,
1581+
&evaluationContext, hashEntries),
1582+
std::move(expressionResult), aggregationDataVariant);
1583+
}
15541584
}
1585+
aggregationTimer.stop();
15551586
}
1556-
aggregationTimer.stop();
15571587
}
1588+
15581589
runtimeInfo().addDetail("timeMapLookup", lookupTimer.msecs());
15591590
runtimeInfo().addDetail("timeAggregation", aggregationTimer.msecs());
1560-
1561-
return createResultFromHashMap(aggregationData, aggregateAliases, localVocab);
1591+
IdTable resultTable =
1592+
createResultFromHashMap(aggregationData, aggregateAliases, &localVocab);
1593+
return {std::move(resultTable), resultSortedOn(), std::move(localVocab)};
15621594
}
15631595

15641596
// _____________________________________________________________________________

src/engine/GroupBy.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,7 @@
1-
// Copyright 2018, University of Freiburg,
1+
// Copyright 2018 - 2024, University of Freiburg
22
// Chair of Algorithms and Data Structures.
3-
// Author:
4-
// 2018 Florian Kramer (florian.kramer@mail.uni-freiburg.de)
5-
// 2020- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de)
3+
// Authors: Florian Kramer [2018]
4+
// Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>
65

76
#pragma once
87

@@ -316,10 +315,9 @@ class GroupBy : public Operation {
316315
// Create result IdTable by using a HashMap mapping groups to aggregation data
317316
// and subsequently calling `createResultFromHashMap`.
318317
template <size_t NUM_GROUP_COLUMNS>
319-
IdTable computeGroupByForHashMapOptimization(
320-
std::vector<HashMapAliasInformation>& aggregateAliases,
321-
const IdTable& subresult, const std::vector<size_t>& columnIndices,
322-
LocalVocab* localVocab) const;
318+
Result computeGroupByForHashMapOptimization(
319+
std::vector<HashMapAliasInformation>& aggregateAliases, auto subresults,
320+
const std::vector<size_t>& columnIndices) const;
323321

324322
using AggregationData =
325323
std::variant<AvgAggregationData, CountAggregationData, MinAggregationData,

src/engine/SpatialJoin.cpp

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,16 @@ size_t SpatialJoin::getResultWidth() const {
212212

213213
// ____________________________________________________________________________
214214
size_t SpatialJoin::getCostEstimate() {
215-
if (childLeft_ && childRight_) {
216-
size_t inputEstimate =
217-
childLeft_->getSizeEstimate() * childRight_->getSizeEstimate();
215+
if (!childLeft_ || !childRight_) {
216+
return 1; // dummy return, as the class does not have its children yet
217+
}
218+
219+
size_t spatialJoinCostEst = [this]() {
220+
auto n = childLeft_->getSizeEstimate();
221+
auto m = childRight_->getSizeEstimate();
222+
218223
if (config_.algo_ == SpatialJoinAlgorithm::BASELINE) {
219-
return inputEstimate * inputEstimate;
224+
return n * m;
220225
} else {
221226
AD_CORRECTNESS_CHECK(
222227
config_.algo_ == SpatialJoinAlgorithm::S2_GEOMETRY ||
@@ -229,14 +234,14 @@ size_t SpatialJoin::getCostEstimate() {
229234
// for each item do a lookup on the index for the right table in O(log m).
230235
// Together we have O(n log(m) + m log(m)), because in general we can't
231236
// draw conclusions about the relation between the sizes of n and m.
232-
auto n = childLeft_->getSizeEstimate();
233-
auto m = childRight_->getSizeEstimate();
234-
auto logm = static_cast<size_t>(
235-
log(static_cast<double>(childRight_->getSizeEstimate())));
237+
auto logm = static_cast<size_t>(std::log(static_cast<double>(m)));
236238
return (n * logm) + (m * logm);
237239
}
238-
}
239-
return 1; // dummy return, as the class does not have its children yet
240+
}();
241+
242+
// The cost to compute the children needs to be taken into account.
243+
return spatialJoinCostEst + childLeft_->getCostEstimate() +
244+
childRight_->getCostEstimate();
240245
}
241246

242247
// ____________________________________________________________________________

src/global/Constants.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
1-
// Copyright 2023, University of Freiburg,
1+
// Copyright 2023 - 2025, University of Freiburg,
22
// Chair of Algorithms and Data Structures.
3-
//
4-
// Authors: Björn Buchhold <buchhold@gmail.com>
3+
// Authors: Björn Buchhold <buchhold@gmail.com> [2014 - 2017]
54
// Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>
65
// Hannah Bast <bast@cs.uni-freiburg.de>
76

@@ -22,6 +21,7 @@ using namespace ad_utility::memory_literals;
2221
constexpr inline ad_utility::MemorySize DEFAULT_MEMORY_LIMIT_INDEX_BUILDING =
2322
5_GB;
2423
constexpr inline ad_utility::MemorySize STXXL_DISK_SIZE_INDEX_BUILDER = 1_GB;
24+
constexpr inline ad_utility::MemorySize DEFAULT_PARSER_BUFFER_SIZE = 10_MB;
2525

2626
constexpr inline ad_utility::MemorySize DEFAULT_MEM_FOR_QUERIES = 4_GB;
2727

src/index/ConstantsIndexBuilding.h

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,6 @@ constexpr inline size_t PARSER_BATCH_SIZE = 1'000'000;
2929
// streams faster.
3030
constexpr inline size_t PARSER_MIN_TRIPLES_AT_ONCE = 10'000;
3131

32-
// When reading from a file, Chunks of this size will
33-
// be fed to the parser at once (10 MiB).
34-
constinit inline std::atomic<size_t> FILE_BUFFER_SIZE = 10 * (1ul << 20);
35-
3632
constinit inline std::atomic<size_t> BUFFER_SIZE_JOIN_PATTERNS_WITH_OSP =
3733
50'000;
3834

src/index/Index.cpp

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -181,13 +181,23 @@ ad_utility::MemorySize& Index::memoryLimitIndexBuilding() {
181181
}
182182

183183
// ____________________________________________________________________________
184-
ad_utility::MemorySize& Index::blocksizePermutationsPerColumn() {
185-
return pimpl_->blocksizePermutationPerColumn();
184+
const ad_utility::MemorySize& Index::memoryLimitIndexBuilding() const {
185+
return std::as_const(*pimpl_).memoryLimitIndexBuilding();
186186
}
187187

188188
// ____________________________________________________________________________
189-
const ad_utility::MemorySize& Index::memoryLimitIndexBuilding() const {
190-
return std::as_const(*pimpl_).memoryLimitIndexBuilding();
189+
ad_utility::MemorySize& Index::parserBufferSize() {
190+
return pimpl_->parserBufferSize();
191+
}
192+
193+
// ____________________________________________________________________________
194+
const ad_utility::MemorySize& Index::parserBufferSize() const {
195+
return std::as_const(*pimpl_).parserBufferSize();
196+
}
197+
198+
// ____________________________________________________________________________
199+
ad_utility::MemorySize& Index::blocksizePermutationsPerColumn() {
200+
return pimpl_->blocksizePermutationPerColumn();
191201
}
192202

193203
// ____________________________________________________________________________

src/index/Index.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ class Index {
195195
ad_utility::MemorySize& memoryLimitIndexBuilding();
196196
const ad_utility::MemorySize& memoryLimitIndexBuilding() const;
197197

198+
ad_utility::MemorySize& parserBufferSize();
199+
const ad_utility::MemorySize& parserBufferSize() const;
200+
198201
ad_utility::MemorySize& blocksizePermutationsPerColumn();
199202

200203
void setOnDiskBase(const std::string& onDiskBase);

src/index/IndexBuilderMain.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
// Copyright 2014, University of Freiburg,
1+
// Copyright 2014 - 2025 University of Freiburg
22
// Chair of Algorithms and Data Structures.
3-
// Author:
4-
// 2014-2017 Björn Buchhold (buchhold@informatik.uni-freiburg.de)
5-
// 2018- Johannes Kalmbach (kalmbach@informatik.uni-freiburg.de)
3+
// Authors: Björn Buchhold <buchhold@cs.uni-freiburg.de> [2014 - 2017]
4+
// Johannes Kalmbach <kalmbach@cs.uni-freiburg.de>
5+
// Hannah Bast <bast@cs.uni-freiburg.de>
66

77
#include <boost/program_options.hpp>
88
#include <cstdlib>
@@ -165,6 +165,7 @@ int main(int argc, char** argv) {
165165
bool onlyPsoAndPos = false;
166166
bool addWordsFromLiterals = false;
167167
std::optional<ad_utility::MemorySize> stxxlMemory;
168+
std::optional<ad_utility::MemorySize> parserBufferSize;
168169
optind = 1;
169170

170171
Index index{ad_utility::makeUnlimitedAllocator<Id>()};
@@ -228,6 +229,9 @@ int main(int argc, char** argv) {
228229
add("stxxl-memory,m", po::value(&stxxlMemory),
229230
"The amount of memory in to use for sorting during the index build. "
230231
"Decrease if the index builder runs out of memory.");
232+
add("parser-buffer-size,b", po::value(&parserBufferSize),
233+
"The size of the buffer used for parsing the input files. This must be "
234+
"large enough to hold a single input triple. Default: 10 MB.");
231235
add("keep-temporary-files,k", po::bool_switch(&keepTemporaryFiles),
232236
"Do not delete temporary files from index creation for debugging.");
233237

@@ -249,6 +253,9 @@ int main(int argc, char** argv) {
249253
if (stxxlMemory.has_value()) {
250254
index.memoryLimitIndexBuilding() = stxxlMemory.value();
251255
}
256+
if (parserBufferSize.has_value()) {
257+
index.parserBufferSize() = parserBufferSize.value();
258+
}
252259

253260
// If no text index name was specified, take the part of the wordsfile after
254261
// the last slash.

src/index/IndexImpl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,11 @@ IndexBuilderDataAsFirstPermutationSorter IndexImpl::createIdTriplesAndVocab(
7171
std::unique_ptr<RdfParserBase> IndexImpl::makeRdfParser(
7272
const std::vector<Index::InputFileSpecification>& files) const {
7373
auto makeRdfParserImpl =
74-
[&files]<int useCtre>() -> std::unique_ptr<RdfParserBase> {
74+
[this, &files]<int useCtre>() -> std::unique_ptr<RdfParserBase> {
7575
using TokenizerT =
7676
std::conditional_t<useCtre == 1, TokenizerCtre, Tokenizer>;
77-
return std::make_unique<RdfMultifileParser<TokenizerT>>(files);
77+
return std::make_unique<RdfMultifileParser<TokenizerT>>(
78+
files, this->parserBufferSize());
7879
};
7980

8081
// `callFixedSize` litfts runtime integers to compile time integers. We use it

0 commit comments

Comments
 (0)