Skip to content

Commit b75bf65

Browse files
duxiao1212facebook-github-bot
authored andcommitted
feat: Impl sort key for LocalShuffleWriter (#26547)
Summary: Impl sort key for LocalShuffleWriter Differential Revision: D86322593
1 parent 90bd292 commit b75bf65

File tree

5 files changed

+302
-76
lines changed

5 files changed

+302
-76
lines changed

presto-native-execution/presto_cpp/main/operators/LocalShuffle.cpp

Lines changed: 99 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
#include "presto_cpp/external/json/nlohmann/json.hpp"
1616
#include "presto_cpp/main/common/Configs.h"
1717

18+
#include <boost/range/algorithm/sort.hpp>
19+
1820
using namespace facebook::velox::exec;
1921
using namespace facebook::velox;
2022

@@ -39,15 +41,41 @@ inline std::string createShuffleFileName(
3941
id);
4042
}
4143

42-
// This file is used to indicate that the shuffle system is ready to be used for
43-
// reading (acts as a sync point between readers if needed). Mostly used for
44-
// test purposes.
45-
const static std::string kReadyForReadFilename = "readyForRead";
44+
inline size_t rowSize(size_t keySize, size_t dataSize) noexcept {
45+
return (kUint32Size * 2) + keySize + dataSize;
46+
}
47+
48+
inline void appendShuffleRow(
49+
char* FOLLY_NONNULL buffer,
50+
std::string_view sortKey,
51+
std::string_view data) {
52+
// Write memory layout: [keySize][dataSize][key][data]
53+
*(TRowSize*)(buffer) =
54+
folly::Endian::big(static_cast<TRowSize>(sortKey.size()));
55+
buffer += kUint32Size;
56+
57+
*(TRowSize*)(buffer) = folly::Endian::big(static_cast<TRowSize>(data.size()));
58+
buffer += kUint32Size;
59+
60+
if (!sortKey.empty()) {
61+
memcpy(buffer, sortKey.data(), sortKey.size());
62+
buffer += sortKey.size();
63+
}
64+
65+
memcpy(buffer, data.data(), data.size());
66+
}
67+
68+
void writeBufferToFile(velox::WriteFile* file, const char* data, size_t size) {
69+
VELOX_CHECK_NOT_NULL(file);
70+
file->append(std::string_view(data, size));
71+
file->close();
72+
}
73+
4674
} // namespace
4775

4876
LocalShuffleWriter::LocalShuffleWriter(
49-
const std::string& rootPath,
50-
const std::string& queryId,
77+
std::string rootPath,
78+
std::string queryId,
5179
uint32_t shuffleId,
5280
uint32_t numPartitions,
5381
uint64_t maxBytesPerPartition,
@@ -59,14 +87,53 @@ LocalShuffleWriter::LocalShuffleWriter(
5987
rootPath_(std::move(rootPath)),
6088
queryId_(std::move(queryId)),
6189
shuffleId_(shuffleId) {
62-
// Use resize/assign instead of resize(size, val).
63-
inProgressPartitions_.resize(numPartitions_);
6490
inProgressPartitions_.assign(numPartitions_, nullptr);
65-
inProgressSizes_.resize(numPartitions_);
6691
inProgressSizes_.assign(numPartitions_, 0);
92+
inProgressRowOffsets_.assign(numPartitions_, {});
6793
fileSystem_ = velox::filesystems::getFileSystem(rootPath_, nullptr);
6894
}
6995

96+
void LocalShuffleWriter::writeBlock(int32_t partition) {
97+
auto& buffer = inProgressPartitions_[partition];
98+
const auto bufferSize = inProgressSizes_[partition];
99+
if (!buffer || bufferSize == 0) {
100+
return;
101+
}
102+
103+
auto file = getNextOutputFile(partition);
104+
auto& offsets = inProgressRowOffsets_[partition];
105+
106+
const bool needsSorting = !offsets.empty() && offsets[0].keySize > 0;
107+
108+
if (needsSorting) {
109+
const char* sourceData = buffer->as<char>();
110+
sortRows(offsets, sourceData);
111+
112+
for (const auto& offset : offsets) {
113+
const char* rowStart = sourceData + offset.keyOffset - (kUint32Size * 2);
114+
const size_t rowLen = (kUint32Size * 2) + offset.keySize + offset.dataSize;
115+
file->append(std::string_view(rowStart, rowLen));
116+
}
117+
file->close();
118+
} else {
119+
writeBufferToFile(file.get(), buffer->as<char>(), bufferSize);
120+
}
121+
122+
// Reset for buffer reuse
123+
inProgressSizes_[partition] = 0;
124+
inProgressRowOffsets_[partition].clear();
125+
}
126+
127+
void LocalShuffleWriter::sortRows(
128+
std::vector<RowOffset>& offsets,
129+
const char* bufferData) {
130+
boost::range::sort(
131+
offsets, [bufferData](const RowOffset& lhs, const RowOffset& rhs) {
132+
return detail::compareKeys(
133+
lhs.getKey(bufferData), rhs.getKey(bufferData));
134+
});
135+
}
136+
70137
std::unique_ptr<velox::WriteFile> LocalShuffleWriter::getNextOutputFile(
71138
int32_t partition) {
72139
auto filename = nextAvailablePartitionFileName(rootPath_, partition);
@@ -92,47 +159,36 @@ std::string LocalShuffleWriter::nextAvailablePartitionFileName(
92159
return filename;
93160
}
94161

95-
void LocalShuffleWriter::storePartitionBlock(int32_t partition) {
96-
auto& buffer = inProgressPartitions_[partition];
97-
auto file = getNextOutputFile(partition);
98-
file->append(
99-
std::string_view(buffer->as<char>(), inProgressSizes_[partition]));
100-
file->close();
101-
inProgressPartitions_[partition].reset();
102-
inProgressSizes_[partition] = 0;
103-
}
104-
105162
void LocalShuffleWriter::collect(
106163
int32_t partition,
107-
std::string_view /* key */,
164+
std::string_view key,
108165
std::string_view data) {
109-
using TRowSize = uint32_t;
166+
VELOX_CHECK_LT(partition, numPartitions_);
110167

111-
auto& buffer = inProgressPartitions_[partition];
112-
const TRowSize rowSize = data.size();
113-
const auto size = sizeof(TRowSize) + rowSize;
114-
115-
// Check if there is enough space in the buffer.
116-
if ((buffer != nullptr) &&
117-
(inProgressSizes_[partition] + size >= buffer->capacity())) {
118-
storePartitionBlock(partition);
119-
// NOTE: the referenced 'buffer' will be reset in storePartitionBlock.
120-
}
168+
const auto size = rowSize(key.size(), data.size());
121169

122-
// Allocate buffer if needed.
123-
if (buffer == nullptr) {
124-
buffer = AlignedBuffer::allocate<char>(
125-
std::max((uint64_t)size, maxBytesPerPartition_), pool_);
126-
inProgressSizes_[partition] = 0;
127-
inProgressPartitions_[partition] = buffer;
170+
if (inProgressPartitions_[partition] == nullptr) {
171+
inProgressPartitions_[partition] =
172+
AlignedBuffer::allocate<char>(maxBytesPerPartition_, pool_, 0);
173+
}
174+
if (inProgressSizes_[partition] + size >= maxBytesPerPartition_) {
175+
writeBlock(partition);
128176
}
129177

130-
// Copy data.
131-
auto offset = inProgressSizes_[partition];
132-
auto rawBuffer = buffer->asMutable<char>() + offset;
178+
auto* rawBuffer = inProgressPartitions_[partition]->asMutable<char>();
179+
auto* writePos = rawBuffer + inProgressSizes_[partition];
133180

134-
*(TRowSize*)(rawBuffer) = folly::Endian::big(rowSize);
135-
::memcpy(rawBuffer + sizeof(TRowSize), data.data(), rowSize);
181+
appendShuffleRow(writePos, key, data);
182+
const auto currentOffset = inProgressSizes_[partition];
183+
const auto keyOffset = currentOffset + (kUint32Size * 2);
184+
const auto dataOffset = keyOffset + key.size();
185+
186+
inProgressRowOffsets_[partition].emplace_back(
187+
RowOffset{
188+
.keyOffset = keyOffset,
189+
.dataOffset = dataOffset,
190+
.keySize = static_cast<uint32_t>(key.size()),
191+
.dataSize = static_cast<uint32_t>(data.size())});
136192

137193
inProgressSizes_[partition] += size;
138194
}
@@ -144,11 +200,10 @@ void LocalShuffleWriter::noMoreData(bool success) {
144200
}
145201
for (auto i = 0; i < numPartitions_; ++i) {
146202
if (inProgressSizes_[i] > 0) {
147-
storePartitionBlock(i);
203+
writeBlock(i);
148204
}
149205
}
150206
}
151-
152207
LocalShuffleReader::LocalShuffleReader(
153208
const std::string& rootPath,
154209
const std::string& queryId,

presto-native-execution/presto_cpp/main/operators/LocalShuffle.h

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
*/
1414
#pragma once
1515

16+
#include "presto_cpp/main/operators/LocalShuffleUtils.h"
1617
#include "presto_cpp/main/operators/ShuffleInterface.h"
1718
#include "velox/buffer/Buffer.h"
1819
#include "velox/common/file/File.h"
@@ -72,17 +73,15 @@ struct LocalShuffleReadInfo {
7273
class LocalShuffleWriter : public ShuffleWriter {
7374
public:
7475
LocalShuffleWriter(
75-
const std::string& rootPath,
76-
const std::string& queryId,
76+
std::string rootPath,
77+
std::string queryId,
7778
uint32_t shuffleId,
7879
uint32_t numPartitions,
7980
uint64_t maxBytesPerPartition,
8081
velox::memory::MemoryPool* pool);
8182

82-
void collect(
83-
int32_t partition,
84-
std::string_view /* key */,
85-
std::string_view data) override;
83+
void collect(int32_t partition, std::string_view key, std::string_view data)
84+
override;
8685

8786
void noMoreData(bool success) override;
8887

@@ -97,7 +96,7 @@ class LocalShuffleWriter : public ShuffleWriter {
9796
std::unique_ptr<velox::WriteFile> getNextOutputFile(int32_t partition);
9897

9998
// Writes the in-progress block to the given partition.
100-
void storePartitionBlock(int32_t partition);
99+
void writeBlock(int32_t partition);
101100

102101
// Deletes all the files in the root directory.
103102
void cleanup();
@@ -121,6 +120,29 @@ class LocalShuffleWriter : public ShuffleWriter {
121120
std::vector<velox::BufferPtr> inProgressPartitions_;
122121
std::vector<size_t> inProgressSizes_;
123122
std::shared_ptr<velox::filesystems::FileSystem> fileSystem_;
123+
124+
// For sorted shuffle, track the key and data offsets for each row for
125+
// efficient sorting.
126+
struct RowOffset {
127+
size_t keyOffset;
128+
size_t dataOffset;
129+
uint32_t keySize;
130+
uint32_t dataSize;
131+
132+
inline std::string_view getKey(const char* FOLLY_NONNULL buffer) const {
133+
return {buffer + keyOffset, keySize};
134+
}
135+
136+
inline std::string_view getData(const char* FOLLY_NONNULL buffer) const {
137+
return {buffer + dataOffset, dataSize};
138+
}
139+
} __attribute__((aligned(32)));
140+
141+
std::vector<std::vector<RowOffset>> inProgressRowOffsets_;
142+
143+
void sortRows(
144+
std::vector<RowOffset>& offsets,
145+
const char* FOLLY_NONNULL buffer);
124146
};
125147

126148
class LocalShuffleReader : public ShuffleReader {
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
15+
#include "presto_cpp/main/operators/LocalShuffleUtils.h"
16+
#include "velox/common/base/Exceptions.h"
17+
#include "folly/lang/Bits.h"
18+
19+
namespace facebook::presto::operators::detail {
20+
21+
bool compareKeys(std::string_view key1, std::string_view key2) noexcept {
22+
return std::lexicographical_compare(
23+
reinterpret_cast<const unsigned char*>(key1.data()),
24+
reinterpret_cast<const unsigned char*>(key1.data() + key1.size()),
25+
reinterpret_cast<const unsigned char*>(key2.data()),
26+
reinterpret_cast<const unsigned char*>(key2.data() + key2.size()));
27+
}
28+
29+
ParsedShuffleData parseShuffleRows(const char* buffer, size_t totalSize) {
30+
ParsedShuffleData result;
31+
size_t offset = 0;
32+
33+
while (offset + static_cast<size_t>(2 * kUint32Size) <= totalSize) {
34+
// Read KeySize (big-endian)
35+
const TRowSize keySize =
36+
folly::Endian::big(*reinterpret_cast<const TRowSize*>(buffer + offset));
37+
offset += kUint32Size;
38+
39+
// Read DataSize (big-endian)
40+
const TRowSize dataSize =
41+
folly::Endian::big(*reinterpret_cast<const TRowSize*>(buffer + offset));
42+
offset += kUint32Size;
43+
44+
// Validate keySize
45+
if (keySize > 0) {
46+
VELOX_CHECK_LE(
47+
offset + keySize,
48+
totalSize,
49+
"Invalid row data, offset + keySize: {} exceeds totalSize: {}",
50+
offset + keySize,
51+
totalSize);
52+
}
53+
54+
// Validate total size
55+
VELOX_CHECK_LE(
56+
offset + keySize + dataSize,
57+
totalSize,
58+
"Invalid row data, offset + keySize + dataSize: {} exceeds totalSize: {}",
59+
offset + keySize + dataSize,
60+
totalSize);
61+
62+
// Extract key and data as string_views
63+
result.keys.emplace_back(buffer + offset, keySize);
64+
offset += keySize;
65+
66+
result.data.emplace_back(buffer + offset, dataSize);
67+
offset += dataSize;
68+
}
69+
70+
return result;
71+
}
72+
73+
} // namespace facebook::presto::operators::detail
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Licensed under the Apache License, Version 2.0 (the "License");
3+
* you may not use this file except in compliance with the License.
4+
* You may obtain a copy of the License at
5+
*
6+
* http://www.apache.org/licenses/LICENSE-2.0
7+
*
8+
* Unless required by applicable law or agreed to in writing, software
9+
* distributed under the License is distributed on an "AS IS" BASIS,
10+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
* See the License for the specific language governing permissions and
12+
* limitations under the License.
13+
*/
14+
#pragma once
15+
16+
#include <cstdint>
17+
#include <string_view>
18+
#include <vector>
19+
20+
namespace facebook::presto::operators {
21+
22+
using TRowSize = uint32_t;
23+
constexpr uint32_t kUint32Size = static_cast<uint32_t>(sizeof(uint32_t));
24+
25+
namespace detail {
26+
27+
/// Structure to hold parsed shuffle data with separate keys and data vectors
28+
struct ParsedShuffleData {
29+
std::vector<std::string_view> keys;
30+
std::vector<std::string_view> data;
31+
};
32+
33+
/// Compares two keys lexicographically (byte-by-byte comparison)
34+
/// Returns true if key1 < key2
35+
bool compareKeys(std::string_view key1, std::string_view key2) noexcept;
36+
37+
/// Parses shuffle data in the format: [keySize][dataSize][key][data]
38+
/// This format is used when sort keys are present in the shuffle data
39+
ParsedShuffleData parseShuffleRows(const char* buffer, size_t totalSize);
40+
41+
} // namespace detail
42+
} // namespace facebook::presto::operators

0 commit comments

Comments
 (0)