-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathjpegls-filter.cpp
202 lines (157 loc) · 7.11 KB
/
jpegls-filter.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
#include "jpegls-filter.h"
#include <numeric>
#include <iostream>
#include "charls/charls.h"
using byte_array_t = std::vector<uint8_t>;
namespace {
template <typename T>
struct image_buffer_t {
jpegls::span<T> buffer;
size_t typesize = 1;
/** Pixel width. */
size_t width = 0;
/** Image height, i.e. number of pixel width. */
size_t height = 0;
/** Number of interleaved samples in a pixel. */
uint32_t channels = 1;
};
/** Given one subchunk of data, compress it and return the encoded data. */
template <typename T>
byte_array_t
encodeSubchunk(const image_buffer_t<T> raw, uint32_t lossy = 0) {
const auto reserved_size = raw.buffer.size_bytes();
byte_array_t encoded(reserved_size + 8192);
auto params = [&]() -> const JlsParameters {
auto params = JlsParameters();
params.width = raw.width;
params.height = raw.height;
params.bitsPerSample = raw.typesize * 8;
params.components = raw.channels;
params.allowedLossyError = lossy;
return params;
}();
size_t csize;
char err_msg[256];
const CharlsApiResultType ret = JpegLsEncode(
encoded.data(), encoded.size(), &csize,
raw.buffer.begin(),
raw.buffer.size_bytes(), ¶ms, err_msg);
if (ret != CharlsApiResultType::OK) {
std::cerr << "JPEG-LS error: " << err_msg << '\n';
}
encoded.resize(csize);
return encoded;
}
}
namespace jpegls {
span<uint8_t>
encode(span<uint8_t> raw, const subchunk_config_t c) {
std::vector<byte_array_t> local_out(c.subchunks);
// For each sub-chunk of raw data, determine the byte range, image width and height.
// Then, compress data.
#pragma omp parallel for schedule(guided)
for (size_t block = 0; block < c.subchunks; block++) {
const size_t width = c.length;
// Let's say chunk height is 27, not divisible by 24. We have the
// remainder of 3. We distribute the remainder by adding additional
// single row to the first 3 subchunks.
const size_t padded_height = c.lblocks + 1;
const size_t height = (block < c.remainder) ? padded_height : c.lblocks;
// Now, for the first 3 subchunks, the offset is computed by padded
// heights and the block id. The rest has an additional offsets.
const size_t offset =
c.typesize * width *
((block < c.remainder) ? height * block
: (padded_height * c.remainder + height * (block - c.remainder)));
const image_buffer_t<const uint8_t> input{raw.subspan(offset, width * height * c.typesize),
c.typesize, width, height, 1};
local_out[block] = encodeSubchunk(input, c.lossy);
}
// Compute the total compressed size in bytes.
const auto compressed_size =
std::accumulate(local_out.begin(), local_out.end(), c.header_size,
[](const auto& a, const auto& b) -> size_t { return a + b.size(); });
// Reallocate the raw buffer, if the new size is larger than original size.
span<uint8_t> out_buf;
if (compressed_size <= raw.size) {
out_buf = {raw.data, compressed_size};
} else {
out_buf = {static_cast<uint8_t*>(realloc(raw.data, compressed_size)), compressed_size};
}
span<uint32_t> header{reinterpret_cast<uint32_t*>(out_buf.data), c.subchunks};
#pragma omp parallel for schedule(guided)
for (size_t block = 0; block < c.subchunks; block++) {
const auto offset = std::accumulate(
local_out.begin(), local_out.begin() + block, size_t(c.header_size),
[](const auto& a, const auto& b) -> size_t { return a + b.size(); });
const auto& local_buf = local_out[block];
// Write header
header[block] = local_buf.size();
// Write payload
std::copy(local_buf.begin(), local_buf.end(), out_buf.begin() + offset);
}
return out_buf;
}
#ifdef H5JPEGLS_USE_ASYNC
std::array<tf::Task, 3>
encodeAsync(span<const uint8_t> raw, const subchunk_config_t c, tf::Taskflow& taskflow,
encode_ctx_t& encoded) {
constexpr size_t zero = 0;
constexpr size_t one = 1;
const auto n_subchunks = c.subchunks;
auto allocate_task = taskflow.emplace([&, n_subchunks]() {
// Allocate buffers of the subchunks.
encoded = encode_cache_t{n_subchunks};
});
// For each sub-chunk of raw data, determine the byte range, image width and height.
// Then, compress data.
auto scatter_task =
taskflow.for_each_index(zero, n_subchunks, one, [&, c, raw](const size_t block) {
const size_t width = c.length;
const size_t height =
(c.remainder != 0 && block == c.subchunks) ? c.remainder : c.lblocks;
const size_t offset = c.typesize * width * height * block;
const image_buffer_t<const uint8_t> input{
raw.subspan(offset, width * height * c.typesize), c.typesize, width, height, 1};
auto& local_out = std::get<encode_cache_t>(encoded).local_out.at(block);
local_out = encodeSubchunk(input, c.lossy);
});
// Compute the total compressed size in bytes. We will shrink wrap the
// compressed subchunks into one contiguous data layout.
auto shrink_task = taskflow.emplace([&, c]() {
auto& compressed_size = std::get<encode_cache_t>(encoded).compressed_size;
const auto& local_out = std::get<encode_cache_t>(encoded).local_out;
compressed_size =
std::accumulate(local_out.begin(), local_out.end(), c.header_size,
[](const auto& a, const auto& b) -> size_t { return a + b.size(); });
});
auto gather_task = taskflow.emplace([&, c]() {
const size_t compressed_size = std::get<encode_cache_t>(encoded).compressed_size;
const auto& local_out = std::get<encode_cache_t>(encoded).local_out;
byte_array_t encoded_buf(compressed_size);
span<uint32_t> header{reinterpret_cast<uint32_t*>(encoded_buf.data()), c.subchunks};
for (size_t block = 0; block < c.subchunks; block++) {
const auto offset = std::accumulate(
local_out.begin(), local_out.begin() + block, c.header_size,
[](const auto& a, const auto& b) -> size_t { return a + b.size(); });
const auto local_buf = std::move(local_out.at(block));
// Write header
header[block] = local_buf.size();
// Write payload
std::copy(local_buf.begin(), local_buf.end(), encoded_buf.begin() + offset);
}
// move the aggregated data to the output buffer
encoded = std::move(encoded_buf);
});
// Now, label the tasks for debugging purpose.
allocate_task.name("allocate");
scatter_task.name("compress");
shrink_task.name("shrink");
gather_task.name("gather");
// Schecule the tasks serially.
taskflow.linearize({allocate_task, scatter_task, shrink_task, gather_task});
// Return the tasks for a more fine grain task scheduling, e.g. concurrency limit.
return {allocate_task, scatter_task, gather_task};
}
#endif
} // namespace jpegls