Skip to content

Commit

Permalink
refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugoberry committed Apr 1, 2024
1 parent e1052b7 commit c76657b
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 67 deletions.
184 changes: 117 additions & 67 deletions src/abf/AbfParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,117 +76,167 @@ void AbfParser::patch_header_of_compressed_buffer(std::vector<uint8_t> &compress

}

std::vector<uint8_t> AbfParser::get_sqlite(const std::string &path, const int trailing_chunks=15)
{
constexpr auto DataModelFileName = "DataModel";
constexpr uint32_t LocalFileType = 1027;

uint64_t virtual_directory_offset = 0;
uint32_t virtual_directory_size = 0;
uint64_t skip_offset = 0;
uint32_t block_index_iterator = 0;
uint32_t block_index = 0;
std::pair<uint64_t, uint64_t> AbfParser::initialize_zip_and_locate_datamodel(const std::string &path) {
// Initialize zip, locate DataModel, return datamodel_ofs and datamodel_size

constexpr auto DataModelFileName = "DataModel";
mz_zip_archive zip_archive;
uint64_t datamodel_ofs = 0;
uint64_t datamodel_size = 0;
memset(&zip_archive, 0, sizeof(zip_archive));

// Initialize the zip archive for reading
mz_bool status = mz_zip_reader_init_file(&zip_archive, path.c_str(), 0);
if (!status) {
throw std::runtime_error("Could not open pbix file");
throw std::runtime_error("Could not open zip file");
}
int file_index = mz_zip_reader_locate_file(&zip_archive, DataModelFileName, NULL, 0);

// Locate the DataModel file within the zip
int file_index = mz_zip_reader_locate_file(&zip_archive, "DataModel", nullptr, 0);
if (file_index < 0) {
throw std::runtime_error("DataModel not found in the pbix file.");
} else {
mz_zip_archive_file_stat file_stat;
if (mz_zip_reader_file_stat(&zip_archive, file_index, &file_stat)) {
datamodel_size= file_stat.m_comp_size;
datamodel_ofs = file_stat.m_local_header_ofs;

} else {
throw std::runtime_error("Could not retrieve information about DataModel.");
}
mz_zip_reader_end(&zip_archive); // Clean up before throwing
throw std::runtime_error("DataModel not found in the zip file.");
}
mz_zip_reader_end(&zip_archive);
std::ifstream entryStream(path, std::ios::binary);
if (!entryStream.is_open()) {
throw std::runtime_error("Could not open pbix file");

// Retrieve information about the DataModel file
mz_zip_archive_file_stat file_stat{};
if (!mz_zip_reader_file_stat(&zip_archive, file_index, &file_stat)) {
mz_zip_reader_end(&zip_archive); // Clean up before throwing
throw std::runtime_error("Could not retrieve information about DataModel.");
}
entryStream.seekg(datamodel_ofs+26);

// Clean up the zip reader as it's no longer needed after getting the info
mz_zip_reader_end(&zip_archive);

// Return the offset and compressed size of the DataModel file
return {file_stat.m_local_header_ofs, file_stat.m_comp_size};
}

void AbfParser::read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs) {
constexpr u_short ZIP_LOCAL_FILE_HEADER_FIXED = 26;
constexpr u_short ZIP_LOCAL_FILE_HEADER = 30;
// Read compressed DataModel header to adjust offset
entryStream.seekg(datamodel_ofs+ZIP_LOCAL_FILE_HEADER_FIXED);
uint16_t filename_len = 0;
uint16_t extra_len = 0;
entryStream.read(reinterpret_cast<char *>(&filename_len), sizeof(filename_len));
entryStream.read(reinterpret_cast<char *>(&extra_len), sizeof(extra_len));

datamodel_ofs += 30+filename_len+extra_len;
entryStream.seekg(datamodel_ofs+102);
datamodel_ofs += ZIP_LOCAL_FILE_HEADER + filename_len + extra_len;
}

std::vector<uint8_t> AbfParser::decompress_initial_block(std::ifstream &entryStream, uint64_t datamodel_ofs, XPress9Wrapper &xpress9_wrapper) {
constexpr u_short ABF_XPRESS9_SIGNATRUE = 102;
// Seek to the start of the DataModel compressed data
entryStream.seekg(datamodel_ofs + ABF_XPRESS9_SIGNATRUE, std::ios::beg);

uint32_t uncompressed_size;
uint32_t compressed_size;

//process the first chunk
// Read the compressed and uncompressed sizes before the offset
entryStream.read(reinterpret_cast<char*>(&uncompressed_size), sizeof(uint32_t));
entryStream.read(reinterpret_cast<char*>(&compressed_size), sizeof(uint32_t));

XPress9Wrapper xpress9_wrapper;
if (!xpress9_wrapper.Initialize())
{
throw std::runtime_error("Failed to initialize XPress9Wrapper");
}

// Buffers for storing decompressed data
std::vector<uint8_t> c0_decompressed_buffer(uncompressed_size);
std::vector<uint8_t> c0_compressed_buffer(compressed_size);
std::vector<uint8_t> all_decompressed_buffer;
// Allocate buffers for compressed and decompressed data
std::vector<uint8_t> decompressed_buffer(uncompressed_size);
std::vector<uint8_t> compressed_buffer(compressed_size);

entryStream.read(reinterpret_cast<char*>(c0_compressed_buffer.data()), compressed_size);
entryStream.read(reinterpret_cast<char*>(compressed_buffer.data()), compressed_size);

// Decompress the entire data
uint32_t decompressed_size = xpress9_wrapper.Decompress(c0_compressed_buffer.data(), compressed_size, c0_decompressed_buffer.data(), c0_decompressed_buffer.size());
uint32_t decompressed_size = xpress9_wrapper.Decompress(compressed_buffer.data(), compressed_size, decompressed_buffer.data(), decompressed_buffer.size());
// Verify that the total decompressed size matches the expected size
if (decompressed_size != uncompressed_size) {
throw std::runtime_error("Mismatch in decompressed chunk size in first chunk.");
throw std::runtime_error("Mismatch in decompressed block size in first block.");
}
all_decompressed_buffer.insert(all_decompressed_buffer.end(), c0_decompressed_buffer.begin(), c0_decompressed_buffer.end()); // Add the decompressed data to the overall buffer
return decompressed_buffer;
}

std::tie(virtual_directory_offset, virtual_directory_size) = process_backup_log_header(all_decompressed_buffer);
std::vector<uint8_t> AbfParser::iterate_and_decompress_blocks(std::ifstream &entryStream, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset) {
// Calculate the total number of blocks
constexpr uint32_t BLOCK_SIZE = 0x200000;
auto total_blocks = (virtual_directory_size + virtual_directory_offset) / BLOCK_SIZE;

auto total_chunks = (virtual_directory_size + virtual_directory_offset) / 0x200000;
std::vector<uint8_t> all_decompressed_data;
uint32_t block_index = 0;
uint32_t block_index_iterator = 0;

while(entryStream.tellg()<datamodel_ofs+datamodel_size)
{
// Iterate through each block in the DataModel
while (entryStream.tellg()<datamodel_ofs+datamodel_size) {
block_index++;
// Read the compressed and uncompressed sizes before the offset
entryStream.read(reinterpret_cast<char*>(&uncompressed_size), sizeof(uint32_t));
entryStream.read(reinterpret_cast<char*>(&compressed_size), sizeof(uint32_t));
if (total_chunks > trailing_chunks && !(block_index >= total_chunks - trailing_chunks))
{
// Read the compressed and uncompressed sizes
uint32_t uncompressed_size = 0;
uint32_t compressed_size = 0;
entryStream.read(reinterpret_cast<char*>(&uncompressed_size), sizeof(uncompressed_size));
entryStream.read(reinterpret_cast<char*>(&compressed_size), sizeof(compressed_size));

// Skip blocks if not within the last `trailing_blocks` (based on your logic)
if (total_blocks > trailing_blocks && block_index < (total_blocks - trailing_blocks)) {
skip_offset += uncompressed_size;
entryStream.seekg(compressed_size, std::ios::cur);
entryStream.seekg(compressed_size, std::ios::cur); // Skip this block
continue;
}
// Buffers for storing decompressed data
std::vector<uint8_t> decompressed_buffer(uncompressed_size);

// Allocate buffers for the compressed and decompressed data
std::vector<uint8_t> compressed_buffer(compressed_size);
std::vector<uint8_t> decompressed_buffer(uncompressed_size);

// Read the compressed block
entryStream.read(reinterpret_cast<char*>(compressed_buffer.data()), compressed_size);

// call to a new function process header_buffer which we'll use to modify compressed_buffer
patch_header_of_compressed_buffer(compressed_buffer, block_index_iterator);

decompressed_size = xpress9_wrapper.Decompress(compressed_buffer.data(), compressed_size, decompressed_buffer.data(), decompressed_buffer.size());
// Verify that the total decompressed size matches the expected size
// Decompress the block
uint32_t decompressed_size = xpress9_wrapper.Decompress(compressed_buffer.data(), compressed_size, decompressed_buffer.data(), decompressed_buffer.size());

// Verify decompression success
if (decompressed_size != uncompressed_size) {
throw std::runtime_error("Mismatch in decompressed chunk size.");
throw std::runtime_error("Decompression failed or resulted in unexpected size.");
}
all_decompressed_buffer.insert(all_decompressed_buffer.end(), decompressed_buffer.begin(), decompressed_buffer.end()); // Add the decompressed data to the overall buffer

if (skip_offset + all_decompressed_buffer.size() >= virtual_directory_offset + virtual_directory_size)
{
return extract_sqlite_buffer(all_decompressed_buffer, skip_offset, virtual_directory_offset, virtual_directory_size); // Extracts and returns the SQLite buffer.
}
// Add decompressed data to the overall buffer
all_decompressed_data.insert(all_decompressed_data.end(), decompressed_buffer.begin(), decompressed_buffer.end());
}

return all_decompressed_data;
}

std::vector<uint8_t> AbfParser::get_sqlite(const std::string &path, const int trailing_blocks=15)
{
// Initialize zip and locate DataModel
auto [datamodel_ofs, datamodel_size] = initialize_zip_and_locate_datamodel(path);

// Open file stream
std::ifstream entryStream(path, std::ios::binary);
if (!entryStream.is_open()) {
throw std::runtime_error("Could not open pbix file for reading compressed DataModel header.");
}

// Read compressed DataModel header to adjust offset
read_compressed_datamodel_header(entryStream, datamodel_ofs);

XPress9Wrapper xpress9_wrapper;
if (!xpress9_wrapper.Initialize())
{
throw std::runtime_error("Failed to initialize XPress9Wrapper");
}

throw std::runtime_error("DataModel metadata not found in PBIX file.");
// Decompress initial block to get the virtual directory info
auto initial_decompressed_buffer = decompress_initial_block(entryStream, datamodel_ofs, xpress9_wrapper);

// Process backup log header to get virtual directory offset and size
auto [virtual_directory_offset, virtual_directory_size] = process_backup_log_header(initial_decompressed_buffer);

uint64_t skip_offset = 0; //optimization for skipping blocks
// Iterate through the remaining blocks and decompress them
auto all_decompressed_buffer = iterate_and_decompress_blocks(entryStream, datamodel_ofs, datamodel_size, xpress9_wrapper, virtual_directory_offset, virtual_directory_size, trailing_blocks, skip_offset);

// Prefix all_decompressed_buffer with initial_decompressed_buffer in case we have only one block
all_decompressed_buffer.insert(all_decompressed_buffer.begin(), initial_decompressed_buffer.begin(), initial_decompressed_buffer.end());

if (skip_offset + all_decompressed_buffer.size() < virtual_directory_offset + virtual_directory_size)
{
throw std::runtime_error("Could not parse the entire DataModel.");
}
// Finally, extract the SQLite buffer from the decompressed data
return extract_sqlite_buffer(all_decompressed_buffer, skip_offset, virtual_directory_offset, virtual_directory_size);
}
4 changes: 4 additions & 0 deletions src/abf/AbfParser.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ class AbfParser {
static std::vector<uint8_t> trim_buffer(const std::vector<uint8_t>& buffer);
static std::tuple<uint64_t,int> process_backup_log_header(const std::vector<uint8_t> &buffer);
static std::vector<uint8_t> extract_sqlite_buffer(const std::vector<uint8_t> &buffer, uint64_t skip_offset, uint64_t virtual_directory_offset, int virtual_directory_size);
static std::pair<uint64_t, uint64_t> initialize_zip_and_locate_datamodel(const std::string &path);
static void read_compressed_datamodel_header(std::ifstream &entryStream, uint64_t &datamodel_ofs);
static std::vector<uint8_t> decompress_initial_block(std::ifstream &entryStream, uint64_t datamodel_ofs, XPress9Wrapper &xpress9_wrapper);
static std::vector<uint8_t> iterate_and_decompress_blocks(std::ifstream &entryStream, uint64_t datamodel_ofs, uint64_t datamodel_size, XPress9Wrapper &xpress9_wrapper, uint64_t virtual_directory_offset, int virtual_directory_size, const int trailing_blocks, uint64_t &skip_offset);
};

class Header {
Expand Down

0 comments on commit c76657b

Please sign in to comment.