Skip to content

Commit

Permalink
Docker, fix liblzma
Browse files Browse the repository at this point in the history
Could I have installed liblzma in docker and saved a few hours?

Yes. Yes I could have. But where's the fun in that? :D
  • Loading branch information
LunarWatcher committed Jul 23, 2024
1 parent a966f58 commit d86c478
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 17 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ compile_commands.json
/downloads/
/out/
*.7z
/.env

# Source: https://github.com/github/gitignore/blob/main/Python.gitignore {{{
# Byte-compiled / optimized / DLL files
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Stack Exchange data dump downloader and transformer

[![Data dump transformer build](https://github.com/LunarWatcher/se-data-dump-transformer/actions/workflows/transformer.yml/badge.svg)](https://github.com/LunarWatcher/se-data-dump-transformer/actions/workflows/transformer.yml)
[![Data dump transformer build](https://github.com/LunarWatcher/se-data-dump-transformer/actions/workflows/transformer.yml/badge.svg)](https://github.com/LunarWatcher/se-data-dump-transformer/actions/workflows/transformer.yml) [![Stackapps listing](https://img.shields.io/badge/StackApps%20listing-FF9900)](https://stackapps.com/q/10591/69829)

**NOTE:** This repo does not yet gather the data dump, as it has not yet been released in the new format. It currently contains the scaffolding required to deal with some of SE's bullshit, to make sure it can be quickly adapted to actually download the data dump parts when they become available.

Expand Down Expand Up @@ -106,9 +106,9 @@ cd transformer
mkdir build
cd build
# Option 1: debug:
cmake .. -DCMAKE_BUILD_Type=Debug
cmake .. -DCMAKE_BUILD_TYPE=Debug
# Option 2: release mode; strongly recommended for anything that needs the performance:
cmake .. -DCMAKE_BUILD_Type=Release
cmake .. -DCMAKE_BUILD_TYPE=Release
# ---
# Replace 8 with the number of cores/threads you have
cmake --build . -j 8
Expand Down
14 changes: 14 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
services:
transformer:
build:
context: .
dockerfile: transformer/Dockerfile
environment:
- SEDD_OUTPUT_TYPE=${SEDD_OUTPUT_TYPE:-json}
- SPDLOG_LEVEL=${SPDLOG_LEVEL}
user: "1000"
volumes:
# Change these paths to use different source directories
# for the container
- ./downloads/:/app/downloads
- ./out/:/app/out
34 changes: 27 additions & 7 deletions transformer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,22 +1,32 @@
cmake_minimum_required(VERSION 3.10)
cmake_minimum_required(VERSION 3.24 FATAL_ERROR)

set(CMAKE_PROJECT_TOP_LEVEL_INCLUDES "${CMAKE_CURRENT_SOURCE_DIR}/dep_provider.cmake")
project(sedd-transformer)

set(CMAKE_POLICY_DEFAULT_CMP0074 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0144 NEW)
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)

set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

set (CMAKE_CXX_STANDARD 20)
set (CMAKE_POSITION_INDEPENDENT_CODE ON)
set (BUILD_SHARED_LIBS ON)

list (APPEND CMAKE_FIND_ROOT_PATH "${CMAKE_SOURCE_DIR}/cmake/")
message(STATUS ${CMAKE_FIND_ROOT_PATH})
set (ENABLE_TEST OFF CACHE STRING "" FORCE)

if (UNIX)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=undefined")
endif()
#set (CMAKE_FIND_DEBUG_MODE TRUE)
find_package(LibLZMA REQUIRED)
#set (CMAKE_FIND_DEBUG_MODE FALSE)

include(FetchContent)

FetchContent_Declare(
archive
libarchive
GIT_REPOSITORY https://github.com/libarchive/libarchive
GIT_TAG v3.7.4
)
Expand All @@ -43,14 +53,24 @@ FetchContent_Declare(
GIT_REPOSITORY https://github.com/ibireme/yyjson
GIT_TAG 0.10.0
)

FetchContent_MakeAvailable(yyjson)
FetchContent_MakeAvailable(cli11)
FetchContent_MakeAvailable(spdlog)
FetchContent_MakeAvailable(stc)
FetchContent_MakeAvailable(archive)
FetchContent_MakeAvailable(pugixml)

FetchContent_GetProperties(libarchive)
if(NOT libarchive_POPULATED)
FetchContent_Populate(libarchive)

add_subdirectory(${libarchive_SOURCE_DIR} ${libarchive_BINARY_DIR})

endif()

if (NOT WIN32)
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fsanitize=undefined")
endif()

add_executable(sedd-transformer
src/Main.cpp

Expand Down
21 changes: 21 additions & 0 deletions transformer/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
FROM ubuntu:latest

WORKDIR /app

RUN apt-get update && apt-get upgrade -y \
&& apt-get install -y libssl-dev cmake gcc g++ git

ARG SEDD_OUTPUT_TYPE=json
ENV SPDLOG_LEVEL=info

COPY transformer/src /app/src
COPY transformer/tests /app/tests
COPY transformer/CMakeLists.txt /app/CMakeLists.txt
COPY transformer/cmake /app/cmake
COPY transformer/dep_provider.cmake /app/dep_provider.cmake

RUN mkdir build && cd build && cmake .. -DCMAKE_BUILD_TYPE=Release && make -j $(nproc)


CMD ls -la ./downloads && ls -la ./out \
&& /app/build/bin/sedd-transformer -i /app/downloads -o /app/out -t ${SEDD_OUTPUT_TYPE} || sleep 460000
16 changes: 16 additions & 0 deletions transformer/cmake/FindLibLZMA.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
set (BUILD_TESTING OFF CACHE STRING "" FORCE)
set (BUILD_SHARED_LIBS ON)

include(FetchContent)

FetchContent_Declare(
xz
GIT_REPOSITORY https://github.com/tukaani-project/xz
GIT_TAG v5.6.2
)
FetchContent_MakeAvailable(xz)
set(LIBLZMA_INCLUDE_DIRS ${xz_SOURCE_DIR}/src/liblzma/api/)
set (LIBLZMA_LIBRARY liblzma)

set (LIBLZMA_FOUND ON)
set (LIBLZMA_LIBRARIES liblzma)
19 changes: 19 additions & 0 deletions transformer/dep_provider.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
cmake_minimum_required(VERSION 3.24)

set (ROOT_DIR ${CMAKE_SOURCE_DIR})
set (DEP_OVERRIDES ${ROOT_DIR}/cmake/)

macro(sedd_provide_dependency method package_name)

if (NOT ("${DEP_OVERRIDES}" IN_LIST CMAKE_MODULE_PATH))
list(APPEND CMAKE_MODULE_PATH "${DEP_OVERRIDES}")
endif()

message(STATUS "INSTALLING DEP: ${package_name} with lookup path ${CMAKE_MODULE_PATH}")
find_package(${package_name} BYPASS_PROVIDER)
endmacro()

cmake_language(
SET_DEPENDENCY_PROVIDER sedd_provide_dependency
SUPPORTED_METHODS FIND_PACKAGE
)
13 changes: 7 additions & 6 deletions transformer/src/Main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,12 @@ std::map<std::string, TransformerType> strToTransformer {
};

std::shared_ptr<sedd::Transformer> getTransformer(TransformerType type) {
#define SEDD_TRANSFORMER(type, ret) {TransformerType::type, []() { spdlog::info("Using transformer " #type); return ret; }}

static auto map = std::map<TransformerType, std::function<std::shared_ptr<sedd::Transformer>()>> {
{TransformerType::JSON, []() { return std::make_shared<sedd::JSONTransformer>(); }},
{TransformerType::DRY_RUN, []() { return nullptr; }},
SEDD_TRANSFORMER(JSON, std::make_shared<sedd::JSONTransformer>()),

SEDD_TRANSFORMER(DRY_RUN, nullptr),
};

return map.at(type)();
Expand Down Expand Up @@ -71,17 +74,15 @@ int main(int argc, char* argv[]) {
};

std::filesystem::create_directories(ctx.destDir);
spdlog::info("Configuration:");
spdlog::info("Files: [source = {}, dest = {}]", ctx.sourceDir.string(), ctx.destDir.string());

for (const auto& entry : std::filesystem::directory_iterator(ctx.sourceDir)) {
spdlog::info("Now processing {}", entry.path().string());

auto parser = sedd::ArchiveParser(entry);
parser.read(ctx);

std::this_thread::sleep_for(10s);

}

std::this_thread::sleep_for(10s);
return 0;
}
5 changes: 5 additions & 0 deletions transformer/src/data/ArchiveParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,11 @@ void ArchiveParser::read(const GlobalContext& conf) {
}

}
auto err = archive_error_string(a);
if (err != nullptr) {
spdlog::critical("{}", err);
throw std::runtime_error(err);
}

if (conf.transformer) {
conf.transformer->endArchive(ctx);
Expand Down
2 changes: 1 addition & 1 deletion transformer/src/data/ArchiveWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
namespace sedd {

ArchiveWriter::ArchiveWriter(const std::filesystem::path& basePath) : archiveName(basePath.string() + ".7z"), tmpOutputDir(basePath) {
spdlog::debug("Opening archive {}", archiveName.string());
spdlog::debug("Opening output archive: {}", archiveName.string());
// TODO: figure out if archive_write_new() works
a = archive_write_new();
int r = archive_write_set_format_7zip(a);
Expand Down

0 comments on commit d86c478

Please sign in to comment.