m4tx
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎Makefile
+55 b/‎Makefile
+55
diff --git a/‎abstract.tex
+6 b/‎abstract.tex
+6
diff --git a/‎apx-benchmark-env.tex
+48 b/‎apx-benchmark-env.tex
+48
diff --git a/‎apx-idn-format.tex
+31 b/‎apx-idn-format.tex
+31
diff --git a/‎apx-model-format.tex
+84 b/‎apx-model-format.tex
+84
diff --git a/‎apx-models.tex
+43 b/‎apx-models.tex
+43
diff --git a/‎apx-test-data.tex
+27 b/‎apx-test-data.tex
+27
@@ -0,0 +1,2 @@
+build/
+*.log
@@ -0,0 +1,55 @@
+MAIN := main
+IMGS := \
+	build/img/herb_uj.pdf \
+	img/bench-dedicated-model-compress.tex \
+	img/bench-known-sequencing-compress.tex \
+	img/bench-small-data-compress.tex \
+	img/bench-best-performance-compress.tex \
+	img/bench-dedicated-model-decompress.tex \
+	img/bench-known-sequencing-decompress.tex \
+	img/bench-small-data-decompress.tex \
+	img/bench-best-performance-decompress.tex \
+	img/binning.tex \
+	img/idn_file.tex
+CHAPTERS := \
+	conclusion.tex \
+	evaluation.tex \
+	implementation.tex \
+	introduction.tex \
+	problem.tex \
+	apx-benchmark-env.tex \
+	apx-idn-format.tex \
+	apx-model-format.tex \
+	apx-models.tex \
+	apx-test-data.tex
+REST := bibliography.bib abstract.tex titlepage.tex
+BIBLIOGRAPHY := build/bibliography.bbl
+CC := pdflatex -shell-escape
+INKSCAPE := inkscape
+LINT := lacheck
+BIBTEX := bibtex
+
+all: stage2
+
+stage2: $(MAIN).tex $(BIBLIOGRAPHY) $(REST) $(IMGS) $(CHAPTERS)
+	$(CC) -output-directory build $<
+	$(CC) -output-directory build $<
+
+stage1: $(MAIN).tex $(REST) $(IMGS) $(CHAPTERS)
+	mkdir -p build/build
+	$(CC) -output-directory build $<
+
+build/img/%.pdf: img/%.svg
+	mkdir -p build/img
+	$(INKSCAPE) -o $@ $<
+
+$(BIBLIOGRAPHY): stage1
+	$(BIBTEX) build/$(MAIN).aux
+
+lint:
+	$(LINT) $(MAIN).tex
+
+clean:
+	rm -rf build/* *.aux *.log *.bbl *.out *.pdf *.blg *.toc
+
+.PHONY: clean lint
@@ -0,0 +1,6 @@
+\begin{abstract}
+    In recent years, there happened a gigantic leap in the speed of DNA sequencing methods, which allowed us to sequence DNAs of complex organisms, such as humans, quickly.
+    However, this leads to increasing demand for disk storage, as the sizes of the databases containing such data can easily reach dozens of terabytes.
+    In his article ``Context binning, model clustering and adaptivity for data compression of genetic data'', Jarek Duda proposes promising compression techniques that should help build a compressor better than the current state of the art.
+    This thesis describes the compressor built to evaluate those techniques, tests it with real-world data and compares it to other genetic data compression tools.
+\end{abstract}
@@ -0,0 +1,48 @@
+\section{Benchmarking tools and environment}
+\label{sec:benchmarking-tools-and-environment}
+
+\centering
+\begin{tblr}{
+    colspec = {|c|c|c|l|},
+    row{odd} = {white},
+    row{even} = {white},
+    row{1} = {gray8},
+    row{2-Z} = {font=\footnotesize},
+    rowsep=0.1pt,
+}
+    \hline
+    Name & Version & Type & Command Line \\
+    \hline
+    \SetCell[r=2]{c}gzip & \SetCell[r=2]{c}1.10 & Compressor & \texttt{gzip -c \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{gzip -c -d \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}gzip\_9 & \SetCell[r=2]{c}1.10 & Compressor & \texttt{gzip -c -9 \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{gzip -c -d \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}pigz & \SetCell[r=2]{c}2.6 & Compressor & \texttt{pigz -c \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{pigz -c -d \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}bzip2 & \SetCell[r=2]{c}1.0.8 & Compressor & \texttt{bzip2 -c \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{bzip2 -c -d \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}bzip2\_9 & \SetCell[r=2]{c}1.0.8 & Compressor & \texttt{bzip2 -c -9 \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{bzip2 -c -d \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}xz & \SetCell[r=2]{c}5.2.5 & Compressor & \texttt{xz -c {-}{-}threads=16 \$INPUT > \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{xz -c -d {-}{-}threads=16 \$INPUT > \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}fqzcomp\_q2 & \SetCell[r=2]{c}4.6 & Compressor & \texttt{fqzcomp -q2 -s5+ \$INPUT \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{fqzcomp -d \$INPUT \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}fqzcomp\_q3 & \SetCell[r=2]{c}4.6 & Compressor & \texttt{fqzcomp -q3 -s5+ \$INPUT \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{fqzcomp -d \$INPUT \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}genozip & \SetCell[r=2]{c}13.0.20 & Compressor & \texttt{genozip \$INPUT -o \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{genounzip \$INPUT -o \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}spring & \SetCell[r=2]{c}{git rev.\\g5091e1b} & Compressor & \texttt{spring -c {-}{-}no-ids -t16 -i \$INPUT -o \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{spring -d -t16 -i \$INPUT -o \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}dsrc2 & \SetCell[r=2]{c}2.02 & Compressor & \texttt{dsrc c -m1 -t16 \$INPUT \$OUTPUT} \\ \hline
+    & & Decompressor & \texttt{dsrc d -t16 \$INPUT \$OUTPUT} \\ \hline
+    \SetCell[r=2]{c}idencomp & \SetCell[r=2]{c}{git rev.\\8cd39db} & Compressor & {\texttt{idencomp compress {-}{-}threads 16} \\ \texttt{\hspace{1em}{-}{-}no-identifiers \$INPUT -o \$OUTPUT}} \\ \hline
+    & & Decompressor & {\texttt{idencomp decompress {-}{-}threads 16} \\ \texttt{\hspace{1em}\$INPUT -o \$OUTPUT}} \\ \hline
+    \SetCell[r=2]{c}{idencomp\_fast} & \SetCell[r=2]{c}{git rev.\\8cd39db} & Compressor & {\texttt{idencomp compress {-}{-}threads 16 {-}{-}fast} \\ \texttt{\hspace{1em}{-}{-}no-identifiers \$INPUT -o \$OUTPUT}} \\ \hline
+    & & Decompressor & {\texttt{idencomp decompress {-}{-}threads 16} \\ \texttt{\hspace{1em}\$INPUT -o \$OUTPUT}} \\ \hline
+\end{tblr}
+
+\vspace{1em}
+\textbf{OS}: Ubuntu 22.04 LTS (x86-64);
+\textbf{CPU}: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (16~vCores);
+\textbf{RAM}: 128GB;
+\textbf{Disk}: 400GB SSD
@@ -0,0 +1,31 @@
+\section{IDN data format}\label{sec:idn-data-format}
+
+The IDN file starts with \texttt{IDENCOMP} bytes encoded in ASCII, followed
+by the version number (currently always 1).
+Then, there is the metadata section, which contains the number of metadata
+items and the items themselves.
+Currently, only one type of metadata item is supported: model identifiers,
+which indicate which models a file uses, to allow the decompressor to load
+the corresponding models (or throw an error if they are unavailable).
+
+The file header is followed by an arbitrary number of data blocks.
+The data block header consists of the compressed data length and
+CRC32\cite{rfc3385} checksum of all the sequence data in a block.
+Then, it is followed by an arbitrary number of slices.
+
+Each slice is either a ``switch model'' slice that instructs the decompressor
+to use a specific model for the subsequent sequences or a ``sequence'' slice,
+which is a compressed sequence data.
+For completeness, the sequence identifiers (names) may also appear in a file
+as a separate slice, containing all of the sequence names separated by
+newlines, and compressed using either Deflate (for lower compression quality
+options) or Brotli (for higher).
+
+\begin{figure}[!ht]
+    \centering
+    \input{img/idn_file}
+    \caption{%
+        High-level diagram of the IDN file format
+    }
+    \label{fig:idn-file-format}
+\end{figure}
@@ -0,0 +1,84 @@
+\section{Model data format}\label{sec:model-data-format}
+
+Because the model data does not need to be very concise, MessagePack has been
+chosen as the data storage format for the models.
+The file contains the model type (acids or quality scores), context specifier
+type (described in \Cref{subsec:contexts}), list of contexts (symbol
+probabilities), and map of context specifiers to context indices.
+Precisely, the data stored in such MessagePack file corresponds to a given
+JSON file:
+
+\colorlet{punct}{red!60!black}
+\definecolor{background}{HTML}{EEEEEE}
+\definecolor{delim}{RGB}{20,105,176}
+\colorlet{numb}{magenta!60!black}
+
+\lstdefinelanguage{json}{
+    basicstyle=\normalfont\ttfamily,
+    numberstyle=\scriptsize,
+    commentstyle=\color{gray},
+    stepnumber=1,
+    numbersep=8pt,
+    showstringspaces=false,
+    breaklines=true,
+    frame=single,
+    comment=[l]{//},
+    backgroundcolor=\color{background},
+    literate=
+    *{0}{{{\color{numb}0}}}{1}
+        {1}{{{\color{numb}1}}}{1}
+        {2}{{{\color{numb}2}}}{1}
+        {3}{{{\color{numb}3}}}{1}
+        {4}{{{\color{numb}4}}}{1}
+        {5}{{{\color{numb}5}}}{1}
+        {6}{{{\color{numb}6}}}{1}
+        {7}{{{\color{numb}7}}}{1}
+        {8}{{{\color{numb}8}}}{1}
+        {9}{{{\color{numb}9}}}{1}
+        {:}{{{\color{punct}{:}}}}{1}
+        {,}{{{\color{punct}{,}}}}{1}
+        {\{}{{{\color{delim}{\{}}}}{1}
+        {\}}{{{\color{delim}{\}}}}}{1}
+        {[}{{{\color{delim}{[}}}}{1}
+        {]}{{{\color{delim}{]}}}}{1},
+}
+
+\begin{lstlisting}[language=json,firstnumber=1,label={lst:model-json}]
+[
+    // Model identifier (as a byte array)
+    [31, 77, 69, 112, ..., 125],
+    // Model type
+    "Acids",
+    // Context specifier type
+    "generic_ao4_qo1_pb2",
+    [
+        [
+            // Context specifiers (represented as integers)
+            [420, 2137, 2403],
+            // Context
+            [
+                // Context probability
+                // The sum of all context probabilities in a model should be 1
+                0.1234,
+                // Symbol probabilities
+                // (in this case: N, A, C, T, G, respectively)
+                // The sum of all symbol probabilities in a context should be 1
+                [0.0, 0.2, 0.3, 0.4, 0.1]
+            ]
+        ],
+        // ... more contexts
+    ]
+]
+\end{lstlisting}
+
+The \emph{model identifier} is an SHA-3\cite{1421} 256-bit checksum of the
+entire model contents.
+When deserializing the model from a file, the identifier indicates if the
+model has been read correctly.
+When reading a sequence file, the identifier list tells the decompressor
+which models to use.
+
+The identifier generation process starts with serialized by storing the model
+type, context specifier type, model map sorted by keys ascending, and then
+the contexts themselves.
+Then, the hash of such a blob is calculated.
@@ -0,0 +1,43 @@
+
+\begin{landscape}
+    \section{Pre-defined models\hfill}\label{sec:pre-defined-models}
+
+    \centering
+    \begin{tblr}{
+        colspec = {|c|c|c|r|r|l|l|l|},
+        row{odd} = {white},
+        row{even} = {gray9},
+        row{1} = {gray8},
+        rowsep=0.3pt,
+    }
+        \hline
+        \SetCell[r=2]{c} Orig. Sample & \SetCell[r=2]{c} Type & \SetCell[r=2]{c} Context Specifier Type & \SetCell[c=2]{c} No. of contexts & & \SetCell[c=3]{c} Rate [bpv] \\
+        \hline
+        & & & \SetCell{c, gray8}Original & \SetCell{c, gray8}Binned & \SetCell{c, gray8}Original & \SetCell{c, gray8}Binned & \SetCell{c, gray8}Dummy \\
+        \hline
+        ERR174310 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 22440 & 1.8343433 & 1.8680468 & 1.9762784 \\ \hline
+        ERR174310 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 74854 & 18055 & 2.3280263 & 2.6904142 & 3.7842848 \\ \hline
+        SRR8861483 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 15821 & 1.8362569 & 1.8846421 & 1.9789689 \\ \hline
+        SRR8861483 & Q. Scores & Generic ($N$=2, $M$=1, $P$=6) & 4156 & 2154 & 0.5422133 & 0.55309355 & 0.7640397 \\ \hline
+        SRR2962693 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 16620 & 1.9065356 & 1.936624 & 1.999837 \\ \hline
+        SRR2962693 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 23633 & 1688 & 1.4530900 & 1.5139407 & 1.9596039 \\ \hline
+        SRR19549058 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 27329 & 1.7648101 & 1.797492 & 1.9204142 \\ \hline
+        SRR19549058 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 23221 & 363 & 1.0534605 & 1.1090238 & 1.535664 \\ \hline
+        m64187e & Acids & Generic ($N$=8, $M$=0, $P$=0) & 87093 & 53266 & 1.3062766 & 1.4495015 & 1.9995453 \\ \hline
+        m64187e & Q. Scores & Light ($N$=0, $M$=4, $P$=0, $Q_{\text{max}}$=16) & 65536 & 407 & 1.0913382 & 1.119422 & 1.4806273 \\ \hline
+        SRR5373739 & Acids & Generic ($N$=4, $M$=1, $P$=2) & 11213 & 8 & 1.9178923 & 1.9503715 & 1.9802411 \\ \hline
+        SRR5373739 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 14810 & 6 & 0.7870272 & 0.83380574 & 0.9821181 \\ \hline
+        SRR18908372 & Acids & Light ($N$=4, $M$=3, $P$=2, $Q_{\text{max}}$=8) & 27648 & 13133 & 1.9188511 & 1.928996 & 1.9859153 \\ \hline
+        SRR18908372 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 1725 & 103 & 0.4196878 & 0.42710626 & 0.4811454 \\ \hline
+        SRR20210997 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 31130 & 1.9040625 & 1.9258752 & 2.001345 \\ \hline
+        SRR20210997 & Q. Scores & Generic ($N$=3, $M$=3, $P$=0) & 2355 & 224 & 0.45801452 & 0.45916292 & 0.4842346 \\ \hline
+        SRR19609907 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 71799 & 13958 & 1.8250997 & 1.8819417 & 1.9684256 \\ \hline
+        SRR19609907 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 868 & 333 & 0.47679535 & 0.47716048 & 0.5075786 \\ \hline
+        SRR16141966 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 59081 & 38592 & 0.8189294 & 0.8645096 & 1.954075 \\ \hline
+        SRR16141966 & Q. Scores & Light ($N$=0, $M$=3, $P$=0, $Q_{\text{max}}$=32) & 27525 & 463 & 0.7264000 & 0.8849423 & 1.1798589 \\ \hline
+        ERR5462922 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 81521 & 38902 & 0.5528092 & 0.7684268 & 1.9750485 \\ \hline
+        ERR5462922 & Q. Scores & Light ($N$=2, $M$=4, $P$=2, $Q_{\text{max}}$=8) & 5041 & 9 & 0.15876424 & 0.1647008 & 0.17681403 \\ \hline
+        SRR18718246 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 75556 & 20690 & 0.33925304 & 0.38645545 & 1.9909452 \\ \hline
+        SRR18718246 & Q. Scores & Generic ($N$=4, $M$=1, $P$=2) & 24377 & 9566 & 1.9584363 & 2.020547 & 2.7235699 \\ \hline
+    \end{tblr}
+\end{landscape}
@@ -0,0 +1,27 @@
+\section{Test data\hfill}\label{sec:test-data}
+
+\centering
+\begin{tblr}{
+    colspec = {|l|l|l|},
+    row{odd} = {gray9},
+    row{even} = {white},
+    row{1} = {gray8},
+}
+    \hline
+    Dataset & Instrument Model & Species \\
+    \hline
+    ERR174310 & Illumina HiSeq 2000 & Homo sapiens (human) \\ \hline
+    SRR8861483 & Illumina NovaSeq 6000 & Homo sapiens (human) \\ \hline
+    SRR2962693 & Illumina HiSeq 2500 & Homo sapiens (human) \\ \hline
+    SRR19549058 & Illumina HiSeq 2500 & Burkholderia stabilis (bacteria) \\ \hline
+    m64187e & Sequel IIe System & SARS-CoV-2 (virus) \\ \hline
+    SRR5373739 & Illumina HiSeq 2500 & Felis catus (cat) \\ \hline
+    SRR18908372 & Illumina NovaSeq 6000 & Felis catus (siberian cat) \\ \hline
+    SRR20210997 & Illumina HiSeq 2500 & Salmonella (bacteria) \\ \hline
+    SRR19609907 & Illumina HiSeq 2500 & Pyrus spp (pear tree) \\ \hline
+    SRR16141966 & Illumina HiSeq 2500 & E. coli (bacteria) \\ \hline
+    ERR5462922 & Illumina iSeq 100 & EBOV (ebola virus) \\ \hline
+    SRR18718246 & Illumina MiSeq & HIV-1 (virus) \\ \hline
+    SRR6123542 & Illumina HiSeq 2500 & E. coli (bacteria) \\ \hline
+    SRR2747516 & Illumina HiSeq 2500 & Canis lupus familiaris (Shiba Inu dog) \\ \hline
+\end{tblr}