Skip to content

Commit b6b4a3f

Browse files
committed
Initial commit
0 parents  commit b6b4a3f

35 files changed

+5724
-0
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
build/
2+
*.log

Makefile

+55
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
MAIN := main
2+
IMGS := \
3+
build/img/herb_uj.pdf \
4+
img/bench-dedicated-model-compress.tex \
5+
img/bench-known-sequencing-compress.tex \
6+
img/bench-small-data-compress.tex \
7+
img/bench-best-performance-compress.tex \
8+
img/bench-dedicated-model-decompress.tex \
9+
img/bench-known-sequencing-decompress.tex \
10+
img/bench-small-data-decompress.tex \
11+
img/bench-best-performance-decompress.tex \
12+
img/binning.tex \
13+
img/idn_file.tex
14+
CHAPTERS := \
15+
conclusion.tex \
16+
evaluation.tex \
17+
implementation.tex \
18+
introduction.tex \
19+
problem.tex \
20+
apx-benchmark-env.tex \
21+
apx-idn-format.tex \
22+
apx-model-format.tex \
23+
apx-models.tex \
24+
apx-test-data.tex
25+
REST := bibliography.bib abstract.tex titlepage.tex
26+
BIBLIOGRAPHY := build/bibliography.bbl
27+
CC := pdflatex -shell-escape
28+
INKSCAPE := inkscape
29+
LINT := lacheck
30+
BIBTEX := bibtex
31+
32+
all: stage2
33+
34+
stage2: $(MAIN).tex $(BIBLIOGRAPHY) $(REST) $(IMGS) $(CHAPTERS)
35+
$(CC) -output-directory build $<
36+
$(CC) -output-directory build $<
37+
38+
stage1: $(MAIN).tex $(REST) $(IMGS) $(CHAPTERS)
39+
mkdir -p build/build
40+
$(CC) -output-directory build $<
41+
42+
build/img/%.pdf: img/%.svg
43+
mkdir -p build/img
44+
$(INKSCAPE) -o $@ $<
45+
46+
$(BIBLIOGRAPHY): stage1
47+
$(BIBTEX) build/$(MAIN).aux
48+
49+
lint:
50+
$(LINT) $(MAIN).tex
51+
52+
clean:
53+
rm -rf build/* *.aux *.log *.bbl *.out *.pdf *.blg *.toc
54+
55+
.PHONY: clean lint

abstract.tex

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
\begin{abstract}
2+
In recent years, there happened a gigantic leap in the speed of DNA sequencing methods, which allowed us to sequence DNAs of complex organisms, such as humans, quickly.
3+
However, this leads to increasing demand for disk storage, as the sizes of the databases containing such data can easily reach dozens of terabytes.
4+
In his article ``Context binning, model clustering and adaptivity for data compression of genetic data'', Jarek Duda proposes promising compression techniques that should help build a compressor better than the current state of the art.
5+
This thesis describes the compressor built to evaluate those techniques, tests it with real-world data and compares it to other genetic data compression tools.
6+
\end{abstract}

apx-benchmark-env.tex

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
\section{Benchmarking tools and environment}
2+
\label{sec:benchmarking-tools-and-environment}
3+
4+
\centering
5+
\begin{tblr}{
6+
colspec = {|c|c|c|l|},
7+
row{odd} = {white},
8+
row{even} = {white},
9+
row{1} = {gray8},
10+
row{2-Z} = {font=\footnotesize},
11+
rowsep=0.1pt,
12+
}
13+
\hline
14+
Name & Version & Type & Command Line \\
15+
\hline
16+
\SetCell[r=2]{c}gzip & \SetCell[r=2]{c}1.10 & Compressor & \texttt{gzip -c \$INPUT > \$OUTPUT} \\ \hline
17+
& & Decompressor & \texttt{gzip -c -d \$INPUT > \$OUTPUT} \\ \hline
18+
\SetCell[r=2]{c}gzip\_9 & \SetCell[r=2]{c}1.10 & Compressor & \texttt{gzip -c -9 \$INPUT > \$OUTPUT} \\ \hline
19+
& & Decompressor & \texttt{gzip -c -d \$INPUT > \$OUTPUT} \\ \hline
20+
\SetCell[r=2]{c}pigz & \SetCell[r=2]{c}2.6 & Compressor & \texttt{pigz -c \$INPUT > \$OUTPUT} \\ \hline
21+
& & Decompressor & \texttt{pigz -c -d \$INPUT > \$OUTPUT} \\ \hline
22+
\SetCell[r=2]{c}bzip2 & \SetCell[r=2]{c}1.0.8 & Compressor & \texttt{bzip2 -c \$INPUT > \$OUTPUT} \\ \hline
23+
& & Decompressor & \texttt{bzip2 -c -d \$INPUT > \$OUTPUT} \\ \hline
24+
\SetCell[r=2]{c}bzip2\_9 & \SetCell[r=2]{c}1.0.8 & Compressor & \texttt{bzip2 -c -9 \$INPUT > \$OUTPUT} \\ \hline
25+
& & Decompressor & \texttt{bzip2 -c -d \$INPUT > \$OUTPUT} \\ \hline
26+
\SetCell[r=2]{c}xz & \SetCell[r=2]{c}5.2.5 & Compressor & \texttt{xz -c {-}{-}threads=16 \$INPUT > \$OUTPUT} \\ \hline
27+
& & Decompressor & \texttt{xz -c -d {-}{-}threads=16 \$INPUT > \$OUTPUT} \\ \hline
28+
\SetCell[r=2]{c}fqzcomp\_q2 & \SetCell[r=2]{c}4.6 & Compressor & \texttt{fqzcomp -q2 -s5+ \$INPUT \$OUTPUT} \\ \hline
29+
& & Decompressor & \texttt{fqzcomp -d \$INPUT \$OUTPUT} \\ \hline
30+
\SetCell[r=2]{c}fqzcomp\_q3 & \SetCell[r=2]{c}4.6 & Compressor & \texttt{fqzcomp -q3 -s5+ \$INPUT \$OUTPUT} \\ \hline
31+
& & Decompressor & \texttt{fqzcomp -d \$INPUT \$OUTPUT} \\ \hline
32+
\SetCell[r=2]{c}genozip & \SetCell[r=2]{c}13.0.20 & Compressor & \texttt{genozip \$INPUT -o \$OUTPUT} \\ \hline
33+
& & Decompressor & \texttt{genounzip \$INPUT -o \$OUTPUT} \\ \hline
34+
\SetCell[r=2]{c}spring & \SetCell[r=2]{c}{git rev.\\g5091e1b} & Compressor & \texttt{spring -c {-}{-}no-ids -t16 -i \$INPUT -o \$OUTPUT} \\ \hline
35+
& & Decompressor & \texttt{spring -d -t16 -i \$INPUT -o \$OUTPUT} \\ \hline
36+
\SetCell[r=2]{c}dsrc2 & \SetCell[r=2]{c}2.02 & Compressor & \texttt{dsrc c -m1 -t16 \$INPUT \$OUTPUT} \\ \hline
37+
& & Decompressor & \texttt{dsrc d -t16 \$INPUT \$OUTPUT} \\ \hline
38+
\SetCell[r=2]{c}idencomp & \SetCell[r=2]{c}{git rev.\\8cd39db} & Compressor & {\texttt{idencomp compress {-}{-}threads 16} \\ \texttt{\hspace{1em}{-}{-}no-identifiers \$INPUT -o \$OUTPUT}} \\ \hline
39+
& & Decompressor & {\texttt{idencomp decompress {-}{-}threads 16} \\ \texttt{\hspace{1em}\$INPUT -o \$OUTPUT}} \\ \hline
40+
\SetCell[r=2]{c}{idencomp\_fast} & \SetCell[r=2]{c}{git rev.\\8cd39db} & Compressor & {\texttt{idencomp compress {-}{-}threads 16 {-}{-}fast} \\ \texttt{\hspace{1em}{-}{-}no-identifiers \$INPUT -o \$OUTPUT}} \\ \hline
41+
& & Decompressor & {\texttt{idencomp decompress {-}{-}threads 16} \\ \texttt{\hspace{1em}\$INPUT -o \$OUTPUT}} \\ \hline
42+
\end{tblr}
43+
44+
\vspace{1em}
45+
\textbf{OS}: Ubuntu 22.04 LTS (x86-64);
46+
\textbf{CPU}: Intel(R) Xeon(R) Gold 6248 CPU @ 2.50GHz (16~vCores);
47+
\textbf{RAM}: 128GB;
48+
\textbf{Disk}: 400GB SSD

apx-idn-format.tex

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
\section{IDN data format}\label{sec:idn-data-format}
2+
3+
The IDN file starts with \texttt{IDENCOMP} bytes encoded in ASCII, followed
4+
by the version number (currently always 1).
5+
Then, there is the metadata section, which contains the number of metadata
6+
items and the items themselves.
7+
Currently, only one type of metadata item is supported: model identifiers,
8+
which indicate which models a file uses, to allow the decompressor to load
9+
the corresponding models (or throw an error if they are unavailable).
10+
11+
The file header is followed by an arbitrary number of data blocks.
12+
The data block header consists of the compressed data length and
13+
CRC32\cite{rfc3385} checksum of all the sequence data in a block.
14+
Then, it is followed by an arbitrary number of slices.
15+
16+
Each slice is either a ``switch model'' slice that instructs the decompressor
17+
to use a specific model for the subsequent sequences or a ``sequence'' slice,
18+
which is a compressed sequence data.
19+
For completeness, the sequence identifiers (names) may also appear in a file
20+
as a separate slice, containing all of the sequence names separated by
21+
newlines, and compressed using either Deflate (for lower compression quality
22+
options) or Brotli (for higher).
23+
24+
\begin{figure}[!ht]
25+
\centering
26+
\input{img/idn_file}
27+
\caption{%
28+
High-level diagram of the IDN file format
29+
}
30+
\label{fig:idn-file-format}
31+
\end{figure}

apx-model-format.tex

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
\section{Model data format}\label{sec:model-data-format}
2+
3+
Because the model data does not need to be very concise, MessagePack has been
4+
chosen as the data storage format for the models.
5+
The file contains the model type (acids or quality scores), context specifier
6+
type (described in \Cref{subsec:contexts}), list of contexts (symbol
7+
probabilities), and map of context specifiers to context indices.
8+
Precisely, the data stored in such MessagePack file corresponds to a given
9+
JSON file:
10+
11+
\colorlet{punct}{red!60!black}
12+
\definecolor{background}{HTML}{EEEEEE}
13+
\definecolor{delim}{RGB}{20,105,176}
14+
\colorlet{numb}{magenta!60!black}
15+
16+
\lstdefinelanguage{json}{
17+
basicstyle=\normalfont\ttfamily,
18+
numberstyle=\scriptsize,
19+
commentstyle=\color{gray},
20+
stepnumber=1,
21+
numbersep=8pt,
22+
showstringspaces=false,
23+
breaklines=true,
24+
frame=single,
25+
comment=[l]{//},
26+
backgroundcolor=\color{background},
27+
literate=
28+
*{0}{{{\color{numb}0}}}{1}
29+
{1}{{{\color{numb}1}}}{1}
30+
{2}{{{\color{numb}2}}}{1}
31+
{3}{{{\color{numb}3}}}{1}
32+
{4}{{{\color{numb}4}}}{1}
33+
{5}{{{\color{numb}5}}}{1}
34+
{6}{{{\color{numb}6}}}{1}
35+
{7}{{{\color{numb}7}}}{1}
36+
{8}{{{\color{numb}8}}}{1}
37+
{9}{{{\color{numb}9}}}{1}
38+
{:}{{{\color{punct}{:}}}}{1}
39+
{,}{{{\color{punct}{,}}}}{1}
40+
{\{}{{{\color{delim}{\{}}}}{1}
41+
{\}}{{{\color{delim}{\}}}}}{1}
42+
{[}{{{\color{delim}{[}}}}{1}
43+
{]}{{{\color{delim}{]}}}}{1},
44+
}
45+
46+
\begin{lstlisting}[language=json,firstnumber=1,label={lst:model-json}]
47+
[
48+
// Model identifier (as a byte array)
49+
[31, 77, 69, 112, ..., 125],
50+
// Model type
51+
"Acids",
52+
// Context specifier type
53+
"generic_ao4_qo1_pb2",
54+
[
55+
[
56+
// Context specifiers (represented as integers)
57+
[420, 2137, 2403],
58+
// Context
59+
[
60+
// Context probability
61+
// The sum of all context probabilities in a model should be 1
62+
0.1234,
63+
// Symbol probabilities
64+
// (in this case: N, A, C, T, G, respectively)
65+
// The sum of all symbol probabilities in a context should be 1
66+
[0.0, 0.2, 0.3, 0.4, 0.1]
67+
]
68+
],
69+
// ... more contexts
70+
]
71+
]
72+
\end{lstlisting}
73+
74+
The \emph{model identifier} is an SHA-3\cite{1421} 256-bit checksum of the
75+
entire model contents.
76+
When deserializing the model from a file, the identifier indicates if the
77+
model has been read correctly.
78+
When reading a sequence file, the identifier list tells the decompressor
79+
which models to use.
80+
81+
The identifier generation process starts with serialized by storing the model
82+
type, context specifier type, model map sorted by keys ascending, and then
83+
the contexts themselves.
84+
Then, the hash of such a blob is calculated.

apx-models.tex

+43
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
2+
\begin{landscape}
3+
\section{Pre-defined models\hfill}\label{sec:pre-defined-models}
4+
5+
\centering
6+
\begin{tblr}{
7+
colspec = {|c|c|c|r|r|l|l|l|},
8+
row{odd} = {white},
9+
row{even} = {gray9},
10+
row{1} = {gray8},
11+
rowsep=0.3pt,
12+
}
13+
\hline
14+
\SetCell[r=2]{c} Orig. Sample & \SetCell[r=2]{c} Type & \SetCell[r=2]{c} Context Specifier Type & \SetCell[c=2]{c} No. of contexts & & \SetCell[c=3]{c} Rate [bpv] \\
15+
\hline
16+
& & & \SetCell{c, gray8}Original & \SetCell{c, gray8}Binned & \SetCell{c, gray8}Original & \SetCell{c, gray8}Binned & \SetCell{c, gray8}Dummy \\
17+
\hline
18+
ERR174310 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 22440 & 1.8343433 & 1.8680468 & 1.9762784 \\ \hline
19+
ERR174310 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 74854 & 18055 & 2.3280263 & 2.6904142 & 3.7842848 \\ \hline
20+
SRR8861483 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 15821 & 1.8362569 & 1.8846421 & 1.9789689 \\ \hline
21+
SRR8861483 & Q. Scores & Generic ($N$=2, $M$=1, $P$=6) & 4156 & 2154 & 0.5422133 & 0.55309355 & 0.7640397 \\ \hline
22+
SRR2962693 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 16620 & 1.9065356 & 1.936624 & 1.999837 \\ \hline
23+
SRR2962693 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 23633 & 1688 & 1.4530900 & 1.5139407 & 1.9596039 \\ \hline
24+
SRR19549058 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 27329 & 1.7648101 & 1.797492 & 1.9204142 \\ \hline
25+
SRR19549058 & Q. Scores & Generic ($N$=0, $M$=2, $P$=6) & 23221 & 363 & 1.0534605 & 1.1090238 & 1.535664 \\ \hline
26+
m64187e & Acids & Generic ($N$=8, $M$=0, $P$=0) & 87093 & 53266 & 1.3062766 & 1.4495015 & 1.9995453 \\ \hline
27+
m64187e & Q. Scores & Light ($N$=0, $M$=4, $P$=0, $Q_{\text{max}}$=16) & 65536 & 407 & 1.0913382 & 1.119422 & 1.4806273 \\ \hline
28+
SRR5373739 & Acids & Generic ($N$=4, $M$=1, $P$=2) & 11213 & 8 & 1.9178923 & 1.9503715 & 1.9802411 \\ \hline
29+
SRR5373739 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 14810 & 6 & 0.7870272 & 0.83380574 & 0.9821181 \\ \hline
30+
SRR18908372 & Acids & Light ($N$=4, $M$=3, $P$=2, $Q_{\text{max}}$=8) & 27648 & 13133 & 1.9188511 & 1.928996 & 1.9859153 \\ \hline
31+
SRR18908372 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 1725 & 103 & 0.4196878 & 0.42710626 & 0.4811454 \\ \hline
32+
SRR20210997 & Acids & Light ($N$=8, $M$=0, $P$=0, $Q_{\text{max}}$=1) & 65536 & 31130 & 1.9040625 & 1.9258752 & 2.001345 \\ \hline
33+
SRR20210997 & Q. Scores & Generic ($N$=3, $M$=3, $P$=0) & 2355 & 224 & 0.45801452 & 0.45916292 & 0.4842346 \\ \hline
34+
SRR19609907 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 71799 & 13958 & 1.8250997 & 1.8819417 & 1.9684256 \\ \hline
35+
SRR19609907 & Q. Scores & Light ($N$=0, $M$=4, $P$=3, $Q_{\text{max}}$=16) & 868 & 333 & 0.47679535 & 0.47716048 & 0.5075786 \\ \hline
36+
SRR16141966 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 59081 & 38592 & 0.8189294 & 0.8645096 & 1.954075 \\ \hline
37+
SRR16141966 & Q. Scores & Light ($N$=0, $M$=3, $P$=0, $Q_{\text{max}}$=32) & 27525 & 463 & 0.7264000 & 0.8849423 & 1.1798589 \\ \hline
38+
ERR5462922 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 81521 & 38902 & 0.5528092 & 0.7684268 & 1.9750485 \\ \hline
39+
ERR5462922 & Q. Scores & Light ($N$=2, $M$=4, $P$=2, $Q_{\text{max}}$=8) & 5041 & 9 & 0.15876424 & 0.1647008 & 0.17681403 \\ \hline
40+
SRR18718246 & Acids & Generic ($N$=8, $M$=0, $P$=0) & 75556 & 20690 & 0.33925304 & 0.38645545 & 1.9909452 \\ \hline
41+
SRR18718246 & Q. Scores & Generic ($N$=4, $M$=1, $P$=2) & 24377 & 9566 & 1.9584363 & 2.020547 & 2.7235699 \\ \hline
42+
\end{tblr}
43+
\end{landscape}

apx-test-data.tex

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
\section{Test data\hfill}\label{sec:test-data}
2+
3+
\centering
4+
\begin{tblr}{
5+
colspec = {|l|l|l|},
6+
row{odd} = {gray9},
7+
row{even} = {white},
8+
row{1} = {gray8},
9+
}
10+
\hline
11+
Dataset & Instrument Model & Species \\
12+
\hline
13+
ERR174310 & Illumina HiSeq 2000 & Homo sapiens (human) \\ \hline
14+
SRR8861483 & Illumina NovaSeq 6000 & Homo sapiens (human) \\ \hline
15+
SRR2962693 & Illumina HiSeq 2500 & Homo sapiens (human) \\ \hline
16+
SRR19549058 & Illumina HiSeq 2500 & Burkholderia stabilis (bacteria) \\ \hline
17+
m64187e & Sequel IIe System & SARS-CoV-2 (virus) \\ \hline
18+
SRR5373739 & Illumina HiSeq 2500 & Felis catus (cat) \\ \hline
19+
SRR18908372 & Illumina NovaSeq 6000 & Felis catus (siberian cat) \\ \hline
20+
SRR20210997 & Illumina HiSeq 2500 & Salmonella (bacteria) \\ \hline
21+
SRR19609907 & Illumina HiSeq 2500 & Pyrus spp (pear tree) \\ \hline
22+
SRR16141966 & Illumina HiSeq 2500 & E. coli (bacteria) \\ \hline
23+
ERR5462922 & Illumina iSeq 100 & EBOV (ebola virus) \\ \hline
24+
SRR18718246 & Illumina MiSeq & HIV-1 (virus) \\ \hline
25+
SRR6123542 & Illumina HiSeq 2500 & E. coli (bacteria) \\ \hline
26+
SRR2747516 & Illumina HiSeq 2500 & Canis lupus familiaris (Shiba Inu dog) \\ \hline
27+
\end{tblr}

0 commit comments

Comments
 (0)