-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbzip2_compressor.sf
151 lines (104 loc) · 3.56 KB
/
bzip2_compressor.sf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#!/usr/bin/ruby
# Author: Trizen
# Date: 20 August 2024
# Translated: 04 October 2024
# https://github.com/trizen
# A very basic Bzip2 compressor.
# References:
# BZIP2: Format Specification, by Joe Tsai
# https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf
require('Compression::Util')
define CompUtil = %S<Compression::Util>
define CHUNK_SIZE = (1 << 17)
STDIN.binmode(:raw)
STDOUT.binmode(:raw)
func encode_mtf_alphabet(alphabet) {
var table = Set(alphabet...)
var populated = 0
var marked = []
for i in (0..255 `by` 16) {
var enc = 0
for j in (^16) {
if (table.has(i+j)) {
enc |= (1 << j)
}
}
populated <<= 1
if (enc > 0) {
populated |= 1
marked.push(enc)
}
}
STDERR.printf("Populated: %016b\n", populated)
STDERR.say("Marked: #{marked}")
return (populated, marked)
}
func encode_code_lengths(dict) {
var lengths = []
for symbol in (0 .. (dict.keys.map{.to_i}.max \\ 0)) {
if (dict.has(symbol)) {
lengths << dict{symbol}.len
}
else {
die "Incomplete Huffman tree not supported"
lengths << 0
}
}
STDERR.say("Code lengths: #{lengths}")
var deltas = CompUtil.deltas(lengths)
STDERR.say("Code lengths deltas: #{deltas}")
var bitstring = (CompUtil.int2bits(deltas.shift, 5).to_s + '0')
for d in (deltas) {
bitstring += (((d > 0) ? ('10' * d) : ('11' * abs(d))) + '0')
}
STDERR.say("Deltas bitstring: #{bitstring}")
return bitstring
}
var s = "Hello, World!\n"
var fh = (STDIN.is_on_tty ? s.open_r(:raw) : STDIN)
print "BZh"
var level = 9
if (!level.is_int || !level.is_between(1, 9)) {
die "Invalid level value: #{level}"
}
print level
var block_header_bitstring = unpack("B48", "1AY&SY")
var block_footer_bitstring = unpack("B48", "\27rE8P\x90")
var bitstring = FileHandle.new_buf(:raw)
var stream_crc32 = 0
while (!fh.eof) {
fh.read(\var chunk, CHUNK_SIZE)
bitstring << block_header_bitstring
var crc32 = CompUtil.crc32(pack('B*', unpack('b*', chunk)))
STDERR.say("CRC32: #{crc32}")
crc32 = Num(CompUtil.int2bits_lsb(crc32, 32), 2)
STDERR.say("Bzip2-CRC32: #{crc32}")
stream_crc32 = ((crc32 ^ ((stream_crc32 << 1) | ((stream_crc32 >> 31) & 0x1))) & 0xffffffff)
bitstring << CompUtil.int2bits(crc32, 32)
bitstring << '0' # not randomized
var rle4 = CompUtil.rle4_encode(chunk)
var (bwt, bwt_idx) = CompUtil.bwt_encode(CompUtil.symbols2string(rle4))
bitstring << CompUtil.int2bits(bwt_idx, 24)
var (mtf, alphabet) = CompUtil.mtf_encode(bwt)
STDERR.say("MTF Alphabet: #{alphabet}")
var (populated, marked) = encode_mtf_alphabet(alphabet)
bitstring << CompUtil.int2bits(populated, 16)
marked.each {|v|
bitstring << CompUtil.int2bits_lsb(v, 16)
}
var zrle = CompUtil.zrle_encode(mtf.flip).flip
var eob = (alphabet.len + 1) # end-of-block symbol
STDERR.say("EOB symbol: #{eob}")
zrle << eob
var dict = CompUtil.huffman_from_symbols([zrle..., (^eob)...])
var num_sels = ceil(zrle.len / 50)
STDERR.say("Number of selectors: #{num_sels}")
bitstring << CompUtil.int2bits(2, 3)
bitstring << CompUtil.int2bits(num_sels, 15)
bitstring << ('0' * num_sels)
bitstring << (encode_code_lengths(dict) * 2)
bitstring << join('', dict{zrle...})
}
bitstring << block_footer_bitstring
bitstring << CompUtil.int2bits(stream_crc32, 32)
print pack("B*", bitstring.parent)