Skip to content

Commit 120ba32

Browse files
committed
Add more complete LZ4 testing
1 parent c371c7d commit 120ba32

24 files changed

+578
-224
lines changed

ofrak_core/src/ofrak/core/lz4.py

Lines changed: 219 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,29 @@
1+
"""
2+
Lz4 Components.
3+
4+
Lz4Unpacker currently supports unpacking modern LZ4 format (Lz4ModernData),
5+
legacy format (see Lz4LegacyData), and skippable data (Lz4SkippableData).
6+
7+
Lz4Packer supports repacking the modern LZ4 format (Lz4ModernData), matching block/checksum
8+
information extracted during unpacking. Compression level can be specified via config.
9+
10+
Lz4LegacyPacker supports repacking legacy LZ4 format (Lz4LegacyData) with compression level
11+
support (default/fast/high modes). Compression level can be specified via config.
12+
"""
113
import logging
214
from dataclasses import dataclass
315

4-
import lz4.frame # type: ignore
516
import lz4.block # type: ignore
6-
7-
from ofrak.component.identifier import Identifier
17+
import lz4.frame # type: ignore
818
from ofrak.component.packer import Packer
919
from ofrak.component.unpacker import Unpacker
1020
from ofrak.core.binary import GenericBinary
11-
from ofrak.core.magic import MagicDescriptionPattern, MagicMimePattern
21+
from ofrak.core.magic import RawMagicPattern
22+
from ofrak.model.component_model import ComponentConfig
1223
from ofrak.resource import Resource
1324
from ofrak_type.range import Range
1425

26+
1527
LOGGER = logging.getLogger(__name__)
1628

1729
# LZ4 frame magic numbers (little-endian)
@@ -33,27 +45,21 @@ class Lz4Data(GenericBinary):
3345
@dataclass
3446
class Lz4ModernData(Lz4Data):
3547
"""
36-
LZ4 modern frame format (default).
37-
38-
The modern LZ4 frame format includes:
39-
- Frame descriptor with flags
40-
- Optional content size and dictionary ID
41-
- Block independence flags
42-
- Optional checksums (content and block)
43-
- End mark
48+
LZ4 modern frame format (v1.4+).
4449
"""
4550

51+
block_size: int
52+
block_size_id: int
53+
block_linked: bool
54+
content_checksum: bool
55+
block_checksum: bool
56+
content_size: int
57+
4658

4759
@dataclass
4860
class Lz4LegacyData(Lz4Data):
4961
"""
50-
LZ4 legacy frame format.
51-
52-
Older LZ4 format predating the frame specification:
53-
- Simpler structure
54-
- No checksums or metadata
55-
- Fixed 8MB max block size
56-
- Deprecated but still encountered in the wild
62+
LZ4 legacy frame format (v0.1-v0.9).
5763
"""
5864

5965

@@ -62,118 +68,251 @@ class Lz4SkippableData(Lz4Data):
6268
"""
6369
LZ4 skippable frame.
6470
65-
Special frame type for embedding metadata or application-specific data:
66-
- Not compressed data
67-
- Contains arbitrary bytes
68-
- LZ4 parsers can safely skip these frames
69-
- Typically used alongside regular frames
71+
Special frame type for embedding metadata or application-specific data.
7072
"""
7173

7274

73-
class Lz4Identifier(Identifier):
75+
class Lz4Unpacker(Unpacker[None]):
7476
"""
75-
Identify LZ4 compressed data by checking magic bytes.
77+
Unpack (decompress) LZ4 modern frame format files.
7678
77-
Recognizes all LZ4 frame types:
78-
- Modern/default frames (0x184D2204)
79-
- Legacy frames (0x184C2102)
80-
- Skippable frames (0x184D2A50-0x184D2A5F)
79+
Supports:
80+
- Modern frame format (Lz4ModernData)
81+
- Skippable frames (metadata containers: Lz4SkippableData)
8182
"""
8283

83-
id = b"Lz4Identifier"
84-
targets = (GenericBinary,)
84+
id = b"Lz4Unpacker"
85+
targets = (Lz4ModernData, Lz4SkippableData)
86+
children = (GenericBinary,)
8587

86-
async def identify(self, resource: Resource, config=None) -> None:
87-
data = await resource.get_data(Range(0, 4))
88+
async def unpack(self, resource: Resource, config=None):
89+
"""
90+
Unpack LZ4 data.
91+
92+
:param resource: The LZ4 resource to unpack
8893
89-
if len(data) < 4:
90-
return
94+
:raises RuntimeError: if the data is not valid LZ4 format
95+
"""
96+
resource_data = await resource.get_data()
9197

92-
# Check for modern frame
93-
if data == LZ4_MODERN_MAGIC:
94-
resource.add_tag(Lz4ModernData)
95-
return
98+
if resource.has_tag(Lz4ModernData):
99+
# lz4.frame.get_frame_info() does not support legacy frames
100+
frame_info = lz4.frame.get_frame_info(resource_data)
101+
resource.add_view(
102+
Lz4ModernData(
103+
block_size=frame_info["block_size"],
104+
block_size_id=frame_info["block_size_id"],
105+
block_linked=frame_info["block_linked"],
106+
content_checksum=frame_info["content_checksum"],
107+
block_checksum=frame_info["block_checksum"],
108+
content_size=frame_info["content_size"],
109+
)
110+
)
96111

97-
# Check for legacy frame
98-
if data == LZ4_LEGACY_MAGIC:
99-
resource.add_tag(Lz4LegacyData)
100-
return
112+
try:
113+
decompressed_data = lz4.frame.decompress(resource_data)
114+
except RuntimeError as e:
115+
LOGGER.error(f"Failed to decompress LZ4 data: {e}")
116+
raise
101117

102-
# Check for skippable frames
103-
# Format: 0x5X 0x2A 0x4D 0x18 where X is 0-F
104-
if data[1:4] == b"\x2a\x4d\x18" and 0x50 <= data[0] <= 0x5F:
105-
resource.add_tag(Lz4SkippableData)
106-
return
118+
await resource.create_child(
119+
tags=(GenericBinary,),
120+
data=decompressed_data,
121+
)
107122

108123

109-
class Lz4Unpacker(Unpacker[None]):
124+
class Lz4LegacyUnpacker(Unpacker[None]):
110125
"""
111-
Unpack (decompress) LZ4 files of all frame types.
126+
Unpack (decompress) LZ4 legacy frame format files.
112127
113-
Supports:
114-
- Modern frame format (most common)
115-
- Legacy frame format (deprecated)
116-
- Skippable frames (metadata containers)
128+
Legacy format (v0.1-v0.9) uses lz4.block decompression instead of lz4.frame.
117129
"""
118130

119-
id = b"Lz4Unpacker"
120-
targets = (Lz4ModernData, Lz4LegacyData, Lz4SkippableData)
131+
id = b"Lz4LegacyUnpacker"
132+
targets = (Lz4LegacyData,)
121133
children = (GenericBinary,)
122134

123135
async def unpack(self, resource: Resource, config=None):
124136
"""
125-
Unpack LZ4 data.
137+
Unpack LZ4 legacy data.
126138
127-
:param resource: The LZ4 resource to unpack
128-
:param config: Optional unpacker configuration
139+
:param resource: The LZ4 legacy resource to unpack
129140
130-
:raises RuntimeError: if the data is not valid LZ4 format
141+
:raises RuntimeError: if the data is not valid LZ4 legacy format
131142
"""
132143
resource_data = await resource.get_data()
133144

145+
# Parse legacy header: 4 bytes magic + 4 bytes block size
146+
if len(resource_data) < 8:
147+
raise RuntimeError("Invalid LZ4 legacy format: file too short")
148+
149+
# Note: The header field is the compressed block size, not uncompressed size
150+
block_size = int.from_bytes(resource_data[4:8], "little")
151+
compressed_block = resource_data[8:]
152+
153+
# Validate block size matches actual data
154+
if len(compressed_block) != block_size:
155+
raise RuntimeError(
156+
f"Invalid LZ4 legacy format: header says {block_size} bytes but found {len(compressed_block)}"
157+
)
158+
134159
try:
135-
decompressed_data = lz4.frame.decompress(resource_data)
136-
except RuntimeError as e:
137-
LOGGER.error(f"Failed to decompress LZ4 data: {e}")
138-
raise
160+
# LZ4 legacy blocks don't store uncompressed size, so we need to provide
161+
# a large enough buffer. Use a generous multiplier to handle any compression ratio.
162+
max_uncompressed_size = block_size * 255 # LZ4 max compression ratio
163+
decompressed_data = lz4.block.decompress(
164+
compressed_block, uncompressed_size=max_uncompressed_size
165+
)
166+
except Exception as e:
167+
LOGGER.error(f"Failed to decompress LZ4 legacy data: {e}")
168+
raise RuntimeError(f"LZ4 legacy decompression failed: {e}")
139169

140170
await resource.create_child(
141171
tags=(GenericBinary,),
142172
data=decompressed_data,
143173
)
144174

145175

146-
class Lz4Packer(Packer[None]):
176+
@dataclass
177+
class Lz4PackerConfig(ComponentConfig):
147178
"""
148-
Pack data into a compressed LZ4 file using modern frame format.
179+
Configuration for LZ4 packer.
180+
181+
compression_level: Compression level to use (default: 0).
182+
- Negative values: Fast acceleration (faster, less compression)
183+
- 0-2: Minimum compression (default, all produce same output)
184+
- 3: Minimum high-compression mode
185+
- 4-16: Higher compression levels (16 is maximum)
186+
"""
187+
188+
compression_level: int = 0
149189

150-
Note: Only creates modern frame format. Legacy frames and skippable frames
151-
cannot be repacked:
152-
- Legacy format is deprecated and not supported by the Python lz4 library
153-
- Skippable frames are metadata containers and don't make semantic sense to pack
154190

155-
If you unpack a legacy or skippable frame and repack, it will be converted
156-
to modern frame format.
191+
class Lz4Packer(Packer[Lz4PackerConfig]):
192+
"""
193+
Pack data into a compressed LZ4 file using modern frame format.
194+
195+
Implementation repacks modern frame format preserving frame metadata.
196+
Compression level can be specified via config (default: 0).
157197
"""
158198

159199
targets = (Lz4ModernData,)
160200

161-
async def pack(self, resource: Resource, config=None):
201+
async def pack(self, resource: Resource, config: Lz4PackerConfig = None):
162202
"""
163-
Pack data into LZ4 modern frame format.
203+
Pack data into `Lz4ModernData` format.
164204
165205
:param resource: The LZ4 resource to pack
166-
:param config: Optional packer configuration
206+
:param config: Optional configuration specifying compression level
167207
"""
208+
if config is None:
209+
config = Lz4PackerConfig()
210+
168211
lz4_child = await resource.get_only_child()
169212
child_data = await lz4_child.get_data()
170213

171-
lz4_compressed = lz4.frame.compress(child_data)
214+
# Use stored compression settings from the view
215+
lz4_view = await resource.view_as(Lz4ModernData)
216+
content_checksum = lz4_view.content_checksum
217+
block_checksum = lz4_view.block_checksum
218+
block_size = lz4_view.block_size
219+
store_size = lz4_view.content_size != 0
220+
221+
lz4_compressed = lz4.frame.compress(
222+
child_data,
223+
compression_level=config.compression_level,
224+
content_checksum=content_checksum,
225+
block_checksum=block_checksum,
226+
block_size=block_size,
227+
store_size=store_size,
228+
)
172229

173230
original_size = await resource.get_data_length()
174231
resource.queue_patch(Range(0, original_size), lz4_compressed)
175232

176233

177-
# Register magic patterns for automatic identification
178-
MagicMimePattern.register(Lz4Data, "application/x-lz4")
179-
MagicDescriptionPattern.register(Lz4Data, lambda s: s.lower().startswith("lz4 compressed data"))
234+
class Lz4LegacyPacker(Packer[Lz4PackerConfig]):
235+
"""
236+
Pack data into compressed LZ4 legacy format.
237+
238+
Legacy format supports compression levels via lz4.block.compress():
239+
- Negative values: Fast mode with acceleration
240+
- 0: Default compression
241+
- 1-12: High compression mode
242+
"""
243+
244+
targets = (Lz4LegacyData,)
245+
246+
async def pack(self, resource: Resource, config: Lz4PackerConfig = None):
247+
"""
248+
Pack data into `Lz4LegacyData` format.
249+
250+
:param resource: The LZ4 legacy resource to pack
251+
:param config: Optional configuration specifying compression level
252+
"""
253+
if config is None:
254+
config = Lz4PackerConfig()
255+
256+
lz4_child = await resource.get_only_child()
257+
child_data = await lz4_child.get_data()
258+
259+
# Map compression_level to lz4.block.compress() parameters
260+
# This matches the lz4 CLI behavior for legacy format:
261+
# - Level < 0: fast mode with acceleration = -level
262+
if config.compression_level < 0:
263+
# Fast mode with acceleration
264+
compressed_block = lz4.block.compress(
265+
child_data,
266+
mode="fast",
267+
acceleration=abs(config.compression_level),
268+
store_size=False,
269+
)
270+
# - Level 0-2: fast mode with acceleration = 0
271+
elif config.compression_level < 3:
272+
# Fast mode with acceleration = 0 (levels 0, 1, 2)
273+
compressed_block = lz4.block.compress(
274+
child_data, mode="fast", acceleration=0, store_size=False
275+
)
276+
# - Level >= 3: high compression mode
277+
else:
278+
# High compression mode (3-12)
279+
compressed_block = lz4.block.compress(
280+
child_data,
281+
mode="high_compression",
282+
compression=config.compression_level,
283+
store_size=False,
284+
)
285+
286+
# Build legacy header: magic (4 bytes) + compressed_block_size (4 bytes)
287+
compressed_block_size = len(compressed_block)
288+
header = LZ4_LEGACY_MAGIC + compressed_block_size.to_bytes(4, "little")
289+
290+
# Combine header + compressed block
291+
lz4_compressed = header + compressed_block
292+
293+
original_size = await resource.get_data_length()
294+
resource.queue_patch(Range(0, original_size), lz4_compressed)
295+
296+
297+
def match_lz4_modern_magic(data: bytes) -> bool:
298+
if len(data) < 4:
299+
return False
300+
return data[:4] == LZ4_MODERN_MAGIC
301+
302+
303+
def match_lz4_legacy_magic(data: bytes) -> bool:
304+
if len(data) < 4:
305+
return False
306+
return data[:4] == LZ4_LEGACY_MAGIC
307+
308+
309+
def match_lz4_skippable_magic(data: bytes) -> bool:
310+
if len(data) < 4:
311+
return False
312+
# Format: 0x5X 0x2A 0x4D 0x18 where X is 0-F
313+
return data[1:4] == b"\x2a\x4d\x18" and 0x50 <= data[0] <= 0x5F
314+
315+
316+
RawMagicPattern.register(Lz4ModernData, match_lz4_modern_magic)
317+
RawMagicPattern.register(Lz4LegacyData, match_lz4_legacy_magic)
318+
RawMagicPattern.register(Lz4SkippableData, match_lz4_skippable_magic)

0 commit comments

Comments
 (0)