1+ """
2+ Lz4 Components.
3+
4+ Lz4Unpacker currently supports unpacking modern LZ4 format (Lz4ModernData),
5+ legacy format (see Lz4LegacyData), and skippable data (Lz4SkippableData).
6+
7+ Lz4Packer supports repacking the modern LZ4 format (Lz4ModernData), matching block/checksum
8+ information extracted during unpacking. Compression level can be specified via config.
9+
10+ Lz4LegacyPacker supports repacking legacy LZ4 format (Lz4LegacyData) with compression level
11+ support (default/fast/high modes). Compression level can be specified via config.
12+ """
113import logging
214from dataclasses import dataclass
315
4- import lz4 .frame # type: ignore
516import lz4 .block # type: ignore
6-
7- from ofrak .component .identifier import Identifier
17+ import lz4 .frame # type: ignore
818from ofrak .component .packer import Packer
919from ofrak .component .unpacker import Unpacker
1020from ofrak .core .binary import GenericBinary
11- from ofrak .core .magic import MagicDescriptionPattern , MagicMimePattern
21+ from ofrak .core .magic import RawMagicPattern
22+ from ofrak .model .component_model import ComponentConfig
1223from ofrak .resource import Resource
1324from ofrak_type .range import Range
1425
26+
1527LOGGER = logging .getLogger (__name__ )
1628
1729# LZ4 frame magic numbers (little-endian)
@@ -33,27 +45,21 @@ class Lz4Data(GenericBinary):
3345@dataclass
3446class Lz4ModernData (Lz4Data ):
3547 """
36- LZ4 modern frame format (default).
37-
38- The modern LZ4 frame format includes:
39- - Frame descriptor with flags
40- - Optional content size and dictionary ID
41- - Block independence flags
42- - Optional checksums (content and block)
43- - End mark
48+ LZ4 modern frame format (v1.4+).
4449 """
4550
51+ block_size : int
52+ block_size_id : int
53+ block_linked : bool
54+ content_checksum : bool
55+ block_checksum : bool
56+ content_size : int
57+
4658
4759@dataclass
4860class Lz4LegacyData (Lz4Data ):
4961 """
50- LZ4 legacy frame format.
51-
52- Older LZ4 format predating the frame specification:
53- - Simpler structure
54- - No checksums or metadata
55- - Fixed 8MB max block size
56- - Deprecated but still encountered in the wild
62+ LZ4 legacy frame format (v0.1-v0.9).
5763 """
5864
5965
@@ -62,118 +68,251 @@ class Lz4SkippableData(Lz4Data):
6268 """
6369 LZ4 skippable frame.
6470
65- Special frame type for embedding metadata or application-specific data:
66- - Not compressed data
67- - Contains arbitrary bytes
68- - LZ4 parsers can safely skip these frames
69- - Typically used alongside regular frames
71+ Special frame type for embedding metadata or application-specific data.
7072 """
7173
7274
73- class Lz4Identifier ( Identifier ):
75+ class Lz4Unpacker ( Unpacker [ None ] ):
7476 """
75- Identify LZ4 compressed data by checking magic bytes .
77+ Unpack (decompress) LZ4 modern frame format files .
7678
77- Recognizes all LZ4 frame types:
78- - Modern/default frames (0x184D2204)
79- - Legacy frames (0x184C2102)
80- - Skippable frames (0x184D2A50-0x184D2A5F)
79+ Supports:
80+ - Modern frame format (Lz4ModernData)
81+ - Skippable frames (metadata containers: Lz4SkippableData)
8182 """
8283
83- id = b"Lz4Identifier"
84- targets = (GenericBinary ,)
84+ id = b"Lz4Unpacker"
85+ targets = (Lz4ModernData , Lz4SkippableData )
86+ children = (GenericBinary ,)
8587
86- async def identify (self , resource : Resource , config = None ) -> None :
87- data = await resource .get_data (Range (0 , 4 ))
88+ async def unpack (self , resource : Resource , config = None ):
89+ """
90+ Unpack LZ4 data.
91+
92+ :param resource: The LZ4 resource to unpack
8893
89- if len (data ) < 4 :
90- return
94+ :raises RuntimeError: if the data is not valid LZ4 format
95+ """
96+ resource_data = await resource .get_data ()
9197
92- # Check for modern frame
93- if data == LZ4_MODERN_MAGIC :
94- resource .add_tag (Lz4ModernData )
95- return
98+ if resource .has_tag (Lz4ModernData ):
99+ # lz4.frame.get_frame_info() does not support legacy frames
100+ frame_info = lz4 .frame .get_frame_info (resource_data )
101+ resource .add_view (
102+ Lz4ModernData (
103+ block_size = frame_info ["block_size" ],
104+ block_size_id = frame_info ["block_size_id" ],
105+ block_linked = frame_info ["block_linked" ],
106+ content_checksum = frame_info ["content_checksum" ],
107+ block_checksum = frame_info ["block_checksum" ],
108+ content_size = frame_info ["content_size" ],
109+ )
110+ )
96111
97- # Check for legacy frame
98- if data == LZ4_LEGACY_MAGIC :
99- resource .add_tag (Lz4LegacyData )
100- return
112+ try :
113+ decompressed_data = lz4 .frame .decompress (resource_data )
114+ except RuntimeError as e :
115+ LOGGER .error (f"Failed to decompress LZ4 data: { e } " )
116+ raise
101117
102- # Check for skippable frames
103- # Format: 0x5X 0x2A 0x4D 0x18 where X is 0-F
104- if data [1 :4 ] == b"\x2a \x4d \x18 " and 0x50 <= data [0 ] <= 0x5F :
105- resource .add_tag (Lz4SkippableData )
106- return
118+ await resource .create_child (
119+ tags = (GenericBinary ,),
120+ data = decompressed_data ,
121+ )
107122
108123
109- class Lz4Unpacker (Unpacker [None ]):
124+ class Lz4LegacyUnpacker (Unpacker [None ]):
110125 """
111- Unpack (decompress) LZ4 files of all frame types .
126+ Unpack (decompress) LZ4 legacy frame format files .
112127
113- Supports:
114- - Modern frame format (most common)
115- - Legacy frame format (deprecated)
116- - Skippable frames (metadata containers)
128+ Legacy format (v0.1-v0.9) uses lz4.block decompression instead of lz4.frame.
117129 """
118130
119- id = b"Lz4Unpacker "
120- targets = (Lz4ModernData , Lz4LegacyData , Lz4SkippableData )
131+ id = b"Lz4LegacyUnpacker "
132+ targets = (Lz4LegacyData ,)
121133 children = (GenericBinary ,)
122134
123135 async def unpack (self , resource : Resource , config = None ):
124136 """
125- Unpack LZ4 data.
137+ Unpack LZ4 legacy data.
126138
127- :param resource: The LZ4 resource to unpack
128- :param config: Optional unpacker configuration
139+ :param resource: The LZ4 legacy resource to unpack
129140
130- :raises RuntimeError: if the data is not valid LZ4 format
141+ :raises RuntimeError: if the data is not valid LZ4 legacy format
131142 """
132143 resource_data = await resource .get_data ()
133144
145+ # Parse legacy header: 4 bytes magic + 4 bytes block size
146+ if len (resource_data ) < 8 :
147+ raise RuntimeError ("Invalid LZ4 legacy format: file too short" )
148+
149+ # Note: The header field is the compressed block size, not uncompressed size
150+ block_size = int .from_bytes (resource_data [4 :8 ], "little" )
151+ compressed_block = resource_data [8 :]
152+
153+ # Validate block size matches actual data
154+ if len (compressed_block ) != block_size :
155+ raise RuntimeError (
156+ f"Invalid LZ4 legacy format: header says { block_size } bytes but found { len (compressed_block )} "
157+ )
158+
134159 try :
135- decompressed_data = lz4 .frame .decompress (resource_data )
136- except RuntimeError as e :
137- LOGGER .error (f"Failed to decompress LZ4 data: { e } " )
138- raise
160+ # LZ4 legacy blocks don't store uncompressed size, so we need to provide
161+ # a large enough buffer. Use a generous multiplier to handle any compression ratio.
162+ max_uncompressed_size = block_size * 255 # LZ4 max compression ratio
163+ decompressed_data = lz4 .block .decompress (
164+ compressed_block , uncompressed_size = max_uncompressed_size
165+ )
166+ except Exception as e :
167+ LOGGER .error (f"Failed to decompress LZ4 legacy data: { e } " )
168+ raise RuntimeError (f"LZ4 legacy decompression failed: { e } " )
139169
140170 await resource .create_child (
141171 tags = (GenericBinary ,),
142172 data = decompressed_data ,
143173 )
144174
145175
146- class Lz4Packer (Packer [None ]):
176+ @dataclass
177+ class Lz4PackerConfig (ComponentConfig ):
147178 """
148- Pack data into a compressed LZ4 file using modern frame format.
179+ Configuration for LZ4 packer.
180+
181+ compression_level: Compression level to use (default: 0).
182+ - Negative values: Fast acceleration (faster, less compression)
183+ - 0-2: Minimum compression (default, all produce same output)
184+ - 3: Minimum high-compression mode
185+ - 4-16: Higher compression levels (16 is maximum)
186+ """
187+
188+ compression_level : int = 0
149189
150- Note: Only creates modern frame format. Legacy frames and skippable frames
151- cannot be repacked:
152- - Legacy format is deprecated and not supported by the Python lz4 library
153- - Skippable frames are metadata containers and don't make semantic sense to pack
154190
155- If you unpack a legacy or skippable frame and repack, it will be converted
156- to modern frame format.
191+ class Lz4Packer (Packer [Lz4PackerConfig ]):
192+ """
193+ Pack data into a compressed LZ4 file using modern frame format.
194+
195+ Implementation repacks modern frame format preserving frame metadata.
196+ Compression level can be specified via config (default: 0).
157197 """
158198
159199 targets = (Lz4ModernData ,)
160200
161- async def pack (self , resource : Resource , config = None ):
201+ async def pack (self , resource : Resource , config : Lz4PackerConfig = None ):
162202 """
163- Pack data into LZ4 modern frame format.
203+ Pack data into `Lz4ModernData` format.
164204
165205 :param resource: The LZ4 resource to pack
166- :param config: Optional packer configuration
206+ :param config: Optional configuration specifying compression level
167207 """
208+ if config is None :
209+ config = Lz4PackerConfig ()
210+
168211 lz4_child = await resource .get_only_child ()
169212 child_data = await lz4_child .get_data ()
170213
171- lz4_compressed = lz4 .frame .compress (child_data )
214+ # Use stored compression settings from the view
215+ lz4_view = await resource .view_as (Lz4ModernData )
216+ content_checksum = lz4_view .content_checksum
217+ block_checksum = lz4_view .block_checksum
218+ block_size = lz4_view .block_size
219+ store_size = lz4_view .content_size != 0
220+
221+ lz4_compressed = lz4 .frame .compress (
222+ child_data ,
223+ compression_level = config .compression_level ,
224+ content_checksum = content_checksum ,
225+ block_checksum = block_checksum ,
226+ block_size = block_size ,
227+ store_size = store_size ,
228+ )
172229
173230 original_size = await resource .get_data_length ()
174231 resource .queue_patch (Range (0 , original_size ), lz4_compressed )
175232
176233
177- # Register magic patterns for automatic identification
178- MagicMimePattern .register (Lz4Data , "application/x-lz4" )
179- MagicDescriptionPattern .register (Lz4Data , lambda s : s .lower ().startswith ("lz4 compressed data" ))
234+ class Lz4LegacyPacker (Packer [Lz4PackerConfig ]):
235+ """
236+ Pack data into compressed LZ4 legacy format.
237+
238+ Legacy format supports compression levels via lz4.block.compress():
239+ - Negative values: Fast mode with acceleration
240+ - 0: Default compression
241+ - 1-12: High compression mode
242+ """
243+
244+ targets = (Lz4LegacyData ,)
245+
246+ async def pack (self , resource : Resource , config : Lz4PackerConfig = None ):
247+ """
248+ Pack data into `Lz4LegacyData` format.
249+
250+ :param resource: The LZ4 legacy resource to pack
251+ :param config: Optional configuration specifying compression level
252+ """
253+ if config is None :
254+ config = Lz4PackerConfig ()
255+
256+ lz4_child = await resource .get_only_child ()
257+ child_data = await lz4_child .get_data ()
258+
259+ # Map compression_level to lz4.block.compress() parameters
260+ # This matches the lz4 CLI behavior for legacy format:
261+ # - Level < 0: fast mode with acceleration = -level
262+ if config .compression_level < 0 :
263+ # Fast mode with acceleration
264+ compressed_block = lz4 .block .compress (
265+ child_data ,
266+ mode = "fast" ,
267+ acceleration = abs (config .compression_level ),
268+ store_size = False ,
269+ )
270+ # - Level 0-2: fast mode with acceleration = 0
271+ elif config .compression_level < 3 :
272+ # Fast mode with acceleration = 0 (levels 0, 1, 2)
273+ compressed_block = lz4 .block .compress (
274+ child_data , mode = "fast" , acceleration = 0 , store_size = False
275+ )
276+ # - Level >= 3: high compression mode
277+ else :
278+ # High compression mode (3-12)
279+ compressed_block = lz4 .block .compress (
280+ child_data ,
281+ mode = "high_compression" ,
282+ compression = config .compression_level ,
283+ store_size = False ,
284+ )
285+
286+ # Build legacy header: magic (4 bytes) + compressed_block_size (4 bytes)
287+ compressed_block_size = len (compressed_block )
288+ header = LZ4_LEGACY_MAGIC + compressed_block_size .to_bytes (4 , "little" )
289+
290+ # Combine header + compressed block
291+ lz4_compressed = header + compressed_block
292+
293+ original_size = await resource .get_data_length ()
294+ resource .queue_patch (Range (0 , original_size ), lz4_compressed )
295+
296+
297+ def match_lz4_modern_magic (data : bytes ) -> bool :
298+ if len (data ) < 4 :
299+ return False
300+ return data [:4 ] == LZ4_MODERN_MAGIC
301+
302+
303+ def match_lz4_legacy_magic (data : bytes ) -> bool :
304+ if len (data ) < 4 :
305+ return False
306+ return data [:4 ] == LZ4_LEGACY_MAGIC
307+
308+
309+ def match_lz4_skippable_magic (data : bytes ) -> bool :
310+ if len (data ) < 4 :
311+ return False
312+ # Format: 0x5X 0x2A 0x4D 0x18 where X is 0-F
313+ return data [1 :4 ] == b"\x2a \x4d \x18 " and 0x50 <= data [0 ] <= 0x5F
314+
315+
316+ RawMagicPattern .register (Lz4ModernData , match_lz4_modern_magic )
317+ RawMagicPattern .register (Lz4LegacyData , match_lz4_legacy_magic )
318+ RawMagicPattern .register (Lz4SkippableData , match_lz4_skippable_magic )
0 commit comments