diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml new file mode 100644 index 0000000..56c2ac4 --- /dev/null +++ b/.github/workflows/main.yml @@ -0,0 +1,19 @@ +name: Ruff +on: push +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.13.7" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff pytest + - name: Run Ruff + run: ruff check --output-format=github . + - name: Run unit-tests + run: python -m pytest tests/* diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..bc63beb --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +# empty \ No newline at end of file diff --git a/src/huffman/huffman_binary.py b/src/huffman/huffman_binary.py new file mode 100644 index 0000000..92a5ba6 --- /dev/null +++ b/src/huffman/huffman_binary.py @@ -0,0 +1,14 @@ +def bits_to_bytes(bits: str) -> tuple[bytes, int]: + """Преобразует битовую строку в байты и возвращает (байты, padding).""" + padding = (8 - len(bits) % 8) % 8 + padded_bits = bits + "0" * padding + data = int(padded_bits, 2).to_bytes(len(padded_bits) // 8, "big") + return data, padding + + +def bytes_to_bits(data: bytes, padding: int) -> str: + """Преобразует байты в битовую строку, убирая добивочные нули.""" + raw_bits = bin(int.from_bytes(data, "big"))[2:].zfill(len(data) * 8) + if padding: + raw_bits = raw_bits[:-padding] + return raw_bits diff --git a/src/huffman/huffman_codec.py b/src/huffman/huffman_codec.py new file mode 100644 index 0000000..f1fc627 --- /dev/null +++ b/src/huffman/huffman_codec.py @@ -0,0 +1,36 @@ +import json +from .huffman_encoding import encode, decode +from .huffman_binary import bits_to_bytes, bytes_to_bits + + +def encode_file(input_path: str, output_path: str): + # читаем исходный текст + text = open(input_path, "r", encoding="utf-8").read() + + # кодируем в биты + encoded_bits, codes = encode(text) + + # в байты + data_bytes, padding = bits_to_bytes(encoded_bits) + + # сериализация таблицы + codes_json = json.dumps(codes).encode("utf-8") + + with open(output_path, "wb") as f: + f.write(len(codes_json).to_bytes(4, "big")) # длина таблицы + f.write(codes_json) # таблица + f.write(bytes([padding])) # padding + f.write(data_bytes) # данные + + +def decode_file(input_path: str, output_path: str): + with open(input_path, "rb") as f: + table_size = int.from_bytes(f.read(4), "big") + codes = json.loads(f.read(table_size).decode("utf-8")) + padding = f.read(1)[0] + data_bytes = f.read() + + bits = bytes_to_bits(data_bytes, padding) + text = decode(bits, codes) + + open(output_path, "w", encoding="utf-8").write(text) diff --git a/src/huffman/huffman_encoding.py b/src/huffman/huffman_encoding.py new file mode 100644 index 0000000..6122fa5 --- /dev/null +++ b/src/huffman/huffman_encoding.py @@ -0,0 +1,62 @@ +def build_huffman_codes(text: str) -> dict[str, str]: + # если текст пустой — возвращаем пустой словарь + if not text: + return {} + + # подсчёт частоты каждого символа + frequency = {} + for char in text: + frequency[char] = frequency.get(char, 0) + 1 + + # если только один уникальный символ — простой код + if len(frequency) == 1: + only_char = next(iter(frequency)) + return {only_char: "0"} + + # узлы дерева: (частота, символ/поддерево) + nodes = [(freq, char) for char, freq in frequency.items()] + + # строим дерево Хаффмана + while len(nodes) > 1: + nodes.sort(key=lambda x: x[0]) + freq1, node1 = nodes.pop(0) + freq2, node2 = nodes.pop(0) + nodes.append((freq1 + freq2, (node1, node2))) + + # корень дерева + (_, root) = nodes[0] + codes = {} + + def traverse(node, code): + if isinstance(node, str): + codes[node] = code + else: + left, right = node + traverse(left, code + "0") + traverse(right, code + "1") + + traverse(root, "") + return codes + + +def encode(text: str) -> tuple[str, dict[str, str]]: + codes = build_huffman_codes(text) + encoded_bits = "".join(codes[ch] for ch in text) if text else "" + return encoded_bits, codes + + +def decode(bits: str, codes: dict[str, str]) -> str: + if not bits or not codes: + return "" + + reverse_codes = {code: ch for ch, code in codes.items()} + decoded = [] + current = "" + + for bit in bits: + current += bit + if current in reverse_codes: + decoded.append(reverse_codes[current]) + current = "" + + return "".join(decoded) diff --git a/tests/test_huffman.py b/tests/test_huffman.py new file mode 100644 index 0000000..7eb73ec --- /dev/null +++ b/tests/test_huffman.py @@ -0,0 +1,75 @@ +from src.huffman.huffman_encoding import encode, decode +from src.huffman.huffman_codec import encode_file, decode_file + + +def test_encode_decode_basic_string(): + text = "hello huffman" + encoded, codes = encode(text) + assert decode(encoded, codes) == text + + +def test_encode_decode_single_char(): + text = "aaaaaa" + encoded, codes = encode(text) + assert codes == {"a": "0"} + assert encoded == "000000" + assert decode(encoded, codes) == text + + +def test_encode_decode_empty_string(): + text = "" + encoded, codes = encode(text) + assert encoded == "" + assert codes == {} + assert decode(encoded, codes) == text + + +def test_encode_two_chars_manual(): + text = "ab" + encoded, codes = encode(text) + + assert codes == { + "a": "0", + "b": "1", + } + assert encoded == "01" + assert decode(encoded, codes) == text + + +def test_encode_aaab_manual(): + text = "aaab" + encoded, codes = encode(text) + + assert codes == { + "b": "0", + "a": "1", + } + assert encoded == "1110" + assert decode(encoded, codes) == text + + +def test_encode_abc_manual(): + text = "abc" + encoded, codes = encode(text) + + assert codes == { + "c": "0", + "a": "10", + "b": "11", + } + assert encoded == "10110" + assert decode(encoded, codes) == text + + +def test_encode_decode_file(tmp_path): + input_file = tmp_path / "input.txt" + compressed_file = tmp_path / "compressed.huf" + output_file = tmp_path / "output.txt" + + text = "This is a test of the Huffman codec.\n" * 10 + input_file.write_text(text, encoding="utf-8") + + encode_file(str(input_file), str(compressed_file)) + decode_file(str(compressed_file), str(output_file)) + + assert output_file.read_text(encoding="utf-8") == text