Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
name: Ruff
on: push
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install Python
uses: actions/setup-python@v5
with:
python-version: "3.13.7"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install ruff pytest
- name: Run Ruff
run: ruff check --output-format=github .
- name: Run unit-tests
run: python -m pytest tests/*
1 change: 1 addition & 0 deletions src/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# empty
14 changes: 14 additions & 0 deletions src/huffman/huffman_binary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
def bits_to_bytes(bits: str) -> tuple[bytes, int]:
"""Преобразует битовую строку в байты и возвращает (байты, padding)."""
padding = (8 - len(bits) % 8) % 8
padded_bits = bits + "0" * padding
data = int(padded_bits, 2).to_bytes(len(padded_bits) // 8, "big")
return data, padding


def bytes_to_bits(data: bytes, padding: int) -> str:
"""Преобразует байты в битовую строку, убирая добивочные нули."""
raw_bits = bin(int.from_bytes(data, "big"))[2:].zfill(len(data) * 8)
if padding:
raw_bits = raw_bits[:-padding]
return raw_bits
36 changes: 36 additions & 0 deletions src/huffman/huffman_codec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import json
from .huffman_encoding import encode, decode
from .huffman_binary import bits_to_bytes, bytes_to_bits


def encode_file(input_path: str, output_path: str):
# читаем исходный текст
text = open(input_path, "r", encoding="utf-8").read()

# кодируем в биты
encoded_bits, codes = encode(text)

# в байты
data_bytes, padding = bits_to_bytes(encoded_bits)

# сериализация таблицы
codes_json = json.dumps(codes).encode("utf-8")

with open(output_path, "wb") as f:
f.write(len(codes_json).to_bytes(4, "big")) # длина таблицы
f.write(codes_json) # таблица
f.write(bytes([padding])) # padding
f.write(data_bytes) # данные


def decode_file(input_path: str, output_path: str):
with open(input_path, "rb") as f:
table_size = int.from_bytes(f.read(4), "big")
codes = json.loads(f.read(table_size).decode("utf-8"))
padding = f.read(1)[0]
data_bytes = f.read()

bits = bytes_to_bits(data_bytes, padding)
text = decode(bits, codes)

open(output_path, "w", encoding="utf-8").write(text)
62 changes: 62 additions & 0 deletions src/huffman/huffman_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
def build_huffman_codes(text: str) -> dict[str, str]:
# если текст пустой — возвращаем пустой словарь
if not text:
return {}

# подсчёт частоты каждого символа
frequency = {}
for char in text:
frequency[char] = frequency.get(char, 0) + 1

# если только один уникальный символ — простой код
if len(frequency) == 1:
only_char = next(iter(frequency))
return {only_char: "0"}

# узлы дерева: (частота, символ/поддерево)
nodes = [(freq, char) for char, freq in frequency.items()]

# строим дерево Хаффмана
while len(nodes) > 1:
nodes.sort(key=lambda x: x[0])
freq1, node1 = nodes.pop(0)
freq2, node2 = nodes.pop(0)
nodes.append((freq1 + freq2, (node1, node2)))

# корень дерева
(_, root) = nodes[0]
codes = {}

def traverse(node, code):
if isinstance(node, str):
codes[node] = code
else:
left, right = node
traverse(left, code + "0")
traverse(right, code + "1")

traverse(root, "")
return codes


def encode(text: str) -> tuple[str, dict[str, str]]:
codes = build_huffman_codes(text)
encoded_bits = "".join(codes[ch] for ch in text) if text else ""
return encoded_bits, codes


def decode(bits: str, codes: dict[str, str]) -> str:
if not bits or not codes:
return ""

reverse_codes = {code: ch for ch, code in codes.items()}
decoded = []
current = ""

for bit in bits:
current += bit
if current in reverse_codes:
decoded.append(reverse_codes[current])
current = ""

return "".join(decoded)
75 changes: 75 additions & 0 deletions tests/test_huffman.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
from src.huffman.huffman_encoding import encode, decode
from src.huffman.huffman_codec import encode_file, decode_file


def test_encode_decode_basic_string():
text = "hello huffman"
encoded, codes = encode(text)
assert decode(encoded, codes) == text


def test_encode_decode_single_char():
text = "aaaaaa"
encoded, codes = encode(text)
assert codes == {"a": "0"}
assert encoded == "000000"
assert decode(encoded, codes) == text


def test_encode_decode_empty_string():
text = ""
encoded, codes = encode(text)
assert encoded == ""
assert codes == {}
assert decode(encoded, codes) == text


def test_encode_two_chars_manual():
text = "ab"
encoded, codes = encode(text)

assert codes == {
"a": "0",
"b": "1",
}
assert encoded == "01"
assert decode(encoded, codes) == text


def test_encode_aaab_manual():
text = "aaab"
encoded, codes = encode(text)

assert codes == {
"b": "0",
"a": "1",
}
assert encoded == "1110"
assert decode(encoded, codes) == text


def test_encode_abc_manual():
text = "abc"
encoded, codes = encode(text)

assert codes == {
"c": "0",
"a": "10",
"b": "11",
}
assert encoded == "10110"
assert decode(encoded, codes) == text


def test_encode_decode_file(tmp_path):
input_file = tmp_path / "input.txt"
compressed_file = tmp_path / "compressed.huf"
output_file = tmp_path / "output.txt"

text = "This is a test of the Huffman codec.\n" * 10
input_file.write_text(text, encoding="utf-8")

encode_file(str(input_file), str(compressed_file))
decode_file(str(compressed_file), str(output_file))

assert output_file.read_text(encoding="utf-8") == text