Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -173,4 +173,7 @@ poetry.toml
# LSP config files
pyrightconfig.json

# Haffman code test files
haffman_code_test_files

# End of https://www.toptal.com/developers/gitignore/api/python
108 changes: 108 additions & 0 deletions src/haffman_code/haffman_code.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
from pathlib import Path


def encode(msg: str) -> tuple[str, dict[str, str]]:
"""Haffman code encoder."""
# Подстчёт количества каждого символа и преобразование в список кортежей, сортированный по частоте.
char_count = {}
for char in msg:
if char in char_count:
char_count[char] += 1
else:
char_count[char] = 1
char_count_list = sorted(char_count.items(), key=lambda item: item[1])

code_table = {char: "" for char in msg}
while len(char_count_list) > 1:
char_count_list = sorted(char_count_list, key=lambda item: item[1])

# Берём две самые минимальные штуки
smallest_one = char_count_list.pop(0)
smallest_two = char_count_list.pop(0)
# Перебор и обработка всех символов, которые на этих ветках.
for char in smallest_one[0]:
code_table[char] = "0" + code_table[char]
for char in smallest_two[0]:
code_table[char] = "1" + code_table[char]

merged_element = (
smallest_one[0] + smallest_two[0],
smallest_one[1] + smallest_two[1],
)
char_count_list.append(merged_element)

encoded_msg = ""
for char in msg:
encoded_msg += code_table[char]

decode_table = {haffman: char for char, haffman in code_table.items()}

return encoded_msg, decode_table


def decode(encoded: str, table: dict[str, str]) -> str:
"""Haffman code decoder."""
decoded = ""
current_element = ""
for digit in encoded:
current_element += digit
if current_element in table:
decoded += table[current_element]
current_element = ""

return decoded


def file_encode(filepath: str):
"""Haffman code file encoder. Creates encoded file with name "filename.encoded". If file with this name already exists, replaces it."""
path = Path(filepath)

if not path.is_file():
raise Exception("File doesn`t exist.")

with open(path, "r") as file:
with open(str(path.parent) + "/" + path.stem + ".encoded", "w") as encoded_file:
encoded_msg, table = encode(file.read())
# Обработка таблицы в строку, где через пробелы идут склеенные символ и его код.
string_table = " ".join([item[1] + item[0] for item in table.items()])

encoded_file.write(encoded_msg + "\n")
encoded_file.write(string_table)


def file_decode(filepath: str):
"""Haffman code file decoder. Creates decoded file with name "decoded-'filename'.txt". If file with this name already exists, replaces it.\n
Takes only "*.encoded" files."""
path = Path(filepath)

if not path.is_file():
raise Exception("File doesn`t exist.")
if path.suffix != ".encoded":
raise Exception("File suffix isn`t '.encoded'.")

with open(path, "r") as encoded_file:
with open(
str(path.parent) + "/decoded-" + path.stem + ".txt", "w"
) as decoded_file:
# Получение данных, где только одно разделение, поскольку среди закодированных символов может быть "\n"
encoded_msg, string_table = encoded_file.read().split("\n", maxsplit=1)
# Проверка на пустоту файла
if encoded_msg == "":
decoded_file.write("")
else:
# Обработка таблицы в словарь из строки.
splitted_string_table = string_table.split(" ")
# Если есть, значит среди символов был пробел, которого мы вот таким вот костылём чиним.
if "" in splitted_string_table:
space_index = splitted_string_table.index("")
splitted_string_table.pop(space_index)
splitted_string_table[space_index] = (
" " + splitted_string_table[space_index]
)

table = {item[1:]: item[0] for item in splitted_string_table}

decoded_file.write(decode(encoded_msg, table))


# Честно говоря, про возможность работы в бинарном режиме я не забыл, но не нашёл применения для него.
59 changes: 59 additions & 0 deletions tests/haffman_code_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from haffman_code.haffman_code import encode, decode, file_encode, file_decode
from random import randint
from pathlib import Path


def test_encode_decode_biection():
string = "".join(chr(randint(128, 255)) for _ in range(randint(1, 1000)))
encoded_string, table = encode(string)
assert string == decode(encoded_string, table)


def test_encode_decode_empty_biection():
encoded_string, table = encode("")
assert "" == decode(encoded_string, table)


def test_encode_uniqueness():
string = "".join(chr(randint(128, 255)) for _ in range(randint(1, 1000)))
table = encode(string)[1]
codes = table.keys()
for code in codes:
for other_code in codes:
if code == other_code:
continue
if code == other_code[: len(code)]:
assert False
break
else:
assert True


def test_file_encode_decode_biection():
Path("./haffman_code_test_files").mkdir(exist_ok=True)
string = "".join(chr(randint(128, 255)) for _ in range(randint(1, 1000)))
with open("haffman_code_test_files/haffman_code_test_text_file.txt", "w") as file:
file.write(string)

file_encode("haffman_code_test_files/haffman_code_test_text_file.txt")
file_decode("haffman_code_test_files/haffman_code_test_text_file.encoded")

with open(
"haffman_code_test_files/decoded-haffman_code_test_text_file.txt", "r"
) as decoded_file:
assert string == decoded_file.read()


def test_file_encode_decode_empty_biection():
Path("./haffman_code_test_files").mkdir(exist_ok=True)
string = ""
with open("haffman_code_test_files/haffman_code_test_text_file.txt", "w") as file:
file.write(string)

file_encode("haffman_code_test_files/haffman_code_test_text_file.txt")
file_decode("haffman_code_test_files/haffman_code_test_text_file.encoded")

with open(
"haffman_code_test_files/decoded-haffman_code_test_text_file.txt", "r"
) as decoded_file:
assert string == decoded_file.read()