-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords-extractor.py
44 lines (35 loc) · 1.36 KB
/
words-extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
import os
import json
from bs4 import BeautifulSoup
from collections import defaultdict
# Directory containing HTML files
html_folder = './files/'
# Function to clean and split text into words
def extract_words(text):
words_list = text.lower().split()
# NOTE: additional processing will be added later
return words_list
word_dict = defaultdict(int)
count = 0
for filename in os.listdir(html_folder):
if filename.endswith(".html"):
print(filename)
filepath = os.path.join(html_folder, filename)
# Open and parse the HTML file
with open(filepath, 'r', encoding='utf-8') as file:
soup = BeautifulSoup(file, 'html.parser')
text = soup.get_text() # Extract text from the HTML
words = extract_words(text)
print(len(words))
# Update word count dictionary
for word in words:
if len(word) > 1 and (word[0] == 'የ' or word[0] == 'በ'):
word = word[1:]
word_dict[word] += 1
# Convert defaultdict to regular dict
word_dict = dict(word_dict)
# Save the word dictionary to a JSON file
output_file = 'outputs/word_dictionary.json'
with open(output_file, 'w', encoding='utf-8') as json_file:
json.dump(word_dict, json_file, ensure_ascii=False, indent=4)
print(f"Word dictionary saved to {output_file}")