-
Notifications
You must be signed in to change notification settings - Fork 2
/
main_parser.py
76 lines (60 loc) · 2.79 KB
/
main_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import requests
from bs4 import BeautifulSoup
import csv
import datetime
import os
##DO NOT EDIT OR ADD
class WebsiteParser:
def __init__(self):
self.session = requests.Session()
def read_from_file(self, file_path):
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
def convert_to_tsv(self, data):
output = []
for row in data:
output.append([str(item) for item in row])
return output
def write_to_tsv(self, file_path, tsv_data):
with open(file_path, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter='\t')
writer.writerows(tsv_data)
def write_to_csv(self, csv_data):
current_date = datetime.datetime.now().strftime("%d_%m_%Y")
file_path = f'{self.directory}/{self.brand}_output_{current_date}.csv'
# Write data to CSV
with open(file_path, 'w', newline='', encoding='utf-8') as file:
writer = csv.writer(file, delimiter=',')
writer.writerows(csv_data)
print(f"Data saved to '{file_path}'")
def parse_directory(self, directory_path):
all_data = []
header_added = False
total_files = len([f for f in os.listdir(directory_path) if f.endswith('.txt') or f.endswith('.html')])
processed_files = 0
print(f"Found {total_files} HTML files in the directory.")
print("Processing files...")
for filename in os.listdir(directory_path):
if filename.endswith('.txt') or filename.endswith('.html'):
file_path = os.path.join(directory_path, filename)
category = os.path.splitext(filename)[0] # Use the filename as the category
tsv_output = self.parse_website(file_path, category)
if not header_added:
tsv_output[0].append('filename') # Add the new column name for filename
all_data.append(tsv_output[0]) # Add the header row only once
header_added = True
# Add the filename as a new column to the parsed data
for row in tsv_output[1:]:
row.append(filename)
all_data.append(row)
processed_files += 1
progress = (processed_files / total_files) * 100
print(f"Progress: {progress:.2f}% ({processed_files}/{total_files} files processed)")
print("Writing data to CSV file...")
self.write_to_csv(all_data)
return all_data
def parse_website(self, source, category):
html_content = self.read_from_file(source)
soup = BeautifulSoup(html_content, 'html.parser')
parsed_data = self.parse_product_blocks(soup, category)
return self.convert_to_tsv(parsed_data)