-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_operation.py
135 lines (118 loc) · 4.4 KB
/
data_operation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
@Description:
@Author: Ambrose
@Date: 2024-04-17 18:48:56
@LastEditTime: 2024-04-22 10:41:34
@LastEditors: Ambrose
"""
import os
from PIL import Image
import numpy as np
import jieba
import openpyxl
from wordcloud import WordCloud
from setting import setting as st
class data_operate(object):
def __init__(self):
self.file_path = st.file_info.keys()
self.text = ""
self.words = []
self.dict = {}
# 数据读入
def data_read(self, file_name):
with open(file_name, "r", encoding="utf-8") as f:
self.text = f.read()
f.close()
# 数据分割与数据清洗
def data_split(self):
self.dict = {}
# 特殊词标注
if len(st.special_word_list) != 0:
for word in st.special_word_list:
jieba.suggest_freq(word, tune=True)
# 从文件导入停用词表
stop_dic = open(st.stop_words_path, "rb")
stop_word_content = stop_dic.read()
stop_word_list = stop_word_content.splitlines()
stop_dic.close()
segs = jieba.cut(self.text)
# 文本清洗
for seg in segs:
if seg not in stop_word_list and seg != " " and len(seg) != 1:
self.words.append(seg.replace(" ", " "))
for word in self.words:
if word in self.dict:
self.dict[word] += 1
else:
self.dict[word] = 1
self.dict = sorted(self.dict.items(), key=lambda x: x[1], reverse=True)
self.dict = dict(self.dict)
# 以txt方式数据存储
def data_storage_txt(self, file_name):
if os.path.exists("./output/txt/{}".format(file_name)):
os.remove("./output/txt/{}".format(file_name))
with open("./output/txt/{}".format(file_name), "w", encoding="utf-8") as f:
for word, count in self.dict.items():
f.write(f"{word}: {count}")
f.write("\n")
f.close()
# 以csv方式数据存储
def data_storage_csv(self, file_name):
if os.path.exists("./output/csv/{}.csv".format(file_name[:4])):
os.remove("./output/csv/{}.csv".format(file_name[:4]))
with open(
"./output/csv/{}.csv".format(file_name[:4]), "w", encoding="utf-8"
) as f:
for word, count in self.dict.items():
f.write(f"{word},{count}")
f.write("\n")
f.close()
# 以excel方式数据存储
def data_storage_xlsx(self, file_name):
if os.path.exists("./output/excel/{}.xlsx".format(file_name[:4])):
os.remove("./output/excel/{}.xlsx".format(file_name[:4]))
workbook = openpyxl.Workbook()
sheet = workbook.active
sheet["A1"] = "词"
sheet["B1"] = "出现数量"
row = 2
for word, count in self.dict.items():
sheet.cell(row, column=1, value=word)
sheet.cell(row, column=2, value=count)
row += 1
for column_cells in sheet.columns:
length = max(len(str(cell.value)) for cell in column_cells)
sheet.column_dimensions[column_cells[0].column_letter].width = length + 2
workbook.save("./output/excel/{}.xlsx".format(file_name[:4]))
# 生成词云
def word_cloud_generate(self, file_name):
shape_mask = np.array(Image.open(st.background_img))
text = " ".join(self.words)
word_cloud = WordCloud(
background_color="white",
font_path=st.font_path,
mask=shape_mask,
width=1000,
height=700,
max_words=50
).generate(text)
word_cloud.to_file("./output/word_cloud/wordCloud{}.png".format(file_name[:4]))
# 数据操作入口
def run(self, save):
for file_name in self.file_path:
self.data_read("./resource/{}".format(file_name))
self.data_split()
if save == "txt":
self.data_storage_txt(file_name)
elif save == "csv":
self.data_storage_csv(file_name)
elif save == "xlsx":
self.data_storage_xlsx(file_name)
elif save == "all":
self.data_storage_txt(file_name)
self.data_storage_csv(file_name)
self.data_storage_xlsx(file_name)
self.word_cloud_generate(file_name)
if __name__ == "__main__":
operate = data_operate()
operate.run(save="all")