-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathdata_processing.py
More file actions
131 lines (107 loc) · 4.53 KB
/
data_processing.py
File metadata and controls
131 lines (107 loc) · 4.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 7 17:53:55 2018
@author: Anan
"""
import numpy as np
import pickle
from collections import Counter
import random
import tensorflow as tf
max_comment_length = 200
split_frac = 0.8
num_labels = 3
def is_CN_char(ch):
return ch >= u'\u4e00' and ch <= u'\u9fa5'
def cut(string):
return list(jieba.cut(string))
def get_stopwords(filename = "D:/Code/Pycharm/Data_source/chinese_stopwords.txt"):
stopwords_dic = open(filename, encoding= 'utf-8')
stopwords = stopwords_dic.readlines()
stopwords = [w.strip() for w in stopwords]
stopwords_dic.close()
return stopwords
def convert2simple(word):
openCC = OpenCC('tw2sp')
return openCC.convert(word)
def clean_sentence(sentence):
stopwords = get_stopwords()
sentence = ''.join(filter(is_CN_char,sentence))
sentence = convert2simple(sentence)
words = [w for w in cut(sentence) if len(w)>1 and w not in stopwords]
words = ' '.join(words)
return words
def save_txt(data,filename):
with open(filename,"wb") as f:
pickle.dump(data,f)
def get_comment_each_star():
content = pd.read_csv('douban_movie_comments.csv',encoding='gb18030')
content['comment'] = content['comment'].fillna('')
content['comment'] = content['comment'].apply(clean_sentence)
for i in range(1,6):
comment = [content.iloc[j].comment for j in range(len(content)) if content.iloc[j].star == i]
comment = [c for c in comment if len(c)!=0]
comment_lens = Counter([len(x) for x in comment])
if comment_lens[0] == 0:
save_txt(comment,"comments_star{}.txt".format(i))
def read_txt(filename):
with open(filename, "rb") as fp:
b = pickle.load(fp)
return b
def get_random_comment(data,length=6000):
return [data[i] for i in random.sample(range(len(data)),length)]
def get__binary_comment(star,length):
comment = read_txt("comments_star{}.txt".format(star))
comment = get_random_comment(comment,length=length)
return comment
def word_to_id(vocab):
counts = Counter(vocab)
vocab = sorted(counts, key=counts.get, reverse=True)
word_to_id = { word : i for i, word in enumerate(vocab)}
id_to_word = {i:word for i,word in enumerate(vocab)}
return word_to_id, id_to_word
def comment_to_id(word_to_id,comments):
comment_to_id = []
for comment in comments:
comment_to_id.append([word_to_id[word] for word in comment.split()] )
return comment_to_id
def pad_sequences(comment_to_id,maxlen,padding='post',truncating='post'):
features = np.zeros((len(comment_to_id), maxlen), dtype=int)
for i,comment in enumerate(comment_to_id):
if len(comment) <= maxlen and padding == 'pre':
features[i, -len(comment):] = np.array(comment)[:maxlen]
if len(comment) <= maxlen and padding == 'post':
features[i, :len(comment)] = np.array(comment)[:maxlen]
if len(comment) > maxlen and truncating == 'post':
features[i, :] = np.array(comment)[:maxlen]
if len(comment) > maxlen and truncating == 'pre':
features[i, :] = np.array(comment)[len(comment)-maxlen:]
return features
def split_dataset(pad_comments,labels):
split_index = int(len(pad_comments)*split_frac)
data_list = list(zip(pad_comments, labels))
random.shuffle(data_list)
pad_comments, labels = zip(*data_list)
x_train, x_test = pad_comments[:split_index], pad_comments[split_index:]
y_train, y_test = labels[:split_index], labels[split_index:]
return x_train,y_train,x_test,y_test
comment_star1 = get__binary_comment(1,10000)
comment_star2 = get__binary_comment(2,5000)
comment_star3 = get__binary_comment(3,14500)
negetive_comment = comment_star1 + comment_star2
positive_comment= get__binary_comment(5,15000)
all_comment = positive_comment + comment_star3 + negetive_comment
labels = [2]*15000 + [1]*14500 + [0]*15000
labels = (np.arange(num_labels) == np.array(labels)[:,None]).astype(np.float32)
vocab = ' '.join(all_comment).split()
vocab.append('unknown')
word_to_id, id_to_word = word_to_id(vocab)
comment_to_id = comment_to_id(word_to_id,all_comment)
pad_comments = pad_sequences(comment_to_id,maxlen=max_comment_length,padding='post',truncating='post')
x_train,y_train,x_test,y_test = split_dataset(pad_comments,labels)
save_txt(x_train,"x_train_sentiment")
save_txt(y_train,"y_train_sentiment")
save_txt(x_test,"x_test_sentiment")
save_txt(y_test,"y_test_sentiment")
save_txt(y_test,"y_test_sentiment")
save_txt(word_to_id,"word_to_id_sentiment")