-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
65 lines (53 loc) · 2.02 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from pandas import DataFrame
with open('../data/Chats.txt', 'r') as file:
chats: str = file.read()
df: DataFrame = DataFrame(columns=['sender', 'msg'])
# divide single string into multiple lines
chats: list = chats.split('\n')
chats = chats[1:]
# remove unnecessary data
for i in range(len(chats)):
chats[i] = ' '.join(chats[i].split(' ')[4:])
# insert all the chats into the DataFrame
for chat in chats:
sender: str = chat.split(':')[0]
msg: str = ':'.join(chat.split(':')[1:])
msg = msg[1:]
# remove names from the data
if sender == 'Shabd Saran': sender = 'Me'
else: sender = 'Other'
df = df.append({'sender': sender, 'msg': msg}, ignore_index=True)
# removing <Media omitted> lines
df.drop(df[df['msg'] == '<Media omitted>'].index, inplace=True)
df.reset_index(drop=True, inplace=True) # reset index values
# grouping the conversations
drop_indices: list = []
for i in range(len(df) - 1):
curr_sender: str = df['sender'].iloc[i]
curr_index: int = i
sentence: str = df['msg'].iloc[i]
while i < len(df) - 1 and curr_sender == df['sender'].iloc[i+1]:
sentence += '. ' + df['msg'].iloc[i+1]
drop_indices.append(1 + curr_index)
i += 1
df['msg'].iloc[curr_index] = sentence
i += 1
df.drop(index=drop_indices, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True) # reset index values
# remove pairs with no replies
drop_indices: list = []
for i in range(len(df) - 1):
if '' in [df['msg'].iloc[i], df['msg'].iloc[i+1]]:
drop_indices.append(i)
drop_indices.append(i+1)
i += 2
df.drop(index=drop_indices, axis=0, inplace=True)
df.reset_index(drop=True, inplace=True) # reset index values
# storing data into train.from & trian.to
with open('train.from', 'a') as train_from:
with open('train.to', 'a') as train_to:
for i in range(len(df)):
if 'Me' == df['sender'].iloc[i]:
train_from.write(df['msg'].iloc[i] + '\n')
else:
train_to.write(df['msg'].iloc[i] + '\n')