-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchatOnlineSoln.py
85 lines (73 loc) · 2.46 KB
/
chatOnlineSoln.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
import sys
import re
def plot_bar(df):
# this is for plotting purpose
index = np.arange(len(df['sender']))
plt.bar(index, df['numOfTxt'])
plt.xlabel('Sender', fontsize=13)
plt.ylabel('No of Texts', fontsize=13)
plt.xticks(index, df['sender'], fontsize=10, rotation=15)
plt.title('Texts sent by each sender in the convo')
plt.show()
def parse_file(text_file):
file = open(text_file, 'r')
sender = []
message = []
datetime = []
for eachLine in file:
if (re.search("^\d+/\d+/\d+, \d+:\d+ \w+ - ", eachLine)):
data = eachLine
else:
data += eachLine
datetime.append(data.split(' - ')[0])
if(len(data.split('-')[1].split(':')) > 1):
s = data.split('-')[1].split(':')[0].strip()
sender.append(s)
else:
sender.append('')
try:
message.append(data.split(': ', 1)[1].replace('\n', ' ').strip())
except:
message.append('')
df = pd.DataFrame(zip(datetime, sender, message), columns=[
'timestamp', 'sender', 'message'])
df['timestamp'] = pd.to_datetime(df['timestamp'])
df = df[df.sender != ''].reset_index(drop=True)
for i in range(len(df.message)):
if df.iloc[i]['message']== '<Media omitted>':
df.drop(df.index[i])
print "hey",df.index[i],i
print df
return df
def mssgPerUser(df):
names = list(df['sender'].unique())
numOfTxts = []
for i in range(len(names)):
numOfTxts.append(0)
for name in df['sender']:
numOfTxts[names.index(name)] += 1
df = pd.DataFrame(zip(names, numOfTxts), columns=['sender', 'numOfTxt'])
return df
dataDf = parse_file(sys.argv[1])
dataDf['characters'] = dataDf.message.apply(len)
dataDf['words'] = dataDf.message.apply(lambda x: len(x.split()))
dataDf['keywords'] = dataDf.message.apply(lambda x: (x.split()))
# print (dataDf)
stopwords = []
with open("stopwordsHinglish.txt") as file:
data = file.read()
stopwords = data
for arr in dataDf['keywords']:
for keyword in stopwords:
if (keyword in arr):
arr.remove(keyword)
# print dataDf
# words = ''
# for i in dataDf.message.values:
# words += '{} '.format(i.lower()) # make words lowercase
# print (pd.DataFrame(Counter(words.split()).most_common(), columns=['word', 'frequency']))