-
Notifications
You must be signed in to change notification settings - Fork 0
/
helper.py
131 lines (103 loc) · 4.74 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
from urlextract import URLExtract
from wordcloud import WordCloud
from collections import Counter
import pandas as pd
import emoji
# Function will show the statistic part of data
def show_stat(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
# 1. show the total number of message(chat) done into group/individual
total_msg = df.shape[0]
# 2. show the total number of words by group/individual
total_word_list = []
for msg in df['Message']:
total_word_list.extend(msg.split()) # stored words into total_word list
total_word = len(total_word_list)
# 3. show the total number of media omitted file(video/image file)
total_media_file = df[df['Message'] == '<Media omitted>'].shape[0]
# 4. Show the total number of links
link_list = []
extractor = URLExtract()
for msg in df['Message']:
link_list.extend(extractor.find_urls(msg))
link = len(link_list)
return total_msg, total_word, total_media_file, link
# Function will show the most busy user
def show_busy_user(df):
x1 = df['User'].value_counts().head() # show top 5 users
x2 = (df['User'].value_counts()/df.shape[0])*100 # show all the users in percentage
return x1, x2.round(2)
# Function will show word cloud
def show_word_cloud(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
temp_df = df[df['User'] != 'Group_notification'] # will not count Group_notification's words as word cloud
temp_df = temp_df[temp_df['Message'] != '<Media omitted>'] # will not count <Media omitted> message as word cloud
# below words are also not count as word cloud
f = open('stop_hinglish.txt')
stop_word = f.read()
def remove_stop_word(msg):
word_cloud_list = []
for wd in msg.lower().split():
if wd not in stop_word:
word_cloud_list.append(wd)
return " ".join(word_cloud_list)
wc = WordCloud(width=500, height=500, min_font_size=10, background_color='white')
temp_df['Message'] = temp_df['Message'].apply(remove_stop_word)
df_wc = wc.generate(temp_df['Message'].str.cat(sep = " "))
return df_wc
# Function will show msot common word
def show_most_com_word(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
temp_df = df[df['User'] != 'Group_notification'] # will not count Group_notification's words as most common word
temp_df = temp_df[temp_df['Message'] != '<Media omitted>'] # will not count <Media omitted> message as most common word
# below words are also not count as most common word
f = open('stop_hinglish.txt')
stop_word = f.read()
word_list = []
for msg in temp_df['Message']:
for wd in msg.lower().split():
if wd not in stop_word:
word_list.append(wd)
most_word = pd.DataFrame(Counter(word_list).most_common(15)) # fetch most 20 words
return most_word
# Function will show emoji analysis
def show_emoji_ana(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
emoji_list = []
for msg in df['Message']:
emoji_list.extend([c for c in msg if c in emoji.EMOJI_DATA])
emj = pd.DataFrame(Counter(emoji_list).most_common(len(Counter(emoji_list)))) # fetch all the emoji which is used
return emj
def show_month_timeline(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
timeline = df.groupby(['Year', 'Month_name', 'Month']).count()['Message'].reset_index()
time = []
for i in range(timeline.shape[0]):
time.append(timeline['Month_name'][i] + "-" + str(timeline['Year'][i]))
timeline['Time'] = time
return timeline
def show_day_timeline(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
timeline = df.groupby(df['Full_date']).count()['Message'].reset_index()
return timeline
def show_day_name_ana(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
day_timeline = df['Day_name'].value_counts()
return day_timeline
def show_month_name_ana(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
month_timeline = df['Month_name'].value_counts()
return month_timeline
def time_wise_active(selected_user, df):
if selected_user != 'Overall':
df = df[selected_user == df['User']]
active_graph = df.pivot_table(index='Day_name', columns='Period', values='Message', aggfunc='count').fillna(0)
return active_graph