-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtext_EDA.py
266 lines (224 loc) · 10.1 KB
/
text_EDA.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import embeddings
import clustering
import spacy
import pytextrank
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
def split_into_sentences(utterance):
"""
Splits a SpaCy doc into sentences.
(You must include 'setencizer' in the SpaCy NLP pipeline
so that you can split the sentences out.)
:utterance: SpaCy doc
:return: list of SpaCy spans of sentences.
"""
return [sentence for sentence in utterance.sents]
def get_sentence_count(utterance):
"""
Gets the number of sentences in a given utterance.
:utterance: string of text or SpaCy doc
:return: and integer >= 0 indicating the number of sentences.
"""
return len(split_into_sentences(utterance))
def get_word_count(utterance):
"""
This function counts the number of word tokens in an utterance.
:utterance: string of text or SpaCy doc
:return: integer >= 0 for the total number of tokens
"""
if isinstance(utterance, str):
return sum(1 for word in utterance.split())
else:
return sum(1 for word in utterance.text.split())
def get_word_density(utterance):
"""Calculates the average length of the words in the utterance.
:utterance: SpaCy doc
:return: integer >= 0 for the average word length.
"""
total_characters = 0
for token in utterance:
if not token.is_punct and token.is_alpha:
total_characters += len(token.text)
return total_characters / get_word_count(utterance)
def get_punctuation_count(utterance):
"""
Returns the number of punctuations in the input utterance.
:utterance: SpaCy doc
:return: integer >= 0 for the number of punctuations.
"""
return sum([1 for token in utterance if token.is_punct])
def get_uppercase_count(utterance):
"""
Gets the number of uppercase letters in an input SpaCy doc.
:utterance: SpaCy doc
:return: integer >= 0 for the number of uppercase letters.
"""
return sum(1 for c in utterance.text if c.isupper())
def get_lowercase_count(utterance):
"""
Gets the number of lowercase letters in an input SpaCy doc.
:utterance: SpaCy doc
:return: integer >= 0 for the number of lowercase letters.
"""
return sum(1 for c in utterance.text if c.islower())
def get_case_ratio(utterance):
"""Gets the ratio of uppercase letters.
:utterance: SpaCy doc
:return: float >= 0 for the ratio of uppercase letters to lowercase letters.
"""
uppercase = get_uppercase_count(utterance)
lowercase = get_lowercase_count(utterance)
return uppercase / (uppercase + lowercase)
def get_sentiment(utterance):
"""
Returns the sentiment polarity between -1 and 1 for how
negative or positive the utterance is.
This function uses NLTK's VADER lexicon approach for getting the
sentiment score.
:utterance: SpaCy doc
:return: -1 >= float <= 1 for utterance sentiment.
"""
return sid.polarity_scores(utterance.text)['compound']
def get_cluster_labels(utterances):
"""
Calculates cluster labels for all the utterances.
If there are few documents, I use PCA for dimension reduction and k-means
for clustering. If there are more than 100, I use UMAP and HDBSCAN together.
:utterances: list of strings
:return: list of integers indicating what cluster they belong to.
"""
n_docs = len(utterances)
if n_docs <= 100:
document_embeddings = embeddings.pretrained_transformer_embeddings(utterances)
dims = min(clustering.get_optimal_n_components(document_embeddings), 10)
reduced = clustering.reduce_dimensions_pca(document_embeddings,\
dimensions=dims)
return clustering.kmeans_clustering(reduced,\
max_num_clusters=min(n_docs,80))
else:
document_embeddings = embeddings.word2vec_sif_embeddings(utterances,model_name=None)
reduced = clustering.reduce_dimensions_umap(document_embeddings)
return clustering.hdbscan_clustering(reduced)
def get_top_words(utterances, n = 20):
"""
This function returns the top n words in a list SpaCy docs.
:utterances: list of SpaCy docs.
:return:
"""
words = []
for utterance in utterances:
for word in utterance:
if word.is_alpha:
words.append(word.text)
counter = Counter(words)
top_words = counter.most_common(n)
return top_words
def get_top_n_ngrams(corpus, N = 1, n=None):
"""
This function gives the n top N-grams along with their counts in desc order.
"""
vec = CountVectorizer(ngram_range=(N, N)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
def get_top_pos(utterances, pos = 'VERB', n = 20):
"""
This function takes a list of lists of word tokens, the type of phrase position
(i.e. verb, noun, etc.) and the n number of top words of the pos type. It goes through the
lists of lists, joins the tokens, and works out which words are of that pos. It will then
count the number of occurances in that word.
Returns lemma of top n words that are of type pos.
"""
words = []
for utterance in utterances:
for word in utterance:
if word.pos_ == pos:
words.append(word.lemma_)
counter = Counter(words)
top_words = counter.most_common(n)
return top_words
def get_top_entities(utterances, entity_types, n = 20):
entities = []
for utterance in utterances:
for word in utterance:
if word.ent_type_ in entity_types:
entities.append(word.text)
counter = Counter(entities)
top_entities = counter.most_common(n)
return top_entities
def get_top_keywords(keywords, n = 20):
words = []
for doc_keywords in keywords:
for keyword in doc_keywords:
words.append(keyword.lemma_)
counter = Counter(words)
top_words = counter.most_common(n)
return top_words
class text_EDA():
def __init__(self, utterances, pipes = ['entity_ruler', 'sentencizer', 'textrank']) -> None:
self.data = pd.DataFrame(utterances, columns=['Raw Utterances'])
self.top_features = None
self.nlp_utterances = None
# Load SpaCy model
self.nlp = spacy.load('en_core_web_sm')
# Load SpaCy pipeline
if pipes != None:
self.load_nlp_pipe(pipes)
# Load NLP pipeline
def load_nlp_pipe(self, pipes):
"""
This function creates and loads all the pipes into the nlp-er
:pipes: list of pipe names as strings
"""
for pipe in pipes:
if pipe == 'sentencizer': #needs to go before the parser.
nlp_pipe = self.nlp.create_pipe(pipe)
self.nlp.add_pipe(nlp_pipe, before='parser')
elif pipe == 'textrank':
tr = pytextrank.TextRank()
self.nlp.add_pipe(tr.PipelineComponent, name=pipe, last=True)
else:
nlp_pipe = self.nlp.create_pipe(pipe)
self.nlp.add_pipe(nlp_pipe)
def explore(self):
if self.nlp_utterances == None:
self.nlp_utterances = list(self.nlp.pipe(self.data['Raw Utterances']))
self.data['Sentence Counts'] = list(map(get_sentence_count, self.nlp_utterances))
self.data['Word Counts'] = list(map(get_word_count, self.nlp_utterances))
self.data['Word Densities'] = list(map(get_word_density, self.nlp_utterances))
self.data['Punctuation Counts'] = list(map(get_punctuation_count, self.nlp_utterances))
self.data['Case Ratios'] = list(map(get_case_ratio, self.nlp_utterances))
self.data['Sentiments'] = list(map(get_sentiment, self.nlp_utterances))
self.data['Categories'] = get_cluster_labels(self.data['Raw Utterances'])
top_words = pd.DataFrame(get_top_words(self.nlp_utterances), \
columns=['Top Words', 'Top Words Counts'])
n_grams = []
corpus = [utterance.text for utterance in self.nlp_utterances]
for n in range(1,3):
n_grams.append(pd.DataFrame(get_top_n_ngrams(corpus,N=n,n=20), columns=[f'Top n-grams ({n})', 'Top n-gram Counts']))
top_nouns = pd.DataFrame(get_top_pos(self.nlp_utterances, pos='NOUN', n=20),\
columns=['Top Nouns', 'Top Nouns Counts'])
top_verbs = pd.DataFrame(get_top_pos(self.nlp_utterances, pos='VERB', n=20),\
columns=['Top Verbs', 'Top Verbs Counts'])
top_people = pd.DataFrame(get_top_entities(self.nlp_utterances, ['PERSON'], 20),\
columns=['Top People', 'Top People Counts'])
top_organisations = pd.DataFrame(get_top_entities(self.nlp_utterances, ['ORG', 'NORP'], 20),\
columns=['Top Oraganisations', 'Top Oraganisations Counts'])
top_locations = pd.DataFrame(get_top_entities(self.nlp_utterances, ['FAC', 'GPE', 'LOC'], 20),\
columns=['Top Locations', 'Top Locations Counts'])
keyphrases_corpus = [[keyphrase.chunks[0] for keyphrase in utterance._.phrases] for utterance in self.nlp_utterances]
top_keyphrases = pd.DataFrame(get_top_keywords(keyphrases_corpus,n=20), columns=['Top Keyphrases', 'Top Keyphrases Counts'])
self.top_features = top_words
for n_gram in n_grams:
self.top_features = pd.concat([self.top_features, n_gram], axis=1)
self.top_features = pd.concat([self.top_features, top_nouns], axis=1)
self.top_features = pd.concat([self.top_features, top_verbs], axis=1)
self.top_features = pd.concat([self.top_features, top_people], axis=1)
self.top_features = pd.concat([self.top_features, top_organisations], axis=1)
self.top_features = pd.concat([self.top_features, top_locations], axis=1)
self.top_features = pd.concat([self.top_features, top_keyphrases], axis=1)