From f9284bbeb050a640cb3e1dd1cc982a86e3bd0f34 Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:12:10 -0500 Subject: [PATCH 1/7] Add fasttext sentiment preprocessing help method --- src/aml/fast.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/aml/fast.py b/src/aml/fast.py index 00b8896..60e8098 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -18,6 +18,12 @@ def add_label(r): for k in j: s[k] = "__label__" + s[k] if s[k].find("__label__") == -1 else s[k] return r_ +def add_label_sentiment(r): + r_ = copy.deepcopy(r) + for i, s in enumerate(r_.sentences): + for _, _, sentiment in r.aos[i]: + s.append("__label__" + sentiment) + def review_formatted_file(path, corpus): with open(path, 'w', encoding='utf-8') as f: for r in corpus: f.write(' '.join(r) + '\n') From 50616b0ba43e65a86b2aec78cd0992c27fe99986 Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:31:20 -0500 Subject: [PATCH 2/7] Add inheritance and remove unused imports --- src/aml/fast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index 60e8098..c2bc25e 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -7,7 +7,7 @@ import fasttext import gensim -from .mdl import AbstractAspectModel, AspectPairType, BatchPairsType, flatten +from .mdl import AbstractAspectModel, AbstractSentimentModel, flatten from cmn.review import Review # Utility functions @@ -29,7 +29,7 @@ def review_formatted_file(path, corpus): for r in corpus: f.write(' '.join(r) + '\n') -class Fast(AbstractAspectModel): +class Fast(AbstractAspectModel, AbstractSentimentModel): def __init__(self, naspects, nwords): super().__init__(naspects, nwords) self.aspect_word_prob = None From 77ea811d422b32d26471e7e36368b14b7a74768e Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:38:07 -0500 Subject: [PATCH 3/7] Add fasttext sentiment analysis methods --- src/aml/fast.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index c2bc25e..583b7e8 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -11,7 +11,7 @@ from cmn.review import Review # Utility functions -def add_label(r): +def add_label_aspect(r): r_ = copy.deepcopy(r) for i, s in enumerate(r_.sentences): for j, _, _ in r.aos[i]: # j is the index of aspect words in sentence s @@ -24,6 +24,10 @@ def add_label_sentiment(r): for _, _, sentiment in r.aos[i]: s.append("__label__" + sentiment) +def add_label(r, label_type): + if label_type == 'aspect': return add_label(r) + elif label_type == 'sentiment': return add_label_sentiment(r) + def review_formatted_file(path, corpus): with open(path, 'w', encoding='utf-8') as f: for r in corpus: f.write(' '.join(r) + '\n') @@ -60,14 +64,14 @@ def infer(self, review: Review, doctype: str): return self.mdl.predict(review.get_txt(), k=self.naspects) @staticmethod - def preprocess(doctype, reviews, settings=None): + def preprocess(doctype, reviews, settings=None, label_type='aspect'): if not AbstractAspectModel.stop_words: import nltk AbstractAspectModel.stop_words = nltk.corpus.stopwords.words('english') reviews_ = [] - if doctype == 'rvw': reviews_ = [np.concatenate(add_label(r).sentences) for r in reviews] - elif doctype == 'snt': reviews_ = [s for r in reviews for s in add_label(r).sentences] + if doctype == 'rvw': reviews_ = [np.concatenate(add_label(r, label_type).sentences) for r in reviews] + elif doctype == 'snt': reviews_ = [s for r in reviews for s in add_label(r, label_type).sentences] reviews_ = [[word for word in doc if word not in AbstractAspectModel.stop_words and len(word) > 3 and (re.match('[a-zA-Z]+', word) or re.search('__label__', word))] for doc in reviews_] dict = gensim.corpora.Dictionary(reviews_) @@ -117,4 +121,18 @@ def merge_aspects_words(self, r_pred_aspects, nwords): result.append(sorted(flatten(subr_pred_aspects_words), reverse=True, key=lambda t: t[1])) - return result \ No newline at end of file + return result + + def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output): + corpus, self.dict = self.preprocess(doctype, reviews_train, no_extremes) + review_formatted_file(f'{output}model.train', corpus) + self.mdl = fasttext.train_supervised(f'{output}model.train', **settings, label_type='sentiment') + self.aspect_word_prob = self.generate_aspect_words() + + self.dict.save(f'{output}model.dict') + self.mdl.save_model(f'{output}model') + pd.to_pickle(self.aspect_word_prob, f'{output}model_sword_prob.pkl') + # do we need cas and perplexity? + + def infer_sentiment(self, review: Review, doctype: str): + return self.mdl.predict(review.get_txt(), k=self.naspects) \ No newline at end of file From 4643cbc9cf4058f813d4537a19da8d0b45d623d6 Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 14:55:03 -0500 Subject: [PATCH 4/7] bug fix --- src/aml/fast.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index 583b7e8..918bcff 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -25,7 +25,7 @@ def add_label_sentiment(r): s.append("__label__" + sentiment) def add_label(r, label_type): - if label_type == 'aspect': return add_label(r) + if label_type == 'aspect': return add_label_aspect(r) elif label_type == 'sentiment': return add_label_sentiment(r) def review_formatted_file(path, corpus): @@ -81,6 +81,7 @@ def preprocess(doctype, reviews, settings=None, label_type='aspect'): def get_aspect_words(self, aspect, nwords): words_prob = [] + print(sorted(self.aspect_word_prob[aspect].items(), key=lambda item: item[1], reverse=True)[:nwords]) for wp in sorted(self.aspect_word_prob[aspect].items(), key=lambda item: item[1], reverse=True)[:nwords]: words_prob.append(wp) return words_prob From b19212298c7fef9cf09853e537e2d1829388c351 Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 16:05:54 -0500 Subject: [PATCH 5/7] Bug fix (fast training methods) and optimization --- src/aml/fast.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index 918bcff..ca486c5 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -1,9 +1,7 @@ import copy import pickle -import re +import re, numpy as np, pandas as pd, random from typing import List -import numpy as np -import pandas as pd import fasttext import gensim @@ -23,6 +21,7 @@ def add_label_sentiment(r): for i, s in enumerate(r_.sentences): for _, _, sentiment in r.aos[i]: s.append("__label__" + sentiment) + return r_ def add_label(r, label_type): if label_type == 'aspect': return add_label_aspect(r) @@ -34,8 +33,10 @@ def review_formatted_file(path, corpus): class Fast(AbstractAspectModel, AbstractSentimentModel): + capabilities = ['aspect_detection', 'sentiment_analysis'] + def __init__(self, naspects, nwords): - super().__init__(naspects, nwords) + super().__init__(naspects=naspects, nwords=nwords, capabilities=self.capabilities) self.aspect_word_prob = None def load(self, path): @@ -80,11 +81,7 @@ def preprocess(doctype, reviews, settings=None, label_type='aspect'): return reviews_, dict def get_aspect_words(self, aspect, nwords): - words_prob = [] - print(sorted(self.aspect_word_prob[aspect].items(), key=lambda item: item[1], reverse=True)[:nwords]) - for wp in sorted(self.aspect_word_prob[aspect].items(), key=lambda item: item[1], reverse=True)[:nwords]: - words_prob.append(wp) - return words_prob + return sorted(self.aspect_word_prob[aspect].items(), key=lambda item: item[1], reverse=True)[:nwords] def generate_aspect_words(self): aw_prob = dict() @@ -125,14 +122,14 @@ def merge_aspects_words(self, r_pred_aspects, nwords): return result def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_extremes, output): - corpus, self.dict = self.preprocess(doctype, reviews_train, no_extremes) + corpus, self.dict = self.preprocess(doctype, reviews_train, no_extremes, label_type='sentiment') review_formatted_file(f'{output}model.train', corpus) - self.mdl = fasttext.train_supervised(f'{output}model.train', **settings, label_type='sentiment') + self.mdl = fasttext.train_supervised(f'{output}model.train', **settings) self.aspect_word_prob = self.generate_aspect_words() self.dict.save(f'{output}model.dict') self.mdl.save_model(f'{output}model') - pd.to_pickle(self.aspect_word_prob, f'{output}model_sword_prob.pkl') + pd.to_pickle(self.aspect_word_prob, f'{output}model_aspword_prob.pkl') # do we need cas and perplexity? def infer_sentiment(self, review: Review, doctype: str): From 9984bd53d04187d33792d673d37224f4fcab46cc Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:31:24 -0500 Subject: [PATCH 6/7] Add infer_sentiment --- src/aml/fast.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index ca486c5..b4285fb 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -132,5 +132,10 @@ def train_sentiment(self, reviews_train, reviews_valid, settings, doctype, no_ex pd.to_pickle(self.aspect_word_prob, f'{output}model_aspword_prob.pkl') # do we need cas and perplexity? - def infer_sentiment(self, review: Review, doctype: str): - return self.mdl.predict(review.get_txt(), k=self.naspects) \ No newline at end of file + def infer_sentiment(self, review, doctype): + review_s_prob = [] + review_, _ = super().preprocess(doctype, [review]) + for r in review_: + pred = self.mdl.predict(" ".join(r)) # default k=1 + review_s_prob.append((pred[0][0], pred[1][0])) + return review_s_prob \ No newline at end of file From 93fce8763e729a3a1510fc39711d6d80d72c50e6 Mon Sep 17 00:00:00 2001 From: Christine Wong <55353492+Lillliant@users.noreply.github.com> Date: Wed, 29 Nov 2023 19:54:57 -0500 Subject: [PATCH 7/7] Bug fix: infer_sentiment --- src/aml/fast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aml/fast.py b/src/aml/fast.py index b4285fb..033fa6e 100644 --- a/src/aml/fast.py +++ b/src/aml/fast.py @@ -137,5 +137,5 @@ def infer_sentiment(self, review, doctype): review_, _ = super().preprocess(doctype, [review]) for r in review_: pred = self.mdl.predict(" ".join(r)) # default k=1 - review_s_prob.append((pred[0][0], pred[1][0])) + review_s_prob.append([(pred[0][0], pred[1][0])]) return review_s_prob \ No newline at end of file