Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Word2Vec Embeddings to Cluster Summarizers [resolves #150] #162

Open
wants to merge 8 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,6 @@ sadedegel/dataset/raw/*.txt filter=lfs diff=lfs merge=lfs -text
sadedegel/dataset/sents/*.json filter=lfs diff=lfs merge=lfs -text
sadedegel/dataset/annotated/*.json filter=lfs diff=lfs merge=lfs -text
sadedegel/bblock/data/vocabulary.json filter=lfs diff=lfs merge=lfs -text
sadedegel/ml/model/*/*.model filter=lfs diff=lfs merge=lfs -text
sadedegel/ml/model/*.model filter=lfs diff=lfs merge=lfs -text
sadedegel/ml/model/*/*.npy filter=lfs diff=lfs merge=lfs -text
Binary file added ml
Binary file not shown.
Binary file added model
Binary file not shown.
51 changes: 51 additions & 0 deletions sadedegel/bblock/doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

import numpy as np # type:ignore

from pathlib import Path
from os.path import dirname

from loguru import logger
from scipy.sparse import csr_matrix

Expand Down Expand Up @@ -158,6 +161,8 @@ def span_features(self):

class Sentences:
tokenizer = get_default_word_tokenizer()
wv_model = None
wv_model_name = 'extended_model.model'

def __init__(self, id_: int, text: str, doc):
self.id = id_
Expand All @@ -166,6 +171,9 @@ def __init__(self, id_: int, text: str, doc):
self._tokens = None
self.document = doc
self._bert = None
self._w2v = None
self._has_w2v = None
self._oov = {}
self.toks = None

@staticmethod
Expand Down Expand Up @@ -227,6 +235,42 @@ def idf(self):

return v

@property
def word2vec(self):
if self._w2v is None:
dir = tr_lower(Sentences.tokenizer.__name__.split("Tokenizer")[0])
if Sentences.wv_model is None:
logger.info("Loading Gensim Word2Vec Model...")
from gensim.models import KeyedVectors
try:
kv = KeyedVectors.load(str(Path(dirname(__file__)) / ".." / "ml" / "model" / dir /
Sentences.wv_model_name))
except:
raise FileNotFoundError("Make sure you have a Gensim vocabulary built with currently "
f"configured tokenizer: {dir}")
Sentences.wv_model = kv

tok_vecs = []
oov = []
for token in self.tokens:
low_tok = tr_lower(token)
try:
token2vec = Sentences.wv_model.wv[low_tok]
tok_vecs.append(token2vec)
except:
oov.append(token)
self._oov[f"word2vec_{dir}"] = oov

if tok_vecs:
self._w2v = np.mean(np.vstack(tok_vecs), axis=0)
self._has_w2v = True
else:
logger.info(f"All tokens in this sentence are out of vocabulary. Sentence: {self.text}")
self._w2v = np.zeros(Sentences.wv_model.vector_size, dtype=np.float32)
self._has_w2v = False

return self._w2v

def __str__(self):
return self.text

Expand All @@ -251,6 +295,7 @@ def __init__(self, raw: Union[str, None]):

self.raw = raw
self._bert = None
self._w2v = None
self._sents = []
self.spans = None

Expand Down Expand Up @@ -380,6 +425,12 @@ def tfidf_embeddings(self):

return m

@property
def word2vec_embeddings(self):
if self._w2v is None:
self._w2v = np.vstack([sent.word2vec for sent in self])
return self._w2v

@property
def tf(self):
indptr = [0]
Expand Down
68 changes: 68 additions & 0 deletions sadedegel/ml/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import click
from gensim.models import Word2Vec
from multiprocessing import cpu_count
from pathlib import Path
from os.path import dirname
from tqdm import tqdm
from .word2vec_utils import GCorpus


@click.group(help="Gensim Word2Vec Commandline")
def cli():
pass


@cli.command(help="Train a gensim based word2vec model.")
@click.option('--model-name', '-m', default='gensim_model')
@click.option('--corpus', '-c', type=click.Choice(['standard', 'extended', 'tokenization']), default='standard')
@click.option('--tokenizer', '-t', type=click.Choice(['simple', 'bert']), default='simple')
@click.option('--num-epochs', '-e', help='Training epochs', default=10)
@click.option('--skip-gram', '-s', help='Skip Gram or CBOW. Defaults to True for Skip Gram', default=True)
@click.option('--retrain-from', '-r', default=None)
@click.option('--embedding-size', '-s', default=100)
def train_word2vec(model_name, corpus, tokenizer, num_epochs, skip_gram, retrain_from, embedding_size):

if not retrain_from:
sentences = GCorpus(sadedegel_corpus=corpus, tokenizer=tokenizer)
model = Word2Vec(size=embedding_size,
workers=cpu_count(),
min_count=3,
sg=skip_gram,
seed=42)
click.secho(click.style('Building Vocab...', fg='yellow'))
model.build_vocab(sentences)

click.secho(click.style('Training model...', fg='yellow'))
for e in tqdm(range(num_epochs)):
sentences = GCorpus(sadedegel_corpus=corpus, tokenizer=tokenizer)
model.train(sentences,
epochs=1,
total_examples=model.corpus_count,
report_delay=1)

model_name += '.model'
modelpath = (Path(dirname(__file__)) / 'model' / tokenizer / model_name).absolute()
click.secho(f"Saving model to "+click.style(f"{modelpath}", fg='blue'), color='white')
model.save(str(modelpath))

else:
model_name += '.model'
modelpath = (Path(dirname(__file__)) / 'model' / tokenizer / model_name).absolute()
click.secho(f"Loading model from " + click.style(f"{modelpath}", fg='blue'), color='white')
model = Word2Vec.load(str(modelpath))

sentences = GCorpus(sadedegel_corpus=corpus, tokenizer=tokenizer)

model.build_vocab(sentences, update=True)

click.secho(click.style('Training model...', fg='yellow'))
for e in tqdm(range(num_epochs)):
sentences = GCorpus(sadedegel_corpus=corpus, tokenizer=tokenizer)
model.train(sentences,
epochs=1,
total_examples=model.corpus_count,
report_delay=1)


if __name__ == '__main__':
cli()
3 changes: 3 additions & 0 deletions sadedegel/ml/model/bert/extended_model.model
Git LFS file not shown
Binary file added sadedegel/ml/model/simple/extended_model.model
Binary file not shown.
Binary file not shown.
3 changes: 3 additions & 0 deletions sadedegel/ml/model/simple/extended_model.model.wv.vectors.npy
Git LFS file not shown
3 changes: 3 additions & 0 deletions sadedegel/ml/model/simple/gensim_model.model
Git LFS file not shown
35 changes: 35 additions & 0 deletions sadedegel/ml/word2vec_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from sadedegel.dataset import load_raw_corpus
from sadedegel.dataset.extended import load_extended_raw_corpus
from sadedegel import Doc
from sadedegel.bblock.word_tokenizer_helper import puncts
from sadedegel.bblock.util import tr_lower
from sadedegel.config import tokenizer_context

from tqdm import tqdm


class GCorpus(object):
def __init__(self, sadedegel_corpus='standard', tokenizer='simple'):
self._corpus_type = sadedegel_corpus
self._corpus = None
self.toker = tokenizer

if self._corpus_type == 'standard':
self._corpus = load_raw_corpus()
self.total = 98
elif self._corpus_type == 'extended':
self._corpus = load_extended_raw_corpus()
self.total = 36131
elif self._corpus_type == 'tokenization':
raise NotImplementedError('Tokenization Corpus is not yet implemented.')

def __iter__(self):
for document in tqdm(self._corpus, total=self.total):
with tokenizer_context(self.toker):
d = Doc(document)
for sentence in d:
tokens = []
for token in sentence.tokens:
if token not in puncts:
tokens.append(tr_lower(token))
yield tokens
6 changes: 6 additions & 0 deletions sadedegel/summarize/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,12 @@ ground truth human annotation (Best possible total `relevance` score that can be
| KMeans Summarizer - bert | 0.6599 | 0.7434 | 0.8344 |
| AutoKMeans Summarizer - bert | 0.6608 | 0.7418 | 0.8333 |
| DecomposedKMeans Summarizer - bert | 0.6579 | 0.7440 | 0.8341 |
| KMeans Summarizer (Word2Vec-SimpleTokenizer) - simple | 0.6223 | 0.7188 | 0.8204 |
| AutoKMeans Summarizer (Word2Vec-SimpleTokenizer) - simple | 0.6093 | 0.7117 | 0.8147 |
| DecomposedKMeans Summarizer (Word2Vec-SimpleTokenizer) - simple | 0.6221 | 0.7189 | 0.8207 |
| KMeans Summarizer (Word2Vec-BertTokenizer) - bert | 0.6151 | 0.7162 | 0.8207 |
| AutoKMeans Summarizer (Word2Vec-BertTokenizer) - bert | 0.5976 | 0.7062 | 0.8131 |
| DecomposedKMeans Summarizer (Word2Vec-BertTokenizer) - bert | 0.6152 | 0.7162 | 0.8209 |
| TextRank(0.05) Summarizer (BERT) - bert | 0.6212 | 0.7010 | 0.8000 |
| TextRank(0.5) Summarizer (BERT) - bert | 0.6232 | 0.7005 | 0.7998 |
| TFIDF Summarizer - bert | 0.6781 | **0.7592** | 0.8504 |
Expand Down
13 changes: 12 additions & 1 deletion sadedegel/summarize/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@
('KMeans Summarizer', KMeansSummarizer()),
('AutoKMeans Summarizer', AutoKMeansSummarizer()),
('DecomposedKMeans Summarizer', DecomposedKMeansSummarizer()),
('KMeans Summarizer (Word2Vec-BertTokenizer)', KMeansSummarizer(embedding_type="word2vec")),
('AutoKMeans Summarizer (Word2Vec-BertTokenizer)', AutoKMeansSummarizer(embedding_type="word2vec")),
('DecomposedKMeans Summarizer (Word2Vec-BertTokenizer)', DecomposedKMeansSummarizer(embedding_type=
"word2vec")),
('KMeans Summarizer (Word2Vec-SimpleTokenizer)', KMeansSummarizer(embedding_type="word2vec")),
('AutoKMeans Summarizer (Word2Vec-SimpleTokenizer)', AutoKMeansSummarizer(embedding_type="word2vec")),
('DecomposedKMeans Summarizer (Word2Vec-SimpleTokenizer)', DecomposedKMeansSummarizer(embedding_type=
"word2vec")),
("TextRank(0.05) Summarizer (BERT)", TextRank(alpha=0.05)),
("TextRank(0.15) Summarizer (BERT)", TextRank(alpha=0.15)),
("TextRank(0.30) Summarizer (BERT)", TextRank(alpha=0.30)),
Expand Down Expand Up @@ -91,7 +99,10 @@ def evaluate(table_format, tag, debug):
click.echo(click.style(f" {name} ", fg="magenta"), nl=False)
# skip simple tokenizer for clustering models
if ("cluster" in summarizer or "rank" in summarizer or name == "TFIDF Summarizer") and \
word_tokenizer == "simple":
word_tokenizer == "simple" and "Word2Vec-SimpleTokenizer" not in name:
click.echo(click.style("SKIP", fg="yellow"))
continue
if "cluster" in summarizer and word_tokenizer == "bert" and "Word2Vec-SimpleTokenizer" in name:
click.echo(click.style("SKIP", fg="yellow"))
continue

Expand Down
77 changes: 52 additions & 25 deletions sadedegel/summarize/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,47 +13,67 @@
class KMeansSummarizer(ExtractiveSummarizer):
tags = ExtractiveSummarizer.tags + ['cluster', 'ml']

def __init__(self, n_clusters=2, random_state=42, normalize=True):
def __init__(self, n_clusters=2, random_state=42, normalize=True, embedding_type='bert'):
super().__init__(normalize)
self.n_clusters = n_clusters
self.random_state = random_state
self.embedding_type = embedding_type

def _predict(self, sentences: List[Sentences]):
with tokenizer_context('bert', warning=True):
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")

doc = sentences[0].document
doc = sentences[0].document

effective_n_clusters = min(self.n_clusters, len(doc))
effective_n_clusters = min(self.n_clusters, len(doc))

if self.embedding_type == 'bert':
with tokenizer_context('bert', warning=True):

return 1 / (KMeans(n_clusters=effective_n_clusters, random_state=self.random_state).fit_transform(
doc.bert_embeddings).min(axis=1) + 1e-10)

elif self.embedding_type == 'word2vec':

return 1 / (KMeans(n_clusters=effective_n_clusters, random_state=self.random_state).fit_transform(
doc.bert_embeddings).min(axis=1) + 1e-10)
doc.word2vec_embeddings).min(axis=1) + 1e-10)
else:
raise ValueError(f"{self.embedding_type} is not a valid embedding type supported by SadedeGel")


class AutoKMeansSummarizer(ExtractiveSummarizer):
"""Kmeans cluster automatically deciding on the number of clusters to be used based on document length."""

tags = ExtractiveSummarizer.tags + ['cluster', 'ml']

def __init__(self, n_cluster_to_length=0.05, min_n_cluster=2, random_state=42, normalize=True):
def __init__(self, n_cluster_to_length=0.05, min_n_cluster=2, random_state=42, normalize=True, embedding_type='bert'):
super().__init__(normalize)

self.n_cluster_to_length = n_cluster_to_length
self.min_n_cluster = min_n_cluster
self.random_state = random_state
self.embedding_type = embedding_type

def _predict(self, sentences: List[Sentences]):
with tokenizer_context('bert', warning=True):
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")

doc = sentences[0].document

doc = sentences[0].document
effective_n_clusters = min(max(ceil(len(doc) * self.n_cluster_to_length), self.min_n_cluster), len(doc))

effective_n_clusters = min(max(ceil(len(doc) * self.n_cluster_to_length), self.min_n_cluster), len(doc))
if self.embedding_type == 'bert':
with tokenizer_context('bert', warning=True):

return 1 / (KMeans(n_clusters=effective_n_clusters, random_state=self.random_state).fit_transform(
doc.bert_embeddings).min(axis=1) + 1e-10)

elif self.embedding_type == 'word2vec':
return 1 / (KMeans(n_clusters=effective_n_clusters, random_state=self.random_state).fit_transform(
doc.bert_embeddings).min(axis=1) + 1e-10)
doc.word2vec_embeddings).min(axis=1) + 1e-10)

else:
raise ValueError(f"{self.embedding_type} is not a valid embedding type supported by SadedeGel")


class DecomposedKMeansSummarizer(ExtractiveSummarizer):
Expand All @@ -66,24 +86,31 @@ class DecomposedKMeansSummarizer(ExtractiveSummarizer):

tags = ExtractiveSummarizer.tags + ['cluster', 'ml']

def __init__(self, n_clusters=2, n_components=48, random_state=42, normalize=True):
def __init__(self, n_clusters=2, n_components=48, random_state=42, normalize=True, embedding_type='bert'):
super().__init__(normalize)
self.n_clusters = n_clusters
self.n_components = n_components
self.random_state = random_state
self.embedding_type = embedding_type

def _predict(self, sentences: List[Sentences]):
with tokenizer_context('bert', warning=True):
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")
if len(sentences) == 0:
raise ValueError(f"Ensure that document contains a few sentences for summarization")

doc = sentences[0].document

doc = sentences[0].document
effective_n_clusters = min(self.n_clusters, len(doc))
effective_n_components = min(self.n_components, len(doc))

effective_n_clusters = min(self.n_clusters, len(doc))
effective_n_components = min(self.n_components, len(doc))
pipeline = Pipeline(
[('pca', PCA(effective_n_components)),
('kmeans', KMeans(effective_n_clusters, random_state=self.random_state))])

pipeline = Pipeline(
[('pca', PCA(effective_n_components)),
('kmeans', KMeans(effective_n_clusters, random_state=self.random_state))])
if self.embedding_type == 'bert':
with tokenizer_context('bert', warning=True):
return 1 / (pipeline.fit_transform(doc.bert_embeddings).min(axis=1) + 1e-10)
elif self.embedding_type == 'word2vec':
return 1 / (pipeline.fit_transform(doc.word2vec_embeddings).min(axis=1) + 1e-10)

return 1 / (pipeline.fit_transform(doc.bert_embeddings).min(axis=1) + 1e-10)
else:
raise ValueError(f"{self.embedding_type} is not a valid embedding type supported by SadedeGel")
1 change: 1 addition & 0 deletions tests/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from sadedegel.bblock.util import tr_upper, tr_lower, __tr_lower__, __tr_upper__ # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.bblock.util import flatten, is_eos # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.ml import create_model, load_model, save_model # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.ml.word2vec_utils import GCorpus # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.metrics import rouge1_score # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel.server.__main__ import app # noqa # pylint: disable=unused-import, wrong-import-position
from sadedegel import tokenizer_context # noqa # pylint: disable=unused-import, wrong-import-position
Expand Down
Loading