-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkeyword_extraction_algorithms.py
157 lines (121 loc) · 5.46 KB
/
keyword_extraction_algorithms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# -*- coding: utf-8 -*-
"""Keyword_Extraction_Algorithms.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1-WpVwNHiXSYS4rTbT2mf4aGK0B-L9Fi9
"""
!pip install git+https://github.com/boudinfl/pke.git
text = """ The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including
ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,
our model establishes a new single-model state-of-the-art BLEU score of 41.8 after
training for 3.5 days on eight GPUs, a small fraction of the training costs of the
best models from the literature. We show that the Transformer generalizes well to
other tasks by applying it successfully to English constituency parsing both with
large and limited training data.
"""
import pke
def position_rank_extractor(text):
"""
Uses PositionRank to extract the top 5 keywords from a text
Arguments: text (str)
Returns: list of keywords (list)
"""
# define the valid Part-of-Speeches to occur in the graph
pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
extractor = pke.unsupervised.PositionRank()
extractor.load_document(text, language='en')
extractor.candidate_selection(maximum_word_number=5)
# 4. weight the candidates using the sum of their word's scores that are
# computed using random walk biaised with the position of the words
# in the document. In the graph, nodes are words (nouns and
# adjectives only) that are connected if they occur in a window of
# 3 words.
extractor.candidate_weighting(window=3, pos=pos)
# 5. get the 5-highest scored candidates as keyphrases
keyphrases = extractor.get_n_best(n=5)
results = []
for keyword, score in keyphrases:
results.append(keyword)
return results
position_rank_extractor(text)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from rake_nltk import Rake
def rake_extractor(text):
"""
Uses Rake to extract the top 5 keywords from a text
Arguments: text (str)
Returns: list of keywords (list)
"""
r = Rake()
r.extract_keywords_from_text(text)
return r.get_ranked_phrases()[:5]
rake_extractor(text)
"""# Deep-learning-based methods
The appearance of deep learning has enabled embedding-based methods. Researchers have developed several keyword extraction methods that use document embeddings and enable the model to be based on the semantic similarity.
# Keyword Extraction using BERT
It is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings.
1. Candidate Keywords/Keyphrases:
- Creating a list of candidate keywords or keyphrases from a document.
- CountVectorizer. This allows us to specify the length of the keywords and make them into keyphrases. It also is a nice method for quickly removing stop words.
2. Embedding:
- We use BERT for this purpose as it has shown great results for both semantic similarity and paraphrase.
3. Similarity:
- Find the candidates that are most similar to the document.
We will be using the cosine similarity between vectors
"""
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
# 1. Extract candidate words/phrases
n_gram_range = (2, 2)
stop_words = "english"
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
candidates = count.get_feature_names_out()
# 2. Embedding
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([text])
candidate_embeddings = model.encode(candidates)
# 3. Similarity
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
keywords
import torch
from keybert import KeyBERT
import pandas as pd
import numpy as np
import string
string.punctuation
import datetime
# initialize now as filename
now = datetime.datetime.today().strftime('%d_%m_%Y')
# nltk to nlp preprocessing
import nltk
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import sent_tokenize, word_tokenize
# initialize KeyBERT model
kw_model = KeyBERT()
# join title, set title as lower and tokenize title
new_text = "".join([i for i in text if i not in string.punctuation])
new_text = new_text.lower()
new_text = word_tokenize(new_text)
# initialize stopwords and extend several word to the stop_list
stopwords = nltk.corpus.stopwords.words('english')
# remove stop words
new_text = [i for i in new_text if i not in stopwords]
# join title into string
new_text = " ".join([i for i in new_text if i not in string.punctuation])
keywords = kw_model.extract_keywords(docs=new_text, keyphrase_ngram_range=(2, 2), stop_words='english', use_mmr=True, top_n=5, diversity=0.5)
results = []
for keyword, score in keywords:
results.append(keyword)
results