keyword_extraction_algorithms.py

# -*- coding: utf-8 -*-
"""Keyword_Extraction_Algorithms.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1-WpVwNHiXSYS4rTbT2mf4aGK0B-L9Fi9
"""

!pip install git+https://github.com/boudinfl/pke.git

text = """ The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more parallelizable and requiring significantly
less time to train. Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation task, improving over the existing best results, including
ensembles, by over 2 BLEU. On the WMT 2014 English-to-French translation task,
our model establishes a new single-model state-of-the-art BLEU score of 41.8 after
training for 3.5 days on eight GPUs, a small fraction of the training costs of the
best models from the literature. We show that the Transformer generalizes well to
other tasks by applying it successfully to English constituency parsing both with
large and limited training data.
"""

import pke

def position_rank_extractor(text):
    """
    Uses PositionRank to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    # define the valid Part-of-Speeches to occur in the graph
    pos = {'NOUN', 'PROPN', 'ADJ', 'ADV'}
    extractor = pke.unsupervised.PositionRank()
    extractor.load_document(text, language='en')
    extractor.candidate_selection(maximum_word_number=5)
    # 4. weight the candidates using the sum of their word's scores that are
    #    computed using random walk biaised with the position of the words
    #    in the document. In the graph, nodes are words (nouns and
    #    adjectives only) that are connected if they occur in a window of
    #    3 words.
    extractor.candidate_weighting(window=3, pos=pos)
    # 5. get the 5-highest scored candidates as keyphrases
    keyphrases = extractor.get_n_best(n=5)
    results = []
    for keyword, score in keyphrases:
        results.append(keyword)
    return results

position_rank_extractor(text)

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from rake_nltk import Rake

def rake_extractor(text):
    """
    Uses Rake to extract the top 5 keywords from a text
    Arguments: text (str)
    Returns: list of keywords (list)
    """
    r = Rake()
    r.extract_keywords_from_text(text)
    return r.get_ranked_phrases()[:5]

rake_extractor(text)

"""# Deep-learning-based methods

The appearance of deep learning has enabled embedding-based methods. Researchers have developed several keyword extraction methods that use document embeddings and enable the model to be based on the semantic similarity.

# Keyword Extraction using BERT
It is a minimal and easy-to-use keyword extraction technique that leverages BERT embeddings.

1. Candidate Keywords/Keyphrases:

- Creating a list of candidate keywords or keyphrases from a document.
- CountVectorizer. This allows us to specify the length of the keywords and make them into keyphrases. It also is a nice method for quickly removing stop words.

2. Embedding:

- We use BERT for this purpose as it has shown great results for both semantic similarity and paraphrase.


3. Similarity:

- Find the candidates that are most similar to the document.
We will be using the cosine similarity between vectors
"""

from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# 1. Extract candidate words/phrases
n_gram_range = (2, 2)
stop_words = "english"

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([text])
candidates = count.get_feature_names_out()

# 2. Embedding
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([text])
candidate_embeddings = model.encode(candidates)

# 3. Similarity
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

keywords

import torch
from keybert import KeyBERT

import pandas as pd
import numpy as np
import string
string.punctuation
import datetime

# initialize now as filename
now = datetime.datetime.today().strftime('%d_%m_%Y')

# nltk to nlp preprocessing
import nltk
stopwords = nltk.corpus.stopwords.words('english')
from nltk.tokenize import sent_tokenize, word_tokenize

# initialize KeyBERT model
kw_model = KeyBERT()

# join title, set title as lower and tokenize title
new_text = "".join([i for i in text if i not in string.punctuation])
new_text = new_text.lower()
new_text = word_tokenize(new_text)

# initialize stopwords and extend several word to the stop_list
stopwords = nltk.corpus.stopwords.words('english')

# remove stop words
new_text = [i for i in new_text if i not in stopwords]

# join title into string
new_text = " ".join([i for i in new_text if i not in string.punctuation])

keywords = kw_model.extract_keywords(docs=new_text, keyphrase_ngram_range=(2, 2), stop_words='english', use_mmr=True, top_n=5, diversity=0.5)
results = []
for keyword, score in keywords:
  results.append(keyword)
results