From 6219419aa904e503fcbef7ed1e8e49d52bd71baa Mon Sep 17 00:00:00 2001 From: Hugo-ter-Doest Date: Fri, 9 Aug 2024 11:00:51 +0200 Subject: [PATCH] Bug repair and improvements --- examples/tokenizer/testSentenceTokenizer.js | 24 +++++++++++++++++ lib/natural/tfidf/index.d.ts | 1 + lib/natural/tokenizers/sentence_tokenizer.js | 27 +++++++++++--------- lib/natural/util/abbreviations_en.js | 2 ++ lib/natural/util/index.js | 1 + 5 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 examples/tokenizer/testSentenceTokenizer.js diff --git a/examples/tokenizer/testSentenceTokenizer.js b/examples/tokenizer/testSentenceTokenizer.js new file mode 100644 index 00000000..ebe5a005 --- /dev/null +++ b/examples/tokenizer/testSentenceTokenizer.js @@ -0,0 +1,24 @@ +const Tokenizer = require('../../lib/natural').SentenceTokenizer + +const abbreviations = require('../../lib/natural').abbreviations +const sentenceDemarkers = ['.', '!', '?'] +const tokenizer = new Tokenizer(abbreviations, sentenceDemarkers) + +const testData = `Breaking News: Renewable Energy on the Rise + +In recent years, the adoption of renewable energy sources has been on a significant rise. Governments around the world are investing heavily in solar, wind, and hydroelectric power to reduce their carbon footprints and combat climate change. + +In the United States, the Biden administration has set ambitious goals to achieve net-zero emissions by 2050. This involves a massive shift from fossil fuels to cleaner energy sources. "We are at a pivotal moment in history," said President Biden. "Our actions today will determine the health of our planet for future generations." + +Meanwhile, in Europe, the European Union has been at the forefront of renewable energy adoption. Countries like Germany and Denmark are leading the charge with substantial investments in wind farms and solar panels. The EU's Green Deal aims to make Europe the first climate-neutral continent by 2050. + +China, the world's largest emitter of greenhouse gases, is also making strides in renewable energy. The country has become the largest producer of solar panels and has invested heavily in wind energy. "China is committed to a green future," said President Xi Jinping during a recent summit. + +Despite these advancements, challenges remain. The transition to renewable energy requires enormous financial investments, technological innovations, and policy changes. Additionally, the intermittency of renewable sources like solar and wind poses a challenge for grid stability. + +Experts believe that with continued global cooperation and investment, renewable energy can become the dominant source of power in the coming decades. "The future is bright for renewable energy," said Dr. Jane Goodall, a renowned environmentalist. "We have the technology, the resources, and the will to make this change. Now, we must act." + +Stay tuned for more updates on this developing story.` + +const result = tokenizer.tokenize(testData) +console.log(result) \ No newline at end of file diff --git a/lib/natural/tfidf/index.d.ts b/lib/natural/tfidf/index.d.ts index 8e227228..015b9c58 100644 --- a/lib/natural/tfidf/index.d.ts +++ b/lib/natural/tfidf/index.d.ts @@ -43,6 +43,7 @@ export class TfIdf { constructor (deserialized?: Record) idf (term: string, force?: boolean): number addDocument (document: string | string[] | Record, key?: Record | any, restoreCache?: boolean): void + removeDocument (key: any): boolean addFileSync (path: string, encoding?: string, key?: string, restoreCache?: boolean): void tfidf (terms: string | string[], d: number): number tfidfs (terms: string | string[], callback?: TfIdfCallback): number[] diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js index a4dbeea4..3cba5fd5 100644 --- a/lib/natural/tokenizers/sentence_tokenizer.js +++ b/lib/natural/tokenizers/sentence_tokenizer.js @@ -31,7 +31,9 @@ const ABBREV = 'ABBREV' const DEBUG = false function generateUniqueCode (base, index) { - return `${base}_${index}` + // Surround the placeholder with {{}} to prevent shorter numbers to be recognized + // in larger numbers + return `{{${base}_${index}}}` } function escapeRegExp (string) { @@ -39,13 +41,12 @@ function escapeRegExp (string) { } class SentenceTokenizer extends Tokenizer { - constructor (abbreviations, sentenceDemarkers) { + constructor (abbreviations) { super() - this.abbreviations = abbreviations - if (sentenceDemarkers) { - this.sentenceDemarkers = sentenceDemarkers + if (abbreviations) { + this.abbreviations = abbreviations } else { - this.sentenceDemarkers = ['.', '!', '?'] + this.abbreviations = [] } this.replacementMap = null this.replacementCounter = 0 @@ -64,7 +65,10 @@ class SentenceTokenizer extends Tokenizer { } replaceAbbreviations (text) { - const pattern = new RegExp(`(${this.abbreviations.map(abbrev => escapeRegExp(abbrev)).join('|')})`, 'g') + if (this.abbreviations.length === 0) { + return text + } + const pattern = new RegExp(`(${this.abbreviations.map(abbrev => escapeRegExp(abbrev)).join('|')})`, 'gi') const replacedText = text.replace(pattern, match => { const code = generateUniqueCode(ABBREV, this.replacementCounter++) this.replacementMap.set(code, match) @@ -77,12 +81,11 @@ class SentenceTokenizer extends Tokenizer { replaceDelimitersWithPlaceholders (text) { // Regular expression for sentence delimiters optionally followed by a bracket or quote // Multiple delimiters with spaces in between are allowed - // The look ahead makes sure that there is punctuation symbol as next symbol - const delimiterPattern = /(?=[.?!…])([.?!… ]+)(["'”’)}\]]?)/g - - const modifiedText = text.replace(delimiterPattern, (match, p1, p2) => { + // The expression makes sure that the sentence delimiter group ends with a sentence delimiter + const delimiterPattern = /([.?!… ]*)([.?!…])(["'”’)}\]]?)/g; + const modifiedText = text.replace(delimiterPattern, (match, p1, p2, p3) => { const placeholder = generateUniqueCode(DELIM, this.replacementCounter++) - this.delimiterMap.set(placeholder, p1 + p2) + this.delimiterMap.set(placeholder, p1 + p2 + p3) return placeholder }) diff --git a/lib/natural/util/abbreviations_en.js b/lib/natural/util/abbreviations_en.js index 4252ec3b..5ff2b926 100644 --- a/lib/natural/util/abbreviations_en.js +++ b/lib/natural/util/abbreviations_en.js @@ -7,6 +7,8 @@ const knownAbbreviations = [ 'c/o', 'dept.', 'D.I.Y.', + 'Dr.', + 'e.g.', 'est.', 'E.T.A.', 'Inc.', diff --git a/lib/natural/util/index.js b/lib/natural/util/index.js index 1d0a9a83..79152f73 100644 --- a/lib/natural/util/index.js +++ b/lib/natural/util/index.js @@ -23,6 +23,7 @@ THE SOFTWARE. 'use strict' exports.stopwords = require('./stopwords').words +exports.abbreviations = require('./abbreviations_en').knownAbbreviations exports.ShortestPathTree = require('./shortest_path_tree') exports.LongestPathTree = require('./longest_path_tree') exports.DirectedEdge = require('./directed_edge')