Bug repair and improvements

NaturalNode · Aug 9, 2024 · 6219419 · 6219419
1 parent bcf56ae
commit 6219419
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 12 deletions.
diff --git a/examples/tokenizer/testSentenceTokenizer.js b/examples/tokenizer/testSentenceTokenizer.js
@@ -0,0 +1,24 @@
+const Tokenizer = require('../../lib/natural').SentenceTokenizer
+
+const abbreviations = require('../../lib/natural').abbreviations
+const sentenceDemarkers = ['.', '!', '?']
+const tokenizer = new Tokenizer(abbreviations, sentenceDemarkers)
+
+const testData = `Breaking News: Renewable Energy on the Rise
+
+In recent years, the adoption of renewable energy sources has been on a significant rise. Governments around the world are investing heavily in solar, wind, and hydroelectric power to reduce their carbon footprints and combat climate change.
+
+In the United States, the Biden administration has set ambitious goals to achieve net-zero emissions by 2050. This involves a massive shift from fossil fuels to cleaner energy sources. "We are at a pivotal moment in history," said President Biden. "Our actions today will determine the health of our planet for future generations."
+
+Meanwhile, in Europe, the European Union has been at the forefront of renewable energy adoption. Countries like Germany and Denmark are leading the charge with substantial investments in wind farms and solar panels. The EU's Green Deal aims to make Europe the first climate-neutral continent by 2050.
+
+China, the world's largest emitter of greenhouse gases, is also making strides in renewable energy. The country has become the largest producer of solar panels and has invested heavily in wind energy. "China is committed to a green future," said President Xi Jinping during a recent summit.
+
+Despite these advancements, challenges remain. The transition to renewable energy requires enormous financial investments, technological innovations, and policy changes. Additionally, the intermittency of renewable sources like solar and wind poses a challenge for grid stability.
+
+Experts believe that with continued global cooperation and investment, renewable energy can become the dominant source of power in the coming decades. "The future is bright for renewable energy," said Dr. Jane Goodall, a renowned environmentalist. "We have the technology, the resources, and the will to make this change. Now, we must act."
+
+Stay tuned for more updates on this developing story.`
+
+const result = tokenizer.tokenize(testData)
+console.log(result)
diff --git a/lib/natural/tfidf/index.d.ts b/lib/natural/tfidf/index.d.ts
@@ -43,6 +43,7 @@ export class TfIdf {
   constructor (deserialized?: Record<string, unknown>)
   idf (term: string, force?: boolean): number
   addDocument (document: string | string[] | Record<string, string>, key?: Record<string, any> | any, restoreCache?: boolean): void
+  removeDocument (key: any): boolean
   addFileSync (path: string, encoding?: string, key?: string, restoreCache?: boolean): void
   tfidf (terms: string | string[], d: number): number
   tfidfs (terms: string | string[], callback?: TfIdfCallback): number[]

diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js
@@ -31,21 +31,22 @@ const ABBREV = 'ABBREV'
 const DEBUG = false
 
 function generateUniqueCode (base, index) {
-  return `${base}_${index}`
+  // Surround the placeholder with {{}} to prevent shorter numbers to be recognized
+  // in larger numbers
+  return `{{${base}_${index}}}`
 }
 
 function escapeRegExp (string) {
   return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')
 }
 
 class SentenceTokenizer extends Tokenizer {
-  constructor (abbreviations, sentenceDemarkers) {
+  constructor (abbreviations) {
     super()
-    this.abbreviations = abbreviations
-    if (sentenceDemarkers) {
-      this.sentenceDemarkers = sentenceDemarkers
+    if (abbreviations) {
+      this.abbreviations = abbreviations
     } else {
-      this.sentenceDemarkers = ['.', '!', '?']
+      this.abbreviations = []
     }
     this.replacementMap = null
     this.replacementCounter = 0
@@ -64,7 +65,10 @@ class SentenceTokenizer extends Tokenizer {
   }
 
   replaceAbbreviations (text) {
-    const pattern = new RegExp(`(${this.abbreviations.map(abbrev => escapeRegExp(abbrev)).join('|')})`, 'g')
+    if (this.abbreviations.length === 0) {
+      return text
+    }
+    const pattern = new RegExp(`(${this.abbreviations.map(abbrev => escapeRegExp(abbrev)).join('|')})`, 'gi')
     const replacedText = text.replace(pattern, match => {
       const code = generateUniqueCode(ABBREV, this.replacementCounter++)
       this.replacementMap.set(code, match)
@@ -77,12 +81,11 @@ class SentenceTokenizer extends Tokenizer {
   replaceDelimitersWithPlaceholders (text) {
     // Regular expression for sentence delimiters optionally followed by a bracket or quote
     // Multiple delimiters with spaces in between are allowed
-    // The look ahead makes sure that there is punctuation symbol as next symbol
-    const delimiterPattern = /(?=[.?!…])([.?!… ]+)(["'”’)}\]]?)/g
-
-    const modifiedText = text.replace(delimiterPattern, (match, p1, p2) => {
+    // The expression makes sure that the sentence delimiter group ends with a sentence delimiter
+    const delimiterPattern = /([.?!… ]*)([.?!…])(["'”’)}\]]?)/g;
+    const modifiedText = text.replace(delimiterPattern, (match, p1, p2, p3) => {
       const placeholder = generateUniqueCode(DELIM, this.replacementCounter++)
-      this.delimiterMap.set(placeholder, p1 + p2)
+      this.delimiterMap.set(placeholder, p1 + p2 + p3)
       return placeholder
     })
 

diff --git a/lib/natural/util/abbreviations_en.js b/lib/natural/util/abbreviations_en.js
@@ -7,6 +7,8 @@ const knownAbbreviations = [
   'c/o',
   'dept.',
   'D.I.Y.',
+  'Dr.',
+  'e.g.',
   'est.',
   'E.T.A.',
   'Inc.',

diff --git a/lib/natural/util/index.js b/lib/natural/util/index.js
@@ -23,6 +23,7 @@ THE SOFTWARE.
 'use strict'
 
 exports.stopwords = require('./stopwords').words
+exports.abbreviations = require('./abbreviations_en').knownAbbreviations
 exports.ShortestPathTree = require('./shortest_path_tree')
 exports.LongestPathTree = require('./longest_path_tree')
 exports.DirectedEdge = require('./directed_edge')