diff --git a/README.md b/README.md index b8589b7..3e01dd0 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ const newString = sw.removeStopwords(oldString, [ 'even', 'a', 'custom', 'stopwo ### <language code> -Arrays of stopwords for the following 27 languages are supplied: +Arrays of stopwords for the following 28 languages are supplied: * `ar` - Modern Standard Arabic * `bn` - Bengali @@ -75,6 +75,7 @@ Arrays of stopwords for the following 27 languages are supplied: * `st` - Sotho * `sv` - Swedish * `sw` - Swahili +* `vi` - Vietnamese * `yo` - Yoruba * `zh` - Chinese Simplified * `zu` - Zulu diff --git a/lib/stopword.js b/lib/stopword.js index c59b2c0..937b30c 100644 --- a/lib/stopword.js +++ b/lib/stopword.js @@ -35,6 +35,7 @@ exports.so = require('./stopwords_so.js').words exports.st = require('./stopwords_st.js').words exports.sv = require('./stopwords_sv.js').words exports.sw = require('./stopwords_sw.js').words +exports.vi = require('./stopwords_vi.js').words exports.yo = require('./stopwords_yo.js').words exports.zh = require('./stopwords_zh.js').words exports.zu = require('./stopwords_zu.js').words diff --git a/lib/stopwords_vi.js b/lib/stopwords_vi.js new file mode 100644 index 0000000..1603455 --- /dev/null +++ b/lib/stopwords_vi.js @@ -0,0 +1,35 @@ +/* +Copyright (c) 2011, David Przybilla, Chris Umbel +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +// a list of commonly used words that have little meaning and can be excluded +// from analysis. +var words = [ + 'bị', 'bởi', 'cả', 'các', 'cái', 'cần', 'càng', 'chỉ', + 'chiếc', 'cho', 'chứ', 'chưa', 'chuyện', 'có', 'có thể', + 'cứ', 'của', 'cùng', 'cũng', 'đã', 'đang', 'để', 'đến nỗi', + 'đều', 'điều', 'do', 'đó', 'được', 'dưới', 'gì', 'khi', + 'không', 'là', 'lại', 'lên', 'lúc', 'mà', 'mỗi', 'một cách', + 'này', 'nên', 'nếu', 'ngay', 'nhiều', 'như', 'nhưng', 'những', + 'nơi', 'nữa', 'phải', 'qua', 'ra', 'rằng', 'rất', 'rồi', + 'sau', 'sẽ', 'so', 'sự', 'tại', 'theo', 'thì', 'trên', 'trước', 'từ', 'từng', 'và', + 'vẫn', 'vào', 'vậy', 'vì', 'việc', 'với', 'vừa', 'vâng', 'à', 'ừ', 'từ' +] + +// tell the world about the noise words. +exports.words = words \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index f840264..37366e7 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,6 +1,6 @@ { "name": "stopword", - "version": "0.1.17", + "version": "0.2.2", "lockfileVersion": 1, "requires": true, "dependencies": { diff --git a/package.json b/package.json index 9b58dcf..2eb4f03 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "stopword", - "version": "0.2.1", - "description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 27 languages and also takes lists with custom stopwords as input.", + "version": "0.2.2", + "description": "A module for node.js that takes in text and returns text that is stripped of stopwords. Has pre-defined stopword lists for 28 languages and also takes lists with custom stopwords as input.", "main": "lib/stopword.js", "scripts": { "test": "mocha" diff --git a/test/test.js b/test/test.js index ce81930..f9f7475 100644 --- a/test/test.js +++ b/test/test.js @@ -182,6 +182,12 @@ describe('general stopwordiness:', function () { newString.should.eql(['celcelis', 'qaarada', 'antarktika', 'tan', 'qaboow', 'qalalsan', 'ee', 'dabaysha', 'badan', 'qaaradaha', 'caalamka']) }) + it('should remove vietnamese stopwords', function () { + const oldString = 'Đà Lạt luôn hiện lên như một thành phố nghỉ dưỡng miền núi kiểu mẫu với cảnh quan thiên nhiên tươi đẹp'.split(' ') + const newString = sw.removeStopwords(oldString, sw.vi) + newString.should.eql(['Đà', 'Lạt', 'luôn', 'hiện', 'một', 'thành', 'phố', 'nghỉ', 'dưỡng', 'miền', 'núi', 'kiểu', 'mẫu', 'cảnh', 'quan', 'thiên', 'nhiên', 'tươi', 'đẹp']) + }) + // Right to Left languages