Merge pull request #65 from bact/add-license-spdx

Add SPDX header and CITATION
PyThaiNLP · Nov 9, 2024 · 9b3ff64 · 9b3ff64
2 parents 0225e5c + fbc02c4
commit 9b3ff64
Show file tree

Hide file tree

Showing 16 changed files with 147 additions and 46 deletions.
diff --git a/CITATION.cff b/CITATION.cff
@@ -0,0 +1,30 @@
+cff-version: "1.2.0"
+title: "nlpO3"
+message: >-
+  If you use this software, please cite it using these
+  metadata.
+type: software
+authors:
+  - family-names: Suntorntip
+    given-names: Thanathip
+repository-code: "https://github.com/PyThaiNLP/nlpo3/"
+repository: "https://github.com/PyThaiNLP/nlpo3/"
+url: "https://github.com/PyThaiNLP/nlpo3/"
+abstract: "Thai natural language processing library in Rust, with Python and Node bindings. Formerly oxidized-thainlp."
+keywords:
+  - "tokenizer"
+  - "tokenization"
+  - "Thai"
+  - "natural language processing"
+  - "NLP"
+  - "Rust"
+  - "Node.js"
+  - "Node"
+  - "Python"
+  - "text processing"
+  - "word segmentation"
+  - "Thai language"
+  - "Thai NLP"
+license: Apache-2.0
+version: v1.3.2
+date-released: "2023-04-14"
diff --git a/README.md b/README.md
@@ -1,15 +1,19 @@
+---
+SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+SPDX-License-Identifier: Apache-2.0
+---
+
 # nlpO3
 
-Thai Natural Language Processing library in Rust,
+Thai natural language processing library in Rust,
 with Python and Node bindings. Formerly oxidized-thainlp.
 
 ## Features
 
 - Thai word tokenizer
-  - use maximal-matching dictionary-based tokenization algorithm and honor Thai Character Cluster boundaries
+  - Use maximal-matching dictionary-based tokenization algorithm and honor Thai Character Cluster boundaries
     - [2.5x faster](https://github.com/PyThaiNLP/nlpo3/blob/main/nlpo3-python/notebooks/nlpo3_segment_benchmarks.ipynb) than similar pure Python implementation (PyThaiNLP's newmm)
-  - load a dictionary from a plain text file (one word per line) or from `Vec<String>`
-
+  - Load a dictionary from a plain text file (one word per line) or from `Vec<String>`
 
 ## Dictionary file
 
@@ -19,7 +23,6 @@ with Python and Node bindings. Formerly oxidized-thainlp.
   - [words_th.tx](https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th.txt) from [PyThaiNLP](https://github.com/PyThaiNLP/pythainlp/) - around 62,000 words (CC0)
   - [word break dictionary](https://github.com/tlwg/libthai/tree/master/data) from [libthai](https://github.com/tlwg/libthai/) - consists of dictionaries in different categories, with make script (LGPL-2.1)
 
-
 ## Usage
 
 ### Command-line interface
@@ -31,6 +34,7 @@ echo "ฉันกินข้าว" | nlpo3 segment
 ```
 
 ### Bindings
+
 - [Node.js](nlpo3-nodejs/)
 - [Python](nlpo3-python/) <a href="https://pypi.python.org/pypi/nlpo3"><img alt="pypi" src="https://img.shields.io/pypi/v/nlpo3.svg"/></a>
 
@@ -42,6 +46,7 @@ segment("สวัสดีครับ", "dict_name")
 ```
 
 ### As Rust library
+
 <a href="https://crates.io/crates/nlpo3/"><img alt="crates.io" src="https://img.shields.io/crates/v/nlpo3.svg"/></a>
 
 In `Cargo.toml`:
@@ -54,6 +59,7 @@ nlpo3 = "1.3.2"
 
 Create a tokenizer using a dictionary from file,
 then use it to tokenize a string (safe mode = true, and parallel mode = false):
+
 ```rust
 use nlpo3::tokenizer::newmm::NewmmTokenizer;
 use nlpo3::tokenizer::tokenizer_trait::Tokenizer;
@@ -63,17 +69,20 @@ let tokens = tokenizer.segment("ห้องสมุดประชาชน",
 ```
 
 Create a tokenizer using a dictionary from a vector of Strings:
+
 ```rust
 let words = vec!["ปาลิเมนต์".to_string(), "คอนสติติวชั่น".to_string()];
 let tokenizer = NewmmTokenizer::from_word_list(words);
 ```
 
 Add words to an existing tokenizer:
+
 ```rust
 tokenizer.add_word(&["มิวเซียม"]);
 ```
 
 Remove words from an existing tokenizer:
+
 ```rust
 tokenizer.remove_word(&["กระเพรา", "ชานชลา"]);
 ```
@@ -87,27 +96,29 @@ tokenizer.remove_word(&["กระเพรา", "ชานชลา"]);
 ### Steps
 
 Generic test:
+
 ```bash
 cargo test
 ```
 
 Build API document and open it to check:
+
 ```bash
 cargo doc --open
 ```
 
 Build (remove `--release` to keep debug information):
+
 ```bash
 cargo build --release
 ```
 
 Check `target/` for build artifacts.
 
-
 ## Development documents
 
 - [Notes on custom string](src/NOTE_ON_STRING.md)
 
 ## Issues
 
-Please report issues at https://github.com/PyThaiNLP/nlpo3/issues
+Please report issues at <https://github.com/PyThaiNLP/nlpo3/issues>
diff --git a/src/NOTE_ON_STRING.md b/src/NOTE_ON_STRING.md
@@ -1,24 +1,29 @@
+---
+SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+SPDX-License-Identifier: Apache-2.0
+---
+
 # Why Use Handroll Bytes Slice As "CustomString" Instead of Rust String?
 
-Rust String (and &str) is actually a slice of valid UTF-8 bytes which is
+Rust `String` (and `&str`) is actually a slice of valid UTF-8 bytes which is
 variable-length. It has no way of accessing a random index UTF-8 "character"
-with O(1) time complexity. 
+with O(1) time complexity.
 
 This means any algorithm with operations based on "character" index position
 will be horribly slow on Rust String.
 
-Hence, "fixed_bytes_str" which is transformed from a slice of valid UTF-8
+Hence, `fixed_bytes_str` which is transformed from a slice of valid UTF-8
 bytes into a slice of 4-bytes length - padded left with 0.
 
-Consequently, regular expressions must be padded with \x00 for each unicode
+Consequently, regular expressions must be padded with `\x00` for each Unicode
 character to have 4 bytes.
 
 Thai characters are 3-bytes length, so every Thai char in regex is padded
-with \x00 one time.
-
-For "space" in regex, it is padded with \x00\x00\x00.
+with `\x00` one time.
 
+For "space" in regex, it is padded with `\x00\x00\x00`.
 
 ## References
+
 - [Rust String indexing and internal representation](https://doc.rust-lang.org/book/ch08-02-strings.html#indexing-into-strings)
 - Read more about [UTF-8](https://en.wikipedia.org/wiki/UTF-8) at Wikipedia.
diff --git a/src/four_bytes_str.rs b/src/four_bytes_str.rs
@@ -1,2 +1,5 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
 pub mod custom_regex;
 pub mod custom_string;
diff --git a/src/four_bytes_str/custom_regex.rs b/src/four_bytes_str/custom_regex.rs
@@ -1,7 +1,13 @@
-// This is a result of an attempt to create a formatter
-// which translates normal, human readable thai regex
-// into 4-bytes zero-left-pad bytes regex pattern string
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
 
+/**
+ * Regex for a custom four-byte string.
+ * 
+ * This is a result of an attempt to create a formatter
+ * which translates normal, human readable thai regex
+ * into 4-bytes zero-left-pad bytes regex pattern string
+*/
 use anyhow::{Error as AnyError, Result};
 use regex_syntax::{
     hir::{Anchor, Class, Group, Literal as LiteralEnum, Repetition},

diff --git a/src/four_bytes_str/custom_string.rs b/src/four_bytes_str/custom_string.rs
@@ -1,5 +1,10 @@
-/// Functions dealing with a custom four-byte string.
-/// For more details, see src/NOTE_ON_STRING.md
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Functions dealing with a custom four-byte string.
+ * For more details, see src/NOTE_ON_STRING.md
+*/
 use std::{
     error::{self, Error},
     fmt::Display,

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,2 +1,5 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
 mod four_bytes_str;
 pub mod tokenizer;
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
 mod dict_reader;
 pub mod newmm;
 pub(crate) mod tcc;

diff --git a/src/tokenizer/dict_reader.rs b/src/tokenizer/dict_reader.rs
@@ -1,3 +1,9 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Dictionary reader.
+*/
 use crate::four_bytes_str::custom_string::CustomString;
 
 use super::trie_char::TrieChar as Trie;

diff --git a/src/tokenizer/newmm.rs b/src/tokenizer/newmm.rs
@@ -1,15 +1,18 @@
-/**
-Dictionary-based maximal matching word segmentation, constrained with
-Thai Character Cluster (TCC) boundaries.
-
-The code is based on the notebooks created by Korakot Chaovavanich,
-with heuristic graph size limit added to avoid exponential wait time.
-
-:See Also:
-    * \
-        https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/tokenize/newmm.py
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
 
-Rust implementation: ["Thanathip Suntorntip"]
+/**
+ * Dictionary-based maximal matching word segmentation, constrained with
+ * Thai Character Cluster (TCC) boundaries.
+ * 
+ * The code is based on the notebooks created by Korakot Chaovavanich,
+ * with heuristic graph size limit added to avoid exponential wait time.
+ * 
+ * :See Also:
+ *  * \
+ *   https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/tokenize/newmm.py
+ * 
+ * Rust implementation: ["Thanathip Suntorntip"]
 */
 use std::{collections::VecDeque, error::Error, fmt::Display, path::PathBuf};
 
@@ -167,7 +170,7 @@ impl NewmmTokenizer {
 
     fn one_cut<'a>(
         input: &'a CustomString,
-        custom_dict: & Trie,
+        custom_dict: &Trie,
     ) -> AnyResult<Vec<&'a CustomStringBytesSlice>> {
         let text = input;
         let input_char_len = text.chars_len();

diff --git a/src/tokenizer/tcc.rs b/src/tokenizer/tcc.rs
@@ -1,2 +1,5 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+pub(crate) mod tcc_rules;
 pub(crate) mod tcc_tokenizer;
-pub(crate) mod tcc_rules;
diff --git a/src/tokenizer/tcc/tcc_rules.rs b/src/tokenizer/tcc/tcc_rules.rs
@@ -1,3 +1,9 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Rules for TCC (Thai Character Cluster) tokenization.
+*/
 use crate::four_bytes_str::custom_regex::regex_pattern_to_custom_pattern;
 use lazy_static::lazy_static;
 use regex::bytes::Regex;
@@ -132,7 +138,7 @@ fn tcc_regex_test_cases() {
     let case_20 = replace_tcc_symbol("^แccc์");
     let case_21 = replace_tcc_symbol("^โctะ");
     let case_22 = replace_tcc_symbol("^[เ-ไ]ct");
-    
+
     // This is the only Karan case.
     assert_eq!(
         regex_pattern_to_custom_pattern(&case_1).unwrap(),

diff --git a/src/tokenizer/tcc/tcc_tokenizer.rs b/src/tokenizer/tcc/tcc_tokenizer.rs
@@ -1,3 +1,9 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * TCC (Thai Character Cluster) tokenizer.
+*/
 use super::tcc_rules::{LOOKAHEAD_TCC, NON_LOOKAHEAD_TCC};
 
 use crate::four_bytes_str::custom_string::{
@@ -17,7 +23,6 @@ Credits:
     * Rust Code Translation: Thanathip Suntorntip
 */
 
-
 /// Returns a set of "character" indice at the end of each token
 pub fn tcc_pos(custom_text_type: &CustomStringBytesSlice) -> HashSet<usize> {
     let mut set: HashSet<usize> = HashSet::default();

diff --git a/src/tokenizer/tokenizer_trait.rs b/src/tokenizer/tokenizer_trait.rs
@@ -1,3 +1,6 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
 use anyhow::Result as AnyResult;
 
 pub trait Tokenizer {

diff --git a/src/tokenizer/trie_char.rs b/src/tokenizer/trie_char.rs
@@ -1,15 +1,18 @@
-///This module is meant to be a direct implementation of Dict Trie in PythaiNLP.
-///
-///Many functions are implemented as a recursive function because of the limits imposed by
-///Rust Borrow Checker and this author's (Thanathip) little experience.
-///
-///Rust Code: Thanathip Suntorntip (Gorlph)
-///
-/// For basic information of trie, visit this wikipedia page https://en.wikipedia.org/wiki/Trie
-
-
-
-
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * This module is meant to be a direct implementation of Dict Trie in PyThaiNLP.
+ * 
+ * Many functions are implemented as a recursive function
+ * because of the limits imposed by Rust Borrow Checker and
+ * this author's (Thanathip) little experience.
+ * 
+ * Rust Code: Thanathip Suntorntip (Gorlph)
+ * 
+ * For basic information of trie, visit this wikipedia page
+ * https://en.wikipedia.org/wiki/Trie
+*/
 use crate::four_bytes_str::custom_string::{
     CustomString, CustomStringBytesSlice, CustomStringBytesVec, FixedCharsLengthByteSlice,
 };

diff --git a/tests/test_tokenizer.rs b/tests/test_tokenizer.rs
@@ -1,3 +1,9 @@
+// SPDX-FileCopyrightText: 2024 PyThaiNLP Project
+// SPDX-License-Identifier: Apache-2.0
+
+/**
+ * Test the NewmmTokenizer with the default dictionary.
+ */
 use nlpo3::tokenizer::newmm::NewmmTokenizer;
 use nlpo3::tokenizer::tokenizer_trait::Tokenizer;