From 183748cc5320db0ead862da26c58d9ae94f79315 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 14:51:16 +0000 Subject: [PATCH 01/27] Initial plan From 9654117d36edf6ce20c4f0c31db81d32012006bd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 14:56:20 +0000 Subject: [PATCH 02/27] Add Qwen3 language model support to pythainlp.lm Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pyproject.toml | 2 + pythainlp/lm/__init__.py | 14 ++- pythainlp/lm/qwen3.py | 195 +++++++++++++++++++++++++++++++++++++++ tests/core/test_lm.py | 56 ++++++++--- 4 files changed, 253 insertions(+), 14 deletions(-) create mode 100644 pythainlp/lm/qwen3.py diff --git a/pyproject.toml b/pyproject.toml index 059b68b31..e7c81f9cc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -143,6 +143,8 @@ wtp = ["transformers>=4.22.1", "wtpsplit>=1.0.1"] wunsen = ["wunsen>=0.0.3"] +qwen3 = ["torch>=1.0.0", "transformers>=4.22.1"] + # Compact dependencies - safe small set of optional dependencies compact = [ "nlpo3>=1.3.1", diff --git a/pythainlp/lm/__init__.py b/pythainlp/lm/__init__.py index 259f101d2..4440e50e6 100644 --- a/pythainlp/lm/__init__.py +++ b/pythainlp/lm/__init__.py @@ -2,9 +2,21 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams"] +__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"] from pythainlp.lm.text_util import ( calculate_ngram_counts, remove_repeated_ngrams, ) + +try: + from pythainlp.lm.qwen3 import Qwen3 +except ImportError: + # If dependencies are not installed, make Qwen3 available but raise + # error when instantiated + class Qwen3: # type: ignore + def __init__(self): + raise ImportError( + "Qwen3 requires additional dependencies. " + "Install with: pip install pythainlp[qwen3]" + ) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py new file mode 100644 index 000000000..846c89b16 --- /dev/null +++ b/pythainlp/lm/qwen3.py @@ -0,0 +1,195 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +import torch + + +class Qwen3: + """Qwen3-0.6B language model for Thai text generation. + + A small but capable language model from Alibaba Cloud's Qwen family, + optimized for various NLP tasks including Thai language processing. + """ + + def __init__(self): + self.model = None + self.tokenizer = None + self.device = None + self.torch_dtype = None + self.model_path = None + + def load_model( + self, + model_path: str = "Qwen/Qwen3-0.6B", + device: str = "cuda", + torch_dtype=torch.float16, + low_cpu_mem_usage: bool = True, + ): + """Load Qwen3 model. + + :param str model_path: model path or HuggingFace model ID + :param str device: device (cpu, cuda or other) + :param torch_dtype torch_dtype: torch dtype + :param bool low_cpu_mem_usage: low cpu mem usage + + :Example: + :: + + from pythainlp.lm import Qwen3 + import torch + + model = Qwen3() + model.load_model(device="cpu", torch_dtype=torch.bfloat16) + """ + from transformers import AutoModelForCausalLM, AutoTokenizer + + self.device = device + self.torch_dtype = torch_dtype + self.model_path = model_path + + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + self.model = AutoModelForCausalLM.from_pretrained( + self.model_path, + device_map=device, + torch_dtype=torch_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + + def generate( + self, + text: str, + max_new_tokens: int = 512, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50, + do_sample: bool = True, + skip_special_tokens: bool = True, + ) -> str: + """Generate text from a prompt. + + :param str text: input text prompt + :param int max_new_tokens: maximum number of new tokens to generate + :param float temperature: temperature for sampling (higher = more random) + :param float top_p: top p for nucleus sampling + :param int top_k: top k for top-k sampling + :param bool do_sample: whether to use sampling or greedy decoding + :param bool skip_special_tokens: skip special tokens in output + :return: generated text + :rtype: str + + :Example: + :: + + from pythainlp.lm import Qwen3 + import torch + + model = Qwen3() + model.load_model(device="cpu", torch_dtype=torch.bfloat16) + + result = model.generate("สวัสดี") + print(result) + """ + if self.model is None or self.tokenizer is None: + raise RuntimeError( + "Model not loaded. Please call load_model() first." + ) + + inputs = self.tokenizer(text, return_tensors="pt") + input_ids = inputs["input_ids"].to(self.device) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + do_sample=do_sample, + ) + + # Decode only the newly generated tokens + generated_text = self.tokenizer.decode( + output_ids[0][len(input_ids[0]) :], + skip_special_tokens=skip_special_tokens, + ) + + return generated_text + + def chat( + self, + messages: list[dict[str, str]], + max_new_tokens: int = 512, + temperature: float = 0.7, + top_p: float = 0.9, + top_k: int = 50, + do_sample: bool = True, + skip_special_tokens: bool = True, + ) -> str: + """Generate text using chat format. + + :param list messages: list of message dictionaries with 'role' and 'content' keys + :param int max_new_tokens: maximum number of new tokens to generate + :param float temperature: temperature for sampling + :param float top_p: top p for nucleus sampling + :param int top_k: top k for top-k sampling + :param bool do_sample: whether to use sampling + :param bool skip_special_tokens: skip special tokens in output + :return: generated response + :rtype: str + + :Example: + :: + + from pythainlp.lm import Qwen3 + import torch + + model = Qwen3() + model.load_model(device="cpu", torch_dtype=torch.bfloat16) + + messages = [{"role": "user", "content": "สวัสดีครับ"}] + response = model.chat(messages) + print(response) + """ + if self.model is None or self.tokenizer is None: + raise RuntimeError( + "Model not loaded. Please call load_model() first." + ) + + # Apply chat template if available, otherwise format manually + if hasattr(self.tokenizer, "apply_chat_template"): + text = self.tokenizer.apply_chat_template( + messages, + tokenize=False, + add_generation_prompt=True, + ) + else: + # Simple fallback format + text = "" + for msg in messages: + role = msg.get("role", "user") + content = msg.get("content", "") + text += f"{role}: {content}\n" + text += "assistant: " + + inputs = self.tokenizer(text, return_tensors="pt") + input_ids = inputs["input_ids"].to(self.device) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + max_new_tokens=max_new_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + do_sample=do_sample, + ) + + # Decode only the newly generated tokens + generated_text = self.tokenizer.decode( + output_ids[0][len(input_ids[0]) :], + skip_special_tokens=skip_special_tokens, + ) + + return generated_text diff --git a/tests/core/test_lm.py b/tests/core/test_lm.py index ec182f187..3c2ed8e7d 100644 --- a/tests/core/test_lm.py +++ b/tests/core/test_lm.py @@ -4,30 +4,60 @@ import unittest -from pythainlp.lm import calculate_ngram_counts, remove_repeated_ngrams +from pythainlp.lm import Qwen3, calculate_ngram_counts, remove_repeated_ngrams class LMTestCase(unittest.TestCase): def test_calculate_ngram_counts(self): self.assertEqual( - calculate_ngram_counts(['1', '2', '3', '4']), + calculate_ngram_counts(["1", "2", "3", "4"]), { - ('1', '2'): 1, - ('2', '3'): 1, - ('3', '4'): 1, - ('1', '2', '3'): 1, - ('2', '3', '4'): 1, - ('1', '2', '3', '4'): 1 - } + ("1", "2"): 1, + ("2", "3"): 1, + ("3", "4"): 1, + ("1", "2", "3"): 1, + ("2", "3", "4"): 1, + ("1", "2", "3", "4"): 1, + }, ) def test_remove_repeated_ngrams(self): - texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน'] + texts = ["เอา", "เอา", "แบบ", "แบบ", "แบบ", "ไหน"] self.assertEqual( - remove_repeated_ngrams(texts, n=1), - ['เอา', 'แบบ', 'ไหน'] + remove_repeated_ngrams(texts, n=1), ["เอา", "แบบ", "ไหน"] ) self.assertEqual( remove_repeated_ngrams(texts, n=2), - ['เอา', 'เอา', 'แบบ', 'แบบ', 'ไหน'] + ["เอา", "เอา", "แบบ", "แบบ", "ไหน"], ) + + def test_qwen3_initialization(self): + # Test that Qwen3 can be instantiated + try: + model = Qwen3() + self.assertIsNotNone(model) + self.assertIsNone(model.model) + self.assertIsNone(model.tokenizer) + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_generate_without_load(self): + # Test that generate raises error when model is not loaded + try: + model = Qwen3() + with self.assertRaises(RuntimeError): + model.generate("test") + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_chat_without_load(self): + # Test that chat raises error when model is not loaded + try: + model = Qwen3() + with self.assertRaises(RuntimeError): + model.chat([{"role": "user", "content": "test"}]) + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") From 0e4b368167da944b65f62f7166fc89501d668e81 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 14:59:24 +0000 Subject: [PATCH 03/27] Address code review feedback: Fix type annotations and docstrings Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 846c89b16..3cb0532ea 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from typing import Any + import torch @@ -31,7 +33,7 @@ def load_model( :param str model_path: model path or HuggingFace model ID :param str device: device (cpu, cuda or other) - :param torch_dtype torch_dtype: torch dtype + :param torch.dtype torch_dtype: torch dtype :param bool low_cpu_mem_usage: low cpu mem usage :Example: @@ -119,7 +121,7 @@ def generate( def chat( self, - messages: list[dict[str, str]], + messages: list[dict[str, Any]], max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9, @@ -129,7 +131,7 @@ def chat( ) -> str: """Generate text using chat format. - :param list messages: list of message dictionaries with 'role' and 'content' keys + :param list[dict[str, Any]] messages: list of message dictionaries with 'role' and 'content' keys :param int max_new_tokens: maximum number of new tokens to generate :param float temperature: temperature for sampling :param float top_p: top p for nucleus sampling From e53c7777ab6726e0738bd0da2aa15038f2c546cd Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:01:35 +0000 Subject: [PATCH 04/27] Fix device validation and improve docstring clarity Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 3cb0532ea..2d758968a 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -33,7 +33,7 @@ def load_model( :param str model_path: model path or HuggingFace model ID :param str device: device (cpu, cuda or other) - :param torch.dtype torch_dtype: torch dtype + :param torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16) :param bool low_cpu_mem_usage: low cpu mem usage :Example: @@ -93,7 +93,7 @@ def generate( result = model.generate("สวัสดี") print(result) """ - if self.model is None or self.tokenizer is None: + if self.model is None or self.tokenizer is None or self.device is None: raise RuntimeError( "Model not loaded. Please call load_model() first." ) @@ -154,7 +154,7 @@ def chat( response = model.chat(messages) print(response) """ - if self.model is None or self.tokenizer is None: + if self.model is None or self.tokenizer is None or self.device is None: raise RuntimeError( "Model not loaded. Please call load_model() first." ) From fe937d242b488688dd0206218b58593128a7bbd0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:04:18 +0000 Subject: [PATCH 05/27] Fix device handling: use .to() instead of device_map parameter Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 2d758968a..63da7841f 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -54,10 +54,10 @@ def load_model( self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) self.model = AutoModelForCausalLM.from_pretrained( self.model_path, - device_map=device, torch_dtype=torch_dtype, low_cpu_mem_usage=low_cpu_mem_usage, ) + self.model.to(device) def generate( self, From 20188132f849dcf1b5fe37137b2272f8f2e28c43 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:08:58 +0000 Subject: [PATCH 06/27] Move Qwen3 tests from tests/core to tests/extra Co-authored-by: wannaphong <8536487+wannaphong@users.noreply.github.com> --- tests/core/test_lm.py | 56 ++++++++++------------------------------- tests/extra/testx_lm.py | 40 +++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 43 deletions(-) create mode 100644 tests/extra/testx_lm.py diff --git a/tests/core/test_lm.py b/tests/core/test_lm.py index 3c2ed8e7d..ec182f187 100644 --- a/tests/core/test_lm.py +++ b/tests/core/test_lm.py @@ -4,60 +4,30 @@ import unittest -from pythainlp.lm import Qwen3, calculate_ngram_counts, remove_repeated_ngrams +from pythainlp.lm import calculate_ngram_counts, remove_repeated_ngrams class LMTestCase(unittest.TestCase): def test_calculate_ngram_counts(self): self.assertEqual( - calculate_ngram_counts(["1", "2", "3", "4"]), + calculate_ngram_counts(['1', '2', '3', '4']), { - ("1", "2"): 1, - ("2", "3"): 1, - ("3", "4"): 1, - ("1", "2", "3"): 1, - ("2", "3", "4"): 1, - ("1", "2", "3", "4"): 1, - }, + ('1', '2'): 1, + ('2', '3'): 1, + ('3', '4'): 1, + ('1', '2', '3'): 1, + ('2', '3', '4'): 1, + ('1', '2', '3', '4'): 1 + } ) def test_remove_repeated_ngrams(self): - texts = ["เอา", "เอา", "แบบ", "แบบ", "แบบ", "ไหน"] + texts = ['เอา', 'เอา', 'แบบ', 'แบบ', 'แบบ', 'ไหน'] self.assertEqual( - remove_repeated_ngrams(texts, n=1), ["เอา", "แบบ", "ไหน"] + remove_repeated_ngrams(texts, n=1), + ['เอา', 'แบบ', 'ไหน'] ) self.assertEqual( remove_repeated_ngrams(texts, n=2), - ["เอา", "เอา", "แบบ", "แบบ", "ไหน"], + ['เอา', 'เอา', 'แบบ', 'แบบ', 'ไหน'] ) - - def test_qwen3_initialization(self): - # Test that Qwen3 can be instantiated - try: - model = Qwen3() - self.assertIsNotNone(model) - self.assertIsNone(model.model) - self.assertIsNone(model.tokenizer) - except ImportError: - # Skip if dependencies not installed - self.skipTest("Qwen3 dependencies not installed") - - def test_qwen3_generate_without_load(self): - # Test that generate raises error when model is not loaded - try: - model = Qwen3() - with self.assertRaises(RuntimeError): - model.generate("test") - except ImportError: - # Skip if dependencies not installed - self.skipTest("Qwen3 dependencies not installed") - - def test_qwen3_chat_without_load(self): - # Test that chat raises error when model is not loaded - try: - model = Qwen3() - with self.assertRaises(RuntimeError): - model.chat([{"role": "user", "content": "test"}]) - except ImportError: - # Skip if dependencies not installed - self.skipTest("Qwen3 dependencies not installed") diff --git a/tests/extra/testx_lm.py b/tests/extra/testx_lm.py new file mode 100644 index 000000000..9ef56c20e --- /dev/null +++ b/tests/extra/testx_lm.py @@ -0,0 +1,40 @@ +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileType: SOURCE +# SPDX-License-Identifier: Apache-2.0 + +import unittest + +from pythainlp.lm import Qwen3 + + +class LMTestCaseX(unittest.TestCase): + def test_qwen3_initialization(self): + # Test that Qwen3 can be instantiated + try: + model = Qwen3() + self.assertIsNotNone(model) + self.assertIsNone(model.model) + self.assertIsNone(model.tokenizer) + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_generate_without_load(self): + # Test that generate raises error when model is not loaded + try: + model = Qwen3() + with self.assertRaises(RuntimeError): + model.generate("test") + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_chat_without_load(self): + # Test that chat raises error when model is not loaded + try: + model = Qwen3() + with self.assertRaises(RuntimeError): + model.chat([{"role": "user", "content": "test"}]) + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") From 69fbb42f7d19cbfcd796565c50be036361096225 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 19 Jan 2026 15:51:20 +0000 Subject: [PATCH 07/27] Update copyright year and sort qwen3 dependency alphabetically Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pyproject.toml | 4 ++-- pythainlp/lm/qwen3.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e7c81f9cc..5906e7bbc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -101,6 +101,8 @@ onnx = ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"] oskut = ["oskut>=1.3"] +qwen3 = ["torch>=1.0.0", "transformers>=4.22.1"] + sefr_cut = ["sefr_cut>=1.1"] spacy_thai = ["spacy_thai>=0.7.1"] @@ -143,8 +145,6 @@ wtp = ["transformers>=4.22.1", "wtpsplit>=1.0.1"] wunsen = ["wunsen>=0.0.3"] -qwen3 = ["torch>=1.0.0", "transformers>=4.22.1"] - # Compact dependencies - safe small set of optional dependencies compact = [ "nlpo3>=1.3.1", diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 63da7841f..97ef40283 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations From c81681856b79bb3fe9779891e2c2fea506ab054e Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Mon, 19 Jan 2026 16:08:43 +0000 Subject: [PATCH 08/27] Update copyright year in testx_lm.py --- tests/extra/testx_lm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/extra/testx_lm.py b/tests/extra/testx_lm.py index 9ef56c20e..44305189e 100644 --- a/tests/extra/testx_lm.py +++ b/tests/extra/testx_lm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 From 772a80b7b4e40eaabc4e657f27b0a6ecf83763a9 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 23 Jan 2026 12:27:25 +0000 Subject: [PATCH 09/27] Update pythainlp/lm/qwen3.py --- pythainlp/lm/qwen3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 97ef40283..a68d56a20 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -1,6 +1,7 @@ # SPDX-FileCopyrightText: 2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 + from __future__ import annotations from typing import Any From e2a14de526139931d9a1d71909c0817926300db4 Mon Sep 17 00:00:00 2001 From: Copilot Date: Sun, 1 Feb 2026 11:05:38 +0000 Subject: [PATCH 10/27] Update CITATION.cff from codemeta.json --- CITATION.cff | 1 + 1 file changed, 1 insertion(+) diff --git a/CITATION.cff b/CITATION.cff index 2d85753aa..923f0d4fe 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -23,6 +23,7 @@ authors: abstract: PyThaiNLP is a Thai natural language processing library for Python. It provides standard linguistic analysis for the Thai language, including tokenization and part-of-speech tagging. Additionally, it offers standard Thai locale utility functions, such as Thai Buddhist Era date formatting and the conversion of numbers into Thai text. repository-code: "https://github.com/PyThaiNLP/pythainlp" type: software +doi: 10.5281/zenodo.3519354 version: 5.2.0 license-url: "https://spdx.org/licenses/Apache-2.0" keywords: From c73b2a35a55bba983b69c6b23d4f154a049513c7 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Feb 2026 11:13:05 +0000 Subject: [PATCH 11/27] Apply code review feedback: Add input validation, error handling, and type hints Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 68 ++++++++++++++++++++++++++++++++++------- tests/extra/__init__.py | 1 + tests/extra/testx_lm.py | 26 ++++++++++++++++ 3 files changed, 84 insertions(+), 11 deletions(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index a68d56a20..9a3589a97 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -4,7 +4,7 @@ from __future__ import annotations -from typing import Any +from typing import Any, Optional import torch @@ -27,7 +27,7 @@ def load_model( self, model_path: str = "Qwen/Qwen3-0.6B", device: str = "cuda", - torch_dtype=torch.float16, + torch_dtype: Optional[torch.dtype] = torch.float16, low_cpu_mem_usage: bool = True, ): """Load Qwen3 model. @@ -52,13 +52,49 @@ def load_model( self.torch_dtype = torch_dtype self.model_path = model_path - self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) - self.model = AutoModelForCausalLM.from_pretrained( - self.model_path, - torch_dtype=torch_dtype, - low_cpu_mem_usage=low_cpu_mem_usage, - ) - self.model.to(device) + try: + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path) + except OSError as exc: + raise RuntimeError( + f"Failed to load tokenizer from '{self.model_path}'. " + "Check the model path or your network connection." + ) from exc + + try: + self.model = AutoModelForCausalLM.from_pretrained( + self.model_path, + torch_dtype=torch_dtype, + low_cpu_mem_usage=low_cpu_mem_usage, + ) + except OSError as exc: + raise RuntimeError( + f"Failed to load model from '{self.model_path}'. " + "This can happen due to an invalid model path, missing files, " + "or insufficient disk space." + ) from exc + except RuntimeError as exc: + raise RuntimeError( + "Failed to load model weights. " + "This can be caused by insufficient memory or an incompatible " + "torch_dtype setting." + ) from exc + + if isinstance(device, str) and device.startswith("cuda"): + if not torch.cuda.is_available(): + raise RuntimeError( + "CUDA device requested but CUDA is not available. " + "Check your PyTorch installation and GPU drivers, or use " + "device='cpu' instead." + ) + + try: + self.model.to(device) + except RuntimeError as exc: + raise RuntimeError( + f"Failed to move model to device '{device}'. " + "Ensure the device exists and has enough memory, and that your " + "PyTorch installation supports this device." + ) from exc def generate( self, @@ -99,6 +135,11 @@ def generate( "Model not loaded. Please call load_model() first." ) + if not text or not isinstance(text, str): + raise ValueError( + "text parameter must be a non-empty string." + ) + inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) @@ -160,6 +201,11 @@ def chat( "Model not loaded. Please call load_model() first." ) + if not messages or not isinstance(messages, list): + raise ValueError( + "messages parameter must be a non-empty list of message dictionaries." + ) + # Apply chat template if available, otherwise format manually if hasattr(self.tokenizer, "apply_chat_template"): text = self.tokenizer.apply_chat_template( @@ -171,8 +217,8 @@ def chat( # Simple fallback format text = "" for msg in messages: - role = msg.get("role", "user") - content = msg.get("content", "") + role = str(msg.get("role", "user")).replace("\n", " ") + content = str(msg.get("content", "")).replace("\n", "\\n") text += f"{role}: {content}\n" text += "assistant: " diff --git a/tests/extra/__init__.py b/tests/extra/__init__.py index a1ce14666..1679968ed 100644 --- a/tests/extra/__init__.py +++ b/tests/extra/__init__.py @@ -13,6 +13,7 @@ "tests.extra.testx_augment", "tests.extra.testx_benchmarks", "tests.extra.testx_cli", + "tests.extra.testx_lm", "tests.extra.testx_spell", "tests.extra.testx_tag", "tests.extra.testx_tokenize", diff --git a/tests/extra/testx_lm.py b/tests/extra/testx_lm.py index 44305189e..bd5f63949 100644 --- a/tests/extra/testx_lm.py +++ b/tests/extra/testx_lm.py @@ -38,3 +38,29 @@ def test_qwen3_chat_without_load(self): except ImportError: # Skip if dependencies not installed self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_generate_empty_text(self): + # Test that generate validates text input + try: + model = Qwen3() + model.model = object() # Mock to bypass load check + model.tokenizer = object() + model.device = "cpu" + with self.assertRaises(ValueError): + model.generate("") + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") + + def test_qwen3_chat_empty_messages(self): + # Test that chat validates messages input + try: + model = Qwen3() + model.model = object() # Mock to bypass load check + model.tokenizer = object() + model.device = "cpu" + with self.assertRaises(ValueError): + model.chat([]) + except ImportError: + # Skip if dependencies not installed + self.skipTest("Qwen3 dependencies not installed") From 79a42045e142ca3962b3fbe8de39aa1785e0f086 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Feb 2026 11:31:30 +0000 Subject: [PATCH 12/27] Apply second round of code review feedback: Fix copyright headers, improve error handling, use device_map Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 47 +++++++++++++++++++---------------------- tests/extra/testx_lm.py | 2 +- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 9a3589a97..4fafffe27 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 @@ -34,7 +34,7 @@ def load_model( :param str model_path: model path or HuggingFace model ID :param str device: device (cpu, cuda or other) - :param torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16) + :param Optional[torch.dtype] torch_dtype: torch data type (e.g., torch.float16, torch.bfloat16) :param bool low_cpu_mem_usage: low cpu mem usage :Example: @@ -48,6 +48,15 @@ def load_model( """ from transformers import AutoModelForCausalLM, AutoTokenizer + # Check CUDA availability early before loading model + if isinstance(device, str) and device.startswith("cuda"): + if not torch.cuda.is_available(): + raise RuntimeError( + "CUDA device requested but CUDA is not available. " + "Check your PyTorch installation and GPU drivers, or use " + "device='cpu' instead." + ) + self.device = device self.torch_dtype = torch_dtype self.model_path = model_path @@ -63,37 +72,25 @@ def load_model( try: self.model = AutoModelForCausalLM.from_pretrained( self.model_path, + device_map=device, torch_dtype=torch_dtype, low_cpu_mem_usage=low_cpu_mem_usage, ) except OSError as exc: + # Clean up tokenizer on failure + self.tokenizer = None raise RuntimeError( f"Failed to load model from '{self.model_path}'. " "This can happen due to an invalid model path, missing files, " "or insufficient disk space." ) from exc - except RuntimeError as exc: - raise RuntimeError( - "Failed to load model weights. " - "This can be caused by insufficient memory or an incompatible " - "torch_dtype setting." - ) from exc - - if isinstance(device, str) and device.startswith("cuda"): - if not torch.cuda.is_available(): - raise RuntimeError( - "CUDA device requested but CUDA is not available. " - "Check your PyTorch installation and GPU drivers, or use " - "device='cpu' instead." - ) - - try: - self.model.to(device) - except RuntimeError as exc: + except Exception as exc: + # Clean up tokenizer on failure + self.tokenizer = None raise RuntimeError( - f"Failed to move model to device '{device}'. " - "Ensure the device exists and has enough memory, and that your " - "PyTorch installation supports this device." + f"Failed to load model weights: {exc}. " + "This can be caused by insufficient memory, an incompatible " + "torch_dtype setting, or other configuration issues." ) from exc def generate( @@ -214,11 +211,11 @@ def chat( add_generation_prompt=True, ) else: - # Simple fallback format + # Simple fallback format - remove newlines to avoid confusion text = "" for msg in messages: role = str(msg.get("role", "user")).replace("\n", " ") - content = str(msg.get("content", "")).replace("\n", "\\n") + content = str(msg.get("content", "")).replace("\n", " ") text += f"{role}: {content}\n" text += "assistant: " diff --git a/tests/extra/testx_lm.py b/tests/extra/testx_lm.py index bd5f63949..eebd36cb6 100644 --- a/tests/extra/testx_lm.py +++ b/tests/extra/testx_lm.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: 2026 PyThaiNLP Project +# SPDX-FileCopyrightText: 2016-2026 PyThaiNLP Project # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 From e775b9e7792ba40bbd18311c220d42359abb80f0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 2 Feb 2026 16:55:25 +0000 Subject: [PATCH 13/27] Add type annotations for torch_dtype in WangChanGLM and ChatBotModel for consistency Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/chat/core.py | 6 ++++-- pythainlp/generate/wangchanglm.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index 18e2735a4..6dfd13f11 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -3,6 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations +from typing import Optional + import torch @@ -23,7 +25,7 @@ def load_model( return_dict: bool = True, load_in_8bit: bool = False, device: str = "cuda", - torch_dtype=torch.float16, + torch_dtype: Optional[torch.dtype] = torch.float16, offload_folder: str = "./", low_cpu_mem_usage: bool = True, ): @@ -33,7 +35,7 @@ def load_model( :param bool return_dict: return_dict :param bool load_in_8bit: load model in 8bit :param str device: device (cpu, cuda or other) - :param torch_dtype torch_dtype: torch_dtype + :param Optional[torch.dtype] torch_dtype: torch_dtype :param str offload_folder: offload folder :param bool low_cpu_mem_usage: low cpu mem usage """ diff --git a/pythainlp/generate/wangchanglm.py b/pythainlp/generate/wangchanglm.py index 51ef44261..fdd47d522 100644 --- a/pythainlp/generate/wangchanglm.py +++ b/pythainlp/generate/wangchanglm.py @@ -4,6 +4,7 @@ from __future__ import annotations import re +from typing import Optional import torch @@ -29,7 +30,7 @@ def load_model( return_dict: bool = True, load_in_8bit: bool = False, device: str = "cuda", - torch_dtype=torch.float16, + torch_dtype: Optional[torch.dtype] = torch.float16, offload_folder: str = "./", low_cpu_mem_usage: bool = True, ): @@ -39,7 +40,7 @@ def load_model( :param bool return_dict: return dict :param bool load_in_8bit: load model in 8bit :param str device: device (cpu, cuda or other) - :param torch_dtype torch_dtype: torch_dtype + :param Optional[torch.dtype] torch_dtype: torch_dtype :param str offload_folder: offload folder :param bool low_cpu_mem_usage: low cpu mem usage """ From 5ecd2478007f0e0efe6834c74db2581c70277fce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 18:29:09 +0000 Subject: [PATCH 14/27] Complete type annotations for Qwen3 class Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 4fafffe27..8ca84c85b 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -16,12 +16,12 @@ class Qwen3: optimized for various NLP tasks including Thai language processing. """ - def __init__(self): - self.model = None - self.tokenizer = None - self.device = None - self.torch_dtype = None - self.model_path = None + def __init__(self) -> None: + self.model: Any = None + self.tokenizer: Any = None + self.device: Optional[str] = None + self.torch_dtype: Optional[torch.dtype] = None + self.model_path: Optional[str] = None def load_model( self, @@ -29,7 +29,7 @@ def load_model( device: str = "cuda", torch_dtype: Optional[torch.dtype] = torch.float16, low_cpu_mem_usage: bool = True, - ): + ) -> None: """Load Qwen3 model. :param str model_path: model path or HuggingFace model ID From 88762b0dff05c9ef2553634d94000b719e2b0d7a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 18:39:38 +0000 Subject: [PATCH 15/27] Replace Any type annotations with specific types from transformers library - pythainlp/lm/qwen3.py: Use PreTrainedModel and PreTrainedTokenizerBase - pythainlp/chat/core.py: Use WangChanGLM type instead of Any - pythainlp/phayathaibert/core.py: Use Pipeline, PreTrainedTokenizerBase, AutoTokenizer, AutoModelForMaskedLM, and AutoModelForTokenClassification These changes provide more precise type information for better IDE support and type checking, while maintaining runtime compatibility through TYPE_CHECKING guards. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/chat/core.py | 7 ++++--- pythainlp/lm/qwen3.py | 9 ++++++--- pythainlp/phayathaibert/core.py | 25 ++++++++++++++++--------- 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index ed8611b37..d4a8a2fca 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -3,15 +3,16 @@ # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations -from typing import TYPE_CHECKING, Any, Optional, cast +from typing import TYPE_CHECKING, Optional, cast if TYPE_CHECKING: import torch + from pythainlp.generate.wangchanglm import WangChanGLM class ChatBotModel: history: list[tuple[str, str]] - model: Any + model: "WangChanGLM" def __init__(self) -> None: """Chat using AI generation""" @@ -49,7 +50,7 @@ def load_model( if model_name == "wangchanglm": from pythainlp.generate.wangchanglm import WangChanGLM - self.model: Any = WangChanGLM() + self.model: "WangChanGLM" = WangChanGLM() self.model.load_model( model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded", return_dict=return_dict, diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 8ca84c85b..f0864da51 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -4,7 +4,10 @@ from __future__ import annotations -from typing import Any, Optional +from typing import TYPE_CHECKING, Any, Optional + +if TYPE_CHECKING: + from transformers import PreTrainedModel, PreTrainedTokenizerBase import torch @@ -17,8 +20,8 @@ class Qwen3: """ def __init__(self) -> None: - self.model: Any = None - self.tokenizer: Any = None + self.model: Optional["PreTrainedModel"] = None + self.tokenizer: Optional["PreTrainedTokenizerBase"] = None self.device: Optional[str] = None self.torch_dtype: Optional[torch.dtype] = None self.model_path: Optional[str] = None diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index e27ad3eec..c4898cdf8 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -7,10 +7,17 @@ import re import warnings from collections.abc import Callable -from typing import TYPE_CHECKING, Any, Union +from typing import TYPE_CHECKING, Union if TYPE_CHECKING: - from transformers import CamembertTokenizer + from transformers import ( + AutoModelForMaskedLM, + AutoModelForTokenClassification, + AutoTokenizer, + CamembertTokenizer, + Pipeline, + PreTrainedTokenizerBase, + ) from transformers import ( CamembertTokenizer, @@ -211,13 +218,13 @@ def __init__(self) -> None: pipeline, ) - self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained( + self.tokenizer: "AutoTokenizer" = AutoTokenizer.from_pretrained( _model_name ) - self.model_for_masked_lm: AutoModelForMaskedLM = ( + self.model_for_masked_lm: "AutoModelForMaskedLM" = ( AutoModelForMaskedLM.from_pretrained(_model_name) ) - self.model: Any = pipeline( # transformers.Pipeline + self.model: "Pipeline" = pipeline( # transformers.Pipeline "fill-mask", tokenizer=self.tokenizer, model=self.model_for_masked_lm, @@ -310,8 +317,8 @@ def __init__(self, model: str = "lunarlist/pos_thai_phayathai") -> None: AutoTokenizer, ) - self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model) - self.model: AutoModelForTokenClassification = ( + self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained(model) + self.model: "AutoModelForTokenClassification" = ( AutoModelForTokenClassification.from_pretrained(model) ) @@ -355,8 +362,8 @@ def __init__(self, model: str = "Pavarissy/phayathaibert-thainer") -> None: AutoTokenizer, ) - self.tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(model) - self.model: AutoModelForTokenClassification = ( + self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained(model) + self.model: "AutoModelForTokenClassification" = ( AutoModelForTokenClassification.from_pretrained(model) ) From 1e3f0b6624a0b1d043b793185e35118c789617c5 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 20:10:33 +0000 Subject: [PATCH 16/27] Fix type annotation reassignment in ChatBotModel Remove redundant type annotation on self.model assignment in load_model method to avoid type annotation reassignment warning. The class-level annotation on line 15 already declares the type. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/chat/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index d4a8a2fca..295c27870 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -50,7 +50,7 @@ def load_model( if model_name == "wangchanglm": from pythainlp.generate.wangchanglm import WangChanGLM - self.model: "WangChanGLM" = WangChanGLM() + self.model = WangChanGLM() self.model.load_model( model_path="pythainlp/wangchanglm-7.5B-sft-en-sharded", return_dict=return_dict, From 28869e06a87d7f4b6e43bca554ebab947cbeadca Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 4 Feb 2026 21:21:18 +0000 Subject: [PATCH 17/27] Import torch in TYPE_CHECKING block --- pythainlp/chat/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index 295c27870..72ed40d0a 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -6,8 +6,8 @@ from typing import TYPE_CHECKING, Optional, cast if TYPE_CHECKING: - import torch from pythainlp.generate.wangchanglm import WangChanGLM + import torch class ChatBotModel: From 06b5d5ff367eaa9d3a72ec9abbaff963478e2b87 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 4 Feb 2026 22:17:05 +0000 Subject: [PATCH 18/27] Add import for torch in ChatBotModel --- pythainlp/chat/core.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index 72ed40d0a..4ee909788 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -7,6 +7,7 @@ if TYPE_CHECKING: from pythainlp.generate.wangchanglm import WangChanGLM + import torch From aa8751bcc045b54e0d912a997e3351fbab71b805 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Wed, 4 Feb 2026 22:30:14 +0000 Subject: [PATCH 19/27] Remove duplicate import statement Removed duplicate import of WangChanGLM. --- pythainlp/chat/core.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index 051391530..b0ee0a719 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -10,9 +10,6 @@ import torch - from pythainlp.generate.wangchanglm import WangChanGLM - - class ChatBotModel: history: list[tuple[str, str]] model: "WangChanGLM" From 3d550ce852f6231e0c2ce797b55a5fd769ae12ac Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 4 Feb 2026 22:33:42 +0000 Subject: [PATCH 20/27] Fix ruff import sorting error in chat/core.py Reorder imports in TYPE_CHECKING block to fix ruff I001 error. Moved torch import before pythainlp.generate.wangchanglm import. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/chat/core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pythainlp/chat/core.py b/pythainlp/chat/core.py index b0ee0a719..442dda45f 100644 --- a/pythainlp/chat/core.py +++ b/pythainlp/chat/core.py @@ -6,10 +6,10 @@ from typing import TYPE_CHECKING, Optional, cast if TYPE_CHECKING: - from pythainlp.generate.wangchanglm import WangChanGLM - import torch + from pythainlp.generate.wangchanglm import WangChanGLM + class ChatBotModel: history: list[tuple[str, str]] model: "WangChanGLM" From b9bf7fc793d3f396e62a18f6a68e4ca0111c7919 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 21:54:21 +0000 Subject: [PATCH 21/27] Apply code review feedback: Improve code quality and documentation 1. Remove redundant isinstance check for device parameter (already typed as str) 2. Add documentation about do_sample behavior with temperature parameters 3. Preserve newlines in message content for chat fallback template 4. Add type annotation (-> None) to fallback Qwen3.__init__ in __init__.py 5. Add comment explaining tensor shape assumptions for slicing operation All changes improve code clarity and maintainability without altering functionality. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/__init__.py | 2 +- pythainlp/lm/qwen3.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/pythainlp/lm/__init__.py b/pythainlp/lm/__init__.py index 4440e50e6..779935bab 100644 --- a/pythainlp/lm/__init__.py +++ b/pythainlp/lm/__init__.py @@ -15,7 +15,7 @@ # If dependencies are not installed, make Qwen3 available but raise # error when instantiated class Qwen3: # type: ignore - def __init__(self): + def __init__(self) -> None: raise ImportError( "Qwen3 requires additional dependencies. " "Install with: pip install pythainlp[qwen3]" diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index f0864da51..9baf020ba 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -52,7 +52,7 @@ def load_model( from transformers import AutoModelForCausalLM, AutoTokenizer # Check CUDA availability early before loading model - if isinstance(device, str) and device.startswith("cuda"): + if device.startswith("cuda"): if not torch.cuda.is_available(): raise RuntimeError( "CUDA device requested but CUDA is not available. " @@ -143,6 +143,8 @@ def generate( inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) + # Note: When do_sample=False (greedy decoding), temperature, top_p, + # and top_k parameters are ignored by the transformers library with torch.inference_mode(): output_ids = self.model.generate( input_ids, @@ -154,6 +156,8 @@ def generate( ) # Decode only the newly generated tokens + # output_ids and input_ids are guaranteed to be 2D tensors with + # batch size 1 from the tokenizer call above generated_text = self.tokenizer.decode( output_ids[0][len(input_ids[0]) :], skip_special_tokens=skip_special_tokens, @@ -214,17 +218,19 @@ def chat( add_generation_prompt=True, ) else: - # Simple fallback format - remove newlines to avoid confusion - text = "" + # Simple fallback format - preserve content newlines + lines = [] for msg in messages: role = str(msg.get("role", "user")).replace("\n", " ") - content = str(msg.get("content", "")).replace("\n", " ") - text += f"{role}: {content}\n" - text += "assistant: " + content = str(msg.get("content", "")) + lines.append(f"{role}: {content}") + text = "\n".join(lines) + "\nassistant: " inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) + # Note: When do_sample=False (greedy decoding), temperature, top_p, + # and top_k parameters are ignored by the transformers library with torch.inference_mode(): output_ids = self.model.generate( input_ids, @@ -236,6 +242,8 @@ def chat( ) # Decode only the newly generated tokens + # output_ids and input_ids are guaranteed to be 2D tensors with + # batch size 1 from the tokenizer call above generated_text = self.tokenizer.decode( output_ids[0][len(input_ids[0]) :], skip_special_tokens=skip_special_tokens, From ff07202814874742d9183c228f502df86a3bfafe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Thu, 5 Feb 2026 22:25:08 +0000 Subject: [PATCH 22/27] Apply dependency and import improvements from code review 1. Update torch minimum version from 1.0.0 to 1.9.0 in pyproject.toml - torch.inference_mode used in code requires torch>=1.9.0 2. Change torch_dtype default from torch.float16 to None in method signature - Move torch import from module level to inside load_model method - Handle default value assignment (torch.float16) inside load_model - Reduces import-time dependencies - torch only imported when load_model is called - Follows pattern used in WangChanGLM and ChatBotModel classes These changes improve lazy loading and ensure dependency versions match actual requirements. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pyproject.toml | 2 +- pythainlp/lm/qwen3.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bc46dd0de..8a1845d18 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -124,7 +124,7 @@ onnx = ["numpy>=1.22", "onnxruntime>=1.10.0", "sentencepiece>=0.1.91"] oskut = ["oskut>=1.3"] -qwen3 = ["torch>=1.0.0", "transformers>=4.22.1"] +qwen3 = ["torch>=1.9.0", "transformers>=4.22.1"] sefr_cut = ["sefr_cut>=1.1"] diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index 9baf020ba..a28d86377 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -7,10 +7,9 @@ from typing import TYPE_CHECKING, Any, Optional if TYPE_CHECKING: + import torch from transformers import PreTrainedModel, PreTrainedTokenizerBase -import torch - class Qwen3: """Qwen3-0.6B language model for Thai text generation. @@ -23,14 +22,14 @@ def __init__(self) -> None: self.model: Optional["PreTrainedModel"] = None self.tokenizer: Optional["PreTrainedTokenizerBase"] = None self.device: Optional[str] = None - self.torch_dtype: Optional[torch.dtype] = None + self.torch_dtype: Optional["torch.dtype"] = None self.model_path: Optional[str] = None def load_model( self, model_path: str = "Qwen/Qwen3-0.6B", device: str = "cuda", - torch_dtype: Optional[torch.dtype] = torch.float16, + torch_dtype: Optional["torch.dtype"] = None, low_cpu_mem_usage: bool = True, ) -> None: """Load Qwen3 model. @@ -49,8 +48,13 @@ def load_model( model = Qwen3() model.load_model(device="cpu", torch_dtype=torch.bfloat16) """ + import torch from transformers import AutoModelForCausalLM, AutoTokenizer + # Set default torch_dtype if not provided + if torch_dtype is None: + torch_dtype = torch.float16 + # Check CUDA availability early before loading model if device.startswith("cuda"): if not torch.cuda.is_available(): From df97e0c85cf8335d288e0737f9f66bbe49e60d9a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 10:31:53 +0000 Subject: [PATCH 23/27] Fix torch import scope and type annotation consistency 1. Add 'import torch' in generate() and chat() methods (pythainlp/lm/qwen3.py) - torch.inference_mode() requires torch to be imported in method scope - Previously torch was only imported in load_model() method - Both generate() (line 147) and chat() (line 235) now import torch locally 2. Change AutoTokenizer type annotation to PreTrainedTokenizerBase (pythainlp/phayathaibert/core.py:221) - AutoTokenizer.from_pretrained() returns PreTrainedTokenizerBase instance - Matches type annotations on lines 320 and 365 for consistency - More accurate type annotation (AutoTokenizer is a factory class, not instantiated directly) These changes fix import scope issues and improve type annotation accuracy. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/qwen3.py | 4 ++++ pythainlp/phayathaibert/core.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index a28d86377..f76f4bf32 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -144,6 +144,8 @@ def generate( "text parameter must be a non-empty string." ) + import torch + inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) @@ -230,6 +232,8 @@ def chat( lines.append(f"{role}: {content}") text = "\n".join(lines) + "\nassistant: " + import torch + inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index c4898cdf8..6b5a16f98 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -218,7 +218,7 @@ def __init__(self) -> None: pipeline, ) - self.tokenizer: "AutoTokenizer" = AutoTokenizer.from_pretrained( + self.tokenizer: "PreTrainedTokenizerBase" = AutoTokenizer.from_pretrained( _model_name ) self.model_for_masked_lm: "AutoModelForMaskedLM" = ( From ffb73eee19637b2fade39bd275fbf5b3b28e7054 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 6 Feb 2026 10:57:07 +0000 Subject: [PATCH 24/27] Remove AutoTokenizer import from core.py Removed AutoTokenizer import from transformers. --- pythainlp/phayathaibert/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pythainlp/phayathaibert/core.py b/pythainlp/phayathaibert/core.py index 6b5a16f98..dba072907 100644 --- a/pythainlp/phayathaibert/core.py +++ b/pythainlp/phayathaibert/core.py @@ -13,7 +13,6 @@ from transformers import ( AutoModelForMaskedLM, AutoModelForTokenClassification, - AutoTokenizer, CamembertTokenizer, Pipeline, PreTrainedTokenizerBase, From 91303d359bd298fd4b89ef7358adc3d9c7e75b0c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:06:31 +0000 Subject: [PATCH 25/27] Move Qwen3 tests from extra to noauto_torch suite and sync with dev branch 1. Sync with dev branch (54 commits merged) - Test suite reorganization: noauto tests now grouped by dependency ecosystem - Added noauto_torch, noauto_tensorflow, noauto_onnx, noauto_cython, noauto_network suites - Added braille module and comprehensive translation improvements 2. Move Qwen3 tests to appropriate suite - Moved from tests/extra/testx_lm.py to tests/noauto_torch/testn_lm_torch.py - Changed test class suffix from TestCaseX to TestCaseN (noauto convention) - Registered in tests/noauto_torch/__init__.py test_packages list - Removed from tests/extra/__init__.py test_packages list Reason: Qwen3 uses torch and transformers (PyTorch ecosystem) which are heavy dependencies (~2-3 GB). Per the updated test suite organization, tests with heavy ML framework dependencies belong in noauto_torch suite, not extra suite. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- tests/extra/__init__.py | 1 - tests/noauto_torch/__init__.py | 1 + tests/{extra/testx_lm.py => noauto_torch/testn_lm_torch.py} | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename tests/{extra/testx_lm.py => noauto_torch/testn_lm_torch.py} (98%) diff --git a/tests/extra/__init__.py b/tests/extra/__init__.py index 3391a4866..b72414ca3 100644 --- a/tests/extra/__init__.py +++ b/tests/extra/__init__.py @@ -13,7 +13,6 @@ "tests.extra.testx_augment", "tests.extra.testx_benchmarks", "tests.extra.testx_cli", - "tests.extra.testx_lm", "tests.extra.testx_spell", "tests.extra.testx_tag", "tests.extra.testx_tokenize", diff --git a/tests/noauto_torch/__init__.py b/tests/noauto_torch/__init__.py index 675181b6c..eba6b8584 100644 --- a/tests/noauto_torch/__init__.py +++ b/tests/noauto_torch/__init__.py @@ -23,6 +23,7 @@ # Names of module to be tested test_packages: list[str] = [ + "tests.noauto_torch.testn_lm_torch", "tests.noauto_torch.testn_spell_torch", "tests.noauto_torch.testn_tag_torch", "tests.noauto_torch.testn_tokenize_torch", diff --git a/tests/extra/testx_lm.py b/tests/noauto_torch/testn_lm_torch.py similarity index 98% rename from tests/extra/testx_lm.py rename to tests/noauto_torch/testn_lm_torch.py index eebd36cb6..7de647d76 100644 --- a/tests/extra/testx_lm.py +++ b/tests/noauto_torch/testn_lm_torch.py @@ -7,7 +7,7 @@ from pythainlp.lm import Qwen3 -class LMTestCaseX(unittest.TestCase): +class LMTestCaseN(unittest.TestCase): def test_qwen3_initialization(self): # Test that Qwen3 can be instantiated try: From 432c2abc522579f74aa8bf200a278d90d5531a64 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Feb 2026 11:34:57 +0000 Subject: [PATCH 26/27] Improve dependency error handling and type annotations in lm module 1. Add type annotation to __all__ in pythainlp/lm/__init__.py - Changed from __all__ = [...] to __all__: list[str] = [...] - Matches codebase convention (e.g., pythainlp/chat/__init__.py) 2. Remove try/except wrapper around Qwen3 import in __init__.py - The wrapper was ineffective since qwen3.py uses lazy imports - Users would get raw ModuleNotFoundError instead of helpful message - Better to handle missing deps in the methods that actually need them 3. Add dependency checking in Qwen3 methods (qwen3.py) - load_model(): Wrap torch/transformers imports in try/except - generate(): Wrap torch import in try/except - chat(): Wrap torch import in try/except - All raise ImportError with helpful message: "Install them with: pip install 'pythainlp[qwen3]'" - Catches both ImportError and ModuleNotFoundError for robustness These changes ensure users get clear, actionable error messages when trying to use Qwen3 without installing the optional dependencies, rather than cryptic import errors. Co-authored-by: bact <128572+bact@users.noreply.github.com> --- pythainlp/lm/__init__.py | 14 ++------------ pythainlp/lm/qwen3.py | 26 ++++++++++++++++++++++---- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/pythainlp/lm/__init__.py b/pythainlp/lm/__init__.py index 779935bab..ec7bd0147 100644 --- a/pythainlp/lm/__init__.py +++ b/pythainlp/lm/__init__.py @@ -2,21 +2,11 @@ # SPDX-FileType: SOURCE # SPDX-License-Identifier: Apache-2.0 -__all__ = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"] +__all__: list[str] = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"] from pythainlp.lm.text_util import ( calculate_ngram_counts, remove_repeated_ngrams, ) -try: - from pythainlp.lm.qwen3 import Qwen3 -except ImportError: - # If dependencies are not installed, make Qwen3 available but raise - # error when instantiated - class Qwen3: # type: ignore - def __init__(self) -> None: - raise ImportError( - "Qwen3 requires additional dependencies. " - "Install with: pip install pythainlp[qwen3]" - ) +from pythainlp.lm.qwen3 import Qwen3 diff --git a/pythainlp/lm/qwen3.py b/pythainlp/lm/qwen3.py index f76f4bf32..b67d20ab5 100644 --- a/pythainlp/lm/qwen3.py +++ b/pythainlp/lm/qwen3.py @@ -48,8 +48,14 @@ def load_model( model = Qwen3() model.load_model(device="cpu", torch_dtype=torch.bfloat16) """ - import torch - from transformers import AutoModelForCausalLM, AutoTokenizer + try: + import torch + from transformers import AutoModelForCausalLM, AutoTokenizer + except (ImportError, ModuleNotFoundError) as exc: + raise ImportError( + "Qwen3 language model requires optional dependencies. " + "Install them with: pip install 'pythainlp[qwen3]'" + ) from exc # Set default torch_dtype if not provided if torch_dtype is None: @@ -144,7 +150,13 @@ def generate( "text parameter must be a non-empty string." ) - import torch + try: + import torch + except (ImportError, ModuleNotFoundError) as exc: + raise ImportError( + "Qwen3 language model requires optional dependencies. " + "Install them with: pip install 'pythainlp[qwen3]'" + ) from exc inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) @@ -232,7 +244,13 @@ def chat( lines.append(f"{role}: {content}") text = "\n".join(lines) + "\nassistant: " - import torch + try: + import torch + except (ImportError, ModuleNotFoundError) as exc: + raise ImportError( + "Qwen3 language model requires optional dependencies. " + "Install them with: pip install 'pythainlp[qwen3]'" + ) from exc inputs = self.tokenizer(text, return_tensors="pt") input_ids = inputs["input_ids"].to(self.device) From f529a9bcf98a11a8c4299bcca5e4cbc9ed7a4bf1 Mon Sep 17 00:00:00 2001 From: Arthit Suriyawongkul Date: Fri, 6 Feb 2026 15:20:41 +0000 Subject: [PATCH 27/27] Reorder import statements in __init__.py --- pythainlp/lm/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pythainlp/lm/__init__.py b/pythainlp/lm/__init__.py index ec7bd0147..36ff60a7c 100644 --- a/pythainlp/lm/__init__.py +++ b/pythainlp/lm/__init__.py @@ -4,9 +4,8 @@ __all__: list[str] = ["calculate_ngram_counts", "remove_repeated_ngrams", "Qwen3"] +from pythainlp.lm.qwen3 import Qwen3 from pythainlp.lm.text_util import ( calculate_ngram_counts, remove_repeated_ngrams, ) - -from pythainlp.lm.qwen3 import Qwen3