From 4cb56f2c3f9bb1d5b7ace5480eb6f3c6d8a7dfb1 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 2 Jan 2026 18:27:23 +0800 Subject: [PATCH 1/9] Pin transformers==5.0.0rc1 Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- dev/modal/tests_bwd.py | 35 ----------------------------------- setup.py | 2 +- 2 files changed, 1 insertion(+), 36 deletions(-) delete mode 100644 dev/modal/tests_bwd.py diff --git a/dev/modal/tests_bwd.py b/dev/modal/tests_bwd.py deleted file mode 100644 index f71773d99..000000000 --- a/dev/modal/tests_bwd.py +++ /dev/null @@ -1,35 +0,0 @@ -from pathlib import Path - -import modal - -ROOT_PATH = Path(__file__).parent.parent.parent -REMOTE_ROOT_PATH = "/root/liger-kernel" -PYTHON_VERSION = "3.12" - -image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv") - -app = modal.App("liger_tests_bwd", image=image) - -# mount: add local files to the remote container -repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH) - - -@app.function(gpu="H100!", image=repo, timeout=90 * 60) -def liger_bwd_tests(): - import subprocess - - subprocess.run( - ["uv pip install -e '.[dev]' --system"], - check=True, - shell=True, - cwd=REMOTE_ROOT_PATH, - ) - # force install transformers==4.49.0 - subprocess.run( - ["uv pip install transformers==4.49.0 --system"], - check=True, - shell=True, - cwd=REMOTE_ROOT_PATH, - ) - subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) - subprocess.run(["make test-convergence"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) diff --git a/setup.py b/setup.py index 8e73d905f..3d4a4656e 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_optional_dependencies(): """Get optional dependency groups.""" return { "dev": [ - "transformers>=4.49.0", + "transformers==5.0.0rc1", "matplotlib>=3.7.2", "ruff>=0.12.0", "pytest>=7.1.2", From 98f059388c2d63d44afea5d71fb70f7755110c13 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Wed, 14 Jan 2026 17:48:11 +0800 Subject: [PATCH 2/9] bump to rc2 Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 3d4a4656e..5af130ea4 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_optional_dependencies(): """Get optional dependency groups.""" return { "dev": [ - "transformers==5.0.0rc1", + "transformers==5.0.0rc2", "matplotlib>=3.7.2", "ruff>=0.12.0", "pytest>=7.1.2", From 08219acf2786469f1565db139c1e351d9768f802 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 18:48:55 +0800 Subject: [PATCH 3/9] Update rope related config and tokenizers for transformers v5 (#1014) ## Summary Fix #1013 Transformers v5 introduces a new attribute `rope_parameters` in model config, containing all rope related parameters, and deprecate standalone rope attribute, such as `rope_scaling`, `rope_theta`, etc. Most `TokenizerFast`s are now default tokenizers in v5, hence `tokenization_xxx_fast` paths are removed This PR - replaces deprecated configs with `rope_parameters` - replaces fast tokenizers path with default ones ## Testing Done - Hardware Type: - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- test/convergence/bf16/test_mini_models.py | 145 +++++++++------- .../bf16/test_mini_models_multimodal.py | 84 +++++---- .../bf16/test_mini_models_with_logits.py | 130 ++++++++------ test/convergence/fp32/test_mini_models.py | 160 ++++++++++-------- .../fp32/test_mini_models_multimodal.py | 92 +++++----- .../fp32/test_mini_models_with_logits.py | 144 +++++++++------- test/transformers/test_monkey_patch.py | 37 ++-- 7 files changed, 456 insertions(+), 336 deletions(-) diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py index 19ca2044e..332755e73 100644 --- a/test/convergence/bf16/test_mini_models.py +++ b/test/convergence/bf16/test_mini_models.py @@ -333,8 +333,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -362,7 +363,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -391,7 +394,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -416,7 +421,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -441,7 +448,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -476,7 +485,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -504,7 +515,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -532,7 +545,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -561,8 +576,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -590,7 +606,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -616,8 +634,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -693,7 +712,9 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + rope_theta=10000.0, + ), # 1000000 attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -721,14 +742,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", + rope_theta=500_000, ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -762,9 +783,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -814,9 +834,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -870,9 +889,8 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -923,9 +941,8 @@ num_key_value_heads=2, head_dim=128, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -977,8 +994,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1010,8 +1028,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1069,8 +1088,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1098,8 +1118,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1128,8 +1149,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1165,8 +1187,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1182,11 +1202,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1232,8 +1251,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1249,11 +1266,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1303,8 +1319,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1396,8 +1413,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1437,7 +1455,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1468,8 +1488,9 @@ eod_token_id=3, sep_token_id=4, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, num_experts=2, @@ -1496,7 +1517,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py index bd090e060..df60cef49 100644 --- a/test/convergence/bf16/test_mini_models_multimodal.py +++ b/test/convergence/bf16/test_mini_models_multimodal.py @@ -8,7 +8,7 @@ from datasets import load_dataset from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerFast -from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast +from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from liger_kernel.transformers import apply_liger_kernel_to_gemma3 @@ -54,7 +54,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration @@ -70,7 +70,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor @@ -82,7 +82,7 @@ QWEN2_5_VL_AVAILABLE = False try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -138,7 +138,7 @@ from packaging import version from transformers.models.gemma.configuration_gemma import GemmaConfig - from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast + from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.gemma2.configuration_gemma2 import Gemma2Config from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration @@ -191,7 +191,7 @@ try: # SmolVLM2 is only available in transformers>=4.50.0 - from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast + from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration @@ -268,7 +268,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -315,14 +317,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", + rope_theta=500_000, ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -372,7 +374,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -421,7 +425,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -466,7 +472,16 @@ rms_norm_eps=1e-06, use_cache=True, tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, ).to_dict(), @@ -503,9 +518,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -545,8 +559,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -637,7 +652,9 @@ num_hidden_layers=4, # 30 -> reduced to 4 for testing num_key_value_heads=3, # 3 for 256M model rms_norm_eps=1e-5, - rope_theta=100000, + rope_parameters=dict( + rope_theta=100000, + ), tie_word_embeddings=False, vocab_size=49280, ), @@ -680,10 +697,9 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", - mrope_section=[16, 24, 24], # (temporal, height, width) + rope_parameters=dict( + rope_theta=1000000.0, + mrope_section=[16, 24, 24], ), sliding_window=4096, tie_word_embeddings=True, @@ -742,9 +758,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -794,9 +809,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -825,7 +839,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2VLProcessor( @@ -847,7 +861,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2_5_VLProcessor( @@ -869,7 +883,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2) video_processor = Qwen3VLVideoProcessor() return Qwen3VLProcessor( @@ -926,7 +940,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = GotOcr2ImageProcessorFast( crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448} ) @@ -950,7 +964,7 @@ def create_processor(model_name: str): ) ] ) - gpt2_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SmolVLMImageProcessor(size={"longest_edge": 512}) video_processor = SmolVLMVideoProcessor() @@ -1020,7 +1034,7 @@ def create_processor(model_name: str): ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256) return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer) @@ -1040,7 +1054,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Gemma3ImageProcessor() return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer) diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py index e329d1c26..88cd8b0fe 100644 --- a/test/convergence/bf16/test_mini_models_with_logits.py +++ b/test/convergence/bf16/test_mini_models_with_logits.py @@ -315,8 +315,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -344,7 +345,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -373,7 +376,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -398,7 +403,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -423,7 +430,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -458,7 +467,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -486,7 +497,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -514,7 +527,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -543,8 +558,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -571,7 +587,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -597,8 +615,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -642,9 +661,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -697,9 +715,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -778,7 +795,7 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, @@ -819,9 +836,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -871,9 +887,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -923,8 +938,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -957,8 +973,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1016,8 +1033,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1045,8 +1063,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1075,8 +1094,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1111,8 +1131,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1128,11 +1146,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", + "rope_parameters": { + "rope_theta": 500_000, "mrope_section": [8, 12, 12], # (temporal, height, width) }, - "rope_theta": 500_000, "vocab_size": 32000, "attention_bias": True, }, @@ -1178,8 +1195,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1195,11 +1210,11 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { + "rope_parameters": { + "rope_theta": 500_000, "type": "default", "mrope_section": [8, 12, 12], # (temporal, height, width) }, - "rope_theta": 500_000, "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1249,8 +1264,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1341,8 +1357,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1383,7 +1400,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1414,8 +1433,9 @@ eod_token_id=3, sep_token_id=4, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, num_experts=2, @@ -1442,7 +1462,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py index 7b1ffabd1..01b75e932 100644 --- a/test/convergence/fp32/test_mini_models.py +++ b/test/convergence/fp32/test_mini_models.py @@ -332,8 +332,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -361,7 +362,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -390,7 +393,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -415,7 +420,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -440,7 +447,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -475,7 +484,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -503,7 +514,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -531,7 +544,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -559,8 +574,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -588,7 +604,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -614,8 +632,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -651,14 +670,14 @@ rms_norm_eps=1e-5, use_cache=True, tie_word_embeddings=False, - rope_parameters={ - "rope_type": "yarn", - "factor": 8.0, - "beta_fast": 32.0, - "beta_slow": 1.0, - "truncate": False, - "original_max_position_embeddings": 4096, - }, + rope_parameters=dict( + rope_type="yarn", + factor=8.0, + beta_fast=32.0, + beta_slow=1.0, + truncate=False, + original_max_position_embeddings=4096, + ), attention_dropout=0.0, num_local_experts=8, # Reduced from 32 for mini model num_experts_per_tok=2, # Reduced from 4 for mini model @@ -691,7 +710,9 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -718,14 +739,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -759,9 +780,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -811,9 +831,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -866,9 +885,8 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -919,9 +937,8 @@ num_key_value_heads=2, head_dim=128, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -973,8 +990,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1006,8 +1024,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1035,8 +1054,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1065,8 +1085,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1102,8 +1123,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1119,11 +1138,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1169,8 +1187,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1186,11 +1202,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1238,8 +1253,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1298,8 +1314,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1390,8 +1407,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1430,7 +1448,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1456,7 +1476,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, num_experts=8, @@ -1484,7 +1506,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/fp32/test_mini_models_multimodal.py b/test/convergence/fp32/test_mini_models_multimodal.py index ac1f0ee92..448d95c95 100644 --- a/test/convergence/fp32/test_mini_models_multimodal.py +++ b/test/convergence/fp32/test_mini_models_multimodal.py @@ -9,7 +9,7 @@ from datasets import load_dataset from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerFast -from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast +from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from liger_kernel.transformers import apply_liger_kernel_to_gemma3 @@ -54,7 +54,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration @@ -70,7 +70,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor @@ -83,7 +83,7 @@ try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -108,7 +108,7 @@ QWEN3_VL_MOE_AVAILABLE = False try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -214,7 +214,7 @@ try: # SmolVLM2 is only available in transformers>=4.50.0 - from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast + from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration @@ -291,7 +291,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -339,14 +341,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -396,7 +398,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -446,7 +450,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -492,7 +498,16 @@ rms_norm_eps=1e-06, use_cache=True, tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, ), @@ -528,9 +543,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -570,8 +584,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -662,7 +677,9 @@ num_hidden_layers=4, # 30 -> reduced to 4 for testing num_key_value_heads=3, # 3 for 256M model rms_norm_eps=1e-5, - rope_theta=100000, + rope_parameters=dict( + rope_theta=100000, + ), tie_word_embeddings=False, vocab_size=49280, ), @@ -705,9 +722,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -767,9 +783,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -819,9 +834,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -879,9 +893,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -931,9 +944,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -962,7 +974,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2VLProcessor( @@ -984,7 +996,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2_5_VLProcessor( @@ -1006,7 +1018,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2) video_processor = Qwen3VLVideoProcessor() return Qwen3VLProcessor( @@ -1063,7 +1075,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = GotOcr2ImageProcessorFast( crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448} ) @@ -1087,7 +1099,7 @@ def create_processor(model_name: str): ) ] ) - gpt2_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SmolVLMImageProcessor(size={"longest_edge": 512}) video_processor = SmolVLMVideoProcessor() @@ -1157,7 +1169,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256) return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer) @@ -1177,7 +1189,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Gemma3ImageProcessor() return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer) diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py index ace68f8ab..2d2688476 100644 --- a/test/convergence/fp32/test_mini_models_with_logits.py +++ b/test/convergence/fp32/test_mini_models_with_logits.py @@ -335,8 +335,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -364,7 +365,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -393,7 +396,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -418,7 +423,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -443,7 +450,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -478,7 +487,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -506,7 +517,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -534,7 +547,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -562,8 +577,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -590,7 +606,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -616,8 +634,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -656,7 +675,9 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -683,14 +704,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -724,9 +745,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -776,9 +796,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -833,9 +852,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -888,9 +906,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -942,8 +959,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -976,8 +994,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1035,8 +1054,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1064,8 +1084,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1094,8 +1115,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1131,8 +1153,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1148,11 +1168,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1197,8 +1216,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1214,11 +1231,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1268,8 +1284,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1360,8 +1377,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1402,7 +1420,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1428,7 +1448,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, num_experts=8, @@ -1456,7 +1478,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py index 71ebf592a..d9682fe00 100755 --- a/test/transformers/test_monkey_patch.py +++ b/test/transformers/test_monkey_patch.py @@ -497,9 +497,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_for_conditional_generation( rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -598,9 +597,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -675,9 +673,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_text(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -771,9 +768,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_for_conditional_generat rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -877,9 +873,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -959,9 +954,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_text(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -1107,6 +1101,13 @@ def test_apply_liger_kernel_to_instance_for_mllama_for_conditional_generation(): intermediate_size=64, hidden_act="silu", num_hidden_layers=2, + rope_parameters=dict( + factor=8.0, + high_freq_factor=4.0, + low_freq_factor=1.0, + max_position_embeddings=8192, + rope_type="llama3", + ), rope_scaling=dict( factor=8.0, high_freq_factor=4.0, From 063c4471c3bc1b6bbef656b26f9f19d06b2d0d67 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:18:09 +0800 Subject: [PATCH 4/9] bump to rc3 Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5af130ea4..965bc3fcc 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_optional_dependencies(): """Get optional dependency groups.""" return { "dev": [ - "transformers==5.0.0rc2", + "transformers==5.0.0rc3", "matplotlib>=3.7.2", "ruff>=0.12.0", "pytest>=7.1.2", From 5ace73769c7527b1ab4e1cb89d6252cb722a1543 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:19:06 +0800 Subject: [PATCH 5/9] Remove test-bwd ci Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- .github/workflows/nvi-ci.yml | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/.github/workflows/nvi-ci.yml b/.github/workflows/nvi-ci.yml index b828651ec..a551af14a 100644 --- a/.github/workflows/nvi-ci.yml +++ b/.github/workflows/nvi-ci.yml @@ -64,29 +64,3 @@ jobs: - name: Run tests run: | modal run dev.modal.tests - - tests-bwd: - runs-on: ubuntu-latest - needs: [checkstyle] - env: - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - REBUILD_IMAGE: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} - - steps: - - name: Checkout code - uses: actions/checkout@v6 - - - name: Set up Python - uses: actions/setup-python@v6 - with: - python-version: '3.10' - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install modal - - - name: Run tests - run: | - modal run dev.modal.tests_bwd \ No newline at end of file From 70b2423b9c747aa028d61ce3ad118c5426fbb8d4 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:23:40 +0800 Subject: [PATCH 6/9] split correctness and convergence test Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- .github/workflows/nvi-ci.yml | 54 +++++++++++++++++++++++++++++++++-- dev/modal/test_convergence.py | 27 ++++++++++++++++++ dev/modal/test_correctness.py | 27 ++++++++++++++++++ 3 files changed, 106 insertions(+), 2 deletions(-) create mode 100644 dev/modal/test_convergence.py create mode 100644 dev/modal/test_correctness.py diff --git a/.github/workflows/nvi-ci.yml b/.github/workflows/nvi-ci.yml index a551af14a..f334216b5 100644 --- a/.github/workflows/nvi-ci.yml +++ b/.github/workflows/nvi-ci.yml @@ -40,7 +40,57 @@ jobs: - name: Run checkstyle run: make checkstyle - tests: + # tests: + # runs-on: ubuntu-latest + # needs: [checkstyle] + # env: + # MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + # MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + + # steps: + # - name: Checkout code + # uses: actions/checkout@v6 + + # - name: Set up Python + # uses: actions/setup-python@v6 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # python -m pip install --upgrade pip + # pip install modal + + # - name: Run tests + # run: | + # modal run dev.modal.tests + + correctness: + runs-on: ubuntu-latest + needs: [checkstyle] + env: + MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + + steps: + - name: Checkout code + uses: actions/checkout@v6 + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install modal + + - name: Run tests + run: | + modal run dev.modal.test_correctness + + convergence: runs-on: ubuntu-latest needs: [checkstyle] env: @@ -63,4 +113,4 @@ jobs: - name: Run tests run: | - modal run dev.modal.tests + modal run dev.modal.test_convergence diff --git a/dev/modal/test_convergence.py b/dev/modal/test_convergence.py new file mode 100644 index 000000000..a83d6f6c5 --- /dev/null +++ b/dev/modal/test_convergence.py @@ -0,0 +1,27 @@ +from pathlib import Path + +import modal + +ROOT_PATH = Path(__file__).parent.parent.parent +REMOTE_ROOT_PATH = "/root/liger-kernel" +PYTHON_VERSION = "3.12" + +image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv") + +app = modal.App("liger_tests", image=image) + +# mount: add local files to the remote container +repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH) + + +@app.function(gpu="H100!", image=repo, timeout=90 * 60) +def liger_tests(): + import subprocess + + subprocess.run( + ["uv pip install -e '.[dev]' --system"], + check=True, + shell=True, + cwd=REMOTE_ROOT_PATH, + ) + subprocess.run(["make test-convergence"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) diff --git a/dev/modal/test_correctness.py b/dev/modal/test_correctness.py new file mode 100644 index 000000000..2b55b4b3b --- /dev/null +++ b/dev/modal/test_correctness.py @@ -0,0 +1,27 @@ +from pathlib import Path + +import modal + +ROOT_PATH = Path(__file__).parent.parent.parent +REMOTE_ROOT_PATH = "/root/liger-kernel" +PYTHON_VERSION = "3.12" + +image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv") + +app = modal.App("liger_tests", image=image) + +# mount: add local files to the remote container +repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH) + + +@app.function(gpu="H100!", image=repo, timeout=90 * 60) +def liger_tests(): + import subprocess + + subprocess.run( + ["uv pip install -e '.[dev]' --system"], + check=True, + shell=True, + cwd=REMOTE_ROOT_PATH, + ) + subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) From a796ed275178d1dbd39ef71db6444ae1ded415fe Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 19:40:50 +0800 Subject: [PATCH 7/9] Fix Gemma3TextConfig rope parameters in convergence tests (#1025) ## Summary Follow-up to #1014 Change all occurences in all convergence tests. ## Testing Done - Hardware Type: - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- test/convergence/bf16/test_mini_models.py | 11 +++++++++-- test/convergence/bf16/test_mini_models_with_logits.py | 11 ++++++++++- test/convergence/fp32/test_mini_models.py | 9 ++++++++- test/convergence/fp32/test_mini_models_with_logits.py | 9 ++++++++- 4 files changed, 35 insertions(+), 5 deletions(-) diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py index 332755e73..13799ee0d 100644 --- a/test/convergence/bf16/test_mini_models.py +++ b/test/convergence/bf16/test_mini_models.py @@ -713,8 +713,15 @@ eos_token_id=1, tie_word_embeddings=True, rope_parameters=dict( - rope_theta=10000.0, - ), # 1000000 + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py index 88cd8b0fe..70fa45119 100644 --- a/test/convergence/bf16/test_mini_models_with_logits.py +++ b/test/convergence/bf16/test_mini_models_with_logits.py @@ -768,7 +768,16 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py index 01b75e932..22a1dcd6f 100644 --- a/test/convergence/fp32/test_mini_models.py +++ b/test/convergence/fp32/test_mini_models.py @@ -711,7 +711,14 @@ eos_token_id=1, tie_word_embeddings=True, rope_parameters=dict( - rope_theta=10000.0, + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), ), attention_bias=False, attention_dropout=0.0, diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py index 2d2688476..6822ffdda 100644 --- a/test/convergence/fp32/test_mini_models_with_logits.py +++ b/test/convergence/fp32/test_mini_models_with_logits.py @@ -676,7 +676,14 @@ eos_token_id=1, tie_word_embeddings=True, rope_parameters=dict( - rope_theta=10000.0, + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), ), attention_bias=False, attention_dropout=0.0, From 2cd6e3954ecb7b6075086bae1077494497046e13 Mon Sep 17 00:00:00 2001 From: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> Date: Fri, 16 Jan 2026 20:56:15 +0800 Subject: [PATCH 8/9] Remove deprecated argument `position_id` (#1027) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary `position_id` has been removed from `apply_rotary_pos_emb` in https://github.com/huggingface/transformers/pull/43255 ## Testing Done ``` ❯ python3 -m pytest test/transformers/test_rope.py -q test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-1-128-32-32-64] PASSED [ 2%] test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-2-128-32-32-64] PASSED [ 5%] test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-1-128-32-8-64] PASSED [ 8%] test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-2-128-32-8-64] PASSED [ 11%] test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-3-423-73-213-92] PASSED [ 13%] test/transformers/test_rope.py::test_correctness[True-dtype0-1e-05-1e-05-3-423-73-155-92] PASSED [ 16%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-1-128-32-32-64] PASSED [ 19%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-2-128-32-32-64] PASSED [ 22%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-1-128-32-8-64] PASSED [ 25%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-2-128-32-8-64] PASSED [ 27%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-3-423-73-213-92] PASSED [ 30%] test/transformers/test_rope.py::test_correctness[True-dtype1-0.1-1e-05-3-423-73-155-92] PASSED [ 33%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-1-128-32-32-64] PASSED [ 36%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-2-128-32-32-64] PASSED [ 38%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-1-128-32-8-64] PASSED [ 41%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-2-128-32-8-64] PASSED [ 44%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-3-423-73-213-92] PASSED [ 47%] test/transformers/test_rope.py::test_correctness[False-dtype0-1e-05-1e-05-3-423-73-155-92] PASSED [ 50%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-1-128-32-32-64] PASSED [ 52%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-2-128-32-32-64] PASSED [ 55%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-1-128-32-8-64] PASSED [ 58%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-2-128-32-8-64] PASSED [ 61%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-3-423-73-213-92] PASSED [ 63%] test/transformers/test_rope.py::test_correctness[False-dtype1-0.1-1e-05-3-423-73-155-92] PASSED [ 66%] test/transformers/test_rope.py::test_functional_correctness[True-dtype0-1e-05-1e-05-1-2-2-2-8] PASSED [ 69%] test/transformers/test_rope.py::test_functional_correctness[True-dtype0-1e-05-1e-05-1-2-1-2-8] PASSED [ 72%] test/transformers/test_rope.py::test_functional_correctness[True-dtype0-1e-05-1e-05-9-7-41-41-41] PASSED [ 75%] test/transformers/test_rope.py::test_functional_correctness[True-dtype1-0.1-1e-05-1-2-2-2-8] PASSED [ 77%] test/transformers/test_rope.py::test_functional_correctness[True-dtype1-0.1-1e-05-1-2-1-2-8] PASSED [ 80%] test/transformers/test_rope.py::test_functional_correctness[True-dtype1-0.1-1e-05-9-7-41-41-41] PASSED [ 83%] test/transformers/test_rope.py::test_functional_correctness[False-dtype0-1e-05-1e-05-1-2-2-2-8] PASSED [ 86%] test/transformers/test_rope.py::test_functional_correctness[False-dtype0-1e-05-1e-05-1-2-1-2-8] PASSED [ 88%] test/transformers/test_rope.py::test_functional_correctness[False-dtype0-1e-05-1e-05-9-7-41-41-41] PASSED [ 91%] test/transformers/test_rope.py::test_functional_correctness[False-dtype1-0.1-1e-05-1-2-2-2-8] PASSED [ 94%] test/transformers/test_rope.py::test_functional_correctness[False-dtype1-0.1-1e-05-1-2-1-2-8] PASSED [ 97%] test/transformers/test_rope.py::test_functional_correctness[False-dtype1-0.1-1e-05-9-7-41-41-41] PASSED [100%] ``` - Hardware Type: - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence Signed-off-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com> --- test/transformers/test_rope.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/transformers/test_rope.py b/test/transformers/test_rope.py index a7623a236..4df7da938 100644 --- a/test/transformers/test_rope.py +++ b/test/transformers/test_rope.py @@ -83,7 +83,7 @@ def test_correctness( cos, sin = rotary_emb(k1, pos_ids) # validate forward pass - hf_q, hf_k = apply_rotary_pos_emb(q1, k1, cos, sin, pos_ids) + hf_q, hf_k = apply_rotary_pos_emb(q1, k1, cos, sin) tt_q, tt_k = liger_rotary_pos_emb(q2, k2, cos, sin) assert torch.allclose(hf_q, tt_q, atol=atol, rtol=rtol) assert torch.allclose(hf_k, tt_k, atol=atol, rtol=rtol) From 6a28a3f6fef423dfe50420e46e92b27b7e2dff70 Mon Sep 17 00:00:00 2001 From: Yunwei Dai Date: Tue, 20 Jan 2026 14:46:13 +0800 Subject: [PATCH 9/9] Fix/transformers v5 gemma tokenizer (#1030) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Update Gemma tokenizer usage in convergence tests for Transformers v5 by removing deprecated `GemmaTokenizerFast` imports and renaming usages to the supported non-fast tokenizer class. This fixes the `No module named transformers.models.gemma.tokenization_gemma_fast` error when running convergence tests under Transformers v5. ## Details Transformers v5 moves away from parallel “fast” and “slow” tokenizer implementations and adopts a single tokenizer implementation (see [huggingface/transformers#40936](https://github.com/huggingface/transformers/pull/40936#issue-3425973764)). - Convergence tests were importing and instantiating the fast tokenizer class, causing import errors. - This change updates both: 1) the import path, and 2) the tokenizer class name used in code (`GemmaTokenizerFast` → `GemmaTokenizer`), following the new Transformers v5 API. ## Testing Done - Hardware Type: A100-40G-PCIe - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence