diff --git a/.github/workflows/nvi-ci.yml b/.github/workflows/nvi-ci.yml index b828651ec..f334216b5 100644 --- a/.github/workflows/nvi-ci.yml +++ b/.github/workflows/nvi-ci.yml @@ -40,7 +40,32 @@ jobs: - name: Run checkstyle run: make checkstyle - tests: + # tests: + # runs-on: ubuntu-latest + # needs: [checkstyle] + # env: + # MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} + # MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} + + # steps: + # - name: Checkout code + # uses: actions/checkout@v6 + + # - name: Set up Python + # uses: actions/setup-python@v6 + # with: + # python-version: '3.10' + + # - name: Install dependencies + # run: | + # python -m pip install --upgrade pip + # pip install modal + + # - name: Run tests + # run: | + # modal run dev.modal.tests + + correctness: runs-on: ubuntu-latest needs: [checkstyle] env: @@ -63,15 +88,14 @@ jobs: - name: Run tests run: | - modal run dev.modal.tests + modal run dev.modal.test_correctness - tests-bwd: + convergence: runs-on: ubuntu-latest needs: [checkstyle] env: MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - REBUILD_IMAGE: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' }} steps: - name: Checkout code @@ -89,4 +113,4 @@ jobs: - name: Run tests run: | - modal run dev.modal.tests_bwd \ No newline at end of file + modal run dev.modal.test_convergence diff --git a/dev/modal/tests_bwd.py b/dev/modal/test_convergence.py similarity index 65% rename from dev/modal/tests_bwd.py rename to dev/modal/test_convergence.py index f71773d99..a83d6f6c5 100644 --- a/dev/modal/tests_bwd.py +++ b/dev/modal/test_convergence.py @@ -8,14 +8,14 @@ image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv") -app = modal.App("liger_tests_bwd", image=image) +app = modal.App("liger_tests", image=image) # mount: add local files to the remote container repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH) @app.function(gpu="H100!", image=repo, timeout=90 * 60) -def liger_bwd_tests(): +def liger_tests(): import subprocess subprocess.run( @@ -24,12 +24,4 @@ def liger_bwd_tests(): shell=True, cwd=REMOTE_ROOT_PATH, ) - # force install transformers==4.49.0 - subprocess.run( - ["uv pip install transformers==4.49.0 --system"], - check=True, - shell=True, - cwd=REMOTE_ROOT_PATH, - ) - subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) subprocess.run(["make test-convergence"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) diff --git a/dev/modal/test_correctness.py b/dev/modal/test_correctness.py new file mode 100644 index 000000000..2b55b4b3b --- /dev/null +++ b/dev/modal/test_correctness.py @@ -0,0 +1,27 @@ +from pathlib import Path + +import modal + +ROOT_PATH = Path(__file__).parent.parent.parent +REMOTE_ROOT_PATH = "/root/liger-kernel" +PYTHON_VERSION = "3.12" + +image = modal.Image.debian_slim(python_version=PYTHON_VERSION).pip_install("uv") + +app = modal.App("liger_tests", image=image) + +# mount: add local files to the remote container +repo = image.add_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH) + + +@app.function(gpu="H100!", image=repo, timeout=90 * 60) +def liger_tests(): + import subprocess + + subprocess.run( + ["uv pip install -e '.[dev]' --system"], + check=True, + shell=True, + cwd=REMOTE_ROOT_PATH, + ) + subprocess.run(["make test"], check=True, shell=True, cwd=REMOTE_ROOT_PATH) diff --git a/setup.py b/setup.py index 8e73d905f..965bc3fcc 100644 --- a/setup.py +++ b/setup.py @@ -32,7 +32,7 @@ def get_optional_dependencies(): """Get optional dependency groups.""" return { "dev": [ - "transformers>=4.49.0", + "transformers==5.0.0rc3", "matplotlib>=3.7.2", "ruff>=0.12.0", "pytest>=7.1.2", diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py index 19ca2044e..13799ee0d 100644 --- a/test/convergence/bf16/test_mini_models.py +++ b/test/convergence/bf16/test_mini_models.py @@ -333,8 +333,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -362,7 +363,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -391,7 +394,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -416,7 +421,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -441,7 +448,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -476,7 +485,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -504,7 +515,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -532,7 +545,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -561,8 +576,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -590,7 +606,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -616,8 +634,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -693,7 +712,16 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -721,14 +749,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", + rope_theta=500_000, ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -762,9 +790,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -814,9 +841,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -870,9 +896,8 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -923,9 +948,8 @@ num_key_value_heads=2, head_dim=128, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -977,8 +1001,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1010,8 +1035,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1069,8 +1095,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1098,8 +1125,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1128,8 +1156,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1165,8 +1194,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1182,11 +1209,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1232,8 +1258,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1249,11 +1273,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1303,8 +1326,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1396,8 +1420,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1437,7 +1462,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1468,8 +1495,9 @@ eod_token_id=3, sep_token_id=4, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, num_experts=2, @@ -1496,7 +1524,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/bf16/test_mini_models_multimodal.py b/test/convergence/bf16/test_mini_models_multimodal.py index bd090e060..df60cef49 100644 --- a/test/convergence/bf16/test_mini_models_multimodal.py +++ b/test/convergence/bf16/test_mini_models_multimodal.py @@ -8,7 +8,7 @@ from datasets import load_dataset from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerFast -from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast +from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from liger_kernel.transformers import apply_liger_kernel_to_gemma3 @@ -54,7 +54,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration @@ -70,7 +70,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor @@ -82,7 +82,7 @@ QWEN2_5_VL_AVAILABLE = False try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -138,7 +138,7 @@ from packaging import version from transformers.models.gemma.configuration_gemma import GemmaConfig - from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast + from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.gemma2.configuration_gemma2 import Gemma2Config from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration @@ -191,7 +191,7 @@ try: # SmolVLM2 is only available in transformers>=4.50.0 - from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast + from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration @@ -268,7 +268,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -315,14 +317,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", + rope_theta=500_000, ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -372,7 +374,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -421,7 +425,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -466,7 +472,16 @@ rms_norm_eps=1e-06, use_cache=True, tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, ).to_dict(), @@ -503,9 +518,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -545,8 +559,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -637,7 +652,9 @@ num_hidden_layers=4, # 30 -> reduced to 4 for testing num_key_value_heads=3, # 3 for 256M model rms_norm_eps=1e-5, - rope_theta=100000, + rope_parameters=dict( + rope_theta=100000, + ), tie_word_embeddings=False, vocab_size=49280, ), @@ -680,10 +697,9 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", - mrope_section=[16, 24, 24], # (temporal, height, width) + rope_parameters=dict( + rope_theta=1000000.0, + mrope_section=[16, 24, 24], ), sliding_window=4096, tie_word_embeddings=True, @@ -742,9 +758,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -794,9 +809,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -825,7 +839,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2VLProcessor( @@ -847,7 +861,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2_5_VLProcessor( @@ -869,7 +883,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2) video_processor = Qwen3VLVideoProcessor() return Qwen3VLProcessor( @@ -926,7 +940,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = GotOcr2ImageProcessorFast( crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448} ) @@ -950,7 +964,7 @@ def create_processor(model_name: str): ) ] ) - gpt2_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SmolVLMImageProcessor(size={"longest_edge": 512}) video_processor = SmolVLMVideoProcessor() @@ -1020,7 +1034,7 @@ def create_processor(model_name: str): ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256) return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer) @@ -1040,7 +1054,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Gemma3ImageProcessor() return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer) diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py index e329d1c26..70fa45119 100644 --- a/test/convergence/bf16/test_mini_models_with_logits.py +++ b/test/convergence/bf16/test_mini_models_with_logits.py @@ -315,8 +315,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -344,7 +345,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -373,7 +376,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -398,7 +403,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -423,7 +430,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -458,7 +467,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -486,7 +497,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -514,7 +527,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -543,8 +558,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -571,7 +587,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -597,8 +615,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -642,9 +661,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -697,9 +715,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -751,7 +768,16 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -778,7 +804,7 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, @@ -819,9 +845,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -871,9 +896,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -923,8 +947,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -957,8 +982,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1016,8 +1042,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1045,8 +1072,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1075,8 +1103,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1111,8 +1140,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1128,11 +1155,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", + "rope_parameters": { + "rope_theta": 500_000, "mrope_section": [8, 12, 12], # (temporal, height, width) }, - "rope_theta": 500_000, "vocab_size": 32000, "attention_bias": True, }, @@ -1178,8 +1204,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1195,11 +1219,11 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { + "rope_parameters": { + "rope_theta": 500_000, "type": "default", "mrope_section": [8, 12, 12], # (temporal, height, width) }, - "rope_theta": 500_000, "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1249,8 +1273,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1341,8 +1366,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1383,7 +1409,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1414,8 +1442,9 @@ eod_token_id=3, sep_token_id=4, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, num_experts=2, @@ -1442,7 +1471,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py index 7b1ffabd1..22a1dcd6f 100644 --- a/test/convergence/fp32/test_mini_models.py +++ b/test/convergence/fp32/test_mini_models.py @@ -332,8 +332,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -361,7 +362,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -390,7 +393,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -415,7 +420,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -440,7 +447,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -475,7 +484,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -503,7 +514,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -531,7 +544,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -559,8 +574,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -588,7 +604,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -614,8 +632,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -651,14 +670,14 @@ rms_norm_eps=1e-5, use_cache=True, tie_word_embeddings=False, - rope_parameters={ - "rope_type": "yarn", - "factor": 8.0, - "beta_fast": 32.0, - "beta_slow": 1.0, - "truncate": False, - "original_max_position_embeddings": 4096, - }, + rope_parameters=dict( + rope_type="yarn", + factor=8.0, + beta_fast=32.0, + beta_slow=1.0, + truncate=False, + original_max_position_embeddings=4096, + ), attention_dropout=0.0, num_local_experts=8, # Reduced from 32 for mini model num_experts_per_tok=2, # Reduced from 4 for mini model @@ -691,7 +710,16 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -718,14 +746,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -759,9 +787,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -811,9 +838,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -866,9 +892,8 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -919,9 +944,8 @@ num_key_value_heads=2, head_dim=128, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), use_cache=True, @@ -973,8 +997,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1006,8 +1031,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1035,8 +1061,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1065,8 +1092,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1102,8 +1130,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1119,11 +1145,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1169,8 +1194,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1186,11 +1209,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1238,8 +1260,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1298,8 +1321,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1390,8 +1414,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1430,7 +1455,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1456,7 +1483,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, num_experts=8, @@ -1484,7 +1513,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/convergence/fp32/test_mini_models_multimodal.py b/test/convergence/fp32/test_mini_models_multimodal.py index ac1f0ee92..448d95c95 100644 --- a/test/convergence/fp32/test_mini_models_multimodal.py +++ b/test/convergence/fp32/test_mini_models_multimodal.py @@ -9,7 +9,7 @@ from datasets import load_dataset from torch.utils.data import DataLoader from transformers import PreTrainedTokenizerFast -from transformers.models.gemma.tokenization_gemma_fast import GemmaTokenizerFast +from transformers.models.gemma.tokenization_gemma import GemmaTokenizer from transformers.models.siglip.configuration_siglip import SiglipVisionConfig from liger_kernel.transformers import apply_liger_kernel_to_gemma3 @@ -54,7 +54,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLConfig from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration @@ -70,7 +70,7 @@ import transformers from packaging import version - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLConfig from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration from transformers.models.qwen2_5_vl.processing_qwen2_5_vl import Qwen2_5_VLProcessor @@ -83,7 +83,7 @@ try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -108,7 +108,7 @@ QWEN3_VL_MOE_AVAILABLE = False try: - from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast + from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer from transformers.models.qwen2_vl.image_processing_qwen2_vl import Qwen2VLImageProcessor from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLConfig from transformers.models.qwen3_vl.configuration_qwen3_vl import Qwen3VLTextConfig @@ -214,7 +214,7 @@ try: # SmolVLM2 is only available in transformers>=4.50.0 - from transformers.models.gpt2.tokenization_gpt2_fast import GPT2TokenizerFast + from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer from transformers.models.smolvlm.configuration_smolvlm import SmolVLMConfig from transformers.models.smolvlm.image_processing_smolvlm import SmolVLMImageProcessor from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration @@ -291,7 +291,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -339,14 +341,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -396,7 +398,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -446,7 +450,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -492,7 +498,16 @@ rms_norm_eps=1e-06, use_cache=True, tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, ), @@ -528,9 +543,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -570,8 +584,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -662,7 +677,9 @@ num_hidden_layers=4, # 30 -> reduced to 4 for testing num_key_value_heads=3, # 3 for 256M model rms_norm_eps=1e-5, - rope_theta=100000, + rope_parameters=dict( + rope_theta=100000, + ), tie_word_embeddings=False, vocab_size=49280, ), @@ -705,9 +722,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -767,9 +783,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -819,9 +834,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -879,9 +893,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -931,9 +944,8 @@ rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -962,7 +974,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2VLProcessor( @@ -984,7 +996,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor() video_processor = Qwen2VLVideoProcessor() return Qwen2_5_VLProcessor( @@ -1006,7 +1018,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Qwen2VLImageProcessor(patch_size=16, temporal_patch_size=2, merge_size=2) video_processor = Qwen3VLVideoProcessor() return Qwen3VLProcessor( @@ -1063,7 +1075,7 @@ def create_processor(model_name: str): ) ] ) - qwen_tokenizer = Qwen2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + qwen_tokenizer = Qwen2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = GotOcr2ImageProcessorFast( crop_to_patches=False, min_patches=1, max_patches=12, size={"height": 448, "width": 448} ) @@ -1087,7 +1099,7 @@ def create_processor(model_name: str): ) ] ) - gpt2_tokenizer = GPT2TokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + gpt2_tokenizer = GPT2Tokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SmolVLMImageProcessor(size={"longest_edge": 512}) video_processor = SmolVLMVideoProcessor() @@ -1157,7 +1169,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = SiglipImageProcessor(size={"height": 224, "width": 224}, image_seq_length=256) return PaliGemmaProcessor(image_processor=image_processor, tokenizer=fast_tokenizer) @@ -1177,7 +1189,7 @@ def create_processor(model_name: str): ) ] ) - fast_tokenizer = GemmaTokenizerFast(tokenizer_object=tokenizer_base, **tokenizer_config) + fast_tokenizer = GemmaTokenizer(tokenizer_object=tokenizer_base, **tokenizer_config) image_processor = Gemma3ImageProcessor() return Gemma3Processor(image_processor=image_processor, tokenizer=fast_tokenizer) diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py index ace68f8ab..6822ffdda 100644 --- a/test/convergence/fp32/test_mini_models_with_logits.py +++ b/test/convergence/fp32/test_mini_models_with_logits.py @@ -335,8 +335,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -364,7 +365,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -393,7 +396,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=None, # defaults to num_attention_heads rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=None, tie_word_embeddings=False, use_cache=True, @@ -418,7 +423,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -443,7 +450,9 @@ num_hidden_layers=4, # 32 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), sliding_window=4096, tie_word_embeddings=False, use_cache=True, @@ -478,7 +487,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -506,7 +517,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, ), @@ -534,7 +547,9 @@ bos_token_id=1, # 128000 eos_token_id=2, # 128001 tie_word_embeddings=True, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -562,8 +577,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -590,7 +606,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), sliding_window=131072, tie_word_embeddings=True, use_cache=True, @@ -616,8 +634,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -656,7 +675,16 @@ bos_token_id=2, eos_token_id=1, tie_word_embeddings=True, - rope_theta=10000.0, # 1000000 + rope_parameters=dict( + full_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + sliding_attention=dict( + rope_theta=10000.0, + rope_type="default", + ), + ), attention_bias=False, attention_dropout=0.0, attn_implementation="eager", @@ -683,14 +711,14 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=dict( + rope_parameters=dict( + rope_theta=500_000, factor=8.0, high_freq_factor=4.0, low_freq_factor=1.0, original_max_position_embeddings=8192, rope_type="llama3", ), - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -724,9 +752,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -776,9 +803,8 @@ num_hidden_layers=4, # 80 num_key_value_heads=2, # 8 rms_norm_eps=1e-6, # 1e-5 - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], # (temporal, height, width) ), sliding_window=4096, @@ -833,9 +859,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -888,9 +913,8 @@ num_key_value_heads=2, pad_token_id=2, rms_norm_eps=1e-6, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), sliding_window=131072, @@ -942,8 +966,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -976,8 +1001,9 @@ num_hidden_layers=4, num_key_value_heads=2, pretraining_tp=1, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, max_position_embeddings=4096, # llava-1.5-7b-hf @@ -1035,8 +1061,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1064,8 +1091,9 @@ num_hidden_layers=4, # 40 num_key_value_heads=2, # 8 rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1094,8 +1122,9 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, + rope_parameters=dict( + rope_theta=500_000, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1131,8 +1160,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1148,11 +1175,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, }, @@ -1197,8 +1223,6 @@ num_hidden_layers=4, # 61 num_key_value_heads=2, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500_000, tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 151552 @@ -1214,11 +1238,10 @@ "num_hidden_layers": 4, "num_key_value_heads": 2, "rms_norm_eps": 1e-5, - "rope_scaling": { - "type": "default", - "mrope_section": [8, 12, 12], # (temporal, height, width) - }, - "rope_theta": 500_000, + "rope_parameters": dict( + rope_theta=500_000, + mrope_section=[8, 12, 12], # (temporal, height, width) + ), "vocab_size": 32000, "attention_bias": True, "attention_dropout": 0.0, @@ -1268,8 +1291,9 @@ num_key_value_heads=2, # 8 pretraining_tp=1, rms_norm_eps=1e-5, - rope_scaling=None, - rope_theta=500000.0, + rope_parameters=dict( + rope_theta=500000.0, + ), tie_word_embeddings=False, use_cache=True, vocab_size=32000, # 128256, @@ -1360,8 +1384,9 @@ rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, - rope_theta=10000.0, - rope_scaling=None, + rope_parameters=dict( + rope_theta=10000.0, + ), attention_bias=False, use_sliding_window=False, sliding_window=4096, @@ -1402,7 +1427,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, use_cache=True, @@ -1428,7 +1455,9 @@ initializer_range=0.02, norm_eps=1e-6, num_key_value_heads=2, - rope_theta=10000.0, + rope_parameters=dict( + rope_theta=10000.0, + ), partial_rotary_factor=1.0, vocab_size=32000, num_experts=8, @@ -1456,7 +1485,9 @@ num_hidden_layers=4, num_key_value_heads=2, rms_norm_eps=1e-5, - rope_theta=1000000.0, + rope_parameters=dict( + rope_theta=1000000.0, + ), tie_word_embeddings=True, use_cache=True, vocab_size=32000, diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py index 71ebf592a..d9682fe00 100755 --- a/test/transformers/test_monkey_patch.py +++ b/test/transformers/test_monkey_patch.py @@ -497,9 +497,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_for_conditional_generation( rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -598,9 +597,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -675,9 +673,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_text(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -771,9 +768,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_for_conditional_generat rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -877,9 +873,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -959,9 +954,8 @@ def test_apply_liger_kernel_to_instance_for_qwen3_vl_moe_text(): rms_norm_eps=1e-6, use_cache=False, tie_word_embeddings=True, - rope_theta=1000000.0, - rope_scaling=dict( - type="mrope", + rope_parameters=dict( + rope_theta=1000000.0, mrope_section=[16, 24, 24], ), attention_dropout=0.0, @@ -1107,6 +1101,13 @@ def test_apply_liger_kernel_to_instance_for_mllama_for_conditional_generation(): intermediate_size=64, hidden_act="silu", num_hidden_layers=2, + rope_parameters=dict( + factor=8.0, + high_freq_factor=4.0, + low_freq_factor=1.0, + max_position_embeddings=8192, + rope_type="llama3", + ), rope_scaling=dict( factor=8.0, high_freq_factor=4.0, diff --git a/test/transformers/test_rope.py b/test/transformers/test_rope.py index a7623a236..4df7da938 100644 --- a/test/transformers/test_rope.py +++ b/test/transformers/test_rope.py @@ -83,7 +83,7 @@ def test_correctness( cos, sin = rotary_emb(k1, pos_ids) # validate forward pass - hf_q, hf_k = apply_rotary_pos_emb(q1, k1, cos, sin, pos_ids) + hf_q, hf_k = apply_rotary_pos_emb(q1, k1, cos, sin) tt_q, tt_k = liger_rotary_pos_emb(q2, k2, cos, sin) assert torch.allclose(hf_q, tt_q, atol=atol, rtol=rtol) assert torch.allclose(hf_k, tt_k, atol=atol, rtol=rtol)