From df5504fdf5d5ed0919fb5ca981934b544f25c59a Mon Sep 17 00:00:00 2001
From: Vishnu Raj <vishnu.raj@dolby.com>
Date: Wed, 27 Aug 2025 07:36:36 +0000
Subject: [PATCH] get_image_processor fix; updates for 450M model

---
 nanoVLM.ipynb | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/nanoVLM.ipynb b/nanoVLM.ipynb
index c8b6b37d..3228c28a 100644
--- a/nanoVLM.ipynb
+++ b/nanoVLM.ipynb
@@ -169,7 +169,7 @@
       "source": [
         "def get_dataloaders(train_cfg, vlm_cfg):\n",
         "    # Create datasets\n",
-        "    image_processor = get_image_processor(vlm_cfg.vit_img_size)\n",
+        "    image_processor = get_image_processor(vlm_cfg.max_img_size, vlm_cfg.vit_img_size)\n",
         "    tokenizer = get_tokenizer(vlm_cfg.lm_tokenizer, vlm_cfg.vlm_extra_tokens, vlm_cfg.lm_chat_template)\n",
         "\n",
         "    # Load and combine all training datasets\n",
@@ -401,39 +401,44 @@
         "    vit_hidden_dim: int = 768\n",
         "    vit_inter_dim: int = 4 * vit_hidden_dim\n",
         "    vit_patch_size: int = 16\n",
-        "    vit_img_size: int = 224\n",
+        "    vit_img_size: int = 512\n",
         "    vit_n_heads: int = 12\n",
         "    vit_dropout: float = 0.0\n",
         "    vit_n_blocks: int = 12\n",
         "    vit_ln_eps: float = 1e-6\n",
         "    vit_cls_flag: bool = False\n",
-        "    vit_model_type: str = 'google/siglip-base-patch16-224'\n",
+        "    vit_model_type: str = 'google/siglip2-base-patch16-512'\n",
         "\n",
-        "    lm_hidden_dim: int = 576\n",
-        "    lm_inter_dim: int = 1536\n",
+        "    lm_hidden_dim: int = 960\n",
+        "    lm_inter_dim: int = 2560\n",
         "    lm_rms_eps: float = 1e-5\n",
         "    lm_re_base: int = 100000\n",
         "    lm_max_position_embeddings: int = 8192\n",
         "    lm_base_vocab_size: int = 49152\n",
-        "    extra_token_amount: int = 1  # Number of extra tokens for the VLM (image start, image end, image token)\n",
+        "    extra_token_amount: int = 17  # Number of extra tokens for the VLM (image start, image end, image token)\n",
         "    lm_vocab_size: int = lm_base_vocab_size + extra_token_amount # Not a great way to do this, but it works for now (vlm_extra_tokens cannot be a dict, since this is mutable, and a Field has no len() function)\n",
-        "    lm_n_heads: int = 9\n",
-        "    lm_n_kv_heads: int = 3\n",
+        "    lm_n_heads: int = 15\n",
+        "    lm_n_kv_heads: int = 5\n",
         "    lm_dropout: float = 0.0\n",
-        "    lm_n_blocks: int = 30\n",
+        "    lm_n_blocks: int = 32\n",
         "    lm_attn_scaling: float = 1.0\n",
-        "    lm_eos_token_id: int = 0\n",
-        "    lm_max_length: int = 128\n",
+        "    lm_max_length: int = 1024\n",
         "    lm_use_tokens: bool = False # Decide if the LM expects tokens or embeddings as input (if using as a backbone for the VLM, set to False)\n",
         "    lm_tie_weights: bool = True # Decide if you want to tie the LM Head weight to the token embedding weights\n",
-        "    lm_model_type: str = 'HuggingFaceTB/SmolLM2-135M'\n",
-        "    lm_tokenizer: str = 'HuggingFaceTB/cosmo2-tokenizer'\n",
+        "    lm_model_type: str = 'HuggingFaceTB/SmolLM2-360M-Instruct'\n",
+        "    lm_tokenizer: str = 'HuggingFaceTB/SmolLM2-360M-Instruct'\n",
         "    lm_chat_template: str = \"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}\"\n",
         "\n",
-        "    mp_pixel_shuffle_factor: int = 2\n",
-        "    mp_image_token_length: int = 49\n",
+        "    mp_pixel_shuffle_factor: int = 4\n",
+        "    mp_image_token_length: int = 64\n",
         "\n",
-        "    vlm_extra_tokens: dict[str, str] = field(default_factory=lambda: {\"image_token\": \"<|image|>\"})#, \"boi_token\": \"<|image_start|>\", \"eoi_token\": \"<|image_end|>\"})\n",
+        "    max_img_size: int = 1024\n",
+        "\n",
+        "    vlm_extra_tokens: dict[str, str] = field(default_factory=lambda: {\"image_token\": \"<|image|>\",\n",
+        "      \"r1c1\": \"<row_1_col_1>\", \"r1c2\": \"<row_1_col_2>\", \"r1c3\": \"<row_1_col_3>\", \"r1c4\": \"<row_1_col_4>\",\n",
+        "      \"r2c1\": \"<row_2_col_1>\", \"r2c2\": \"<row_2_col_2>\", \"r2c3\": \"<row_2_col_3>\", \"r2c4\": \"<row_2_col_4>\",\n",
+        "      \"r3c1\": \"<row_3_col_1>\", \"r3c2\": \"<row_3_col_2>\", \"r3c3\": \"<row_3_col_3>\", \"r3c4\": \"<row_3_col_4>\",\n",
+        "      \"r4c1\": \"<row_4_col_1>\", \"r4c2\": \"<row_4_col_2>\", \"r4c3\": \"<row_4_col_3>\", \"r4c4\": \"<row_4_col_4>\"})\n",
         "    vlm_load_backbone_weights: bool = True\n",
         "    vlm_checkpoint_path: str = 'checkpoints'\n",
         "    hf_repo_name: str = 'nanoVLM'\n",