From df5504fdf5d5ed0919fb5ca981934b544f25c59a Mon Sep 17 00:00:00 2001 From: Vishnu Raj Date: Wed, 27 Aug 2025 07:36:36 +0000 Subject: [PATCH] get_image_processor fix; updates for 450M model --- nanoVLM.ipynb | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/nanoVLM.ipynb b/nanoVLM.ipynb index c8b6b37d..3228c28a 100644 --- a/nanoVLM.ipynb +++ b/nanoVLM.ipynb @@ -169,7 +169,7 @@ "source": [ "def get_dataloaders(train_cfg, vlm_cfg):\n", " # Create datasets\n", - " image_processor = get_image_processor(vlm_cfg.vit_img_size)\n", + " image_processor = get_image_processor(vlm_cfg.max_img_size, vlm_cfg.vit_img_size)\n", " tokenizer = get_tokenizer(vlm_cfg.lm_tokenizer, vlm_cfg.vlm_extra_tokens, vlm_cfg.lm_chat_template)\n", "\n", " # Load and combine all training datasets\n", @@ -401,39 +401,44 @@ " vit_hidden_dim: int = 768\n", " vit_inter_dim: int = 4 * vit_hidden_dim\n", " vit_patch_size: int = 16\n", - " vit_img_size: int = 224\n", + " vit_img_size: int = 512\n", " vit_n_heads: int = 12\n", " vit_dropout: float = 0.0\n", " vit_n_blocks: int = 12\n", " vit_ln_eps: float = 1e-6\n", " vit_cls_flag: bool = False\n", - " vit_model_type: str = 'google/siglip-base-patch16-224'\n", + " vit_model_type: str = 'google/siglip2-base-patch16-512'\n", "\n", - " lm_hidden_dim: int = 576\n", - " lm_inter_dim: int = 1536\n", + " lm_hidden_dim: int = 960\n", + " lm_inter_dim: int = 2560\n", " lm_rms_eps: float = 1e-5\n", " lm_re_base: int = 100000\n", " lm_max_position_embeddings: int = 8192\n", " lm_base_vocab_size: int = 49152\n", - " extra_token_amount: int = 1 # Number of extra tokens for the VLM (image start, image end, image token)\n", + " extra_token_amount: int = 17 # Number of extra tokens for the VLM (image start, image end, image token)\n", " lm_vocab_size: int = lm_base_vocab_size + extra_token_amount # Not a great way to do this, but it works for now (vlm_extra_tokens cannot be a dict, since this is mutable, and a Field has no len() function)\n", - " lm_n_heads: int = 9\n", - " lm_n_kv_heads: int = 3\n", + " lm_n_heads: int = 15\n", + " lm_n_kv_heads: int = 5\n", " lm_dropout: float = 0.0\n", - " lm_n_blocks: int = 30\n", + " lm_n_blocks: int = 32\n", " lm_attn_scaling: float = 1.0\n", - " lm_eos_token_id: int = 0\n", - " lm_max_length: int = 128\n", + " lm_max_length: int = 1024\n", " lm_use_tokens: bool = False # Decide if the LM expects tokens or embeddings as input (if using as a backbone for the VLM, set to False)\n", " lm_tie_weights: bool = True # Decide if you want to tie the LM Head weight to the token embedding weights\n", - " lm_model_type: str = 'HuggingFaceTB/SmolLM2-135M'\n", - " lm_tokenizer: str = 'HuggingFaceTB/cosmo2-tokenizer'\n", + " lm_model_type: str = 'HuggingFaceTB/SmolLM2-360M-Instruct'\n", + " lm_tokenizer: str = 'HuggingFaceTB/SmolLM2-360M-Instruct'\n", " lm_chat_template: str = \"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}\"\n", "\n", - " mp_pixel_shuffle_factor: int = 2\n", - " mp_image_token_length: int = 49\n", + " mp_pixel_shuffle_factor: int = 4\n", + " mp_image_token_length: int = 64\n", "\n", - " vlm_extra_tokens: dict[str, str] = field(default_factory=lambda: {\"image_token\": \"<|image|>\"})#, \"boi_token\": \"<|image_start|>\", \"eoi_token\": \"<|image_end|>\"})\n", + " max_img_size: int = 1024\n", + "\n", + " vlm_extra_tokens: dict[str, str] = field(default_factory=lambda: {\"image_token\": \"<|image|>\",\n", + " \"r1c1\": \"\", \"r1c2\": \"\", \"r1c3\": \"\", \"r1c4\": \"\",\n", + " \"r2c1\": \"\", \"r2c2\": \"\", \"r2c3\": \"\", \"r2c4\": \"\",\n", + " \"r3c1\": \"\", \"r3c2\": \"\", \"r3c3\": \"\", \"r3c4\": \"\",\n", + " \"r4c1\": \"\", \"r4c2\": \"\", \"r4c3\": \"\", \"r4c4\": \"\"})\n", " vlm_load_backbone_weights: bool = True\n", " vlm_checkpoint_path: str = 'checkpoints'\n", " hf_repo_name: str = 'nanoVLM'\n",