mlfoundations · rwightman · Feb 21, 2025 · Feb 20, 2025 · Feb 20, 2025 · Feb 21, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -115,7 +115,7 @@ jobs:
           tests
         jq -s -S 'add' durations_* > .test_durations
     - name: Collect pytest durations
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: pytest_durations_${{ matrix.os }}-${{ matrix.python }}-${{ matrix.job }}
         path: .test_durations
diff --git a/src/open_clip/convert.py b/src/open_clip/convert.py
@@ -13,7 +13,7 @@
 def load_big_vision_weights(model: CustomTextCLIP, checkpoint_path: str):
     """ Load weights from .npz checkpoints for official Google big_vision image-text models
 
-    Currently the SigLIP source models are supported and a CustomTextCLIP destination model
+    Currently, the SigLIP source models are supported and a CustomTextCLIP destination model
     w/ timm image encoder.
     """
     from timm.layers import resample_patch_embed, resample_abs_pos_embed
@@ -114,22 +114,27 @@ def _convert_timm_img(module, prefix):
 
     def _convert_openclip_transformer(module: Transformer, prefix):
         for i, block in enumerate(module.resblocks.children()):
-            block_prefix = f'{prefix}encoderblock_{i}/'
+            if f'{prefix}encoderblock/LayerNorm_0/scale' in w:
+                block_prefix = f'{prefix}encoderblock/'
+                idx = i
+            else:
+                block_prefix = f'{prefix}encoderblock_{i}/'
+                idx = None
             mha_prefix = block_prefix + f'MultiHeadDotProductAttention_0/'
-            block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale']))
-            block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias']))
+            block.ln_1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'], idx=idx))
+            block.ln_1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'], idx=idx))
             block.attn.in_proj_weight.copy_(torch.cat([
-                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')]))
+                _n2p(w[f'{mha_prefix}{n}/kernel'], t=False, idx=idx).flatten(1).T for n in ('query', 'key', 'value')]))
             block.attn.in_proj_bias.copy_(torch.cat([
-                _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')]))
-            block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1))
-            block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias']))
-            block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale']))
-            block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias']))
-            block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel']))
-            block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias']))
-            block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel']))
-            block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias']))
+                _n2p(w[f'{mha_prefix}{n}/bias'], t=False, idx=idx).reshape(-1) for n in ('query', 'key', 'value')]))
+            block.attn.out_proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel'], idx=idx).flatten(1))
+            block.attn.out_proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'], idx=idx))
+            block.ln_2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/scale'], idx=idx))
+            block.ln_2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_1/bias'], idx=idx))
+            block.mlp.c_fc.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/kernel'], idx=idx))
+            block.mlp.c_fc.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_0/bias'], idx=idx))
+            block.mlp.c_proj.weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/kernel'], idx=idx))
+            block.mlp.c_proj.bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_0/Dense_1/bias'], idx=idx))
 
     def _convert_openclip_txt(module: TextTransformer, prefix):
         module.token_embedding.weight.copy_(_n2p(w[f'{prefix}Embed_0/embedding'], t=False))
@@ -142,10 +147,11 @@ def _convert_openclip_txt(module: TextTransformer, prefix):
             module.text_projection.weight.copy_(_n2p(w[f'{prefix}head/kernel']))
             module.text_projection.bias.copy_(_n2p(w[f'{prefix}head/bias']))
 
-    _convert_timm_img(model.visual.trunk, 'img/')
-    _convert_openclip_txt(model.text, 'txt/')
-    model.logit_bias.copy_(_n2p(w['b'])[0])
-    model.logit_scale.copy_(_n2p(w['t'])[0])
+    root_prefix = 'params/' if 'params/b' in w else ''
+    _convert_timm_img(model.visual.trunk, f'{root_prefix}img/')
+    _convert_openclip_txt(model.text, f'{root_prefix}txt/')
+    model.logit_bias.copy_(_n2p(w[f'{root_prefix}b'])[0])
+    model.logit_scale.copy_(_n2p(w[f'{root_prefix}t'])[0])
 
 
 @torch.no_grad()

diff --git a/src/open_clip/factory.py b/src/open_clip/factory.py
@@ -18,7 +18,7 @@
 from .pretrained import is_pretrained_cfg, get_pretrained_cfg, download_pretrained,\
     list_pretrained_tags_by_model, download_pretrained_from_hf
 from .transform import image_transform_v2, AugmentationCfg, PreprocessCfg, merge_preprocess_dict, merge_preprocess_kwargs
-from .tokenizer import HFTokenizer, SimpleTokenizer, DEFAULT_CONTEXT_LENGTH
+from .tokenizer import HFTokenizer, SimpleTokenizer, SigLipTokenizer, DEFAULT_CONTEXT_LENGTH
 
 HF_HUB_PREFIX = 'hf-hub:'
 _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
@@ -122,13 +122,21 @@ def get_tokenizer(
     if context_length is None:
         context_length = text_config.get('context_length', DEFAULT_CONTEXT_LENGTH)
 
-    if 'hf_tokenizer_name' in text_config:
+    model_name = model_name.lower()
+    if text_config.get('hf_tokenizer_name', ''):
         tokenizer = HFTokenizer(
             text_config['hf_tokenizer_name'],
             context_length=context_length,
             cache_dir=cache_dir,
             **tokenizer_kwargs,
         )
+    elif 'siglip' in model_name:
+        tn = 'gemma' if 'siglip2'  in model_name else 'mc4' if 'i18n' in model_name else 'c4-en'
+        tokenizer = SigLipTokenizer(
+            tn,
+            context_length=context_length,
+            # **tokenizer_kwargs,
+        )
     else:
         tokenizer = SimpleTokenizer(
             context_length=context_length,

diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP2-256.json b/src/open_clip/model_configs/ViT-B-16-SigLIP2-256.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_base_patch16_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-256",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP2-384.json b/src/open_clip/model_configs/ViT-B-16-SigLIP2-384.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_base_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-384",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP2-512.json b/src/open_clip/model_configs/ViT-B-16-SigLIP2-512.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 512,
+        "timm_model_name": "vit_base_patch16_siglip_512",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2-512",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-B-16-SigLIP2.json b/src/open_clip/model_configs/ViT-B-16-SigLIP2.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 224,
+        "timm_model_name": "vit_base_patch16_siglip_224",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-16-SigLIP2",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-B-32-SigLIP2-256.json b/src/open_clip/model_configs/ViT-B-32-SigLIP2-256.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 768,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_base_patch32_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-B-32-SigLIP2-256",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 768,
+        "heads": 12,
+        "layers": 12,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-L-16-SigLIP2-256.json b/src/open_clip/model_configs/ViT-L-16-SigLIP2-256.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 1024,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 256,
+        "timm_model_name": "vit_large_patch16_siglip_256",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-L-16-SigLIP2-256",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1024,
+        "heads": 16,
+        "layers": 24,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-L-16-SigLIP2-384.json b/src/open_clip/model_configs/ViT-L-16-SigLIP2-384.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 1024,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 384,
+        "timm_model_name": "vit_large_patch16_siglip_384",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-L-16-SigLIP2-384",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1024,
+        "heads": 16,
+        "layers": 24,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}
diff --git a/src/open_clip/model_configs/ViT-L-16-SigLIP2-512.json b/src/open_clip/model_configs/ViT-L-16-SigLIP2-512.json
@@ -0,0 +1,32 @@
+{
+    "embed_dim": 1024,
+    "init_logit_bias": -10,
+    "custom_text": true,
+    "vision_cfg": {
+        "image_size": 512,
+        "timm_model_name": "vit_large_patch16_siglip_512",
+        "timm_model_pretrained": false,
+        "timm_pool": "map",
+        "timm_proj": "none"
+    },
+    "text_cfg": {
+        "context_length": 64,
+        "vocab_size": 256000,
+        "hf_tokenizer_name": "timm/ViT-L-16-SigLIP2-512",
+        "tokenizer_kwargs": {
+            "clean": "canonicalize"
+        },
+        "width": 1024,
+        "heads": 16,
+        "layers": 24,
+        "no_causal_mask": true,
+        "proj_bias": true,
+        "pool_type": "last",
+        "norm_kwargs":{
+            "eps": 1e-6
+        },
+        "act_kwargs": {
+            "approximate": "tanh"
+        }
+    }
+}