@@ -281,7 +281,6 @@ class SiglipMLP(nn.Layer):
281281 def __init__ (self , config ):
282282 super ().__init__ ()
283283 self .config = config
284- self .activation_fn = get_activation_fn (config .hidden_act )
285284 self .fc1 = nn .Linear (config .hidden_size , config .intermediate_size )
286285 self .fc1 .weight .weight_loader = self .weight_loader
287286 self .fc2 = nn .Linear (config .intermediate_size , config .hidden_size )
@@ -304,7 +303,7 @@ def weight_loader(self, param, loaded_weight, loaded_shard_id: Optional[str] = N
304303
305304 def forward (self , hidden_states : paddle .Tensor ) -> paddle .Tensor :
306305 hidden_states = self .fc1 (hidden_states )
307- hidden_states = self .activation_fn (hidden_states [0 ])
306+ hidden_states = get_activation_fn ( self .config . hidden_act ) (hidden_states [0 ])
308307 hidden_states = self .fc2 (hidden_states )
309308 return hidden_states
310309
@@ -318,7 +317,6 @@ def __init__(self, config):
318317 self .layer_norm2 = paddle .nn .LayerNorm (self .embed_dim , epsilon = config .layer_norm_eps )
319318 self .mlp = SiglipMLP (config )
320319
321- # @paddle.jit.to_static
322320 def forward (
323321 self ,
324322 hidden_states ,
@@ -527,7 +525,37 @@ def forward(
527525 else :
528526 attn_cu_seqlens = cu_seqlens
529527
530- max_seqlen = (attn_cu_seqlens [1 :] - attn_cu_seqlens [:- 1 ]).max ().item ()
528+ return self ._run_encoder_layer (
529+ encoder_states = encoder_states ,
530+ all_attentions = all_attentions ,
531+ attn_cu_seqlens = attn_cu_seqlens ,
532+ output_hidden_states = output_hidden_states ,
533+ reversed_window_indices = reversed_window_indices if output_hidden_states else None ,
534+ use_window_attn = use_window_attn ,
535+ hidden_states = hidden_states ,
536+ attention_mask = attention_mask ,
537+ output_attentions = output_attentions ,
538+ cos_emb = cos_emb ,
539+ sin_emb = sin_emb ,
540+ )
541+
542+ # This function will be compiled with CINN when graph_opt_level >= 2
543+ # TODO(SigureMo): Use a new decorator to mark the function for CINN compilation
544+ def _run_encoder_layer (
545+ self ,
546+ encoder_states : Optional [Tuple [()]],
547+ all_attentions : Optional [Tuple [()]],
548+ attn_cu_seqlens : Optional [paddle .Tensor ],
549+ output_hidden_states : Optional [bool ],
550+ reversed_window_indices : paddle .Tensor ,
551+ use_window_attn : bool ,
552+ hidden_states : paddle .Tensor ,
553+ attention_mask : Optional [paddle .Tensor ],
554+ output_attentions : bool ,
555+ cos_emb : Optional [paddle .Tensor ],
556+ sin_emb : Optional [paddle .Tensor ],
557+ ) -> paddle .Tensor :
558+ max_seqlen = (attn_cu_seqlens [1 :] - attn_cu_seqlens [:- 1 ]).max ().cpu ()
531559
532560 for encoder_layer in self .layers :
533561 if output_hidden_states :
0 commit comments