Add Phi model (#356)

Co-authored-by: Jonatan Kłosko <jonatanklosko@gmail.com>
elixir-nx · Mar 1, 2024 · a09b230 · a09b230
1 parent 9d7ce31
commit a09b230
Show file tree

Hide file tree

Showing 10 changed files with 622 additions and 11 deletions.
diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex
@@ -164,6 +164,10 @@ defmodule Bumblebee do
     "MistralModel" => {Bumblebee.Text.Mistral, :base},
     "MistralForCausalLM" => {Bumblebee.Text.Mistral, :for_causal_language_modeling},
     "MistralForSequenceClassification" => {Bumblebee.Text.Mistral, :for_sequence_classification},
+    "PhiModel" => {Bumblebee.Text.Phi, :base},
+    "PhiForCausalLM" => {Bumblebee.Text.Phi, :for_causal_language_modeling},
+    "PhiForSequenceClassification" => {Bumblebee.Text.Phi, :for_sequence_classification},
+    "PhiForTokenClassification" => {Bumblebee.Text.Phi, :for_token_classification},
     "ResNetForImageClassification" => {Bumblebee.Vision.ResNet, :for_image_classification},
     "ResNetModel" => {Bumblebee.Vision.ResNet, :base},
     "RobertaForMaskedLM" => {Bumblebee.Text.Roberta, :for_masked_language_modeling},
@@ -234,6 +238,7 @@ defmodule Bumblebee do
     "llama" => :llama,
     "mistral" => :llama,
     "mbart" => :mbart,
+    "phi" => :code_gen,
     "roberta" => :roberta,
     "t5" => :t5,
     "whisper" => :whisper,

diff --git a/lib/bumblebee/layers.ex b/lib/bumblebee/layers.ex
@@ -382,9 +382,21 @@ defmodule Bumblebee.Layers do
     * `:kernel_initializer` - initializer for `kernel` weights.
       Defaults to `:glorot_uniform`
 
+    * `:bias_initializer` - initializer for `bias` weights. Defaults
+      to `:zeros`.
+
+    * `:use_bias` - whether the layer should add bias to the output.
+      Defaults to `false`
+
   """
   def dense_transposed(%Axon{} = x, units, opts \\ []) do
-    opts = Keyword.validate!(opts, [:name, kernel_initializer: :glorot_uniform])
+    opts =
+      Keyword.validate!(opts, [
+        :name,
+        kernel_initializer: :glorot_uniform,
+        bias_initializer: :zeros,
+        use_bias: false
+      ])
 
     kernel_shape = fn input_shape ->
       kernel_shape = Axon.Shape.dense_kernel(input_shape, units)
@@ -396,13 +408,24 @@ defmodule Bumblebee.Layers do
       |> List.to_tuple()
     end
 
+    bias_shape = &Axon.Shape.dense_bias(&1, units)
+
     kernel = Axon.param("kernel", kernel_shape, initializer: opts[:kernel_initializer])
 
-    op = fn x, kernel, _opts ->
-      Nx.dot(x, [-1], kernel, [1])
-    end
+    {inputs, op} =
+      if opts[:use_bias] do
+        bias = Axon.param("bias", bias_shape, initializer: opts[:bias_initializer])
+        {[x, kernel, bias], &dense_transposed_impl/4}
+      else
+        {[x, kernel], &dense_transposed_impl/3}
+      end
 
-    Axon.layer(op, [x, kernel], name: opts[:name], op_name: :dense_transposed)
+    Axon.layer(op, inputs, name: opts[:name], op_name: :dense_transposed)
+  end
+
+  deftransformp dense_transposed_impl(x, kernel, bias \\ 0, _opts) do
+    Nx.dot(x, [-1], kernel, [1])
+    |> Nx.add(bias)
   end
 
   @doc """

diff --git a/lib/bumblebee/layers/transformer.ex b/lib/bumblebee/layers/transformer.ex
@@ -285,11 +285,11 @@ defmodule Bumblebee.Layers.Transformer do
 
         * `:max_positions` - the maximum number of distinct positions
 
-    * `:rotary_embedding_base` - base for computing rotary embedding frequency. Defaults
-      to `10_000`.
+        * `:base` - base for computing rotary embedding frequency. Defaults
+        to `10_000`.
 
-    * `:rotary_percentage` - percentage of hidden dimensions to allocate to rotary embeddings.
-      Defaults to `1.0`.
+        * `:percentage` - percentage of hidden dimensions to allocate to rotary embeddings.
+        Defaults to `1.0`.
 
     * `:name` - the prefix for layer names
 

diff --git a/lib/bumblebee/text/mistral.ex b/lib/bumblebee/text/mistral.ex
@@ -362,7 +362,7 @@ defmodule Bumblebee.Text.Mistral do
 
     gate = Axon.dense(hidden_state, intermediate_size, name: join(name, "gate"), use_bias: false)
 
-    hidden_state = Axon.multiply(intermediate, Axon.activation(gate, activation))
+    hidden_state = Axon.multiply(intermediate, Layers.activation(gate, activation))
 
     Axon.dense(hidden_state, output_size, name: join(name, "output"), use_bias: false)
   end