From 52ddc53e013c3bfc6b64e8f46f0206030c797b80 Mon Sep 17 00:00:00 2001 From: Ayman Osman Date: Mon, 12 Aug 2024 12:05:56 +0100 Subject: [PATCH 1/6] Add M2M100 & NLLB model --- lib/bumblebee.ex | 5 +- lib/bumblebee/text/m2m100.ex | 516 ++++++++++++++++++++ lib/bumblebee/text/pre_trained_tokenizer.ex | 10 + test/bumblebee/text/m2m100_test.exs | 68 +++ test/bumblebee/text/nllb_test.exs | 38 ++ 5 files changed, 636 insertions(+), 1 deletion(-) create mode 100644 lib/bumblebee/text/m2m100.ex create mode 100644 test/bumblebee/text/m2m100_test.exs create mode 100644 test/bumblebee/text/nllb_test.exs diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex index 0f22e9f0..8661dacc 100644 --- a/lib/bumblebee.ex +++ b/lib/bumblebee.ex @@ -165,6 +165,8 @@ defmodule Bumblebee do "MBartForQuestionAnswering" => {Bumblebee.Text.Mbart, :for_question_answering}, "MBartForSequenceClassification" => {Bumblebee.Text.Mbart, :for_sequence_classification}, "MBartModel" => {Bumblebee.Text.Mbart, :base}, + "M2M100Model" => {Bumblebee.Text.M2m100, :base}, + "M2M100ForConditionalGeneration" => {Bumblebee.Text.M2m100, :for_conditional_generation}, "MistralModel" => {Bumblebee.Text.Mistral, :base}, "MistralForCausalLM" => {Bumblebee.Text.Mistral, :for_causal_language_modeling}, "MistralForSequenceClassification" => {Bumblebee.Text.Mistral, :for_sequence_classification}, @@ -252,7 +254,8 @@ defmodule Bumblebee do "roberta" => :roberta, "t5" => :t5, "whisper" => :whisper, - "xlm-roberta" => :xlm_roberta + "xlm-roberta" => :xlm_roberta, + "m2m_100" => :m2m_100 } @diffusers_class_to_scheduler %{ diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex new file mode 100644 index 00000000..b860d4e3 --- /dev/null +++ b/lib/bumblebee/text/m2m100.ex @@ -0,0 +1,516 @@ +defmodule Bumblebee.Text.M2m100 do + alias Bumblebee.Shared + + options = + [ + vocab_size: [ + default: 128_112, + doc: """ + the vocabulary size of the token embedding. This corresponds to the number of distinct + tokens that can be represented in model input and output + """ + ], + max_positions: [ + default: 1024, + doc: """ + the vocabulary size of the position embedding. This corresponds to the maximum sequence + length that this model can process. Typically this is set to a large value just in case, + such as 512, 1024 or 2048 + """ + ], + hidden_size: [ + default: 1024, + doc: "the dimensionality of hidden layers" + ], + encoder_num_blocks: [ + default: 12, + doc: "the number of Transformer blocks in the encoder" + ], + decoder_num_blocks: [ + default: 12, + doc: "the number of Transformer blocks in the decoder" + ], + encoder_num_attention_heads: [ + default: 16, + doc: "the number of attention heads for each attention layer in the encoder" + ], + decoder_num_attention_heads: [ + default: 16, + doc: "the number of attention heads for each attention layer in the decoder" + ], + encoder_intermediate_size: [ + default: 4096, + doc: + "the dimensionality of the intermediate layer in the transformer feed-forward network (FFN) in the encoder" + ], + decoder_intermediate_size: [ + default: 4096, + doc: + "the dimensionality of the intermediate layer in the transformer feed-forward network (FFN) in the decoder" + ], + scale_embedding: [ + default: true, + doc: "scale embeddings by dividing by sqrt(hidden_size)" + ], + activation: [ + default: :relu, + doc: "the activation function" + ], + dropout_rate: [ + default: 0.1, + doc: "the dropout rate for encoder and decoder" + ], + attention_dropout_rate: [ + default: 0.1, + doc: "the dropout rate for attention weights" + ], + activation_dropout_rate: [ + default: 0.0, + doc: "the dropout rate for activations inside fully connected layers" + ], + initializer_scale: [ + default: 0.02, + doc: + "the standard deviation of the normal initializer used for initializing kernel parameters" + ] + ] ++ + Shared.common_options([:num_labels, :id_to_label]) ++ + Shared.token_options(pad_token_id: 1, eos_token_id: 2, decoder_start_token_id: 2) + + @moduledoc """ + M2M100 model family. + """ + + defstruct [architecture: :base] ++ Shared.option_defaults(options) + + @behaviour Bumblebee.ModelSpec + @behaviour Bumblebee.Configurable + @behaviour Bumblebee.Text.Generation + + import Bumblebee.Utils.Model, only: [join: 2] + + alias Bumblebee.Layers + + @impl true + def architectures(), + do: [ + :base, + :for_conditional_generation + ] + + @impl true + def config(spec, opts) do + spec + |> Shared.put_config_attrs(opts) + |> Shared.validate_label_options() + end + + @impl true + def input_template(_spec) do + %{ + "input_ids" => Nx.template({1, 1}, :s64), + "decoder_input_ids" => Nx.template({1, 1}, :s64) + } + end + + @impl true + def model(%__MODULE__{architecture: :base} = spec) do + inputs = encoder_decoder_inputs(spec) + + inputs + |> core(spec) + |> Layers.output() + end + + def model(%__MODULE__{architecture: :for_conditional_generation} = spec) do + inputs = encoder_decoder_inputs(spec) + outputs = core(inputs, spec) + + logits = + outputs.hidden_state + |> language_modeling_head(spec, name: "language_modeling_head") + |> Axon.bias(name: "language_modeling_head.logits_bias", bias_initializer: :zeros) + + Layers.output(%{ + logits: logits, + decoder_hidden_states: outputs.decoder_hidden_states, + decoder_attentions: outputs.decoder_attentions, + cross_attentions: outputs.cross_attentions, + encoder_hidden_state: outputs.encoder_hidden_state, + encoder_hidden_states: outputs.encoder_hidden_states, + encoder_attentions: outputs.encoder_attentions, + cache: outputs.cache + }) + end + + defp encoder_decoder_inputs(spec) do + shape = {nil, nil} + hidden_shape = {nil, nil, spec.hidden_size} + + encoder_attention_head_mask_shape = + {spec.encoder_num_blocks, spec.encoder_num_attention_heads} + + decoder_attention_head_mask_shape = + {spec.decoder_num_blocks, spec.decoder_num_attention_heads} + + Bumblebee.Utils.Model.inputs_to_map([ + Axon.input("input_ids", optional: true, shape: shape), + Axon.input("attention_mask", optional: true, shape: shape), + Axon.input("position_ids", optional: true, shape: shape), + Axon.input("attention_head_mask", optional: true, shape: encoder_attention_head_mask_shape), + Axon.input("input_embeddings", optional: true, shape: hidden_shape), + Axon.input("decoder_input_ids", optional: true, shape: shape), + Axon.input("decoder_attention_mask", optional: true, shape: shape), + Axon.input("decoder_position_ids", optional: true, shape: shape), + Axon.input("decoder_attention_head_mask", + optional: true, + shape: decoder_attention_head_mask_shape + ), + Axon.input("decoder_input_embeddings", optional: true, shape: hidden_shape), + Axon.input("encoder_hidden_state", optional: true, shape: hidden_shape), + Axon.input("cross_attention_head_mask", + optional: true, + shape: decoder_attention_head_mask_shape + ), + Axon.input("cache", optional: true) + ]) + end + + @impl true + def init_cache(spec, batch_size, max_length, inputs) do + encoder_sequence_length = + if encoder_hidden_state = inputs["encoder_hidden_state"] do + Nx.axis_size(encoder_hidden_state, 1) + end + + Layers.Decoder.init_cache(batch_size, max_length, + hidden_size: spec.hidden_size, + decoder_num_attention_heads: spec.decoder_num_attention_heads, + encoder_num_attention_heads: spec.encoder_num_attention_heads, + decoder_num_blocks: spec.decoder_num_blocks, + encoder_sequence_length: encoder_sequence_length + ) + end + + @impl true + def traverse_cache(_spec, cache, fun) do + Layers.Decoder.traverse_cache(cache, fun) + end + + defp core(inputs, spec) do + encoder_outputs = + Layers.if_present inputs["encoder_hidden_state"] do + %{ + hidden_state: inputs["encoder_hidden_state"], + hidden_states: Layers.none(), + attentions: Layers.none() + } + else + embeddings = + embedder(inputs["input_ids"], inputs["position_ids"], inputs["input_embeddings"], spec, + name: "encoder_embedder" + ) + + embeddings + |> encoder(inputs["attention_mask"], inputs["attention_head_mask"], spec, name: "encoder") + |> Map.take([:hidden_state, :hidden_states, :attentions]) + end + + decoder_input_ids = + Layers.default inputs["decoder_input_ids"] do + Layers.shift_tokens_right(inputs["input_ids"], spec.decoder_start_token_id) + end + + embeddings = + embedder( + decoder_input_ids, + inputs["decoder_position_ids"], + inputs["decoder_input_embeddings"], + spec, + name: "decoder_embedder" + ) + + decoder_outputs = + decoder( + embeddings, + inputs["decoder_attention_mask"], + inputs["decoder_attention_head_mask"], + encoder_outputs.hidden_state, + inputs["attention_mask"], + inputs["cross_attention_head_mask"], + inputs["cache"], + spec, + name: "decoder" + ) + + %{ + hidden_state: decoder_outputs.hidden_state, + decoder_hidden_states: decoder_outputs.hidden_states, + decoder_attentions: decoder_outputs.attentions, + cross_attentions: decoder_outputs.cross_attentions, + cache: decoder_outputs.cache, + encoder_hidden_state: encoder_outputs.hidden_state, + encoder_hidden_states: encoder_outputs.hidden_states, + encoder_attentions: encoder_outputs.attentions + } + end + + defp encoder(hidden_state, attention_mask, attention_head_mask, spec, opts) do + name = opts[:name] + + encoder_outputs = + Layers.Transformer.blocks(hidden_state, + attention_mask: attention_mask, + attention_head_mask: attention_head_mask, + num_blocks: spec.encoder_num_blocks, + num_attention_heads: spec.encoder_num_attention_heads, + hidden_size: spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + dropout_rate: spec.dropout_rate, + attention_dropout_rate: spec.attention_dropout_rate, + layer_norm: [ + epsilon: 1.0e-5 + ], + block_type: :norm_first, + ffn: [ + intermediate_size: spec.encoder_intermediate_size, + activation: spec.activation + ], + name: join(name, "blocks") + ) + + hidden_state = Axon.layer_norm(encoder_outputs.hidden_state, name: join(name, "norm")) + + %{ + hidden_state: hidden_state, + hidden_states: Layers.replace(encoder_outputs.hidden_states, -1, hidden_state), + attentions: encoder_outputs.attentions + } + end + + defp embedder(input_ids, position_ids, input_embeddings, spec, opts) do + name = opts[:name] + + input_embeddings = + Layers.default input_embeddings do + token_embedding(input_ids, spec, name: join(name, "token_embedding")) + end + + position_ids = + Layers.default position_ids do + Axon.nx(input_ids, fn input_ids -> + mask = Nx.not_equal(input_ids, spec.pad_token_id) + + mask + |> Nx.cumulative_sum(axis: 1) + |> Nx.multiply(mask) + |> Nx.add(spec.pad_token_id) + end) + end + + position_embeddings = + position_embedding(position_ids, spec, name: join(name, "position_embedding")) + + Axon.add([input_embeddings, position_embeddings]) + |> Axon.dropout(rate: spec.dropout_rate) + end + + defp token_embedding(input_ids, spec, opts) do + name = opts[:name] + + input_embeddings = + Axon.embedding(input_ids, spec.vocab_size, spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + name: name + ) + + if spec.scale_embedding do + Axon.nx(input_embeddings, fn x -> Nx.multiply(x, Nx.sqrt(spec.hidden_size)) end) + else + input_embeddings + end + end + + defp position_embedding(position_ids, spec, opts) do + name = opts[:name] + + offset = 2 + embedding_dim = spec.hidden_size + num_embeddings = spec.max_positions + offset + padding_idx = spec.pad_token_id + half_dim = div(embedding_dim, 2) + + position_ids + |> Axon.nx( + fn position_ids -> + emb = Nx.log(10_000) + emb = Nx.divide(emb, half_dim - 1) + emb = Nx.exp(Nx.multiply(Nx.iota({half_dim}), Nx.negate(emb))) + emb = Nx.multiply(Nx.new_axis(Nx.iota({num_embeddings}), 1), Nx.new_axis(emb, 0)) + emb = Nx.concatenate([Nx.sin(emb), Nx.cos(emb)], axis: 1) + emb = Nx.reshape(emb, {num_embeddings, :auto}) + + emb = + if rem(embedding_dim, 2) == 1 do + Nx.concatenate([emb, Nx.broadcast(0, {num_embeddings, 1})], axis: 1) + else + emb + end + + zero_pad_slice = Nx.broadcast(0.0, {1, embedding_dim}) + emb = Nx.put_slice(emb, [padding_idx, 0], zero_pad_slice) + + Nx.take(emb, Nx.as_type(position_ids, {:s, 64})) + end, + name: join(name, "sinusoidal_position_embedding") + ) + end + + defp decoder( + hidden_state, + attention_mask, + attention_head_mask, + encoder_hidden_state, + encoder_attention_mask, + cross_attention_head_mask, + cache, + spec, + opts + ) do + name = opts[:name] + + decoder_outputs = + Layers.Transformer.blocks(hidden_state, + attention_mask: attention_mask, + attention_head_mask: attention_head_mask, + cross_hidden_state: encoder_hidden_state, + cross_attention_mask: encoder_attention_mask, + cross_attention_head_mask: cross_attention_head_mask, + cache: cache, + causal: true, + num_blocks: spec.decoder_num_blocks, + num_attention_heads: spec.decoder_num_attention_heads, + hidden_size: spec.hidden_size, + kernel_initializer: kernel_initializer(spec), + dropout_rate: spec.dropout_rate, + attention_dropout_rate: spec.attention_dropout_rate, + layer_norm: [ + epsilon: 1.0e-5 + ], + block_type: :norm_first, + ffn: [ + intermediate_size: spec.decoder_intermediate_size, + activation: spec.activation + ], + name: join(name, "blocks") + ) + + hidden_state = Axon.layer_norm(decoder_outputs.hidden_state, name: join(name, "norm")) + + %{ + cache: decoder_outputs.cache, + hidden_state: hidden_state, + hidden_states: Layers.replace(decoder_outputs.hidden_states, -1, hidden_state), + attentions: decoder_outputs.attentions, + cross_attentions: decoder_outputs.cross_attentions + } + end + + defp language_modeling_head(hidden_state, spec, opts) do + name = opts[:name] + + # TODO: Tie lm-head to word embedding as a spec option + Layers.dense_transposed(hidden_state, spec.vocab_size, + kernel_initializer: kernel_initializer(spec), + name: join(name, "output") + ) + end + + defp kernel_initializer(spec) do + Axon.Initializers.normal(scale: spec.initializer_scale) + end + + defimpl Bumblebee.HuggingFace.Transformers.Config do + def load(spec, data) do + import Shared.Converters + + opts = + convert!(data, + vocab_size: {"vocab_size", number()}, + max_positions: {"max_position_embeddings", number()}, + hidden_size: {"d_model", number()}, + encoder_num_blocks: {"encoder_layers", number()}, + decoder_num_blocks: {"decoder_layers", number()}, + encoder_num_attention_heads: {"encoder_attention_heads", number()}, + decoder_num_attention_heads: {"decoder_attention_heads", number()}, + encoder_intermediate_size: {"encoder_ffn_dim", number()}, + decoder_intermediate_size: {"decoder_ffn_dim", number()}, + scale_embedding: {"scale_embedding", boolean()}, + activation: {"activation_function", activation()}, + dropout_rate: {"dropout", number()}, + attention_dropout_rate: {"attention_dropout", number()}, + activation_dropout_rate: {"activation_dropout", number()}, + classifier_dropout_rate: {"classifier_dropout", number()}, + initializer_scale: {"init_std", number()} + ) ++ Shared.common_options_from_transformers(data, spec) + + @for.config(spec, opts) + end + end + + defimpl Bumblebee.HuggingFace.Transformers.Model do + def params_mapping(spec) do + %{ + "encoder_embedder.token_embedding" => "model.encoder.embed_tokens", + "encoder_embedder.position_embedding" => "model.encoder.embed_positions", + "encoder_embedder.norm" => "model.encoder.layernorm_embedding", + "encoder.blocks.{n}.self_attention.query" => "model.encoder.layers.{n}.self_attn.q_proj", + "encoder.blocks.{n}.self_attention.key" => "model.encoder.layers.{n}.self_attn.k_proj", + "encoder.blocks.{n}.self_attention.value" => "model.encoder.layers.{n}.self_attn.v_proj", + "encoder.blocks.{n}.self_attention.output" => + "model.encoder.layers.{n}.self_attn.out_proj", + "encoder.blocks.{n}.self_attention_norm" => + "model.encoder.layers.{n}.self_attn_layer_norm", + "encoder.blocks.{n}.ffn.intermediate" => "model.encoder.layers.{n}.fc1", + "encoder.blocks.{n}.ffn.output" => "model.encoder.layers.{n}.fc2", + "encoder.blocks.{n}.output_norm" => "model.encoder.layers.{n}.final_layer_norm", + "encoder.norm" => "model.encoder.layer_norm", + "decoder_embedder.token_embedding" => "model.decoder.embed_tokens", + "decoder_embedder.position_embedding" => "model.decoder.embed_positions", + "decoder_embedder.norm" => "model.decoder.layernorm_embedding", + "decoder.blocks.{n}.self_attention.query" => "model.decoder.layers.{n}.self_attn.q_proj", + "decoder.blocks.{n}.self_attention.key" => "model.decoder.layers.{n}.self_attn.k_proj", + "decoder.blocks.{n}.self_attention.value" => "model.decoder.layers.{n}.self_attn.v_proj", + "decoder.blocks.{n}.self_attention.output" => + "model.decoder.layers.{n}.self_attn.out_proj", + "decoder.blocks.{n}.self_attention_norm" => + "model.decoder.layers.{n}.self_attn_layer_norm", + "decoder.blocks.{n}.cross_attention.query" => + "model.decoder.layers.{n}.encoder_attn.q_proj", + "decoder.blocks.{n}.cross_attention.key" => + "model.decoder.layers.{n}.encoder_attn.k_proj", + "decoder.blocks.{n}.cross_attention.value" => + "model.decoder.layers.{n}.encoder_attn.v_proj", + "decoder.blocks.{n}.cross_attention.output" => + "model.decoder.layers.{n}.encoder_attn.out_proj", + "decoder.blocks.{n}.cross_attention_norm" => + "model.decoder.layers.{n}.encoder_attn_layer_norm", + "decoder.blocks.{n}.ffn.intermediate" => "model.decoder.layers.{n}.fc1", + "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2", + "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm", + "decoder.norm" => "model.decoder.layer_norm", + "language_modeling_head.output" => + case spec.architecture do + :for_causal_language_modeling -> "lm_head" + _other -> "model.shared" + end, + "language_modeling_head.logits_bias" => %{ + "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end} + }, + "sequence_classification_head.dense" => "classification_head.dense", + "sequence_classification_head.output" => "classification_head.out_proj", + "question_answering_head.output" => "qa_outputs" + } + end + end +end diff --git a/lib/bumblebee/text/pre_trained_tokenizer.ex b/lib/bumblebee/text/pre_trained_tokenizer.ex index f4bd809a..7cb39700 100644 --- a/lib/bumblebee/text/pre_trained_tokenizer.ex +++ b/lib/bumblebee/text/pre_trained_tokenizer.ex @@ -179,6 +179,16 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do mask: "" } }, + m2m_100: %{ + special_tokens: %{ + eos: "", + unk: "", + sep: "", + pad: "", + cls: "", + mask: "" + } + }, roberta: %{ special_tokens: %{ bos: "", diff --git a/test/bumblebee/text/m2m100_test.exs b/test/bumblebee/text/m2m100_test.exs new file mode 100644 index 00000000..9aae34ca --- /dev/null +++ b/test/bumblebee/text/m2m100_test.exs @@ -0,0 +1,68 @@ +defmodule Bumblebee.Text.M2m100Test do + use ExUnit.Case, async: true + + import Bumblebee.TestHelpers + + @moduletag model_test_tags() + + test ":base" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-M2M100Model"}, + architecture: :base + ) + + assert %Bumblebee.Text.M2m100{architecture: :base} = spec + + input = %{ + "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]), + "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]), + "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]) + } + + output = Axon.predict(model, params, input) + + assert Nx.shape(output.hidden_state) == {1, 8, 16} + + assert_all_close( + output.hidden_state[[.., 1..3, 1..3]], + Nx.tensor([ + [ + [0.7856, -0.3174, -0.4792], + [0.7265, -0.2752, -0.4823], + [1.0580, -0.3263, -0.7994] + ] + ]) + ) + end + + test ":for_conditional_generation" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model( + {:hf, "hf-internal-testing/tiny-random-M2M100ForConditionalGeneration"}, + architecture: :for_conditional_generation + ) + + assert %Bumblebee.Text.M2m100{architecture: :for_conditional_generation} = spec + + input = %{ + "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]), + "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]), + "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]) + } + + output = Axon.predict(model, params, input) + + assert Nx.shape(output.logits) == {1, 8, 128_112} + + assert_all_close( + output.logits[[.., 1..3, 1..3]], + Nx.tensor([ + [ + [0.0000, -0.0323, 0.0527], + [0.0000, -0.0404, 0.0713], + [0.0000, -0.0660, 0.0758] + ] + ]) + ) + end +end diff --git a/test/bumblebee/text/nllb_test.exs b/test/bumblebee/text/nllb_test.exs new file mode 100644 index 00000000..5ae22f53 --- /dev/null +++ b/test/bumblebee/text/nllb_test.exs @@ -0,0 +1,38 @@ +defmodule Bumblebee.Text.NllbTest do + use ExUnit.Case, async: true + + import Bumblebee.TestHelpers + + @moduletag model_test_tags() + + test ":for_conditional_generation" do + assert {:ok, %{model: model, params: params, spec: spec}} = + Bumblebee.load_model({:hf, "hf-internal-testing/tiny-random-nllb"}, + module: Bumblebee.Text.M2m100, + architecture: :for_conditional_generation + ) + + assert %Bumblebee.Text.M2m100{architecture: :for_conditional_generation} = spec + + input = %{ + "input_ids" => Nx.tensor([[10, 20, 30, 40, 50, 60, 70, 80, 0, 0]]), + "decoder_input_ids" => Nx.tensor([[15, 25, 35, 45, 55, 65, 0, 0]]), + "attention_mask" => Nx.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]) + } + + output = Axon.predict(model, params, input) + + assert Nx.shape(output.logits) == {1, 8, 128_112} + + assert_all_close( + output.logits[[.., 1..3, 1..3]], + Nx.tensor([ + [ + [0.0000, 0.0169, -0.0698], + [0.0000, 0.0525, -0.1042], + [0.0000, 0.0667, -0.1078] + ] + ]) + ) + end +end From 170962675c88f133e9798c1b3acbe030b5062d12 Mon Sep 17 00:00:00 2001 From: Ayman Osman Date: Thu, 15 Aug 2024 23:41:10 +0100 Subject: [PATCH 2/6] remove unused options and parameters --- lib/bumblebee/text/m2m100.ex | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex index b860d4e3..084848f3 100644 --- a/lib/bumblebee/text/m2m100.ex +++ b/lib/bumblebee/text/m2m100.ex @@ -73,9 +73,7 @@ defmodule Bumblebee.Text.M2m100 do doc: "the standard deviation of the normal initializer used for initializing kernel parameters" ] - ] ++ - Shared.common_options([:num_labels, :id_to_label]) ++ - Shared.token_options(pad_token_id: 1, eos_token_id: 2, decoder_start_token_id: 2) + ] ++ Shared.token_options(pad_token_id: 1, eos_token_id: 2, decoder_start_token_id: 2) @moduledoc """ M2M100 model family. @@ -102,7 +100,6 @@ defmodule Bumblebee.Text.M2m100 do def config(spec, opts) do spec |> Shared.put_config_attrs(opts) - |> Shared.validate_label_options() end @impl true @@ -126,10 +123,7 @@ defmodule Bumblebee.Text.M2m100 do inputs = encoder_decoder_inputs(spec) outputs = core(inputs, spec) - logits = - outputs.hidden_state - |> language_modeling_head(spec, name: "language_modeling_head") - |> Axon.bias(name: "language_modeling_head.logits_bias", bias_initializer: :zeros) + logits = language_modeling_head(outputs.hidden_state, spec, name: "language_modeling_head") Layers.output(%{ logits: logits, @@ -459,7 +453,7 @@ defmodule Bumblebee.Text.M2m100 do end defimpl Bumblebee.HuggingFace.Transformers.Model do - def params_mapping(spec) do + def params_mapping(_spec) do %{ "encoder_embedder.token_embedding" => "model.encoder.embed_tokens", "encoder_embedder.position_embedding" => "model.encoder.embed_positions", @@ -499,14 +493,7 @@ defmodule Bumblebee.Text.M2m100 do "decoder.blocks.{n}.ffn.output" => "model.decoder.layers.{n}.fc2", "decoder.blocks.{n}.output_norm" => "model.decoder.layers.{n}.final_layer_norm", "decoder.norm" => "model.decoder.layer_norm", - "language_modeling_head.output" => - case spec.architecture do - :for_causal_language_modeling -> "lm_head" - _other -> "model.shared" - end, - "language_modeling_head.logits_bias" => %{ - "bias" => {[{"model", "final_logits_bias"}], fn [value] -> Nx.squeeze(value) end} - }, + "language_modeling_head.output" => "lm_head", "sequence_classification_head.dense" => "classification_head.dense", "sequence_classification_head.output" => "classification_head.out_proj", "question_answering_head.output" => "qa_outputs" From cc644a5241fcd37b3d78399649ed8b7f5f780ff8 Mon Sep 17 00:00:00 2001 From: Ayman Osman Date: Thu, 15 Aug 2024 23:53:02 +0100 Subject: [PATCH 3/6] add moduledoc --- lib/bumblebee/text/m2m100.ex | 95 ++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex index 084848f3..9d09ca2a 100644 --- a/lib/bumblebee/text/m2m100.ex +++ b/lib/bumblebee/text/m2m100.ex @@ -77,6 +77,101 @@ defmodule Bumblebee.Text.M2m100 do @moduledoc """ M2M100 model family. + + ## Architectures + + * `:base` - plain M2M100 without any head on top + + * `:for_conditional_generation` - M2M100 with a language modeling + head. The head returns logits for each token in the original + sequence + + ## Inputs + + * `"input_ids"` - `{batch_size, sequence_length}` + + Indices of input sequence tokens in the vocabulary. + + * `"attention_mask"` - `{batch_size, sequence_length}` + + Mask indicating which tokens to attend to. This is used to ignore + padding tokens, which are added when processing a batch of sequences + with different length. + + * `"position_ids"` - `{batch_size, sequence_length}` + + Indices of positions of each input sequence tokens in the position + embeddings. + + * `"attention_head_mask"` - `{encoder_num_blocks, encoder_num_attention_heads}` + + Mask to nullify selected heads of the self-attention blocks in + the encoder. + + * `"input_embeddings"` - `{batch_size, sequence_length, hidden_size}` + + Embedded representation of `"input_ids"`, which can be specified + for more control over how `"input_ids"` are embedded than the + model's internal embedding lookup. If `"input_embeddings"` are present, + then `"input_ids"` will be ignored. + + * `"decoder_input_ids"` - `{batch_size, target_sequence_length}` + + Indices of decoder input sequence tokens in the vocabulary. If not + present and `"input_ids"` is, it will be generated by shifting + each token in `"input_ids"` to the right once. + + * `"decoder_attention_mask"` - `{batch_size, target_sequence_length}` + + Mask indicating which decoder tokens to attend to. This is used + to ignore padding tokens, which are added when processing a batch + of sequences with different length. + + * `"decoder_position_ids"` - `{batch_size, target_sequence_length}` + + Indices of positions of each decoder input sequence tokens in + the position embeddings. + + * `"decoder_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}` + + Mask to nullify selected heads of the self-attention blocks in + the decoder. + + * `"decoder_input_embeddings"` - `{batch_size, sequence_length, hidden_size}` + + Embedded representation of `"decoder_input_ids"`, which can be + specified for more control over how `"decoder_input_ids"` are + embedded than the model's internal embedding lookup. If + `"decoder_input_embeddings"` are present, then `"decoder_input_ids"` + will be ignored. + + * `"encoder_hidden_state"` - `{batch_size, sequence_length, hidden_size}` + + Last hidden state output from the encoder. This hidden state is + used in cross-attention blocks in the decoder. If specified, the + model will skip the encoding process and use this value directly + for cross-attentions in the decoder. + + * `"cross_attention_head_mask"` - `{decoder_num_blocks, decoder_num_attention_heads}` + + Mask to nullify selected heads of the cross-attention blocks in + the decoder with shape. + + * `"cache"` + + A container with cached layer results used to speed up sequential + decoding (autoregression). With cache, certain hidden states are + taken from the cache, rather than recomputed on every decoding + pass. The cache should be treated as opaque and initialized with + `Bumblebee.Text.Generation.init_cache/4`. + + ## Global layer options + + #{Shared.global_layer_options_doc([:output_hidden_states, :output_attentions])} + + ## Configuration + + #{Shared.options_doc(options)} """ defstruct [architecture: :base] ++ Shared.option_defaults(options) From 86c2436db6a0e007e555fbeb13b339e92946aeb8 Mon Sep 17 00:00:00 2001 From: Ayman Osman Date: Fri, 16 Aug 2024 00:42:11 +0100 Subject: [PATCH 4/6] use defnp for readability for position embedding --- lib/bumblebee/text/m2m100.ex | 56 ++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex index 9d09ca2a..631029ea 100644 --- a/lib/bumblebee/text/m2m100.ex +++ b/lib/bumblebee/text/m2m100.ex @@ -1,6 +1,8 @@ defmodule Bumblebee.Text.M2m100 do alias Bumblebee.Shared + import Nx.Defn + options = [ vocab_size: [ @@ -429,32 +431,42 @@ defmodule Bumblebee.Text.M2m100 do padding_idx = spec.pad_token_id half_dim = div(embedding_dim, 2) - position_ids - |> Axon.nx( - fn position_ids -> - emb = Nx.log(10_000) - emb = Nx.divide(emb, half_dim - 1) - emb = Nx.exp(Nx.multiply(Nx.iota({half_dim}), Nx.negate(emb))) - emb = Nx.multiply(Nx.new_axis(Nx.iota({num_embeddings}), 1), Nx.new_axis(emb, 0)) - emb = Nx.concatenate([Nx.sin(emb), Nx.cos(emb)], axis: 1) - emb = Nx.reshape(emb, {num_embeddings, :auto}) - - emb = - if rem(embedding_dim, 2) == 1 do - Nx.concatenate([emb, Nx.broadcast(0, {num_embeddings, 1})], axis: 1) - else - emb - end - - zero_pad_slice = Nx.broadcast(0.0, {1, embedding_dim}) - emb = Nx.put_slice(emb, [padding_idx, 0], zero_pad_slice) - - Nx.take(emb, Nx.as_type(position_ids, {:s, 64})) - end, + Axon.nx( + position_ids, + &position_embedding_impl(&1, embedding_dim, half_dim, num_embeddings, padding_idx), name: join(name, "sinusoidal_position_embedding") ) end + defnp position_embedding_impl( + position_ids, + embedding_dim, + half_dim, + num_embeddings, + padding_idx + ) do + zero_pad_slice = Nx.broadcast(0.0, {1, embedding_dim}) + + Nx.log(10_000) + |> Nx.divide(half_dim - 1) + |> Nx.negate() + |> Nx.multiply(Nx.iota({half_dim})) + |> Nx.exp() + |> Nx.new_axis(0) + |> Nx.multiply(Nx.new_axis(Nx.iota({num_embeddings}), 1)) + |> then(&Nx.concatenate([Nx.sin(&1), Nx.cos(&1)], axis: 1)) + |> Nx.reshape({num_embeddings, :auto}) + |> then(fn emb -> + if rem(embedding_dim, 2) == 1 do + Nx.concatenate([emb, Nx.broadcast(0, {num_embeddings, 1})], axis: 1) + else + emb + end + end) + |> Nx.put_slice([padding_idx, 0], zero_pad_slice) + |> Nx.take(Nx.as_type(position_ids, {:s, 64})) + end + defp decoder( hidden_state, attention_mask, From 1229a9d06032007fff322dc0b87ba4f8f2990f51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Mon, 19 Aug 2024 19:32:47 +0200 Subject: [PATCH 5/6] Apply suggestions from code review --- lib/bumblebee.ex | 7 ++++++- lib/bumblebee/text/pre_trained_tokenizer.ex | 2 +- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex index 8661dacc..16088e8c 100644 --- a/lib/bumblebee.ex +++ b/lib/bumblebee.ex @@ -255,7 +255,12 @@ defmodule Bumblebee do "t5" => :t5, "whisper" => :whisper, "xlm-roberta" => :xlm_roberta, - "m2m_100" => :m2m_100 + # Both M2M100 and NLLB model checkpoints use the M2M100 model, + # but have distinct tokenizers. Consequently, model type is + # "m2m_100" in both cases. Currently only NLLB has fast tokenizer + # implementation, so if we load the tokenizer correctly, it is + # safe to assume it's NLLB + "m2m_100" => :nllb } @diffusers_class_to_scheduler %{ diff --git a/lib/bumblebee/text/pre_trained_tokenizer.ex b/lib/bumblebee/text/pre_trained_tokenizer.ex index 7cb39700..fa07ee97 100644 --- a/lib/bumblebee/text/pre_trained_tokenizer.ex +++ b/lib/bumblebee/text/pre_trained_tokenizer.ex @@ -179,7 +179,7 @@ defmodule Bumblebee.Text.PreTrainedTokenizer do mask: "" } }, - m2m_100: %{ + nllb: %{ special_tokens: %{ eos: "", unk: "", From 85950c66db23ac71292b0404009d095b7a9ad404 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonatan=20K=C5=82osko?= Date: Tue, 20 Aug 2024 02:33:18 +0900 Subject: [PATCH 6/6] Simplify sinusoidal position embedding --- lib/bumblebee/text/m2m100.ex | 54 ++++++++++++------------------------ 1 file changed, 17 insertions(+), 37 deletions(-) diff --git a/lib/bumblebee/text/m2m100.ex b/lib/bumblebee/text/m2m100.ex index 631029ea..98503abf 100644 --- a/lib/bumblebee/text/m2m100.ex +++ b/lib/bumblebee/text/m2m100.ex @@ -395,7 +395,7 @@ defmodule Bumblebee.Text.M2m100 do mask |> Nx.cumulative_sum(axis: 1) |> Nx.multiply(mask) - |> Nx.add(spec.pad_token_id) + |> Nx.subtract(1) end) end @@ -425,46 +425,26 @@ defmodule Bumblebee.Text.M2m100 do defp position_embedding(position_ids, spec, opts) do name = opts[:name] + # For M2M100 we need to offset the embeddings offset = 2 - embedding_dim = spec.hidden_size - num_embeddings = spec.max_positions + offset - padding_idx = spec.pad_token_id - half_dim = div(embedding_dim, 2) - - Axon.nx( - position_ids, - &position_embedding_impl(&1, embedding_dim, half_dim, num_embeddings, padding_idx), - name: join(name, "sinusoidal_position_embedding") + + position_ids = Axon.add(position_ids, Axon.constant(Nx.tensor(offset))) + + Axon.layer(&sinusoidal_position_embedding_impl/2, [position_ids], + size: spec.hidden_size, + name: name ) end - defnp position_embedding_impl( - position_ids, - embedding_dim, - half_dim, - num_embeddings, - padding_idx - ) do - zero_pad_slice = Nx.broadcast(0.0, {1, embedding_dim}) - - Nx.log(10_000) - |> Nx.divide(half_dim - 1) - |> Nx.negate() - |> Nx.multiply(Nx.iota({half_dim})) - |> Nx.exp() - |> Nx.new_axis(0) - |> Nx.multiply(Nx.new_axis(Nx.iota({num_embeddings}), 1)) - |> then(&Nx.concatenate([Nx.sin(&1), Nx.cos(&1)], axis: 1)) - |> Nx.reshape({num_embeddings, :auto}) - |> then(fn emb -> - if rem(embedding_dim, 2) == 1 do - Nx.concatenate([emb, Nx.broadcast(0, {num_embeddings, 1})], axis: 1) - else - emb - end - end) - |> Nx.put_slice([padding_idx, 0], zero_pad_slice) - |> Nx.take(Nx.as_type(position_ids, {:s, 64})) + defnp sinusoidal_position_embedding_impl(position_ids, opts \\ []) do + size = opts[:size] + + half_size = div(size, 2) + base = 10_000 + range = Nx.iota({half_size}) / (half_size - 1) + inv_frequency = 1 / Nx.pow(base, range) + angle = Nx.outer(position_ids, inv_frequency) + Nx.concatenate([Nx.sin(angle), Nx.cos(angle)], axis: -1) end defp decoder(