From 908be577730fc4f3855a4e644cf69b8defd8e607 Mon Sep 17 00:00:00 2001 From: Tom Fogal <60981+tfogal@users.noreply.github.com> Date: Thu, 31 Oct 2024 11:15:06 -0700 Subject: [PATCH] Add a test for Mistral-NeMo. (#1340) --- thunder/tests/test_networks.py | 51 ++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/thunder/tests/test_networks.py b/thunder/tests/test_networks.py index c5a44a0cbb..6076b681a4 100644 --- a/thunder/tests/test_networks.py +++ b/thunder/tests/test_networks.py @@ -359,3 +359,54 @@ def test_quantization(): assert len(sd) == len(sd2) for k, v in sd.items(): assert_close(v, sd2[k]) + + +@thunder.tests.framework.requiresCUDA +def test_thunderfx_mistral_nemo_small(): + """ + Runs a small version of Mistral-NeMo + + This is largely based on code from Alexandros Koumparoulis. + """ + import transformers + + model_id = "mistralai/Mistral-Nemo-Base-2407" + + # Setup a "small" version of NeMo-Mistral that does not require downloading + # weights. This is not a configuration that is worth benchmarking. + # This was created by using + # MistralConfig(num_hidden_layers=1, max_position_embeddings=1024) + # and then manually diffing that returned object with: + # transformers.AutoConfig.from_pretrained(model_id) + # until they lined up sans the hidden and embeddings changes, above. + config = transformers.models.mistral.configuration_mistral.MistralConfig( + num_hidden_layers=1, + torch_dtype=torch.bfloat16, + max_position_embeddings=1024, + architectures=["MistralForCausalLM"], + hidden_size=5120, + rms_norm_eps=1e-05, + rope_theta=1000000.0, + sliding_window=None, + vocab_size=131072, + head_dim=128, + _name_or_path=model_id, + ) + model = transformers.AutoModelForCausalLM.from_config(config, trust_remote_code=False) + device = torch.device("cuda") + model.to(device) + model.train() + th_backend = thunder.dynamo.ThunderCompiler() + mdl = torch.compile(model, backend=th_backend) + + batch_size = 1 + iid_size = (batch_size, config.max_position_embeddings) + input_ids = torch.randint(0, config.vocab_size, iid_size, device=device) + attention_mask = torch.ones_like(input_ids) + + output = mdl(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids) + logits = output.logits + grad_logits = torch.randn_like(logits) + logits.backward(grad_logits) + + assert th_backend.subgraph_infos, "Should have at least 1 subgraph"