diff --git a/tests/test_llama.py b/tests/test_llama.py deleted file mode 100644 index 0deee15..0000000 --- a/tests/test_llama.py +++ /dev/null @@ -1,76 +0,0 @@ -import unittest - -import torch - -from mixlora.model import LoraLinear, MixLoraConfig, MixLoraSparseMoe - - -class DummyLlamaMLP(torch.nn.Module): - def __init__(self, hidden_size: int, intermediate_size: int): - super().__init__() - self.gate_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) - self.up_proj = torch.nn.Linear(hidden_size, intermediate_size, bias=False) - self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False) - self.act_fn = torch.nn.SiLU() - - -config = MixLoraConfig.from_config( - { - "bias": "none", - "peft_type": "MIXLORA", - "r": 8, - "lora_alpha": 16, - "lora_dropout": 0.05, - "target_modules": [ - "q_proj", - "k_proj", - "v_proj", - "o_proj", - "gate_proj", - "down_proj", - "up_proj", - ], - "routing_strategy": "mixtral", - "num_experts": 8, - "act_fn": "silu", - "top_k": 2, - "base_model_name_or_path": "DUMMY", - "task_type": "CAUSAL_LM", - } -) - -config.model_type_ = "llama" - -hidden_size = 8 -intermediate_size = hidden_size * 2 -dummy_mlp = DummyLlamaMLP(hidden_size, intermediate_size) -moe_layer = MixLoraSparseMoe(dummy_mlp, config) -gate_layer = torch.nn.Linear(hidden_size, config.num_experts_, bias=False) -moe_layer.gate_ = gate_layer.weight -mlp_projections = ["gate_proj", "down_proj", "up_proj"] -for proj_name in mlp_projections: - base_layer: torch.nn.Linear = getattr(dummy_mlp, proj_name) - torch.nn.init.zeros_(base_layer.weight) - for expert_idx in range(config.num_experts_): - moe_layer.experts_[f"experts.{expert_idx}.{proj_name}"] = LoraLinear( - base_layer, config - ) - - -class LlamaInputShapeTest(unittest.TestCase): - def test_forward_with_different_shape(self): - input_shapes = [ - (2, 8, hidden_size), - (1, 16, hidden_size), - (4, 4, hidden_size) - ] - - for shape in input_shapes: - with self.subTest(shape=shape): - input = torch.zeros(shape) - output: torch.Tensor = moe_layer(input) - self.assertEqual(output.shape, shape) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/test_moe_layer.py b/tests/test_moe_layer.py new file mode 100644 index 0000000..4cd596a --- /dev/null +++ b/tests/test_moe_layer.py @@ -0,0 +1,97 @@ +import unittest +from typing import List + +import torch +from transformers.models.llama.modeling_llama import LlamaConfig, LlamaMLP +from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP + +from mixlora.model import LoraLinear, MixLoraConfig, MixLoraSparseMoe + + +def dummy_moe_layer( + model_type: str, + mlp_layer: torch.nn.Module, + hidden_size: int, + mlp_projections: List[str], +): + config = MixLoraConfig.from_config( + { + "bias": "none", + "peft_type": "MIXLORA", + "r": 8, + "lora_alpha": 16, + "lora_dropout": 0.05, + "target_modules": [], + "routing_strategy": "mixtral", + "num_experts": 8, + "act_fn": "silu", + "top_k": 2, + "base_model_name_or_path": "DUMMY", + "task_type": "CAUSAL_LM", + } + ) + config.model_type_ = model_type + moe_layer = MixLoraSparseMoe(mlp_layer, config) + gate_layer = torch.nn.Linear(hidden_size, config.num_experts_, bias=False) + torch.nn.init.normal_(gate_layer.weight) + moe_layer.gate_ = gate_layer.weight + for proj_name in mlp_projections: + base_layer: torch.nn.Linear = getattr(mlp_layer, proj_name) + torch.nn.init.normal_(base_layer.weight) + for expert_idx in range(config.num_experts_): + moe_layer.experts_[f"experts.{expert_idx}.{proj_name}"] = LoraLinear( + base_layer, config + ) + + return moe_layer + + +def dummy_test_shapes(hidden_size: int): + return [(2, 8, hidden_size), (1, 16, hidden_size), (4, 4, hidden_size)] + + +hidden_size = 16 + + +class MoeLayerTestCase(unittest.TestCase): + def test_llama_forward(self): + mlp_layer = LlamaMLP( + LlamaConfig( + vocab_size=128, + hidden_size=hidden_size, + intermediate_size=hidden_size * 2, + num_hidden_layers=8, + num_attention_heads=2, + ) + ) + moe_layer = dummy_moe_layer( + "llama", mlp_layer, hidden_size, ["gate_proj", "down_proj", "up_proj"] + ) + for shape in dummy_test_shapes(hidden_size): + with self.subTest(f"test for shape = {shape}"): + input = torch.zeros(shape) + output: torch.Tensor = moe_layer(input) + self.assertEqual(output.shape, shape) + + def test_phi3_forward(self): + mlp_layer = Phi3MLP( + Phi3Config( + vocab_size=128, + hidden_size=hidden_size, + intermediate_size=hidden_size * 2, + num_hidden_layers=8, + num_attention_heads=2, + ) + ) + moe_layer = dummy_moe_layer( + "phi3", mlp_layer, hidden_size, ["gate_up_proj", "down_proj"] + ) + for shape in dummy_test_shapes(hidden_size): + with self.subTest(f"test for shape = {shape}"): + input = torch.zeros(shape) + output: torch.Tensor = moe_layer(input) + self.assertEqual(output.shape, shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_phi3.py b/tests/test_phi3.py deleted file mode 100644 index 11fe89c..0000000 --- a/tests/test_phi3.py +++ /dev/null @@ -1,61 +0,0 @@ -import unittest - -import torch - -from mixlora.model import LoraLinear, MixLoraConfig, MixLoraSparseMoe - - -class DummyPhi3MLP(torch.nn.Module): - def __init__(self, hidden_size: int, intermediate_size: int): - super().__init__() - self.gate_up_proj = torch.nn.Linear( - hidden_size, 2 * intermediate_size, bias=False - ) - self.down_proj = torch.nn.Linear(intermediate_size, hidden_size, bias=False) - self.act_fn = torch.nn.SiLU() - - -config = MixLoraConfig.from_config( - { - "bias": "none", - "peft_type": "MIXLORA", - "r": 8, - "lora_alpha": 16, - "lora_dropout": 0.05, - "target_modules": ["qkv_proj" "o_proj", "gate_up_proj", "down_proj"], - "routing_strategy": "mixtral", - "num_experts": 8, - "act_fn": "silu", - "top_k": 2, - "base_model_name_or_path": "DUMMY", - "task_type": "CAUSAL_LM", - } -) - -config.model_type_ = "phi3" - -hidden_size = 8 -intermediate_size = hidden_size * 2 -dummy_mlp = DummyPhi3MLP(hidden_size, intermediate_size) -moe_layer = MixLoraSparseMoe(dummy_mlp, config) -gate_layer = torch.nn.Linear(hidden_size, config.num_experts_, bias=False) -moe_layer.gate_ = gate_layer.weight -mlp_projections = ["gate_up_proj", "down_proj"] -for proj_name in mlp_projections: - base_layer: torch.nn.Linear = getattr(dummy_mlp, proj_name) - torch.nn.init.zeros_(base_layer.weight) - for expert_idx in range(config.num_experts_): - moe_layer.experts_[f"experts.{expert_idx}.{proj_name}"] = LoraLinear( - base_layer, config - ) - - -class Phi3TestCase(unittest.TestCase): - def test_forward(self): - input = torch.zeros((1, 8, hidden_size)) - output: torch.Tensor = moe_layer(input) - self.assertEqual(output.shape, (1, 8, hidden_size)) - - -if __name__ == "__main__": - unittest.main()