Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Moe converters #5

Merged
merged 4 commits into from
Oct 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions examples/xglm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,8 @@ cd examples/xglm
torchrun --nproc-per-node=1 convert_dense2moe.py --checkpoint-path=checkpoints/xglm-564M --save-path=$SCRATCH/checkpoints/xglm-8x564M --num-experts=8
```
Note that this upcycling _drops_ the bias parameters of the MLP because the MegaBlocks implementation does not support bias parameters. While this is a limitation of the current implementation, the performance is quickly recovered after a few training steps.

To save back to huggingface format use
```bash
torchrun examples/xglm/convert_ntmoe2hf.py --checkpoint-path=$SCRATCH/checkpoints/xglm-8x564M --save-path=$SCRATCH/checkpoints/huggingface/xglm-8x56fM
```
140 changes: 140 additions & 0 deletions examples/xglm/convert_ntmoe2hf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
"""
Converts a nanotron moe model to HF format
Command:
torchrun --nproc-per-node=1 convert_nt2hf.py --checkpoint-path=nanotron_weights --save-path=hf_weights
"""

import warnings
from argparse import ArgumentParser
from pathlib import Path
from typing import Optional

import torch
from transformers import AutoTokenizer
from tqdm import tqdm

from nanotron.config.models_config import GPT3MoEConfig
from nanotron.models.gpt3_moe import GPT3MoEForTraining, GPT3MoEBlock
from nanotron.models.moe import dMoE, SparseMLP, LearnedRouter

from examples.xglm.convert_dense2moe import create_nt_moe_model
from examples.xglm.convert_nt2hf import convert_attention
from examples.xglm.convert_utils import convert_generic
from examples.xglm.transformers_impl.xglm_model import XGLMForCausalLM, XGLMDecoderLayer, XGLMmoeConfig, XGLMSparseMoeBlock, XGLMMLP
from examples.xglm.transformers_impl.gating import BasicGate


def convert_config(config: GPT3MoEConfig) -> XGLMmoeConfig:
if config.embd_pdrop != config.resid_pdrop:
warnings.warn(
f"nanotron.embd_pdrop = {config.embd_pdrop} does not match with "
f"nanotron.resid_pdrop = {config.resid_pdrop}. "
"XGLM implementation needs these two values to be equal "
"for correct conversion."
)
if config.layer_norm_epsilon != 1e-5:
warnings.warn(f"nanotron.layer_norm_epsilon must be 1e-5, not {config.layer_norm_epsilon}")
if config.moe_z_loss_weight != 0:
warnings.warn(f"transformer implementation does not support z loss")
assert not config.moe_glu, "Transformer implementation does not support glu MLP layers"

return XGLMmoeConfig(
# Regular xglm config.
activation_function=config.activation_function,
attention_dropout=config.attn_pdrop,
dropout=config.embd_pdrop,
eos_token_id=config.eos_token_id,
d_model=config.hidden_size,
ffn_dim=config.intermediate_size,
max_position_embeddings=config.max_position_embeddings,
attention_heads=config.num_attention_heads,
num_layers=config.num_hidden_layers,
vocab_size=config.vocab_size,
decoder_start_token_id=config.position_embedding_offset,
activation_dropout=config.act_pdrop,
scale_embedding=config.scale_embedding,
# Moe specifics.
num_local_experts=config.moe_num_experts,
num_experts_per_tok=config.num_experts_per_tok,
gate_type="linear",
gate_depth=1,
router_aux_loss_coef=config.moe_loss_weight,
)


def convert_mlp(mlp_hf: XGLMMLP, mlp_nt: SparseMLP):
convert_generic(mlp_hf.fc1, mlp_nt.w1.module)
convert_generic(mlp_hf.fc2, mlp_nt.w2.module)


def convert_gate(gate_hf: BasicGate, gate_nt: LearnedRouter):
convert_generic(gate_hf.gate, gate_nt.layer)


def convert_ff(ff_hf: XGLMSparseMoeBlock, ff_nt: dMoE):
convert_gate(ff_hf.gate, ff_nt.gate)
int_size = ff_nt.config.intermediate_size
if len(ff_hf.experts) == 1:
assert ff_nt.experts.mlp.w1.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)
assert ff_nt.experts.mlp.w2.module.weight.shape == (ff_nt.config.hidden_size, int_size*len(ff_hf.experts))
else:
assert ff_nt.experts.mlp.w1.module.weight.T.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)
assert ff_nt.experts.mlp.w2.module.weight.shape == (int_size*len(ff_hf.experts), ff_nt.config.hidden_size)

for i, expert_hf in enumerate(ff_hf.experts):
i0 = i*int_size
i1 = (i + 1)*int_size
with torch.no_grad():
if len(ff_hf.experts) == 1:
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight[i0:i1, :].clone())
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[:, i0:i1].clone())
else:
expert_hf.fc1.weight.copy_(ff_nt.experts.mlp.w1.module.weight.T[i0:i1, :].clone())
expert_hf.fc2.weight.copy_(ff_nt.experts.mlp.w2.module.weight[i0:i1, :].T.clone())

def convert_decoder(block_hf: XGLMDecoderLayer, block_nt: GPT3MoEBlock):
convert_generic(block_hf.self_attn_layer_norm, block_nt.ln_1)
convert_attention(block_hf.self_attn, block_nt.attn)
convert_generic(block_hf.final_layer_norm, block_nt.ln_2)
convert_ff(block_hf.block_sparse_moe, block_nt.ff)


def convert(model_hf: XGLMForCausalLM, model_nt: GPT3MoEForTraining):
convert_generic(model_hf.model.embed_tokens, model_nt.model.token_embeddings.pp_block.token_embedding)
for layer_hf, layer_nt in tqdm(zip(model_hf.model.layers, model_nt.model.decoder), desc="Converting layers",
total=model_nt.config.num_hidden_layers):
convert_decoder(layer_hf, layer_nt.pp_block)
convert_generic(model_hf.model.layer_norm, model_nt.model.final_layer_norm.pp_block)
convert_generic(model_hf.lm_head, model_nt.model.lm_head.pp_block)


def main(checkpoint_path: Path, save_path: Path, tokenizer_name: Optional[str]):
# Load nanotron model.
model_nt = create_nt_moe_model(checkpoint_path=checkpoint_path)

# Init huggingface model.
model_config_hf = convert_config(model_nt.config)
model_hf = XGLMForCausalLM._from_config(model_config_hf)

# Copy weights, initialize tokenizer and save model.
if tokenizer_name is not None:
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
tokenizer.save_pretrained(save_path)
states = torch.randn(4, 1, 1024)
convert(model_hf, model_nt), states.cuda().bfloat16()
print("Saving...")
model_hf.save_pretrained(save_path)
print(f"Model saved to {save_path}")


if __name__ == "__main__":
parser = ArgumentParser(description="Convert HF weights to nanotron format")
parser.add_argument(
"--checkpoint-path", type=Path, default="checkpoints/xglm-7.5B", help="Path to the nanotron checkpoint"
)
parser.add_argument(
"--save-path", type=Path, default="facebook/xglm-7.5B", help="Path to save the huggingface model"
)
parser.add_argument("--tokenizer-name", type=str, default="facebook/xglm-7.5B")
args = parser.parse_args()
ret = main(args.checkpoint_path, args.save_path, args.tokenizer_name)
182 changes: 182 additions & 0 deletions examples/xglm/tests/test_moe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import torch
import pytest

import nanotron
from nanotron.config.parallelism_config import ParallelismArgs
from nanotron.config.models_config import GPT3MoEConfig
from nanotron.parallel import ParallelContext
from nanotron.parallel.pipeline_parallel.tensor_pointer import TensorPointer
from nanotron.trainer import mark_tied_parameters
from nanotron.models.gpt3_moe import GPT3MoEBlock, GPT3MoEForTraining
from nanotron.models.moe import LearnedRouter, dMoE

from tests.helpers.utils import init_distributed

from examples.xglm.convert_ntmoe2hf import convert_config, convert_gate, convert_ff, convert
from examples.xglm.tests.test_implementation import almost_close
from examples.xglm.transformers_impl.xglm_model import XGLMSparseMoeBlock, XGLMForCausalLM
from examples.xglm.transformers_impl.gating import BasicGate


MAX_SEQUENCE_LENGTH = 2048
TEST_SEQUENCE_LENGTH = 128 # If we test with a very large sequence length, precision errors get more significant independent of the correct implementation.
#TEST_SEQUENCE_LENGTH = MAX_SEQUENCE_LENGTH
BATCH_SIZE = 4
HIDDEN_SIZE = 1024
#DTYPE = torch.bfloat16
DTYPE = torch.float32
TEXT = "Hello. This is a relatively long text. I will use this text to test the conversion scripts. Let's finish this text soon because I don't have much more to say. Final note:"

CONFIG = GPT3MoEConfig(
attn_pdrop=0.0,
embd_pdrop=0.0,
resid_pdrop=0.0,
act_pdrop=0.0,
eos_token_id=2,
hidden_size=HIDDEN_SIZE,
intermediate_size=4096,
layer_norm_epsilon=1e-05,
max_position_embeddings=MAX_SEQUENCE_LENGTH,
num_attention_heads=16,
num_hidden_layers=24,
scale_attn_weights=True,
vocab_size=256008,
sinusoidal_position_embedding=True,
position_embedding_offset=2,
use_spda=DTYPE is not torch.bfloat16,
# vvv moe vvv
is_moe=True,
moe_num_experts=8,
num_experts_per_tok=2,
moe_loss_weight=0.01,
moe_z_loss_weight=0.0,
moe_glu=False,
)
PARALLEL_CONFIG = ParallelismArgs(dp=1, pp=1, tp=1, expert_parallel_size=1) #CONFIG.moe_num_experts)


@pytest.fixture
def hidden_states() -> torch.Tensor:
return torch.randn(TEST_SEQUENCE_LENGTH, BATCH_SIZE, HIDDEN_SIZE, dtype=DTYPE)


@pytest.fixture
def input_mask() -> torch.Tensor:
return torch.ones(BATCH_SIZE, TEST_SEQUENCE_LENGTH, dtype=torch.bool)


@pytest.fixture
def input_ids() -> torch.Tensor:
return torch.randint(0, CONFIG.vocab_size, (BATCH_SIZE, TEST_SEQUENCE_LENGTH))


def _test_nt2hf_gate(parallel_context: ParallelContext, hidden_states: torch.Tensor):
hidden_states = hidden_states.cuda()

config_hf = convert_config(CONFIG)
gate_nt = LearnedRouter(CONFIG).cuda().to(DTYPE)
gate_hf = BasicGate(config_hf).cuda().to(DTYPE)
convert_gate(gate_hf, gate_nt)

router_logits_nt, _, _ = gate_nt(hidden_states.view(-1, HIDDEN_SIZE))
router_logits_hf = gate_hf(hidden_states.permute(1, 0, 2).reshape(-1, HIDDEN_SIZE), "")

router_logits_nt = router_logits_nt.view(TEST_SEQUENCE_LENGTH, BATCH_SIZE, -1)
router_logits_hf = router_logits_hf.view(BATCH_SIZE, TEST_SEQUENCE_LENGTH, -1).permute(1, 0, 2)

assert router_logits_nt.size() == router_logits_hf.size()
torch.testing.assert_close(router_logits_nt, router_logits_hf)


def test_nt2hf_gate(hidden_states: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_gate)(hidden_states=hidden_states)


def _test_nt2hf_ff(parallel_context: ParallelContext, hidden_states: torch.Tensor,
num_experts: int, num_experts_per_tok: int):
hidden_states = hidden_states.cuda()

config = {**vars(CONFIG)}
config.update({"moe_num_experts": num_experts, "num_experts_per_tok": num_experts_per_tok})
config = GPT3MoEConfig(**config)
config_hf = convert_config(config)
ff_nt = dMoE(config, parallel_context, PARALLEL_CONFIG).cuda().to(DTYPE)
ff_hf = XGLMSparseMoeBlock(config_hf).cuda().to(DTYPE)
convert_ff(ff_hf, ff_nt)

out_nt = ff_nt(hidden_states)["hidden_states"]
out_hf, _ = ff_hf(hidden_states.permute(1, 0, 2).contiguous(), "")
out_hf = out_hf.permute(1, 0, 2)

assert out_nt.size() == out_hf.size()
almost_close(out_nt, out_hf, max_far=0.05, far_atol=0.003)


@pytest.mark.parametrize("num_experts,num_experts_per_tok", [(1, 1), (2, 1), (4, 1), (4, 2), (8, 1), (8, 2), (8, 4)])
def test_nt2hf_ff(hidden_states: torch.Tensor, num_experts: int, num_experts_per_tok: int):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_ff)(hidden_states=hidden_states, num_experts=num_experts, num_experts_per_tok=num_experts_per_tok)


def _test_nt2hf_model(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
random_states = nanotron.random.RandomStates({"tp_synced": nanotron.random.get_current_random_state()})
input_ids = input_ids.cuda()
input_mask = input_mask.cuda()

# unfortunately, we can't use float64 with huggingface xglm.
new_dtype = torch.float32 if DTYPE == torch.float64 else DTYPE

# Get nanotron model.
config_nt = GPT3MoEConfig(**vars(CONFIG))
if new_dtype not in {torch.bfloat16, torch.float16}:
config_nt.use_spda = True
model_nt = nanotron.models.build_model(
model_builder=lambda: GPT3MoEForTraining(
config=config_nt,
parallel_context=parallel_context,
parallel_config=None,
random_states=random_states,
),
parallel_context=parallel_context,
dtype=new_dtype,
device="cuda",
).eval()
mark_tied_parameters(model=model_nt, parallel_context=parallel_context)

# Create empty model_hf and make conversion.
model_hf = XGLMForCausalLM(convert_config(config_nt)).cuda().to(new_dtype).eval()
convert(model_hf, model_nt)

# Needed :/
aux_losses = {
"load_balancing_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
"z_loss": (
torch.zeros(1, device=input_ids.device)
if not isinstance(input_ids, TensorPointer)
else TensorPointer(self.input_pp_rank)
),
}

# Get outputs and assert.
with torch.no_grad():
out_nt = model_nt.model(input_ids, input_mask, aux_losses)["sharded_logits"].to(new_dtype)
del model_nt
torch.cuda.empty_cache()
out_hf = model_hf(input_ids=input_ids, attention_mask=input_mask, output_router_logits=False).logits.permute(1, 0, 2)
del model_hf
torch.cuda.empty_cache()
assert out_nt.size() == out_hf.size(), f"{out_nt.size()}, {out_hf.size()}"
return out_nt.cpu(), out_hf.cpu()


def _test_nt2hf_dummy_xglm(parallel_context: ParallelContext, input_ids: torch.Tensor, input_mask: torch.Tensor):
out_nt, out_hf = _test_nt2hf_model(parallel_context, input_ids, input_mask)
almost_close(out_nt, out_hf, max_far=0.01, far_atol=2.0) # We allow for less than 1% errors, but some of these are very large!
#torch.testing.assert_close(out_nt.bfloat16(), out_hf.bfloat16())


def test_nt2hf_dummy_xglm(input_ids: torch.Tensor, input_mask: torch.Tensor):
init_distributed(tp=1, dp=1, pp=1)(_test_nt2hf_dummy_xglm)(input_ids=input_ids, input_mask=input_mask)
Loading
Loading