Skip to content

Commit

Permalink
Merge pull request #25 from s-JoL/dev
Browse files Browse the repository at this point in the history
v2 release
  • Loading branch information
s-JoL authored Apr 28, 2023
2 parents 92af968 + c890bce commit f3c664b
Show file tree
Hide file tree
Showing 49 changed files with 1,185 additions and 1,439 deletions.
179 changes: 107 additions & 72 deletions README.md

Large diffs are not rendered by default.

307 changes: 189 additions & 118 deletions README_en.md

Large diffs are not rendered by default.

Binary file removed assets/chinese.JPG
Binary file not shown.
Binary file removed assets/code.JPG
Binary file not shown.
Binary file modified assets/instruct_loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file removed assets/paper.JPG
Binary file not shown.
Binary file modified assets/pretrain_loss.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
35 changes: 22 additions & 13 deletions chat_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,28 +2,29 @@
Author: LiangSong(sl12160010@gmail.com)
Date: 2023-04-06 22:30:10
LastEditors: LiangSong(sl12160010@gmail.com)
LastEditTime: 2023-04-07 23:03:31
LastEditTime: 2023-04-27 20:34:58
FilePath: /Open-Llama/chat_server.py
Description:
Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved.
"""
import torch
import gradio as gr
import sentencepiece as spm
from dataset.tokenizer import Tokenizer
from transformers import LlamaForCausalLM, LlamaConfig
from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer


sp_model = spm.SentencePieceProcessor(
model_file="configs/10w_vocab_wudao5_pile10.model"
tokenizer = LlamaTokenizer(
"configs/10w_vocab_wudao5_pile10.model",
pad_token="<pad>",
add_bos_token=False,
add_eos_token=True,
)
tokenizer = Tokenizer(sp_model)
raw_model = LlamaForCausalLM(
LlamaConfig(

raw_model = OpenLlamaForCausalLM(
OpenLlamaConfig(
vocab_size=tokenizer.vocab_size,
initializer_range=0.01,
pad_token_id=tokenizer.pad_id,
pad_token_id=tokenizer.pad_token_id,
rms_norm_eps=1e-5,
hidden_dropout_prob=0.1,
attention_dropout_prob=0.1,
Expand Down Expand Up @@ -80,20 +81,28 @@ def bot(history):
if completion is None:
inputs = "user:{}\nsystem:".format(prompt)
inputs = tokenizer(
inputs, return_tensors=True, add_special_tokens=False
inputs,
return_tensors="pt",
add_special_tokens=False,
return_attention_mask=False,
)
context.append(inputs["input_ids"])
else:
inputs = "user:{}\nsystem:{}".format(prompt, completion)
inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=True)
inputs = tokenizer(
inputs,
return_tensors="pt",
add_special_tokens=True,
return_attention_mask=False,
)
context.append(inputs["input_ids"])
context = torch.cat(context, dim=-1)
context = context[:, -1024:]
inputs_len = context.shape[1]
context = context.cuda()
pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True)
pred = pred[:, inputs_len:]
pred = tokenizer.decode(pred.cpu())[0]
pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
print(pred)
bot_message = parse_codeblock(pred)
history[-1][1] = bot_message
Expand Down
Binary file added configs/4w_cn_vocab_wudao15.model
Binary file not shown.
Binary file removed configs/6w_vocab_wudao5_pile10.model
Binary file not shown.
14 changes: 1 addition & 13 deletions configs/default_config.yaml
Original file line number Diff line number Diff line change
@@ -1,30 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_accumulation_steps: 12
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 1
distributed_type: DEEPSPEED
downcast_bf16: 'no'
dynamo_backend: 'no'
# dynamo_config:
# dynamo_backend: INDUCTOR
# dynamo_mode: default
# dynamo_use_dynamic: true
# dynamo_use_fullgraph: false
fsdp_config: {}
machine_rank: 0
main_training_function: main
megatron_lm_config: {}
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
use_cpu: false
33 changes: 33 additions & 0 deletions configs/instruct_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
data:
mode: "instruct"
data:
mixed: "data/instruction_data/part-*.jsonl.zst"
pad_to_max: False
sequence_sample_mode: "none"
concat_multiple_sequence: True
num_sequences: 50
seq_length: 2048
tokenizer_model_path: "configs/llama_tokenizer_extended.model"
model:
initializer_range: 1.0e-2
hidden_dropout_prob: 0.1
attention_dropout_prob: 0.1
use_stable_embedding: False
shared_input_output_embedding: False
train:
train_batch_size: 2
num_training_steps: 1000000
num_warmup_steps: 2000
initializer_range: 1.0e-2
lr: 2.0e-4
weight_decay: 1.0e-1
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
train_num_workers: 16
gradient_accumulation_steps: 1
prefetch_factor: 100
# global step
log_interval: 50
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B"
project_name: "Llama Instruction"
25 changes: 0 additions & 25 deletions configs/instruction_tuning_config.py

This file was deleted.

Binary file added configs/llama_tokenizer_extended.model
Binary file not shown.
14 changes: 0 additions & 14 deletions configs/pretrain_config.py

This file was deleted.

36 changes: 36 additions & 0 deletions configs/pretrain_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
data:
mode: "pretrain"
data:
wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
# 由于加载了Llama模型的ckpt所以只使用少量英文数据
the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
pad_to_max: False
sequence_sample_mode: "none"
concat_multiple_sequence: True
num_sequences: 10
seq_length: 2048
tokenizer_model_path: "configs/llama_tokenizer_extended.model"
model:
initializer_range: 1.0e-2
hidden_dropout_prob: 0.1
attention_dropout_prob: 0.1
use_stable_embedding: False
shared_input_output_embedding: False
train:
train_batch_size: 2
num_training_steps: 500000
num_warmup_steps: 2000
initializer_range: 1.0e-2
lr: 2.0e-4
weight_decay: 1.0e-1
# 加载预训练权重,从头训练设为null
ckpt: "data/llama_raw_ckpt/7B/extended.pth"
train_num_workers: 16
gradient_accumulation_steps: 12
prefetch_factor: 100
# global step
log_interval: 5
eval_interval: 500
save_interval: 1000
work_dir: "data/saved_ckpt/7B"
project_name: "Llama Pretrain"
14 changes: 7 additions & 7 deletions data/preprocess_instruction.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
dataset = load_dataset("yizhongw/self_instruct")
write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -39,7 +39,7 @@
dataset = load_dataset("BelleGroup/train_0.5M_CN")
write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -60,7 +60,7 @@
dataset = load_dataset("BelleGroup/train_1M_CN")
write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -81,7 +81,7 @@
dataset = load_dataset("BelleGroup/school_math_0.25M")
write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -102,7 +102,7 @@
dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -123,7 +123,7 @@
dataset = load_dataset("Graverman/Instruct-to-Code")
write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for line in dataset["train"]:
line = json.dumps(line)
Expand All @@ -143,7 +143,7 @@

write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
with open("data/sg_90k_part1.json", "r") as fp:
data1 = json.load(fp)
Expand Down
2 changes: 1 addition & 1 deletion data/preprocess_the_pile.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
paths = glob("data/the_pile/*.jsonl.zst")
write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with zstd.open(path, "r", encoding="utf-8") as fp:
Expand Down
2 changes: 1 addition & 1 deletion data/preprocess_wudao.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
total_num = 0
file_num = 0
file_num = 1
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
for path in tqdm(paths, total=len(paths)):
with open(path, "r") as fp:
Expand Down
69 changes: 0 additions & 69 deletions dataset/collate_fn.py

This file was deleted.

Loading

0 comments on commit f3c664b

Please sign in to comment.