Merge pull request #25 from s-JoL/dev

v2 release
s-JoL · Apr 28, 2023 · f3c664b · f3c664b
2 parents 92af968 + c890bce
commit f3c664b
Show file tree

Hide file tree

Showing 49 changed files with 1,185 additions and 1,439 deletions.
diff --git a/README.md b/README.md
diff --git a/README_en.md b/README_en.md
diff --git a/assets/chinese.JPG b/assets/chinese.JPG
diff --git a/assets/code.JPG b/assets/code.JPG
diff --git a/assets/instruct_loss.png b/assets/instruct_loss.png
diff --git a/assets/paper.JPG b/assets/paper.JPG
diff --git a/assets/pretrain_loss.png b/assets/pretrain_loss.png
diff --git a/chat_server.py b/chat_server.py
@@ -2,28 +2,29 @@
 Author: LiangSong(sl12160010@gmail.com)
 Date: 2023-04-06 22:30:10
 LastEditors: LiangSong(sl12160010@gmail.com)
-LastEditTime: 2023-04-07 23:03:31
+LastEditTime: 2023-04-27 20:34:58
 FilePath: /Open-Llama/chat_server.py
 Description: 
 
 Copyright (c) 2023 by LiangSong(sl12160010@gmail.com), All Rights Reserved. 
 """
 import torch
 import gradio as gr
-import sentencepiece as spm
-from dataset.tokenizer import Tokenizer
-from transformers import LlamaForCausalLM, LlamaConfig
+from transformers import OpenLlamaForCausalLM, OpenLlamaConfig, LlamaTokenizer
 
 
-sp_model = spm.SentencePieceProcessor(
-    model_file="configs/10w_vocab_wudao5_pile10.model"
+tokenizer = LlamaTokenizer(
+    "configs/10w_vocab_wudao5_pile10.model",
+    pad_token="<pad>",
+    add_bos_token=False,
+    add_eos_token=True,
 )
-tokenizer = Tokenizer(sp_model)
-raw_model = LlamaForCausalLM(
-    LlamaConfig(
+
+raw_model = OpenLlamaForCausalLM(
+    OpenLlamaConfig(
         vocab_size=tokenizer.vocab_size,
         initializer_range=0.01,
-        pad_token_id=tokenizer.pad_id,
+        pad_token_id=tokenizer.pad_token_id,
         rms_norm_eps=1e-5,
         hidden_dropout_prob=0.1,
         attention_dropout_prob=0.1,
@@ -80,20 +81,28 @@ def bot(history):
             if completion is None:
                 inputs = "user:{}\nsystem:".format(prompt)
                 inputs = tokenizer(
-                    inputs, return_tensors=True, add_special_tokens=False
+                    inputs,
+                    return_tensors="pt",
+                    add_special_tokens=False,
+                    return_attention_mask=False,
                 )
                 context.append(inputs["input_ids"])
             else:
                 inputs = "user:{}\nsystem:{}".format(prompt, completion)
-                inputs = tokenizer(inputs, return_tensors=True, add_special_tokens=True)
+                inputs = tokenizer(
+                    inputs,
+                    return_tensors="pt",
+                    add_special_tokens=True,
+                    return_attention_mask=False,
+                )
                 context.append(inputs["input_ids"])
         context = torch.cat(context, dim=-1)
         context = context[:, -1024:]
         inputs_len = context.shape[1]
         context = context.cuda()
         pred = model.generate(input_ids=context, max_new_tokens=512, do_sample=True)
         pred = pred[:, inputs_len:]
-        pred = tokenizer.decode(pred.cpu())[0]
+        pred = tokenizer.decode(pred.cpu()[0], skip_special_tokens=True)
         print(pred)
         bot_message = parse_codeblock(pred)
         history[-1][1] = bot_message

diff --git a/configs/4w_cn_vocab_wudao15.model b/configs/4w_cn_vocab_wudao15.model
diff --git a/configs/6w_vocab_wudao5_pile10.model b/configs/6w_vocab_wudao5_pile10.model
diff --git a/configs/default_config.yaml b/configs/default_config.yaml
@@ -1,30 +1,18 @@
 compute_environment: LOCAL_MACHINE
 deepspeed_config:
   deepspeed_multinode_launcher: standard
-  gradient_accumulation_steps: 12
   gradient_clipping: 1.0
   offload_optimizer_device: none
   offload_param_device: none
   zero3_init_flag: false
   zero_stage: 1
 distributed_type: DEEPSPEED
-downcast_bf16: 'no'
-dynamo_backend: 'no'
-# dynamo_config: 
-  # dynamo_backend: INDUCTOR
-  # dynamo_mode: default
-  # dynamo_use_dynamic: true
-  # dynamo_use_fullgraph: false
 fsdp_config: {}
 machine_rank: 0
 main_training_function: main
-megatron_lm_config: {}
 mixed_precision: bf16
 num_machines: 1
 num_processes: 8
 rdzv_backend: static
 same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
+use_cpu: false
diff --git a/configs/instruct_config.yaml b/configs/instruct_config.yaml
@@ -0,0 +1,33 @@
+data:
+  mode: "instruct"
+  data: 
+    mixed: "data/instruction_data/part-*.jsonl.zst"
+  pad_to_max: False
+  sequence_sample_mode: "none"
+  concat_multiple_sequence: True
+  num_sequences: 50
+  seq_length: 2048
+  tokenizer_model_path: "configs/llama_tokenizer_extended.model"
+model:
+  initializer_range: 1.0e-2
+  hidden_dropout_prob: 0.1
+  attention_dropout_prob: 0.1
+  use_stable_embedding: False
+  shared_input_output_embedding: False
+train:
+  train_batch_size: 2
+  num_training_steps: 1000000
+  num_warmup_steps: 2000
+  initializer_range: 1.0e-2
+  lr: 2.0e-4
+  weight_decay: 1.0e-1
+  ckpt: "data/llama_raw_ckpt/7B/extended.pth"
+  train_num_workers: 16
+  gradient_accumulation_steps: 1
+  prefetch_factor: 100
+# global step
+log_interval: 50
+eval_interval: 500
+save_interval: 1000
+work_dir: "data/saved_ckpt/7B"
+project_name: "Llama Instruction"
diff --git a/configs/instruction_tuning_config.py b/configs/instruction_tuning_config.py
diff --git a/configs/llama_tokenizer_extended.model b/configs/llama_tokenizer_extended.model
diff --git a/configs/pretrain_config.py b/configs/pretrain_config.py
diff --git a/configs/pretrain_config.yaml b/configs/pretrain_config.yaml
@@ -0,0 +1,36 @@
+data:
+  mode: "pretrain"
+  data: 
+    wudao: "data/pretrain_data/part-wudao*.jsonl.zst"
+    # 由于加载了Llama模型的ckpt所以只使用少量英文数据
+    the_pile: "data/pretrain_data/part-pile-1*.jsonl.zst"
+  pad_to_max: False
+  sequence_sample_mode: "none"
+  concat_multiple_sequence: True
+  num_sequences: 10
+  seq_length: 2048
+  tokenizer_model_path: "configs/llama_tokenizer_extended.model"
+model:
+  initializer_range: 1.0e-2
+  hidden_dropout_prob: 0.1
+  attention_dropout_prob: 0.1
+  use_stable_embedding: False
+  shared_input_output_embedding: False
+train:
+  train_batch_size: 2
+  num_training_steps: 500000
+  num_warmup_steps: 2000
+  initializer_range: 1.0e-2
+  lr: 2.0e-4
+  weight_decay: 1.0e-1
+  # 加载预训练权重，从头训练设为null
+  ckpt: "data/llama_raw_ckpt/7B/extended.pth"
+  train_num_workers: 16
+  gradient_accumulation_steps: 12
+  prefetch_factor: 100
+# global step
+log_interval: 5
+eval_interval: 500
+save_interval: 1000
+work_dir: "data/saved_ckpt/7B"
+project_name: "Llama Pretrain"
diff --git a/data/preprocess_instruction.py b/data/preprocess_instruction.py
@@ -18,7 +18,7 @@
 dataset = load_dataset("yizhongw/self_instruct")
 write_path = root_dir + "/instruction_data/part-self_instruct-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -39,7 +39,7 @@
 dataset = load_dataset("BelleGroup/train_0.5M_CN")
 write_path = root_dir + "/instruction_data/part-belle_0.5M-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -60,7 +60,7 @@
 dataset = load_dataset("BelleGroup/train_1M_CN")
 write_path = root_dir + "/instruction_data/part-belle_1M-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -81,7 +81,7 @@
 dataset = load_dataset("BelleGroup/school_math_0.25M")
 write_path = root_dir + "/instruction_data/part-belle_school_math_0.25M-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -102,7 +102,7 @@
 dataset = load_dataset("BelleGroup/multiturn_chat_0.8M")
 write_path = root_dir + "/instruction_data/part-belle_multiturn_chat_0.8M-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -123,7 +123,7 @@
 dataset = load_dataset("Graverman/Instruct-to-Code")
 write_path = root_dir + "/instruction_data/part-instruct_to_code-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for line in dataset["train"]:
     line = json.dumps(line)
@@ -143,7 +143,7 @@
 
 write_path = root_dir + "/instruction_data/part-sharegpt_90K-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 with open("data/sg_90k_part1.json", "r") as fp:
     data1 = json.load(fp)

diff --git a/data/preprocess_the_pile.py b/data/preprocess_the_pile.py
@@ -17,7 +17,7 @@
 paths = glob("data/the_pile/*.jsonl.zst")
 write_path = "data/pretrain_data/part-pile-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for path in tqdm(paths, total=len(paths)):
     with zstd.open(path, "r", encoding="utf-8") as fp:

diff --git a/data/preprocess_wudao.py b/data/preprocess_wudao.py
@@ -17,7 +17,7 @@
 paths = glob("data/WuDaoCorpus2.0_base_200G/part*")
 write_path = "data/pretrain_data/part-wudao-{}.jsonl.zst"
 total_num = 0
-file_num = 0
+file_num = 1
 wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
 for path in tqdm(paths, total=len(paths)):
     with open(path, "r") as fp:

diff --git a/dataset/collate_fn.py b/dataset/collate_fn.py