diff --git a/base_test/matmul_test/READE.md b/base_test/matmul_test/READE.md
new file mode 100644
index 0000000..59ed02f
--- /dev/null
+++ b/base_test/matmul_test/READE.md
@@ -0,0 +1,68 @@
+Matmul 自动化测试
+# 1. 脚本说明
+matmul 放置位置：
+```shell
+# mudnn_bench 默认存放在 /usr/local/musa/ 下
+mudnn_bench
+├── bench_test_matmul.sh
+├── bin
+│   ├── mudnn_bench -> mudnn_bench-x.x.x
+│   └── mudnn_bench-x.x.x
+├── matmul_test
+```
+mudnn_bench 示例：  
+**部分旧版本mudnn_bench和mudnn版本不支持混合精度测试，需要和开发者做确认.**
+```shell
+
+# 示例 1：单卡，大矩阵，f32
+MUSA_VISIBLE_DEVICES=4 ./bin/mudnn_bench -m --mm_m 6144 --mm_n 3584 --mm_k 6144 --warmup 30 --tm i --tmv 1000 -p -t f32
+
+# 示例 2：多卡，标准尺寸，bf16
+MUSA_VISIBLE_DEVICES=0,1 ./bin/mudnn_bench -m --mm_m 4096 --mm_n 4096 --mm_k 4096 --warmup 30 --tm i --tmv 1000 -p -t bf16
+
+# 示例 3：单卡，特殊组合，int8
+MUSA_VISIBLE_DEVICES=2 ./bin/mudnn_bench -m --mm_m 8192 --mm_n 8192 --mm_k 768 --warmup 30 --tm i --tmv 1000 -p -t int8
+
+# 示例 4：使用混合精度格式
+MUSA_VISIBLE_DEVICES=3 ./bin/mudnn_bench -m --mm_m 2048 --mm_n 2048 --mm_k 2048 --warmup 30 --tm i --tmv 1000 -p -t bf16:q4:bf16:bf16
+```
+
+# 2. 测试
+可在测试脚本中自行批量配置测试MNK，warmup，iter等。
+## 2.1 fp64, tf32 测试
+注意：fp64和tf32 数据类型调用非 mudnn 接口
+```shell
+# 1. 编译
+cd ./fp64_tf32_src
+
+bash build_gemm_tf32.sh
+bash build_gemm_fp64.sh
+
+## 2. 测试
+bash test_gemm_fp64_tf32.sh
+```
+
+## 2.2 f32_f16_bf16_q8_fp8 测试
+mudnn_bench 测试矩阵value默认说明：
+- 浮点：-0.5~0.5  
+- fp8: 整型-10~10转浮点  
+- qint4：-7～7 
+- 整型：-127~127  
+> 部分版本 mudnn_bench 工具支持全 0 测试(参数 `-z` 实现)，需要和开发者确认
+```shell
+bash test_gemm_f32_f16_bf16_q8_fp8.sh
+```
+
+## 2.3 混合精度测试
+```shell
+# A,B: fp16, C,D: f32: "f16:f16:f32:f32"
+# A,B: bf16, C,D: f32: "bf16:bf16:f32:f32"
+# A,B: tf32, C,D: f32: "f32"
+# A,B: int8, C,D: int32: "int8"
+# W8A8: "q8:q8:f32:f32"
+# W4A16: "bf16:q4:bf16:bf16"
+# A,B: fp8, C,D: fp16: "float8_e4m3:float8_e4m3:f16:f16"
+
+bash test_gemm_mixed.sh
+```
+
diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py
new file mode 100644
index 0000000..95a2a28
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py
@@ -0,0 +1,122 @@
+import re
+import os
+import sys
+from typing import List, Dict, Optional
+
+def extract_matmul_data(log_path: str) -> List[Dict[str, str]]:
+    patterns = {
+        "datatype": re.compile(r"DataType (\w+)"),
+        "mat_params": re.compile(r"m (\d+), n (\d+), k (\d+)"),
+        "elapsed_time": re.compile(r"AverageElapsedTime\(ms\) : (\d+\.\d+)"),
+        "throughput_gops": re.compile(r"Throughput (\d+\.\d+) GOPS")
+    }
+
+    extracted = []
+    current_block = {}
+
+    try:
+        with open(log_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+
+                dt_match = patterns["datatype"].search(line)
+                if dt_match:
+                    current_block["datatype"] = dt_match.group(1)
+
+                mp_match = patterns["mat_params"].search(line)
+                if mp_match:
+                    current_block["m"] = mp_match.group(1)
+                    current_block["n"] = mp_match.group(2)
+                    current_block["k"] = mp_match.group(3)
+
+                et_match = patterns["elapsed_time"].search(line)
+                if et_match:
+                    current_block["elapsed_time"] = et_match.group(1)
+
+                tp_match = patterns["throughput_gops"].search(line)
+                if tp_match:
+                    tops = round(float(tp_match.group(1)) / 1000, 4)
+                    current_block["throughput_tops"] = str(tops)
+
+                if line == "==============================" and current_block:
+                    required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"]
+                    if all(key in current_block for key in required):
+                        dim = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+                        extracted.append({
+                            "datatype": current_block["datatype"],
+                            "shape": dim,
+                            "Throughput(TOPS)": current_block["throughput_tops"],
+                            "AverageElapsedTime(ms)": current_block["elapsed_time"]
+                        })
+                    current_block = {}
+
+        required = ["datatype", "m", "n", "k", "elapsed_time", "throughput_tops"]
+        if current_block and all(key in current_block for key in required):
+            dim = f"{current_block['m']}×{current_block['n']}×{current_block['k']}"
+            extracted.append({
+                "datatype": current_block["datatype"],
+                "shape": dim,
+                "Throughput(TOPS)": current_block["throughput_tops"],
+                "AverageElapsedTime(ms)": current_block["elapsed_time"]
+            })
+
+    except Exception as e:
+        print(f"❌ 读取日志失败：{str(e)}")
+        return []
+
+    return extracted
+
+def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool:
+    if not data:
+        print("⚠️  未提取到有效数据，跳过CSV生成")
+        return False
+
+    headers = ["datatype", "shape", "Throughput(TOPS)", "AverageElapsedTime(ms)"]
+
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(", ".join(headers) + "\n")
+            for item in data:
+                row = [item[h] for h in headers]
+                f.write(", ".join(row) + "\n")
+        print(f"✅ CSV生成成功：{output_path}")
+        return True
+    except Exception as e:
+        print(f"❌ 生成CSV失败：{str(e)}")
+        return False
+
+def main(input_log: str, output_csv: Optional[str] = None):
+    if not os.path.isfile(input_log):
+        print(f"❌ 输入日志文件不存在：{input_log}")
+        return
+
+    if not output_csv:
+        log_dir = os.path.dirname(input_log)
+        log_name = os.path.splitext(os.path.basename(input_log))[0]
+        output_csv = os.path.join(log_dir, f"{log_name}_summary.csv")
+
+    print(f"📊 开始提取日志数据：{input_log}")
+    matmul_data = extract_matmul_data(input_log)
+
+    if not matmul_data:
+        print("❌ 未提取到任何有效测试数据")
+        return
+
+    print(f"✅ 成功提取 {len(matmul_data)} 条测试记录")
+
+    generate_csv(matmul_data, output_csv)
+    print("🎯 所有操作完成！")
+
+if __name__ == "__main__":
+    # 修正sys.argv判断（sys.argv[0]是脚本名，需至少传入1个输入文件路径）
+    if len(sys.argv) < 2:
+        print("用法：")
+        print("  python summarize_fp64_tf32_log.py <输入日志文件路径>")
+        print("示例：")
+        print("  python summarize_fp64_tf32_log.py bench.log")
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[1].replace('.log', '.csv')  # 日志文件同名CSV输出
+    main(input_path, output_path)
+
diff --git a/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py
new file mode 100644
index 0000000..31dacbd
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summarize_fp64_tf32_log.py
@@ -0,0 +1,120 @@
+import re
+import sys
+import os
+from typing import List, Dict, Optional
+
+def extract_matmul_data(log_path: str) -> List[Dict[str, str]]:
+    patterns = {
+        "datatype": re.compile(r"MatMul (\w+) Test \(MUSA\)"),
+        "mat_params": re.compile(r"m = (\d+), n = (\d+), k = (\d+)"),
+        "duration_us": re.compile(r"Duration:(\s*[\d\.]+) us"),
+        "tflops": re.compile(r"computation-\w+=(\s*[\d\.]+)")
+    }
+
+    extracted = []
+    current_block = {}
+
+    try:
+        with open(log_path, 'r', encoding='utf-8') as f:
+            for line in f:
+                line = line.strip()
+
+                dt_match = patterns["datatype"].search(line)
+                if dt_match:
+                    current_block["datatype"] = dt_match.group(1)
+
+                mp_match = patterns["mat_params"].search(line)
+                if mp_match:
+                    current_block["m"] = mp_match.group(1)
+                    current_block["n"] = mp_match.group(2)
+                    current_block["k"] = mp_match.group(3)
+
+                dur_match = patterns["duration_us"].search(line)
+                if dur_match:
+                    us_val = float(dur_match.group(1).strip())
+                    ms_val = round(us_val / 1000, 6)
+                    current_block["duration_ms"] = str(ms_val)
+
+                tf_match = patterns["tflops"].search(line)
+                if tf_match:
+                    tf_val = tf_match.group(1).strip()
+                    current_block["tflops"] = str(round(float(tf_val), 6))
+
+                if line == "========================================" and current_block:
+                    required = ["datatype", "m", "n", "k", "duration_ms", "tflops"]
+                    if all(key in current_block for key in required):
+                        shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+                        extracted.append({
+                            "DataType": current_block["datatype"],
+                            "shape": shape,
+                            "Compute_ability(TFLOPS)": current_block["tflops"],
+                            "AverageElapsedTime(ms)": current_block["duration_ms"]
+                        })
+                    current_block = {}
+
+        required = ["datatype", "m", "n", "k", "duration_ms", "tflops"]
+        if current_block and all(key in current_block for key in required):
+            shape = f"{current_block['m']}-{current_block['n']}-{current_block['k']}"
+            extracted.append({
+                "DataType": current_block["datatype"],
+                "shape": shape,
+                "Compute_ability(TFLOPS)": current_block["tflops"],
+                "AverageElapsedTime(ms)": current_block["duration_ms"]
+            })
+
+    except Exception as e:
+        print(f"❌ 读取日志失败：{str(e)}")
+        return []
+
+    return extracted
+
+def generate_csv(data: List[Dict[str, str]], output_path: str) -> bool:
+    if not data:
+        print("⚠️  未提取到有效数据，跳过CSV生成")
+        return False
+
+    headers = ["DataType", "shape", "Compute_ability(TFLOPS)", "AverageElapsedTime(ms)"]
+    try:
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(", ".join(headers) + "\n")
+            for item in data:
+                row = [item[h] for h in headers]
+                f.write(",".join(row) + "\n")
+        print(f"✅ CSV生成成功：{output_path}")
+        return True
+    except Exception as e:
+        print(f"❌ 生成CSV失败：{str(e)}")
+        return False
+
+def main(input_log: str, output_csv: Optional[str] = None):
+    if not os.path.isfile(input_log):
+        print(f"❌ 输入日志文件不存在：{input_log}")
+        return
+
+    if not output_csv:
+        log_dir = os.path.dirname(input_log)
+        log_name = os.path.splitext(os.path.basename(input_log))[0]
+        output_csv = os.path.join(log_dir, f"{log_name}_summary.csv")
+
+    print(f"📊 开始提取日志数据：{input_log}")
+    matmul_data = extract_matmul_data(input_log)
+
+    if not matmul_data:
+        print("❌ 未提取到任何有效测试数据")
+        return
+
+    print(f"✅ 成功提取 {len(matmul_data)} 条测试记录")
+    generate_csv(matmul_data, output_csv)
+    print("🎯 所有操作完成！")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("用法：")
+        print("  python summarize_fp64_tf32_log.py <输入日志文件路径>")
+        print("示例：")
+        print("  python summarize_fp64_tf32_log.py bench.log")
+        sys.exit(1)
+
+    input_path = sys.argv[1]
+    output_path = sys.argv[1].replace('.log', '.csv')
+    main(input_path, output_path)
diff --git a/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py
new file mode 100644
index 0000000..5edfa15
--- /dev/null
+++ b/base_test/matmul_test/exetrct_log_tools/summary_mixed_data.py
@@ -0,0 +1,64 @@
+import re
+import sys
+import csv
+import os
+
+if len(sys.argv) < 2:
+    print("Usage: python summary_fix_data.py <log_file>")
+    sys.exit(1)
+
+log_file = sys.argv[1]
+print(f"📊 正在读取并解析日志：{log_file}")
+
+if not os.path.exists(log_file):
+    print("❌ 日志文件不存在")
+    sys.exit(1)
+
+# 收集结果
+records = []
+
+# 正则模式
+re_start = re.compile(r"测试:\s*M=(\d+),\s*N=(\d+),\s*K=(\d+),\s*Type=([\w:]+)")
+re_result = re.compile(r"AverageElapsedTime\(ms\)\s*:\s*([\d\.]+)\s*,\s*Throughput\s*([\d\.]+)\s*GOPS")
+
+cur_M = cur_N = cur_K = cur_type = None
+
+with open(log_file, "r", encoding="utf-8") as f:
+    for line in f:
+        line = line.strip()
+
+        # 匹配开始参数
+        m1 = re_start.search(line)
+        if m1:
+            cur_M, cur_N, cur_K, cur_type = m1.groups()
+            continue
+
+        # 匹配结果
+        m2 = re_result.search(line)
+        if m2 and cur_M is not None:
+            elapsed, gops = m2.groups()
+            records.append({
+                "M": cur_M,
+                "N": cur_N,
+                "K": cur_K,
+                "Type": cur_type,
+                "AvgTime(ms)": elapsed,
+                "GOPS": gops
+            })
+            # 清空当前块（防止串行）
+            cur_M = cur_N = cur_K = cur_type = None
+
+# 输出 CSV
+if not records:
+    print("⚠️ 未提取到任何有效数据")
+    sys.exit(0)
+
+csv_path = log_file.replace(".log", ".csv")
+with open(csv_path, "w", newline="", encoding="utf-8") as csvfile:
+    writer = csv.DictWriter(csvfile, fieldnames=records[0].keys())
+    writer.writeheader()
+    writer.writerows(records)
+
+print(f"✅ 解析完成，共 {len(records)} 条数据")
+print(f"📄 CSV 已生成：{csv_path}")
+
diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh
new file mode 100644
index 0000000..4d33fd1
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_fp64.sh
@@ -0,0 +1 @@
+mcc gemm_fp64.mu -lmusart -lmublas -o gemm_fp64 --offload-arch=mp_31
diff --git a/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh
new file mode 100644
index 0000000..83b5acb
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/build_gemm_tf32.sh
@@ -0,0 +1 @@
+g++ gemm_tf32.cpp -std=c++17 -I/usr/local/musa/include -L /usr/local/musa/lib/ -fopenmp -lmudnn -lmusart -o gemm_tf32 -O2
diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu
new file mode 100644
index 0000000..ac62c9e
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/gemm_fp64.mu
@@ -0,0 +1,122 @@
+#include <chrono>
+#include <iostream>
+#include <mublas.h>
+#include <musa_runtime.h>
+#include <vector>
+
+size_t M = 16384;
+size_t N = 16384;
+size_t K = 16384;
+
+struct PrecisionConfig
+{
+  int bytesPerElement;
+  const char *name;
+  int NUM_ITERATIONS;
+  int WARMUP_ITERATIONS = 10;
+};
+
+void test(const PrecisionConfig &config)
+{
+  double *d_A, *d_B, *d_C;
+  std::vector<double> h_A(M * K, double(0.9f));
+  std::vector<double> h_B(K * N, double(1.2f));
+  std::vector<double> h_C(M * N);
+
+  musaMalloc(&d_A, M * K * config.bytesPerElement);
+  musaMalloc(&d_B, K * N * config.bytesPerElement);
+  musaMalloc(&d_C, M * N * config.bytesPerElement);
+
+  musaMemcpy(d_A, h_A.data(), M * K * config.bytesPerElement, musaMemcpyHostToDevice);
+  musaMemcpy(d_B, h_B.data(), K * N * config.bytesPerElement, musaMemcpyHostToDevice);
+
+  mublasHandle_t handle;
+  mublasCreate(&handle);
+
+  double alpha = 1.0f;
+  double beta = 0.0f;
+
+  for (int i = 0; i < config.WARMUP_ITERATIONS; ++i)
+  {
+    mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T,
+                M, N, K, &alpha,
+                d_A, M,
+                d_B, N,
+                &beta,
+                d_C, M);
+  }
+
+  musaError_t syncError = musaDeviceSynchronize();
+  auto start = std::chrono::high_resolution_clock::now();
+
+  if (syncError != musaSuccess)
+  {
+    std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+  }
+
+  for (int i = 0; i < config.NUM_ITERATIONS; ++i)
+  {
+    mublasDgemm(handle, MUBLAS_OP_N, MUBLAS_OP_T,
+                M, N, K, &alpha,
+                d_A, M,
+                d_B, N,
+                &beta,
+                d_C, M);
+  }
+  syncError = musaDeviceSynchronize();
+  auto end = std::chrono::high_resolution_clock::now();
+
+  if (syncError != musaSuccess)
+  {
+    std::cout << "MUSA error: " << musaGetErrorString(syncError) << std::endl;
+  }
+  auto duration =
+      std::chrono::duration_cast<std::chrono::microseconds>(end - start);
+  std::cout << "Average " << config.name << " Single Op Duration: "
+            << duration.count() / config.NUM_ITERATIONS << " us" << std::endl;
+
+  double time_second = duration.count() / 1.0e6;
+  double flops = 2.0 * M * N * K * config.NUM_ITERATIONS;
+  double FLOPS = flops / time_second;
+  double TFLOPS = FLOPS / 1.0e12;
+
+  std::cout << "[FlagPerf Result]" << "computation-FP64=" << TFLOPS << "TFLOPS"
+            << std::endl;
+
+  musaMemcpy(h_C.data(), d_C, M * N * config.bytesPerElement, musaMemcpyDeviceToHost);
+
+  musaFree(d_A);
+  musaFree(d_B);
+  musaFree(d_C);
+
+  mublasDestroy(handle);
+}
+
+int main(int argc, char* argv[]) {
+
+  if (argc != 5) {
+      std::cerr << "Usage: " << argv[0] << " <m> <n> <k> <iter>" << std::endl;
+      std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl;
+      return EXIT_FAILURE;
+  }
+
+  int m = std::atoi(argv[1]);
+  int n = std::atoi(argv[2]);
+  int k = std::atoi(argv[3]);
+  int iter = std::atoi(argv[4]);
+
+  std::cout << "========================================" << std::endl;
+  std::cout << "MatMul FP64 Test (MUSA)" << std::endl;
+  std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl;
+  std::cout << "Test Iterations = " << iter << std::endl;
+
+  M = m;
+  N = n;
+  K = k;
+  musaSetDevice(0);
+  PrecisionConfig fp64_PrecisionConfig = {sizeof(double), "FP64", iter, 40};
+
+  test(fp64_PrecisionConfig);
+
+  return 0;
+}
diff --git a/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp
new file mode 100644
index 0000000..6221eed
--- /dev/null
+++ b/base_test/matmul_test/fp64_tf32_src/gemm_tf32.cpp
@@ -0,0 +1,678 @@
+/* Copyright @2020-2024 Moore Threads Technology Co., Ltd("Moore Threads"). All
+ * rights reserved.
+ *
+ * This software ("this software and its documentations" or "the software") is
+ * protected by Copyright and the information contained herein is confidential.
+ *
+ * The software contained herein is PROPRIETARY to Moore Threads and is being
+ * provided under the terms and conditions of a form of Moore Threads software
+ * license agreement by and between Moore Threads and Licensee ("License
+ * Agreement") or electronically accepted by Licensee. Notwithstanding any
+ * terms or conditions to the contrary in the License Agreement, copy or
+ * disclosure of the software to any third party without the express written
+ * consent of Moore Threads is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE LICENSE
+ * AGREEMENT, MOORE THREADS MAKES NO REPRESENTATION ABOUT ANY WARRANTIES,
+ * INCLUDING BUT NOT LIMITED TO THE SUITABILITY OF THE SOFTWARE FOR ANY
+ * PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF
+ * ANY KIND. MOORE THREADS DISCLAIMS ALL WARRANTIES WITH REGARD TO THE
+ * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL MOORE THREADS BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THE SOFTWARE.
+ */
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include <chrono>
+#include <map>
+#include <thread>
+#include <type_traits>
+#include <typeinfo>
+
+#include <iostream>
+#include <mudnn.h>
+#include <cstring>
+#include <random>
+
+#include <eigen3/Eigen/Core>
+
+using qint8 = int8_t;
+
+#define SHOW printf
+
+namespace Eigen {
+    struct half;
+    struct bfloat16;
+}
+using Eigen::bfloat16;
+using Eigen::half;
+
+
+struct MatMulParam {
+    bool split_k{ false };
+    bool trans_a{ false };
+    bool trans_b{ true };
+    int batch{ 1 };
+    int m{ 6144 };
+    int n{ 8192 };
+    int k{ 19200 };
+    double alpha{ 1.0 };
+    double beta{ 0.0 };
+    double gamma{ 0.0 };
+    int mode{ 0 }; // 0 tensor, 1 scalar
+};
+
+#define CHECK_MUSA(...)                                                        \
+  do {                                                                         \
+    int err = CheckMusaError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__);   \
+    if (err)                                                                   \
+      exit(err);                                                               \
+  } while (0)
+
+#define CHECK_ERR(...)                                                         \
+  do {                                                                         \
+    int err = CheckError(__VA_ARGS__, #__VA_ARGS__, __FILE__, __LINE__);       \
+    if (err)                                                                   \
+      exit(err);                                                               \
+  } while (0)
+
+int CheckMusaError(musaError_t code, const char* expr, const char* file,
+    int line) {
+    if (code) {
+        printf("MUSA error at %s:%d, code=%d (%s) in '%s'", file, line, (int)code,
+            musaGetErrorString(code), expr);
+        return 1;
+    }
+    return 0;
+}
+
+int CheckError(bool code, const char* expr, const char* file, int line) {
+    if (code) {
+        printf("General error at %s:%d, code=%d (%s) in '%s'", file, line,
+            (int)code, "general error", expr);
+        return 1;
+    }
+    return 0;
+}
+
+template <typename Type, typename RandomType>
+void GenerateRandom(Type* data, int64_t size, uint seed = 2333) {
+    // non-determistic seed source
+    // std::random_device rd;
+    // std::default_random_engine engine(rd());
+    // constexpr auto seed = 2333;
+    std::default_random_engine engine(seed);
+    if (std::is_floating_point_v<RandomType>) {
+        std::uniform_real_distribution<float> dist(0, 0);
+        for (auto i = 0; i < size; i++) {
+            data[i] = (Type)(dist(engine));
+        }
+    }
+    else {
+        std::uniform_int_distribution<int8_t> dist(0, 0);
+        for (auto i = 0; i < size; i++) {
+            data[i] = (Type)(dist(engine));
+        }
+    }
+}
+
+void MemFree(void* ptr) {
+    if (ptr) {
+        musaFree(ptr);
+    }
+}
+
+::musa::dnn::MemoryHandler MemoryFunc(size_t size) {
+    void* data = nullptr;
+    if (size) {
+        musaMalloc(&data, size);
+        musaMemset(data, 0, size);
+    }
+    return ::musa::dnn::MemoryHandler(data, MemFree);
+}
+
+enum DType {
+    f32,
+    f16,
+    q8,
+    bf16,
+};
+
+
+class TestMatMul {
+public:
+    inline float F32MaskFormatTF32(float f) {
+        unsigned int t = 0;
+        std::memcpy(&t, &f, sizeof(f));
+        // 1110 0000 0000 0000
+        t = t & 0xffffe000;
+        std::memcpy(&f, &t, sizeof(f));
+        return f;
+    }
+
+    // Random num generator
+
+
+    TestMatMul(const musaStream_t& _stream, const int _device_id, const DType _dtype, const MatMulParam _param, const int _iters)
+    {
+        stream = _stream;
+        device_id = _device_id;
+        dtype = _dtype;
+        dtype_size = 4;
+
+        switch (dtype) {
+            case DType::f32:
+                dtype_str = "float32";
+                dtype_size = 4;
+                break;
+            case DType::f16:
+                dtype_str = "float16";
+                dtype_size = 2;
+                break;
+            case DType::bf16:
+                dtype_str = "bfloat16";
+                dtype_size = 2;
+                break;
+            case DType::q8:
+                dtype_str = "qint8";
+                dtype_size = 1;
+                break;
+            default:
+                bool DType_Not_Suppoted = true;
+                CHECK_ERR(DType_Not_Suppoted);
+                break;
+        }
+        split_k = _param.split_k;
+        trans_a = _param.trans_a;
+        trans_b = _param.trans_b;
+        batch = _param.batch;
+        m = _param.m;
+        n = _param.n;
+        k = _param.k;
+        alpha = _param.alpha;
+        beta = _param.beta;
+        gamma = _param.gamma;
+        mode = _param.mode;
+
+        iters = _iters;
+
+        handle = new ::musa::dnn::Handle(device_id);
+        handle->SetStream(stream);
+    };
+    ~TestMatMul() {
+#define FREE_H(_PTR)                                                           \
+  if (_PTR != nullptr) {                                                       \
+    operator delete(_PTR);                                                     \
+  }
+#define FREE_D(_PTR)                                                           \
+  if (_PTR != nullptr) {                                                       \
+    CHECK_MUSA(musaFree(_PTR));                                                \
+  }
+
+        FREE_H(h_buf_a);
+        FREE_H(h_buf_b);
+        FREE_H(h_buf_c);
+        FREE_H(h_buf_o);
+        FREE_H(h_buf_z);
+
+        FREE_D(d_a);
+        FREE_D(d_b);
+        FREE_D(d_c);
+        FREE_D(d_z);
+
+        FREE_D(d_base);
+        FREE_D(d_bool);
+        FREE_D(d_nonz);
+        FREE_H(h_nonz);
+
+#undef FREE_H
+#undef FREE_D
+
+        if (handle) {
+            delete handle;
+        }
+    };
+
+    bool Test() {
+        // check parameters
+        CheckParams();
+        // initial memory && dnn tensor op
+        Init();
+        // warm up && prepare base golden
+        int warmup_iters = 40;
+        for (int i = 0; i < warmup_iters; i++) {
+            Exec();
+        }
+        // main loop
+        float elapsed_ms = 0.f;
+        musaEvent_t start, stop;
+        if (performance) {
+            CHECK_MUSA(musaEventCreate(&start));
+            CHECK_MUSA(musaEventCreate(&stop));
+            CHECK_MUSA(musaEventRecord(start, stream));
+        }
+
+        std::chrono::milliseconds bubble_time(bubble);
+        std::chrono::milliseconds duration_time(duration);
+        std::chrono::milliseconds show_gap_time(60000);
+        int show_gap_count = 0;
+        auto start_time = std::chrono::steady_clock::now();
+        auto current_time = start_time;
+        const bool blocking = (bubble > 0) || (iters == 0 && duration > 0);
+        int stable_check_gap_count = 1;
+        int run_iters_count = 0;
+        int i = 0;
+        while ((iters > 0 && i < iters) ||
+            (iters == 0 && (current_time - start_time) <= duration_time)) {
+            // operator running
+            Exec(blocking);
+
+            if (bubble > 0) {
+                // SHOW("sleeping %d ms\n", bubble);
+                std::this_thread::sleep_for(bubble_time);
+            }
+            current_time = std::chrono::steady_clock::now();
+            if ((iters == 0 && duration > 0) &&
+                (current_time - start_time) > show_gap_time * show_gap_count) {
+                std::cout << "--- now execution time passed "
+                    << (show_gap_time * show_gap_count).count() << std::endl;
+                show_gap_count++;
+            }
+            // SHOW("run loop %d\n", run_iters_count);
+            i++, stable_check_gap_count++, run_iters_count++;
+        }
+        // performance testing and stability checking are mutually exclusive
+        if (performance) {
+            CHECK_MUSA(musaEventRecord(stop, stream));
+            CHECK_MUSA(musaEventSynchronize(stop));
+            CHECK_MUSA(musaEventElapsedTime(&elapsed_ms, start, stop));
+            elapsed_ms = elapsed_ms / run_iters_count;
+            ShowPerformance(elapsed_ms, (size_t)m * n * k * 2 / elapsed_ms * 1e-6,
+                !stable_check);
+            CHECK_MUSA(musaEventDestroy(start));
+            CHECK_MUSA(musaEventDestroy(stop));
+        }
+        return true;
+    }
+
+    void ShowPerformance(float t, float gops, bool credible) {
+        // SHOW("dev_time : %f, gops : %f %s\n", t, credible ? gops : 0.f,
+        //     credible
+        //     ? " "
+        //     : " - the performance is not credible when enable stable checking");
+        SHOW("Average TF32 Single Op Duration:%f us\n", t * 1.0e3);
+        SHOW("[FlagPerf Result]computation-TF32=%f TFLOPS\n", gops / 1.0e3);
+
+    }
+
+private:
+    void* h_buf_a = nullptr;
+    void* h_buf_b = nullptr;
+    void* h_buf_c = nullptr;
+    void* h_buf_o = nullptr;
+    void* h_buf_z = nullptr;
+
+    void* d_a = nullptr;
+    void* d_b = nullptr;
+    void* d_c = nullptr;
+    void* d_z = nullptr;
+
+    void* d_base = nullptr;
+    void* d_bool = nullptr;
+    void* d_nonz = nullptr;
+    int64_t* h_nonz = nullptr;
+
+    bool result_check = false;
+    bool stable_check = false;
+    bool stable_check_gpu = false;
+    bool performance = true;
+    bool verbose = false;
+    int iters = 1;
+    int duration = 0;
+    int bubble = 0;
+    int gap = 1;
+    uint seed = 2333;
+
+    DType dtype = DType::f32;
+    std::string dtype_str = "float32";
+    size_t dtype_size = 4;
+    bool split_k = false;
+    bool trans_a = false;
+    bool trans_b = false;
+    int batch = 1;
+    int m = 1;
+    int n = 1;
+    int k = 1;
+    double alpha = 1.0;
+    double beta = 0.0;
+    double gamma = 0.0;
+    int mode = 0;
+
+    // qint8 variables
+    const float scale_a = 1.f / 32.f;
+    const float scale_b = 1.f / 32.f;
+    const float scale_c = 32.f;
+
+    // mudnn variables
+    musaStream_t stream;
+    int device_id;
+    ::musa::dnn::Handle* handle;
+    ::musa::dnn::MatMul op;
+
+    ::musa::dnn::Tensor tensor_a;
+    ::musa::dnn::Tensor tensor_b;
+    ::musa::dnn::Tensor tensor_c;
+    ::musa::dnn::Tensor tensor_z;
+    ::musa::dnn::Tensor tensor_base;
+    ::musa::dnn::Tensor tensor_bool;
+    ::musa::dnn::Tensor tensor_nonz;
+
+private:
+
+
+    ::musa::dnn::Tensor::Type GetmuDNNType(const std::string& dtype) {
+        using T = ::musa::dnn::Tensor::Type;
+        static std::map<std::string, T> type_mapping = {
+            {"int8", T::INT8},
+            {"int16", T::INT16},
+            {"int32", T::INT32},
+
+            {"int", T::INT64},
+            {"int64", T::INT64},
+
+            {"uint8", T::UINT8},
+            {"uint16", T::UINT16},
+            {"uint32", T::UINT32},
+
+            {"uint", T::UINT64},
+            {"uint64", T::UINT64},
+
+            {"half", T::HALF},
+            {"float16", T::HALF},
+            {"bfloat16", T::BFLOAT16},
+
+            {"float32", T::FLOAT},
+            {"qint8", T::QINT8},
+
+            {"float", T::FLOAT},
+            {"float64", T::DOUBLE},
+            {"double", T::DOUBLE},
+
+            {"bool", T::BOOL},
+        };
+        if (type_mapping.find(dtype) != type_mapping.end()) {
+            return type_mapping.at(dtype);
+        }
+        else {
+            std::cerr << "GetmuDNNType error : " << dtype << std::endl;
+            return type_mapping.at("float");
+        }
+    }
+    bool CheckParams() {
+        bool pass = true;
+        // param checking
+        if (mode != 0 && mode != 1) {
+            std::cerr << "MatMul mode setting error, fallback 0" << std::endl;
+            mode = 0;
+        }
+        if (m <= 0 || n <= 0 || k <= 0) {
+            std::cerr << "MatMul param setting error, fallback 1" << std::endl;
+            m = m > 0 ? m : 1;
+            n = n > 0 ? n : 1;
+            k = k > 0 ? k : 1;
+        }
+        if (gamma != 0) {
+            std::cerr << "MatMul unsupported gamma != 0 temporarily, fallback 0"
+                << std::endl;
+            gamma = 0;
+        }
+        if (beta != 0) {
+            if (mode == 0) {
+                std::cerr << "MatMul unsupported beta != 0 when mode == 0, fallback 0"
+                    << std::endl;
+                beta = 0;
+            }
+
+        }
+        if (dtype == DType::q8) {
+            // To be removed when binary supports QINT8
+            if (stable_check_gpu) {
+                std::cerr
+                    << "MatMul unsupported qint8 for stable_check_gpu, fallback cpu "
+                    << std::endl;
+                stable_check_gpu = false;
+            }
+            if (mode != 0) {
+                std::cerr << "MatMul mode must be 0 when qint8, fallback 0"
+                    << std::endl;
+                mode = 0;
+            }
+        }
+
+        return pass;
+    }
+
+    bool Init() {
+        size_t nr_elem_a = (size_t)(m)*k;
+        size_t nr_elem_b = (size_t)(k)*n;
+        size_t nr_elem_c = (size_t)(m)*n;
+        size_t nr_elem_z = (size_t)(n);
+
+        size_t size_a = nr_elem_a * dtype_size;
+        size_t size_b = nr_elem_b * dtype_size;
+        size_t size_c = nr_elem_c * dtype_size;
+        size_t size_z = nr_elem_z * dtype_size;
+
+        size_t mem_total, mem_free;
+        CHECK_MUSA(musaMemGetInfo(&mem_free, &mem_total));
+        size_t available_gpu_mem = mem_free;
+        size_t total_gpu_mem = mem_total;
+        size_t need_gpu_mem = size_a + size_b + size_c;
+        if (gamma != 0) {
+            need_gpu_mem += size_z;
+        }
+        if (stable_check && stable_check_gpu) {
+            need_gpu_mem +=
+                size_c + sizeof(bool) * nr_elem_c + sizeof(int64_t) * m * n * 2;
+        }
+        if ((need_gpu_mem > available_gpu_mem) || verbose) {
+            SHOW("%s : Need Device Memory %.2f GiB, Available Device Memory %.2f GiB "
+                "(Total %.2f GiB)\n",
+                (need_gpu_mem > available_gpu_mem) ? "Error" : "Verbose",
+                need_gpu_mem / 1024.f / 1024 / 1024,
+                available_gpu_mem / 1024.f / 1024 / 1024,
+                total_gpu_mem / 1024.f / 1024 / 1024);
+        }
+        CHECK_ERR(need_gpu_mem > available_gpu_mem);
+
+        // host buffer
+        h_buf_a = operator new(size_a); // new char[size_a]();
+        h_buf_b = operator new(size_b); // new char[size_b]();
+        h_buf_c = operator new(size_c); // new char[size_c]();
+        h_buf_o = operator new(size_c); // new char[size_c]();
+
+        // host data initialization
+        if (dtype == DType::f16) {
+            GenerateRandom<half, float>((half*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<half, float>((half*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<half, float>((half*)(h_buf_c), nr_elem_c, seed);
+
+        }
+        else if (dtype == DType::bf16) {
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_c), nr_elem_c, seed);
+        }
+        else if (dtype == DType::q8) {
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<qint8, qint8>((qint8*)(h_buf_c), nr_elem_c, seed);
+        }
+        else {
+            GenerateRandom<float, float>((float*)(h_buf_a), nr_elem_a, seed);
+            GenerateRandom<float, float>((float*)(h_buf_b), nr_elem_b, seed);
+            GenerateRandom<float, float>((float*)(h_buf_c), nr_elem_c, seed);
+        }
+
+        // tensor float 32 format
+        if ((dtype == DType::f32) && mode == 0) {
+            for (size_t i = 0; i < nr_elem_a; i++) {
+                ((float*)h_buf_a)[i] = (float)F32MaskFormatTF32(((float*)h_buf_a)[i]);
+            }
+            for (size_t i = 0; i < nr_elem_b; i++) {
+                ((float*)h_buf_b)[i] = (float)F32MaskFormatTF32(((float*)h_buf_b)[i]);
+            }
+            for (size_t i = 0; i < nr_elem_c; i++) {
+                ((float*)h_buf_c)[i] = (float)F32MaskFormatTF32(((float*)h_buf_c)[i]);
+            }
+        }
+
+        // device buffer
+        CHECK_MUSA(musaMalloc(&d_a, size_a));
+        CHECK_MUSA(musaMalloc(&d_b, size_b));
+        CHECK_MUSA(musaMalloc(&d_c, size_c));
+
+        // transfer host data to device
+
+        CHECK_MUSA(musaMemcpy(d_a, h_buf_a, size_a, musaMemcpyHostToDevice));
+        CHECK_MUSA(musaMemcpy(d_b, h_buf_b, size_b, musaMemcpyHostToDevice));
+        CHECK_MUSA(musaMemcpy(d_c, h_buf_c, size_c, musaMemcpyHostToDevice));
+
+        // host and device buffer for gamma 
+        if (gamma != 0) {
+            h_buf_z = new char[size_z]();
+            CHECK_MUSA(musaMalloc(&d_z, size_z));
+            CHECK_MUSA(musaMemcpy(d_z, h_buf_z, size_z, musaMemcpyHostToDevice));
+            if (dtype == DType::f16) {
+                GenerateRandom<half, float>((half*)(h_buf_z), nr_elem_z, seed);
+            }
+            else if (dtype == DType::bf16) {
+                GenerateRandom<bfloat16, float>((bfloat16*)(h_buf_z), nr_elem_z, seed);
+            }
+            else if (dtype == DType::q8) {
+                GenerateRandom<qint8, qint8>((qint8*)(h_buf_z), nr_elem_z, seed);
+            }
+            else {
+                GenerateRandom<float, float>((float*)(h_buf_z), nr_elem_z, seed);
+            }
+        }
+
+
+        ::musa::dnn::Tensor::Type ttype = GetmuDNNType(dtype_str);
+        tensor_a.SetAddr(d_a);
+        tensor_a.SetType(ttype);
+        if (DType::q8 == dtype) {
+            tensor_a.SetQuantizationInfo(scale_a);
+        }
+        if (trans_a) {
+            tensor_a.SetNdInfo({ k, m });
+        }
+        else {
+            tensor_a.SetNdInfo({ m, k });
+        }
+
+        tensor_b.SetAddr(d_b);
+        tensor_b.SetType(ttype);
+        if (DType::q8 == dtype) {
+            tensor_b.SetQuantizationInfo(scale_b);
+        }
+        if (trans_b) {
+            tensor_b.SetNdInfo({ n, k });
+        }
+        else {
+            tensor_b.SetNdInfo({ k, n });
+        }
+
+        tensor_c.SetAddr(d_c);
+        tensor_c.SetType(ttype);
+        tensor_c.SetNdInfo({ m, n });
+        if (DType::q8 == dtype) {
+            tensor_c.SetQuantizationInfo(scale_c);
+        }
+
+        tensor_z.SetAddr(d_z);
+        tensor_z.SetType(ttype);
+        tensor_z.SetNdInfo({ n });
+
+        CHECK_MUSA(musaStreamSynchronize(stream));
+        CHECK_MUSA(musaDeviceSynchronize());
+
+
+        op.SetTranspose(trans_a, trans_b);
+        // op.SetSplitK(split_k);
+        op.SetAlpha(alpha);
+        op.SetBeta(beta);
+        op.SetGamma(gamma);
+        op.SetComputeMode(static_cast<::musa::dnn::MatMul::ComputeMode>(mode));
+
+        return true;
+    }
+
+    void Exec(bool sync = false) {
+        CHECK_ERR(::musa::dnn::Status::SUCCESS !=
+            op.RunWithBiasAdd(*handle, tensor_c, tensor_a, tensor_b, tensor_z, MemoryFunc));
+        CHECK_MUSA(musaGetLastError());
+        if (sync) {
+            CHECK_MUSA(musaStreamSynchronize(stream));
+        }
+    }
+};
+
+int RunMatMul() {
+
+
+    int device_id = 5;
+    CHECK_MUSA(musaGetDevice(&device_id));
+
+    MatMulParam param;
+    const int iters = 42000;
+    musaStream_t stream;
+    CHECK_MUSA(musaStreamCreate(&stream));
+    TestMatMul test_mm(stream, device_id, DType::f32, param, iters);
+    bool ret = test_mm.Test();
+    CHECK_MUSA(musaStreamDestroy(stream));
+    return ret;
+}
+
+
+int main(int argc, char* argv[]) {
+    
+    if (argc != 5) {
+        std::cerr << "Usage: " << argv[0] << " <m> <n> <k> <iter>" << std::endl;
+        std::cerr << "Example: " << argv[0] << " 128 128 128 10" << std::endl;
+        return EXIT_FAILURE;
+    }
+
+    int m = std::atoi(argv[1]);
+    int n = std::atoi(argv[2]);
+    int k = std::atoi(argv[3]);
+    int iter = std::atoi(argv[4]);
+
+    std::cout << "========================================" << std::endl;
+    std::cout << "MatMul TF32 Test (MUSA)" << std::endl;
+    std::cout << "m = " << m << ", n = " << n << ", k = " << k << std::endl;
+    std::cout << "Test Iterations = " << iter << std::endl;
+
+    int device_id = 0;
+    CHECK_MUSA(musaGetDevice(&device_id));
+
+    MatMulParam param;
+    param.m = m;
+    param.n = n;
+    param.k = k;
+    const int iters = iter;
+    musaStream_t stream;
+    CHECK_MUSA(musaStreamCreate(&stream));
+    TestMatMul test_mm(stream, device_id, DType::f32, param, iters);
+    bool ret = test_mm.Test();
+    CHECK_MUSA(musaStreamDestroy(stream));
+    return ret;
+}
diff --git a/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh
new file mode 100644
index 0000000..736c237
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_f32_f16_bf16_q8_fp8.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+input_data=$(cat <<'EOF'
+128	128	128
+256	256	256
+512	512	512
+1024	1024	1024
+2048	2048	2048
+4096	4096	4096
+8192	8192	8192
+4098	4098	4098
+8190	8190	8190
+EOF
+)
+test_iter=1000
+
+TEST_TYPES=("f32" "f16" "bf16" "q8" "float8_e4m3" "float8_e5m2")
+# TEST_TYPES=("f32")
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+log_file="${LOG_DIR}/bench_f32_f16_bf16_q8_fp8.log"
+> "$log_file"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "开始测试数据类型：$type"
+    while IFS=$'\t' read -r m n k; do
+        m=$(echo "$m" | tr -d ' ')
+        n=$(echo "$n" | tr -d ' ')
+        k=$(echo "$k" | tr -d ' ')
+        echo "$m $n $k"
+        
+        if [[ -n "$m" && -n "$n" && -n "$k" ]]; then
+            MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \
+                -t "$type" \
+                --mm_m="$m" --mm_n="$n" --mm_k="$k" \
+                --mm_mode=0 \
+                --tm i \
+                --tmv "$test_iter" \
+                -p \
+                >> "$log_file" 2>&1 
+            sleep 2
+        fi
+    done < <(echo "$input_data") 
+done 
+
+python exetrct_log_tools/summarize_f32_f16_bf16_q8_fp8_log.py "$log_file"
+
+echo "所有测试完成！日志目录：$LOG_DIR"
diff --git a/base_test/matmul_test/test_gemm_fp64_tf32.sh b/base_test/matmul_test/test_gemm_fp64_tf32.sh
new file mode 100644
index 0000000..924c556
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_fp64_tf32.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+
+# 输入矩阵大小列表
+input_data=$(cat <<'EOF'
+128 128 128
+256 256 256
+512 512 512
+1024 1024 1024
+2048 2048 2048
+4096 4096 4096
+8192 8192 8192
+4098 4098 4098
+8190 8190 8190
+8192 768 8192
+EOF
+)
+
+# 每组测试迭代次数
+test_iter=1000
+
+# 测试类型列表
+TEST_TYPES=("fp64" "tf32")
+
+# GEMM 可执行文件目录
+EXE_DIR="./fp64_tf32_src"
+
+# 日志目录
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+ABS_LOG_DIR=$(realpath "$LOG_DIR")
+log_file="${ABS_LOG_DIR}/bench_fp64_tf32_types.log"
+> "$log_file"
+
+# Python 分析脚本路径
+PYTHON_SUMMARIZE="exetrct_log_tools/summarize_fp64_tf32_log.py"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "=============================="
+    echo "开始测试：$type"
+    echo "=============================="
+
+    # 根据类型选择可执行文件
+    if [[ "$type" == "fp64" ]]; then
+        exe="${EXE_DIR}/gemm_fp64"
+    elif [[ "$type" == "tf32" ]]; then
+        exe="${EXE_DIR}/gemm_tf32"
+    else
+        echo "未知类型: $type"
+        continue
+    fi
+
+    # 检查可执行文件是否存在
+    if [[ ! -f "$exe" ]]; then
+        echo "错误：找不到可执行文件 $exe"
+        continue
+    fi
+
+    # 遍历矩阵大小
+    while read -r m n k; do
+        # 清理可能的空格
+        m=$(echo "$m" | tr -d ' ')
+        n=$(echo "$n" | tr -d ' ')
+        k=$(echo "$k" | tr -d ' ')
+
+        echo "矩阵大小: M=$m, N=$n, K=$k"
+
+        if [[ -n "$m" && -n "$n" && -n "$k" ]]; then
+            # 执行 GEMM 测试并记录日志
+            MUSA_VISIBLE_DEVICES=7 "$exe" "$m" "$n" "$k" "$test_iter" >> "$log_file" 2>&1
+            sleep 1
+        fi
+    done <<< "$input_data"
+
+done
+
+# 调用 Python 分析脚本
+if [[ -f "$PYTHON_SUMMARIZE" ]]; then
+    python "$PYTHON_SUMMARIZE" "$log_file"
+else
+    echo "警告：Python 分析脚本不存在: $PYTHON_SUMMARIZE"
+fi
+
+echo "所有测试完成！日志目录：$ABS_LOG_DIR"
+
diff --git a/base_test/matmul_test/test_gemm_mixed.sh b/base_test/matmul_test/test_gemm_mixed.sh
new file mode 100644
index 0000000..076a95d
--- /dev/null
+++ b/base_test/matmul_test/test_gemm_mixed.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+set -e
+
+input_data=$(cat <<'EOF'
+128     128     128
+256     256     256
+512     512     512
+1024    1024    1024
+2048    2048    2048
+4096    4096    4096
+8192    8192    8192
+4098    4098    4098
+8190    8190    8190
+8192    768     8192
+EOF
+)
+test_iter=1000
+
+TEST_TYPES=("f16:f16:f32:f32" "bf16:bf16:f32:f32" "f32" "int8" "q8:q8:f32:f32" "bf16:q4:bf16:bf16" "float8_e4m3:float8_e4m3:f16:f16")
+# TEST_TYPES=("f32")
+LOG_DIR="mudnn_bench_logs"
+mkdir -p "$LOG_DIR"
+log_file="${LOG_DIR}/bench_fix_matmul.log"
+> "$log_file"
+
+# 先测试命令是否存在
+if [ ! -f "../bin/mudnn_bench" ]; then
+    echo "错误：未找到 ../bin/mudnn_bench 可执行文件" | tee -a "$log_file"
+    exit 1
+fi
+
+echo "开始测试，日志文件：$log_file"
+
+for type in "${TEST_TYPES[@]}"; do
+    echo "开始测试数据类型：$type" | tee -a "$log_file"
+    
+    # 使用 while 循环逐行读取
+    echo "$input_data" | while IFS= read -r line; do
+        # 跳过空行
+        [ -z "$line" ] && continue
+        
+        # 使用 awk 或直接读取三个数字
+        # 方法1：使用 read
+        read m n k <<< "$line"
+        
+        # 或者方法2：使用 awk（更可靠）
+        # m=$(echo "$line" | awk '{print $1}')
+        # n=$(echo "$line" | awk '{print $2}')
+        # k=$(echo "$line" | awk '{print $3}')
+        
+        echo "测试: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file"
+        
+        # 检查参数是否正确
+        if ! [[ "$m" =~ ^[0-9]+$ ]] || ! [[ "$n" =~ ^[0-9]+$ ]] || ! [[ "$k" =~ ^[0-9]+$ ]]; then
+            echo "错误：参数不是数字: m=$m, n=$n, k=$k" | tee -a "$log_file"
+            continue
+        fi
+
+        # 临时保存命令
+        cmd="MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m --mm_m=\"$m\" --mm_n=\"$n\" --mm_k=\"$k\" --warmup 30 --tm i --tmv \"$test_iter\" -p -c -t \"$type\""
+        echo "执行命令: $cmd" >> "$log_file"
+        
+        # 执行命令并捕获退出状态
+        if MUSA_VISIBLE_DEVICES=7 ../bin/mudnn_bench -m \
+            --mm_m="$m" --mm_n="$n" --mm_k="$k" \
+            --warmup 30 \
+            --tm i \
+            --tmv "$test_iter" \
+            -p \
+            -c \
+            -t "$type" >> "$log_file" 2>&1; then
+            echo "测试成功: M=$m, N=$n, K=$k, Type=$type" | tee -a "$log_file"
+        else
+            exit_code=$?
+            echo "测试失败: M=$m, N=$n, K=$k, Type=$type, 退出码: $exit_code" | tee -a "$log_file"
+        fi
+        
+        echo "----------------------------------------" >> "$log_file"
+        sleep 2
+    done
+done
+
+python sexetrct_log_tool/summary_mixed_data.py  "$log_file"
+echo "所有测试完成！日志目录：$LOG_DIR"
+echo "查看日志：cat $log_file"
diff --git a/script/monitor/README.md b/base_test/monitor/README.md
similarity index 100%
rename from script/monitor/README.md
rename to base_test/monitor/README.md
diff --git a/script/monitor/monitor_gpu.sh b/base_test/monitor/monitor_gpu.sh
old mode 100755
new mode 100644
similarity index 100%
rename from script/monitor/monitor_gpu.sh
rename to base_test/monitor/monitor_gpu.sh