From 30ea092342c404d38bf50a2c3999afefd845967e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=94=E7=8E=89sealofyou?= <1554948966@qq.com>
Date: Sun, 24 Aug 2025 00:44:15 +0000
Subject: [PATCH 1/2] =?UTF-8?q?chore:=20=E6=9B=B4=E6=96=B0=20.gitignore=20?=
 =?UTF-8?q?=E6=96=87=E4=BB=B6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 添加 __pycache__ 目录到忽略列表
- 添加 temp 目录到忽略列表
- 添加 venv 目录到忽略列表
- 添加 *.pyc 文件到忽略列表
---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.gitignore b/.gitignore
index 79cb12d..8c8df3a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,3 +2,9 @@
 */*/__pycache__/
 */*/*/*/__pycache__/
 */*/*/*/*/__pycache__/
+__pycache__/*
+temp
+
+venv/*
+
+*.pyc
\ No newline at end of file

From ef3752918b0f22b6d4596a32f8025842af62e923 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=94=E7=8E=89sealofyou?= <1554948966@qq.com>
Date: Sun, 24 Aug 2025 08:27:58 +0000
Subject: [PATCH 2/2] =?UTF-8?q?feat(agent):=20=E5=A2=9E=E5=8A=A0=E6=97=A5?=
 =?UTF-8?q?=E5=BF=97=E8=AE=B0=E5=BD=95=E5=8A=9F=E8=83=BD=E5=B9=B6=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96=E6=A8=A1=E5=9E=8B=E7=94=9F=E6=88=90=E6=B5=81=E7=A8=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 在 Reflexion_Oneshot 类中添加了日志记录初始化和调用过程的日志记录
- 在 KimiK2Model 中增加了请求发送和响应接收的日志记录
- 优化了模型生成流程，增加了错误处理和重试机制
- 调整了 prompt_for_generation 中的提示内容，增加了 AMD GPU 优化指南
---
 src/agents/reflexion_oneshot.py               |  35 ++-
 .../performance_metrics/performance_utils.py  |  21 +-
 src/main_reflexion_oneshot.py                 |  15 +-
 src/models/KimiK2.py                          |  19 +-
 src/prompts/prompt_for_generation.py          | 106 +++++---
 src/prompts/prompt_for_reflection.py          | 247 +++++++++++-------
 src/temp-1/embedding_triton_kernel.py         | 167 ++++++++++++
 7 files changed, 455 insertions(+), 155 deletions(-)
 create mode 100644 src/temp-1/embedding_triton_kernel.py

diff --git a/src/agents/reflexion_oneshot.py b/src/agents/reflexion_oneshot.py
index d10a345..790f538 100644
--- a/src/agents/reflexion_oneshot.py
+++ b/src/agents/reflexion_oneshot.py
@@ -23,16 +23,29 @@ class Reflexion_Oneshot(Reflexion):
     """
 
     def __init__(self, model: BaseModel, dataset, corpus_path, mem_file=None):
+        logger.info("Initializing Reflexion_Oneshot agent")
+        logger.info(f"Model: {model.__class__.__name__}")
+        logger.info(f"Dataset size: {len(dataset)}")
+        logger.info(f"Corpus path: {corpus_path}")
+        
         self.model = model
         self.dataset = dataset
         self.memories = []
 
+        logger.info("Initializing instruction retriever")
         self.instruction_retriever = BM25Retriever()
         self.instruction_retriever.process(content_input_path=corpus_path)
+        logger.info("Instruction retriever initialized successfully")
+        
+        logger.info("Initializing code retriever")
         self.code_retriever = BM25Retriever(mode="code")
         self.code_retriever.process(content_input_path=corpus_path)
+        logger.info("Code retriever initialized successfully")
 
+        logger.info("Initializing memories")
         self.memory_init(mem_file)
+        logger.info(f"Memories initialized successfully, count: {len(self.memories)}")
+        logger.info("Reflexion_Oneshot agent initialization completed")
 
     def memory_init(self, mem_file=None):
         class Memory(metaclass=MemoryClassMeta, field_names=["ps", 
@@ -199,8 +212,16 @@ def generate_solution(self, mem, temperature=0):
         msg = [
             {"role": "user", "content": text},
         ]
-        response = self.model.generate(msg, temperature=temperature)
-        mem.ps.solution = clear_code(response)
+        
+        # 添加模型调用前的日志
+        logger.info(f"Calling model {self.model.__class__.__name__} for {mem.ps.filename}")
+        try:
+            response = self.model.generate(msg, temperature=temperature)
+            mem.ps.solution = clear_code(response)
+            logger.info(f"Successfully generated solution for {mem.ps.filename}")
+        except Exception as e:
+            logger.error(f"Failed to generate solution for {mem.ps.filename}: {str(e)}")
+            raise
 
         return
 
@@ -220,4 +241,12 @@ def generate_reflexion(self, mem, temperature):
                 "content": reflect_txt
             }
         ]
-        mem.reflection = self.model.generate(reflect_msg, temperature=temperature)
\ No newline at end of file
+        
+        # 添加模型调用前的日志
+        logger.info(f"Calling model {self.model.__class__.__name__} for reflection on {mem.ps.filename}")
+        try:
+            mem.reflection = self.model.generate(reflect_msg, temperature=temperature)
+            logger.info(f"Successfully generated reflection for {mem.ps.filename}")
+        except Exception as e:
+            logger.error(f"Failed to generate reflection for {mem.ps.filename}: {str(e)}")
+            raise
diff --git a/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py b/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py
index 1c77ba1..a3f7d32 100644
--- a/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py
+++ b/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py
@@ -37,12 +37,14 @@ def __init__(
 
         self.input_tensors = []
         self.do_bench_config = do_bench_config()
+        # 添加folder_path属性，默认值会在run_benchmark中被替换
+        self.folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"
 
     def get_input_tensors(self):
         raise NotImplementedError("You must implement this method to get input tensors")
 
     def to_cuda(self, input_tensor):
-        raise NotImplementedError("You must implement this method to get input tensors")
+        raise NotImplementedError("You must implement this method to move tensors to CUDA")
     
     def call_op(self, input_tensor):
         raise NotImplementedError("You must implement this method to call the op")
@@ -97,7 +99,9 @@ def get_do_bench_config(self, warmup=None, rep=None):
             previous_ms = ms
         
         print("MS did not stabilize. Returning default config.")
-        raise NotImplementedError("You must implement this method to make the runtime stable")
+        # Instead of raising an error, set default config
+        self.do_bench_config = do_bench_config()
+        return
 
     def get_runtime(self, op: Callable):
         ms, min_ms, max_ms = triton.testing.do_bench(
@@ -121,7 +125,8 @@ def run_benchmark(self):
             try:
                 input_tensor = self.to_cuda(input_tensor_)
                 # print(input_tensor)
-                op = lambda : self.call_op(input_tensor)            
+                input_tensor_clone = input_tensor  # Create a copy to avoid late binding issues
+                op = lambda : self.call_op(input_tensor_clone)            
                 ms = self.get_runtime(op)
                 gbps = self.get_gbps(input_tensor, ms)
                 tflops = self.get_tflops(input_tensor, ms)
@@ -136,8 +141,12 @@ def run_benchmark(self):
             except Exception as e:
                 print(f"Failed to run benchmark for input tensor. Error: {e}")
             input_tensor = None
-        folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"
+            
+        # 确保folder_path目录存在
+        if not os.path.exists(self.folder_path):
+            os.makedirs(self.folder_path, exist_ok=True)
+            
         file_name = self.op_name + ".json"
-        file_path = os.path.join(folder_path, file_name)
+        file_path = os.path.join(self.folder_path, file_name)
         with open(file_path, 'w', encoding='utf8') as f:
-            json.dump(results, f, indent=4)
+            json.dump(results, f, indent=4)
\ No newline at end of file
diff --git a/src/main_reflexion_oneshot.py b/src/main_reflexion_oneshot.py
index b5d4f9f..f00f4ac 100644
--- a/src/main_reflexion_oneshot.py
+++ b/src/main_reflexion_oneshot.py
@@ -1,17 +1,24 @@
-
 import os
+import logging
 from agents.reflexion_oneshot import Reflexion_Oneshot
 from models.KimiK2 import KimiK2Model
 from dataloaders.TritonBench import TritonBench
 from args_config import load_config
 
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 
 def main():
-    args = load_config("configs/tritonbench_oneshot_config.yaml")
+    args = load_config("/hackathon-agent/src/configs/tritonbench_oneshot_config.yaml")
 
     # setup LLM model
     #model = OpenAIModel(api_key=args.api_key, model_id=args.model_id)
+    logger.info("Initializing KimiK2Model...")
     model = KimiK2Model(api_key=args.api_key, model_id=args.model_id)
+    logger.info("KimiK2Model initialized successfully")
+    
     # setup dataset
     result_path = None
     dataset = TritonBench(statis_path=args.statis_path, 
@@ -25,11 +32,13 @@ def main():
                           target_kernels=args.target_kernels)
 
     # setup agent
+    logger.info("Initializing Reflexion_Oneshot agent...")
     agent = Reflexion_Oneshot(model=model, dataset=dataset, corpus_path=args.corpus_path)
+    logger.info("Reflexion_Oneshot agent initialized successfully")
 
     # run the agent
     agent.run(output_path=args.output_path, multi_thread=args.multi_thread, iteration_num=args.max_iteration, temperature=args.temperature, datalen=None)
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file
diff --git a/src/models/KimiK2.py b/src/models/KimiK2.py
index 9728de8..46970b8 100644
--- a/src/models/KimiK2.py
+++ b/src/models/KimiK2.py
@@ -2,9 +2,12 @@
 from typing import List
 import openai
 from tenacity import retry, stop_after_attempt, wait_random_exponential
+import logging
 
 from models.Base import BaseModel
 
+# 创建日志记录器
+logger = logging.getLogger(__name__)
 
 class KimiK2Model(BaseModel):
     def __init__(self, 
@@ -25,6 +28,7 @@ def __init__(self,
             #api_key = "wisemodel-xxvqzbsnecjtoxufxodx",
             api_key=api_key,
             base_url = "https://laiyeapi.aifoundrys.com:7443/v1",
+            # base_url = "https://api.moonshot.cn/v1",
             default_headers = headers
         )
         
@@ -35,6 +39,9 @@ def generate(self,
                  presence_penalty=0, 
                  frequency_penalty=0, 
                  max_tokens=5000) -> str:
+        logger.info(f"Sending request to model {self.model_id} with {len(messages)} messages")
+        logger.debug(f"Messages content: {messages}")
+        
         response = self.client.chat.completions.create(
             model=self.model_id,
             messages=messages,
@@ -44,8 +51,12 @@ def generate(self,
         )
         
         if not response or not hasattr(response, 'choices') or len(response.choices) == 0:
-            raise ValueError("No response choices returned from the API.")
-
-        return response.choices[0].message.content
-   
+            error_msg = "No response choices returned from the API."
+            logger.error(error_msg)
+            raise ValueError(error_msg)
 
+        result = response.choices[0].message.content
+        logger.info(f"Received response from model {self.model_id}, response length: {len(result)} characters")
+        logger.debug(f"Response content: {result[:200]}..." if len(result) > 200 else f"Response content: {result}")
+        
+        return result
\ No newline at end of file
diff --git a/src/prompts/prompt_for_generation.py b/src/prompts/prompt_for_generation.py
index 6d07461..18c3684 100644
--- a/src/prompts/prompt_for_generation.py
+++ b/src/prompts/prompt_for_generation.py
@@ -1,7 +1,6 @@
-
 prompt = """
-You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
-Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
+You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
+Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:
 
 **Target Platform:** AMD GPU (ROCm)
 
@@ -38,6 +37,19 @@
     *   **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
     *   **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
 8.  **Triton Version:** Assume Triton version 3.1.0 or later.
+9.  **AMD GPU Optimization Guidelines:**
+    *   Consider wavefront size of 64 threads for AMD GPUs (different from NVIDIA's 32 threads).
+    *   Optimize memory access patterns for AMD's memory hierarchy to ensure coalesced access.
+    *   Pay attention to shared memory bank conflicts which are more critical on AMD GPUs - try to access shared memory in a strided pattern that avoids conflicts.
+    *   Use appropriate block sizes that align with AMD GPU architecture (e.g., multiple of 64 for wavefront efficiency).
+    *   Consider using `tl.inline_asm_elementwise` for AMD-specific intrinsics if needed.
+    *   Minimize register pressure to avoid spills which significantly impact performance on AMD GPUs.
+10. **Performance Optimization:**
+    *   Implement autotuning when possible with sensible default values for BLOCK_M, BLOCK_N, BLOCK_K, num_warps, and num_stages.
+    *   Consider memory coalescing for global memory accesses.
+    *   Minimize divergent branching within wavefronts.
+    *   Optimize data reuse in shared memory.
+    *   Consider using tensor cores (MFMA instructions) on AMD GPUs when applicable.
 
 **FINAL VERIFICATION:**
 Before completing, verify:
@@ -45,13 +57,15 @@
 2. ALL function calls exactly match their definitions in terms of parameter counts and names.
 3. No functions are called without being defined.
 4. No parameters are missing from your implementations.
+5. The code follows AMD GPU optimization guidelines.
+6. Autotuning configurations are properly set up if applicable.
 
 **Generated AMD ROCm Compatible Triton Kernel Code:**
 """
 
 prompt_rocm = """
-You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
-Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
+You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
+Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:
 
 **Target Platform:** AMD GPU (ROCm)
 
@@ -88,44 +102,58 @@
     *   **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
     *   **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
 8.  **Triton Version:** Assume Triton version 3.2.0 or later.
-9.  Maximize performance by exploring the following:
-i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps. 
-ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability. 
-iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
-Primary Autotuning Fields (Mandatory)
-1. BLOCK_M, BLOCK_N, BLOCK_K
-   * Tile sizes for GEMM or other tensor contractions.
-   * Larger blocks improve compute density, but reduce grid-level parallelism.
-   * Explore wide range of values like:
-     * BLOCK: [32, ..., 128, ..., 2048, ...] 
-   * Adjust based on memory reuse and L2 cache locality.
-2. num_stages=n
-   * Controls pipeline depth for kernel execution.
-   * Rules for setting this:
-     * 1 if no GEMM.
-     * 2 if a single GEMM (e.g., GEMM + ReLU).
-     * 1 if two GEMMs are fused (e.g., Flash Attention).
-   * Optimize for latency and execution overlap.
-3. num_warps
-    * Controls number of warps (groups of 64 threads) to launch per block.
-    * If it is too low then underutilization -> kernel runs slow.
-    * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
-    * You must choose a sweet spot by trying out integer range of 1 to 16.
-    * You MUST NOT try the range beyond 16, it is NOT VALID. 
-Examples of Autotuning Setup
-Here's how Triton kernels should be decorated to allow autotuning:
+9.  **Performance Optimization Strategy:**
+    Maximize performance by exploring the following:
+    i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps.
+    ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
+    iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
+    
+    **Primary Autotuning Fields (Mandatory)**
+    1. BLOCK_M, BLOCK_N, BLOCK_K
+       * Tile sizes for GEMM or other tensor contractions.
+       * Larger blocks improve compute density, but reduce grid-level parallelism.
+       * Explore wide range of values like:
+         * BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
+       * Adjust based on memory reuse and L2 cache locality.
+    2. num_stages=n
+       * Controls pipeline depth for kernel execution.
+       * Rules for setting this:
+         * 1 if no GEMM.
+         * 2 if a single GEMM (e.g., GEMM + ReLU).
+         * 1 if two GEMMs are fused (e.g., Flash Attention).
+       * Optimize for latency and execution overlap.
+    3. num_warps
+       * Controls number of warps (groups of 64 threads) to launch per block.
+       * If it is too low then underutilization -> kernel runs slow.
+       * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
+       * You must choose a sweet spot by trying out integer range of 1 to 16.
+       * You MUST NOT try the range beyond 16, it is NOT VALID.
+       
+    **Examples of Autotuning Setup**
+    Here's how Triton kernels should be decorated to allow autotuning:
     * key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
     * BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
     * You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
-For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
+    For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
+
+    NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
 
-NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
+    ```python
+    def grid(args: dict[str, Any]) -> tuple[int]:
+        # This creates a 1D grid of size (C * D, )
+        return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
+    ```
 
-def grid(args: dict[str, Any]) -> tuple[int]:
-    # This creates a 1D grid of size (C * D, )
-    return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
+    The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
+    pid_m = tl.program_id(0)  # Gets the ID for the first dimension
+    pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
 
-The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
-pid_m = tl.program_id(0)  # Gets the ID for the first dimension
-pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
+10. **AMD GPU Specific Optimization Considerations:**
+    When implementing and optimizing the kernel, consider these critical AMD GPU characteristics:
+    *   AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
+    *   Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
+    *   Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
+    *   Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
+    *   Appropriate block sizes that align with AMD GPU architecture for better occupancy
+    *   Consider using AMD-specific intrinsics through `tl.inline_asm_elementwise` for maximum performance
 """
\ No newline at end of file
diff --git a/src/prompts/prompt_for_reflection.py b/src/prompts/prompt_for_reflection.py
index fe3f936..4cddd3d 100644
--- a/src/prompts/prompt_for_reflection.py
+++ b/src/prompts/prompt_for_reflection.py
@@ -1,6 +1,5 @@
 prompt = """
-You are an expert in writing Triton operators for efficient GPU programming. Analyze the failed test cases and provide insights 
-on why the solution failed and how it could be improved. Be specific about the issues found.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. Analyze the failed test cases and provide detailed insights on why the solution failed and how it could be improved. Be specific about the issues found and provide actionable recommendations.
 
 **Original problem:**
 
@@ -14,19 +13,36 @@
 
 {test_result}
 
+**Thinking Process:**
+Before providing your reflection, think through the following steps:
+1. Identify the type of failure (syntax error, runtime error, correctness issue, performance issue)
+2. Locate the specific part of the code causing the failure
+3. Analyze why this part is problematic in the context of AMD GPU architecture
+4. Propose specific fixes or improvements
+
 **Important Instructions:**
-- Think before writing the reflection and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the reflection. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
+**AMD GPU and Performance Optimization Focus:**
+When analyzing the code, pay special attention to these key areas:
+1. AMD GPU compatibility issues (e.g., CUDA-specific code, incorrect memory access patterns)
+2. Wavefront efficiency (64-thread wavefronts on AMD vs 32-thread warps on NVIDIA)
+3. Memory coalescing and bank conflict issues
+4. Shared memory usage optimization
+5. Register pressure and spillage
+6. Autotuning parameters (BLOCK_M, BLOCK_N, BLOCK_K, num_warps, num_stages) for AMD architecture
+7. Divergent branching within wavefronts
 """
 
 prompt_exe = """
-You are an expert in writing Triton operators for efficient GPU programming. Analyze the failed test cases and provide insights 
-on why the solution failed and how it could be improved. Be specific about the issues found.
-Runnable test is used to test if the code can be successfully executed.
-Correctness test is used to test if the output of the code is correct, i.e. if the code does implement the functionality required in the original problem.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. Analyze the failed test cases and provide detailed insights on why the solution failed and how it could be improved. Be specific about the issues found and provide actionable recommendations.
+
+Different types of tests have been run:
+- Runnable test: Checks if the code can be successfully executed (compiles and runs without crashing)
+- Correctness test: Checks if the output of the code is correct (implements the required functionality)
 
 **Original problem:**
 
@@ -44,20 +60,34 @@
 
 {exe_test_result}
 
+**Thinking Process:**
+Before providing your reflection, think through the following steps:
+1. Identify the type of failure (syntax error, runtime error, correctness issue, performance issue)
+2. Locate the specific part of the code causing the failure
+3. Analyze why this part is problematic in the context of AMD GPU architecture
+4. Propose specific fixes or improvements
+
 **Important Instructions:**
-- Think before writing the reflection and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the reflection. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
+**AMD GPU and Performance Optimization Focus:**
+When analyzing the code, pay special attention to these key areas:
+1. AMD GPU compatibility issues (e.g., CUDA-specific code, incorrect memory access patterns)
+2. Wavefront efficiency (64-thread wavefronts on AMD vs 32-thread warps on NVIDIA)
+3. Memory coalescing and bank conflict issues
+4. Shared memory usage optimization
+5. Register pressure and spillage
+6. Autotuning parameters (BLOCK_M, BLOCK_N, BLOCK_K, num_warps, num_stages) for AMD architecture
+7. Divergent branching within wavefronts
 """
 
 prompt_ga = """
-You are an expert in writing Triton operators for efficient GPU programming. 
-Analyze this Triton code and its performance(latency in ms and efficiency in TFLOPS or GB/s), and give a summary about the optimization strategy that the code uses.
-Provide insights on how to generate a new code with better performance. 
-You can use optimization strategies such as Memory access efficiency, Hardware resource utilization, IR analysis, Assembly analysis, Kernel occupancy, 
-TorchInductor with Triton tuning knobs and Auto-tunable kernel configurations and environment variables.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. 
+Analyze this Triton code and its performance (latency in ms and efficiency in TFLOPS or GB/s), and provide a detailed summary of the optimization strategy that the code uses.
+Provide specific insights on how to generate a new code with better performance. 
 
 **Original problem:**
 
@@ -73,17 +103,32 @@
 
 efficiency(TFLOPS, GB/s): {efficiency}
 
+**Thinking Process:**
+Before providing your optimization insights, think through the following steps:
+1. Analyze the current performance bottlenecks
+2. Identify which parts of the code contribute most to the latency
+3. Evaluate how well the code utilizes AMD GPU resources
+4. Suggest specific optimizations that could improve performance
+
 **Important Instructions:**
-- Think before writing the optimization and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the optimization insights. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function and parameter names, counts, or order.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
+**AMD GPU Optimization Focus:**
+When analyzing the code, pay special attention to these key optimization areas:
+1. Wavefront efficiency (64-thread wavefronts on AMD) - ensure high occupancy with proper block sizes
+2. Memory coalescing and bank conflict issues specific to AMD GPUs
+3. Shared memory usage optimization - minimize conflicts and maximize reuse
+4. Register pressure and spillage - keep register usage low to prevent performance degradation
+5. Autotuning parameters (BLOCK_M, BLOCK_N, BLOCK_K, num_warps, num_stages) for AMD architecture
+6. Divergent branching within wavefronts - minimize conditional execution paths
+7. Memory hierarchy utilization - optimize for L1/L2 cache and global memory access patterns
 """
 
 prompt_rocm = """
-You are an expert in writing Triton operators for efficient GPU programming. Analyze the failed test cases and provide insights 
-on why the solution failed and how it could be improved. Be specific about the issues found.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. Analyze the failed test cases and provide detailed insights on why the solution failed and how it could be improved. Be specific about the issues found and provide actionable recommendations.
 
 **Original problem:**
 
@@ -97,22 +142,31 @@
 
 {test_result}
 
+**Thinking Process:**
+Before providing your reflection, think through the following steps:
+1. Identify the type of failure (syntax error, runtime error, correctness issue, performance issue)
+2. Locate the specific part of the code causing the failure
+3. Analyze why this part is problematic in the context of AMD GPU architecture
+4. Propose specific fixes or improvements
+
 **Important Instructions:**
-- Think before writing the reflection and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the reflection. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
-Maximize performance by exploring the following:
-i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps. 
-ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability. 
-iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
-Primary Autotuning Fields (Mandatory)
+**Performance Optimization Strategy:**
+Maximize performance by exploring the following areas:
+i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps - find optimal values for AMD GPU architecture.
+ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
+iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
+
+**Primary Autotuning Fields (Mandatory)**
 1. BLOCK_M, BLOCK_N, BLOCK_K
    * Tile sizes for GEMM or other tensor contractions.
    * Larger blocks improve compute density, but reduce grid-level parallelism.
    * Explore wide range of values like:
-     * BLOCK: [32, ..., 128, ..., 2048, ...] 
+     * BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
    * Adjust based on memory reuse and L2 cache locality.
 2. num_stages=n
    * Controls pipeline depth for kernel execution.
@@ -127,29 +181,22 @@
     * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
     * You must choose a sweet spot by trying out integer range of 1 to 16.
     * You MUST NOT try the range beyond 16, it is NOT VALID. 
-Examples of Autotuning Setup
-Here's how Triton kernels should be decorated to allow autotuning:
-    * key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
-    * BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
-    * You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
-For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
-
-NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
-
-def grid(args: dict[str, Any]) -> tuple[int]:
-    # This creates a 1D grid of size (C * D, )
-    return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
-
-The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
-pid_m = tl.program_id(0)  # Gets the ID for the first dimension
-pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
+
+**AMD GPU Specific Optimization Considerations:**
+When analyzing and providing optimization suggestions, consider these critical AMD GPU characteristics:
+1. AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
+2. Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
+3. Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
+4. Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
+5. Appropriate block sizes that align with AMD GPU architecture for better occupancy
 """
 
 prompt_exe_rocm = """
-You are an expert in writing Triton operators for efficient GPU programming. Analyze the failed test cases and provide insights 
-on why the solution failed and how it could be improved. Be specific about the issues found.
-Runnable test is used to test if the code can be successfully executed.
-Correctness test is used to test if the output of the code is correct, i.e. if the code does implement the functionality required in the original problem.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. Analyze the failed test cases and provide detailed insights on why the solution failed and how it could be improved. Be specific about the issues found and provide actionable recommendations.
+
+Different types of tests have been run:
+- Runnable test: Checks if the code can be successfully executed (compiles and runs without crashing)
+- Correctness test: Checks if the output of the code is correct (implements the required functionality)
 
 **Original problem:**
 
@@ -167,22 +214,31 @@ def grid(args: dict[str, Any]) -> tuple[int]:
 
 {exe_test_result}
 
+**Thinking Process:**
+Before providing your reflection, think through the following steps:
+1. Identify the type of failure (syntax error, runtime error, correctness issue, performance issue)
+2. Locate the specific part of the code causing the failure
+3. Analyze why this part is problematic in the context of AMD GPU architecture
+4. Propose specific fixes or improvements
+
 **Important Instructions:**
-- Think before writing the reflection and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the reflection. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
-Maximize performance by exploring the following:
-i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps. 
-ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability. 
-iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
-Primary Autotuning Fields (Mandatory)
+**Performance Optimization Strategy:**
+Maximize performance by exploring the following areas:
+i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps - find optimal values for AMD GPU architecture.
+ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
+iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
+
+**Primary Autotuning Fields (Mandatory)**
 1. BLOCK_M, BLOCK_N, BLOCK_K
    * Tile sizes for GEMM or other tensor contractions.
    * Larger blocks improve compute density, but reduce grid-level parallelism.
    * Explore wide range of values like:
-     * BLOCK: [32, ..., 128, ..., 2048, ...] 
+     * BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
    * Adjust based on memory reuse and L2 cache locality.
 2. num_stages=n
    * Controls pipeline depth for kernel execution.
@@ -197,30 +253,20 @@ def grid(args: dict[str, Any]) -> tuple[int]:
     * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
     * You must choose a sweet spot by trying out integer range of 1 to 16.
     * You MUST NOT try the range beyond 16, it is NOT VALID. 
-Examples of Autotuning Setup
-Here's how Triton kernels should be decorated to allow autotuning:
-    * key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
-    * BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
-    * You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
-For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
-
-NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
-
-def grid(args: dict[str, Any]) -> tuple[int]:
-    # This creates a 1D grid of size (C * D, )
-    return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
-
-The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
-pid_m = tl.program_id(0)  # Gets the ID for the first dimension
-pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
+
+**AMD GPU Specific Optimization Considerations:**
+When analyzing and providing optimization suggestions, consider these critical AMD GPU characteristics:
+1. AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
+2. Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
+3. Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
+4. Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
+5. Appropriate block sizes that align with AMD GPU architecture for better occupancy
 """
 
 prompt_ga_rocm = """
-You are an expert in writing Triton operators for efficient GPU programming. 
-Analyze this Triton code and its performance(speedup[vs reference kernel] for e.g. (1.6x) and efficiency in TFLOPS or GB/s), and give a summary about the optimization strategy that the code uses.
-Provide insights on how to generate a new code with better performance. 
-You can use optimization strategies such as Memory access efficiency, Hardware resource utilization, IR analysis, Assembly analysis, Kernel occupancy, 
-TorchInductor with Triton tuning knobs and Auto-tunable kernel configurations and environment variables.
+You are an expert in writing and optimizing Triton operators for high-performance GPU programming, especially targeting AMD GPUs with ROCm. 
+Analyze this Triton code and its performance (speedup vs reference kernel, e.g. 1.6x and efficiency in TFLOPS or GB/s), and provide a detailed summary of the optimization strategy that the code uses.
+Provide specific insights on how to generate a new code with better performance. 
 
 **Original problem:**
 
@@ -236,22 +282,31 @@ def grid(args: dict[str, Any]) -> tuple[int]:
 
 efficiency(TFLOPS, GB/s): {efficiency}
 
+**Thinking Process:**
+Before providing your optimization insights, think through the following steps:
+1. Analyze the current performance bottlenecks
+2. Identify which parts of the code contribute most to the latency
+3. Evaluate how well the code utilizes AMD GPU resources
+4. Suggest specific optimizations that could improve performance
+
 **Important Instructions:**
-- Think before writing the optimization and no more explanation is required after the reflection.
+- Think carefully and thoroughly before writing the optimization insights. No additional explanation is required after the reflection.
 - You should not suggest changes to the name of the function and parameter names, counts, or order.
-- generate the reflection wrapped in a code block with the tag `reflection`, e.g.
+- Generate the reflection wrapped in a code block with the tag `reflection`, e.g.
 "```markdown<your reflections>```"
 
-Maximize performance by exploring the following:
-i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps. 
-ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability. 
-iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
-Primary Autotuning Fields (Mandatory)
+**Performance Optimization Strategy:**
+Maximize performance by exploring the following areas:
+i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps - find optimal values for AMD GPU architecture.
+ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
+iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
+
+**Primary Autotuning Fields (Mandatory)**
 1. BLOCK_M, BLOCK_N, BLOCK_K
    * Tile sizes for GEMM or other tensor contractions.
    * Larger blocks improve compute density, but reduce grid-level parallelism.
    * Explore wide range of values like:
-     * BLOCK: [32, ..., 128, ..., 2048, ...] 
+     * BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
    * Adjust based on memory reuse and L2 cache locality.
 2. num_stages=n
    * Controls pipeline depth for kernel execution.
@@ -266,20 +321,12 @@ def grid(args: dict[str, Any]) -> tuple[int]:
     * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
     * You must choose a sweet spot by trying out integer range of 1 to 16.
     * You MUST NOT try the range beyond 16, it is NOT VALID. 
-Examples of Autotuning Setup
-Here's how Triton kernels should be decorated to allow autotuning:
-    * key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
-    * BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
-    * You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
-For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
-
-NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
-
-def grid(args: dict[str, Any]) -> tuple[int]:
-    # This creates a 1D grid of size (C * D, )
-    return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
-
-The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
-pid_m = tl.program_id(0)  # Gets the ID for the first dimension
-pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
+
+**AMD GPU Specific Optimization Considerations:**
+When analyzing and providing optimization suggestions, consider these critical AMD GPU characteristics:
+1. AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
+2. Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
+3. Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
+4. Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
+5. Appropriate block sizes that align with AMD GPU architecture for better occupancy
 """
\ No newline at end of file
diff --git a/src/temp-1/embedding_triton_kernel.py b/src/temp-1/embedding_triton_kernel.py
new file mode 100644
index 0000000..4673edf
--- /dev/null
+++ b/src/temp-1/embedding_triton_kernel.py
@@ -0,0 +1,167 @@
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def embedding_kernel(
+    token_ids_ptr,
+    weights_ptr,
+    out_ptr,
+    vob_start_id,
+    vob_end_id,
+    stride_b,            # token_ids dim-0 stride
+    stride_t,            # token_ids dim-1 stride
+    stride_weights_v,    # weights dim-0 stride
+    stride_weights_h,    # weights dim-1 stride
+    stride_out_b,        # out dim-0 stride
+    stride_out_t,        # out dim-1 stride
+    stride_out_d,        # out dim-2 stride
+    HID,                 # total hidden size
+    HID_DMODEL_TILE: tl.constexpr
+):
+    pid_b = tl.program_id(0)   # batch
+    pid_t = tl.program_id(1)   # sequence time
+    pid_h = tl.program_id(2)   # hidden tile
+
+    seq_off = pid_b * stride_b + pid_t * stride_t
+    tid = tl.load(token_ids_ptr + seq_off).to(tl.int32)
+    tid = tl.where(tid < vob_start_id, vob_start_id,
+                   tl.where(tid >= vob_end_id, vob_end_id - 1, tid))
+
+    offs_hd = pid_h * HID_DMODEL_TILE + tl.arange(0, HID_DMODEL_TILE)
+    mask_hd = offs_hd < HID
+
+    w_ptrs = weights_ptr + tid * stride_weights_v + offs_hd * stride_weights_h
+    vec = tl.load(w_ptrs, mask=mask_hd, other=0.0)
+
+    o_ptrs = out_ptr + pid_b * stride_out_b + pid_t * stride_out_t + offs_hd * stride_out_d
+    tl.store(o_ptrs, vec, mask=mask_hd)
+
+@triton.autotune(
+    configs=[
+        triton.Config({'HID_DMODEL_TILE': 64},  num_warps=2, num_stages=2),
+        triton.Config({'HID_DMODEL_TILE': 128}, num_warps=4, num_stages=2),
+        triton.Config({'HID_DMODEL_TILE': 256}, num_warps=8, num_stages=4),
+    ],
+    key=['HID']
+)
+@triton.jit
+def embedding_kernel_autotuned(
+    token_ids_ptr,
+    weights_ptr,
+    out_ptr,
+    vob_start_id,
+    vob_end_id,
+    stride_b,
+    stride_t,
+    stride_weights_v,
+    stride_weights_h,
+    stride_out_b,
+    stride_out_t,
+    stride_out_d,
+    HID,
+    HID_DMODEL_TILE: tl.constexpr,
+):
+    embedding_kernel(
+        token_ids_ptr,
+        weights_ptr,
+        out_ptr,
+        vob_start_id,
+        vob_end_id,
+        stride_b,
+        stride_t,
+        stride_weights_v,
+        stride_weights_h,
+        stride_out_b,
+        stride_out_t,
+        stride_out_d,
+        HID,
+        HID_DMODEL_TILE=HID_DMODEL_TILE,
+    )
+
+def embedding(token_ids: torch.Tensor, weights: torch.Tensor, vob_start_id: int, vob_end_id: int, out: torch.Tensor = None):
+    if token_ids.dim() == 1:
+        token_ids = token_ids.unsqueeze(0)
+    batch, seq = token_ids.shape
+    vocab, hidden = weights.shape
+
+    if out is None:
+        out = torch.empty((batch, seq, hidden), dtype=weights.dtype, device=weights.device)
+    else:
+        assert out.shape == (batch, seq, hidden), "output tensor shape mismatch"
+
+    grid = lambda META: (batch, seq, triton.cdiv(hidden, META['HID_DMODEL_TILE']))
+
+    embedding_kernel_autotuned[grid](
+        token_ids, weights, out,
+        vob_start_id, vob_end_id,
+        token_ids.stride(0), token_ids.stride(1),
+        weights.stride(0),   weights.stride(1),
+        out.stride(0),       out.stride(1),     out.stride(2),
+        hidden,
+    )
+    return out
+
+##################################################################################################################################################
+
+
+
+import torch
+
+def test_embedding():
+    # 参数定义
+    vocab_size = 1000         # 词汇表大小
+    embedding_dim = 512       # 嵌入维度
+    sequence_length = 128     # 输入序列长度
+    vob_start_id = 10         # 词汇表起始 ID
+    vob_end_id = 1000         # 词汇表结束 ID
+
+    # 创建测试输入张量
+    input_ids = torch.randint(
+        vob_start_id, vob_end_id, (sequence_length,), dtype=torch.int32, device='cuda'
+    )
+    weight = torch.randn(
+        vocab_size, embedding_dim, dtype=torch.float32, device='cuda'
+    )
+    out = torch.zeros(
+        sequence_length, embedding_dim, dtype=torch.float32, device='cuda'
+    )
+
+    # 调用嵌入函数
+    embedding(input_ids, weight, vob_start_id, vob_end_id, out)
+
+    # 保存结果
+    results = {}
+    results['test_case_1'] = out.clone()
+
+    # 测试不同的输入
+    input_ids = torch.randint(
+        vob_start_id, vob_end_id, (sequence_length,), dtype=torch.int32, device='cuda'
+    )
+    embedding(input_ids, weight, vob_start_id, vob_end_id, out)
+    results['test_case_2'] = out.clone()
+
+    # 测试不同的词汇表范围
+    vob_start_id = 0
+    vob_end_id = 500
+    input_ids = torch.randint(
+        vob_start_id, vob_end_id, (sequence_length,), dtype=torch.int32, device='cuda'
+    )
+    embedding(input_ids, weight, vob_start_id, vob_end_id, out)
+    results['test_case_3'] = out.clone()
+
+    # 测试不同的嵌入维度
+    embedding_dim = 256
+    weight = torch.randn(
+        vocab_size, embedding_dim, dtype=torch.float32, device='cuda'
+    )
+    out = torch.zeros(
+        sequence_length, embedding_dim, dtype=torch.float32, device='cuda'
+    )
+    embedding(input_ids, weight, vob_start_id, vob_end_id, out)
+    results['test_case_4'] = out.clone()
+
+    return results
+
+result_gold = test_embedding()