danielhua23 · sealofyou · Aug 24, 2025 · Aug 24, 2025
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,9 @@
 */*/__pycache__/
 */*/*/*/__pycache__/
 */*/*/*/*/__pycache__/
+__pycache__/*
+temp
+
+venv/*
+
+*.pyc
diff --git a/src/agents/reflexion_oneshot.py b/src/agents/reflexion_oneshot.py
@@ -23,16 +23,29 @@ class Reflexion_Oneshot(Reflexion):
     """
 
     def __init__(self, model: BaseModel, dataset, corpus_path, mem_file=None):
+        logger.info("Initializing Reflexion_Oneshot agent")
+        logger.info(f"Model: {model.__class__.__name__}")
+        logger.info(f"Dataset size: {len(dataset)}")
+        logger.info(f"Corpus path: {corpus_path}")
+
         self.model = model
         self.dataset = dataset
         self.memories = []
 
+        logger.info("Initializing instruction retriever")
         self.instruction_retriever = BM25Retriever()
         self.instruction_retriever.process(content_input_path=corpus_path)
+        logger.info("Instruction retriever initialized successfully")
+
+        logger.info("Initializing code retriever")
         self.code_retriever = BM25Retriever(mode="code")
         self.code_retriever.process(content_input_path=corpus_path)
+        logger.info("Code retriever initialized successfully")
 
+        logger.info("Initializing memories")
         self.memory_init(mem_file)
+        logger.info(f"Memories initialized successfully, count: {len(self.memories)}")
+        logger.info("Reflexion_Oneshot agent initialization completed")
 
     def memory_init(self, mem_file=None):
         class Memory(metaclass=MemoryClassMeta, field_names=["ps", 
@@ -199,8 +212,16 @@ def generate_solution(self, mem, temperature=0):
         msg = [
             {"role": "user", "content": text},
         ]
-        response = self.model.generate(msg, temperature=temperature)
-        mem.ps.solution = clear_code(response)
+
+        # 添加模型调用前的日志
+        logger.info(f"Calling model {self.model.__class__.__name__} for {mem.ps.filename}")
+        try:
+            response = self.model.generate(msg, temperature=temperature)
+            mem.ps.solution = clear_code(response)
+            logger.info(f"Successfully generated solution for {mem.ps.filename}")
+        except Exception as e:
+            logger.error(f"Failed to generate solution for {mem.ps.filename}: {str(e)}")
+            raise
 
         return
 
@@ -220,4 +241,12 @@ def generate_reflexion(self, mem, temperature):
                 "content": reflect_txt
             }
         ]
-        mem.reflection = self.model.generate(reflect_msg, temperature=temperature)
+
+        # 添加模型调用前的日志
+        logger.info(f"Calling model {self.model.__class__.__name__} for reflection on {mem.ps.filename}")
+        try:
+            mem.reflection = self.model.generate(reflect_msg, temperature=temperature)
+            logger.info(f"Successfully generated reflection for {mem.ps.filename}")
+        except Exception as e:
+            logger.error(f"Failed to generate reflection for {mem.ps.filename}: {str(e)}")
+            raise
diff --git a/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py b/src/dataloaders/TB_eval/data/performance_metrics/performance_utils.py
@@ -37,12 +37,14 @@ def __init__(
 
         self.input_tensors = []
         self.do_bench_config = do_bench_config()
+        # 添加folder_path属性，默认值会在run_benchmark中被替换
+        self.folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"
 
     def get_input_tensors(self):
         raise NotImplementedError("You must implement this method to get input tensors")
 
     def to_cuda(self, input_tensor):
-        raise NotImplementedError("You must implement this method to get input tensors")
+        raise NotImplementedError("You must implement this method to move tensors to CUDA")
 
     def call_op(self, input_tensor):
         raise NotImplementedError("You must implement this method to call the op")
@@ -97,7 +99,9 @@ def get_do_bench_config(self, warmup=None, rep=None):
             previous_ms = ms
 
         print("MS did not stabilize. Returning default config.")
-        raise NotImplementedError("You must implement this method to make the runtime stable")
+        # Instead of raising an error, set default config
+        self.do_bench_config = do_bench_config()
+        return
 
     def get_runtime(self, op: Callable):
         ms, min_ms, max_ms = triton.testing.do_bench(
@@ -121,7 +125,8 @@ def run_benchmark(self):
             try:
                 input_tensor = self.to_cuda(input_tensor_)
                 # print(input_tensor)
-                op = lambda : self.call_op(input_tensor)            
+                input_tensor_clone = input_tensor  # Create a copy to avoid late binding issues
+                op = lambda : self.call_op(input_tensor_clone)            
                 ms = self.get_runtime(op)
                 gbps = self.get_gbps(input_tensor, ms)
                 tflops = self.get_tflops(input_tensor, ms)
@@ -136,8 +141,12 @@ def run_benchmark(self):
             except Exception as e:
                 print(f"Failed to run benchmark for input tensor. Error: {e}")
             input_tensor = None
-        folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"
+
+        # 确保folder_path目录存在
+        if not os.path.exists(self.folder_path):
+            os.makedirs(self.folder_path, exist_ok=True)
+
         file_name = self.op_name + ".json"
-        file_path = os.path.join(folder_path, file_name)
+        file_path = os.path.join(self.folder_path, file_name)
         with open(file_path, 'w', encoding='utf8') as f:
-            json.dump(results, f, indent=4)
+            json.dump(results, f, indent=4)
diff --git a/src/main_reflexion_oneshot.py b/src/main_reflexion_oneshot.py
@@ -1,17 +1,24 @@
-
 import os
+import logging
 from agents.reflexion_oneshot import Reflexion_Oneshot
 from models.KimiK2 import KimiK2Model
 from dataloaders.TritonBench import TritonBench
 from args_config import load_config
 
+# 配置日志
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
 
 def main():
-    args = load_config("configs/tritonbench_oneshot_config.yaml")
+    args = load_config("/hackathon-agent/src/configs/tritonbench_oneshot_config.yaml")
 
     # setup LLM model
     #model = OpenAIModel(api_key=args.api_key, model_id=args.model_id)
+    logger.info("Initializing KimiK2Model...")
     model = KimiK2Model(api_key=args.api_key, model_id=args.model_id)
+    logger.info("KimiK2Model initialized successfully")
+
     # setup dataset
     result_path = None
     dataset = TritonBench(statis_path=args.statis_path, 
@@ -25,11 +32,13 @@ def main():
                           target_kernels=args.target_kernels)
 
     # setup agent
+    logger.info("Initializing Reflexion_Oneshot agent...")
     agent = Reflexion_Oneshot(model=model, dataset=dataset, corpus_path=args.corpus_path)
+    logger.info("Reflexion_Oneshot agent initialized successfully")
 
     # run the agent
     agent.run(output_path=args.output_path, multi_thread=args.multi_thread, iteration_num=args.max_iteration, temperature=args.temperature, datalen=None)
 
 
 if __name__ == "__main__":
-    main()
+    main()
diff --git a/src/models/KimiK2.py b/src/models/KimiK2.py
@@ -2,9 +2,12 @@
 from typing import List
 import openai
 from tenacity import retry, stop_after_attempt, wait_random_exponential
+import logging
 
 from models.Base import BaseModel
 
+# 创建日志记录器
+logger = logging.getLogger(__name__)
 
 class KimiK2Model(BaseModel):
     def __init__(self, 
@@ -25,6 +28,7 @@ def __init__(self,
             #api_key = "wisemodel-xxvqzbsnecjtoxufxodx",
             api_key=api_key,
             base_url = "https://laiyeapi.aifoundrys.com:7443/v1",
+            # base_url = "https://api.moonshot.cn/v1",
             default_headers = headers
         )
 
@@ -35,6 +39,9 @@ def generate(self,
                  presence_penalty=0, 
                  frequency_penalty=0, 
                  max_tokens=5000) -> str:
+        logger.info(f"Sending request to model {self.model_id} with {len(messages)} messages")
+        logger.debug(f"Messages content: {messages}")
+
         response = self.client.chat.completions.create(
             model=self.model_id,
             messages=messages,
@@ -44,8 +51,12 @@ def generate(self,
         )
 
         if not response or not hasattr(response, 'choices') or len(response.choices) == 0:
-            raise ValueError("No response choices returned from the API.")
-
-        return response.choices[0].message.content
-
+            error_msg = "No response choices returned from the API."
+            logger.error(error_msg)
+            raise ValueError(error_msg)
 
+        result = response.choices[0].message.content
+        logger.info(f"Received response from model {self.model_id}, response length: {len(result)} characters")
+        logger.debug(f"Response content: {result[:200]}..." if len(result) > 200 else f"Response content: {result}")
+
+        return result
diff --git a/src/prompts/prompt_for_generation.py b/src/prompts/prompt_for_generation.py
@@ -1,7 +1,6 @@
-
 prompt = """
-You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
-Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
+You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
+Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:
 
 **Target Platform:** AMD GPU (ROCm)
 
@@ -38,20 +37,35 @@
     *   **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
     *   **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
 8.  **Triton Version:** Assume Triton version 3.1.0 or later.
+9.  **AMD GPU Optimization Guidelines:**
+    *   Consider wavefront size of 64 threads for AMD GPUs (different from NVIDIA's 32 threads).
+    *   Optimize memory access patterns for AMD's memory hierarchy to ensure coalesced access.
+    *   Pay attention to shared memory bank conflicts which are more critical on AMD GPUs - try to access shared memory in a strided pattern that avoids conflicts.
+    *   Use appropriate block sizes that align with AMD GPU architecture (e.g., multiple of 64 for wavefront efficiency).
+    *   Consider using `tl.inline_asm_elementwise` for AMD-specific intrinsics if needed.
+    *   Minimize register pressure to avoid spills which significantly impact performance on AMD GPUs.
+10. **Performance Optimization:**
+    *   Implement autotuning when possible with sensible default values for BLOCK_M, BLOCK_N, BLOCK_K, num_warps, and num_stages.
+    *   Consider memory coalescing for global memory accesses.
+    *   Minimize divergent branching within wavefronts.
+    *   Optimize data reuse in shared memory.
+    *   Consider using tensor cores (MFMA instructions) on AMD GPUs when applicable.
 
 **FINAL VERIFICATION:**
 Before completing, verify:
 1. ALL functions defined in the code have EXACT signatures matching the required function signatures above.
 2. ALL function calls exactly match their definitions in terms of parameter counts and names.
 3. No functions are called without being defined.
 4. No parameters are missing from your implementations.
+5. The code follows AMD GPU optimization guidelines.
+6. Autotuning configurations are properly set up if applicable.
 
 **Generated AMD ROCm Compatible Triton Kernel Code:**
 """
 
 prompt_rocm = """
-You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
-Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
+You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
+Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:
 
 **Target Platform:** AMD GPU (ROCm)
 
@@ -88,44 +102,58 @@
     *   **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
     *   **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
 8.  **Triton Version:** Assume Triton version 3.2.0 or later.
-9.  Maximize performance by exploring the following:
-i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps. 
-ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability. 
-iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
-Primary Autotuning Fields (Mandatory)
-1. BLOCK_M, BLOCK_N, BLOCK_K
-   * Tile sizes for GEMM or other tensor contractions.
-   * Larger blocks improve compute density, but reduce grid-level parallelism.
-   * Explore wide range of values like:
-     * BLOCK: [32, ..., 128, ..., 2048, ...] 
-   * Adjust based on memory reuse and L2 cache locality.
-2. num_stages=n
-   * Controls pipeline depth for kernel execution.
-   * Rules for setting this:
-     * 1 if no GEMM.
-     * 2 if a single GEMM (e.g., GEMM + ReLU).
-     * 1 if two GEMMs are fused (e.g., Flash Attention).
-   * Optimize for latency and execution overlap.
-3. num_warps
-    * Controls number of warps (groups of 64 threads) to launch per block.
-    * If it is too low then underutilization -> kernel runs slow.
-    * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
-    * You must choose a sweet spot by trying out integer range of 1 to 16.
-    * You MUST NOT try the range beyond 16, it is NOT VALID. 
-Examples of Autotuning Setup
-Here's how Triton kernels should be decorated to allow autotuning:
+9.  **Performance Optimization Strategy:**
+    Maximize performance by exploring the following:
+    i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps.
+    ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
+    iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
+
+    **Primary Autotuning Fields (Mandatory)**
+    1. BLOCK_M, BLOCK_N, BLOCK_K
+       * Tile sizes for GEMM or other tensor contractions.
+       * Larger blocks improve compute density, but reduce grid-level parallelism.
+       * Explore wide range of values like:
+         * BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
+       * Adjust based on memory reuse and L2 cache locality.
+    2. num_stages=n
+       * Controls pipeline depth for kernel execution.
+       * Rules for setting this:
+         * 1 if no GEMM.
+         * 2 if a single GEMM (e.g., GEMM + ReLU).
+         * 1 if two GEMMs are fused (e.g., Flash Attention).
+       * Optimize for latency and execution overlap.
+    3. num_warps
+       * Controls number of warps (groups of 64 threads) to launch per block.
+       * If it is too low then underutilization -> kernel runs slow.
+       * If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
+       * You must choose a sweet spot by trying out integer range of 1 to 16.
+       * You MUST NOT try the range beyond 16, it is NOT VALID.
+
+    **Examples of Autotuning Setup**
+    Here's how Triton kernels should be decorated to allow autotuning:
     * key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
     * BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
     * You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
-For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
+    For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
+
+    NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
 
-NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
+    ```python
+    def grid(args: dict[str, Any]) -> tuple[int]:
+        # This creates a 1D grid of size (C * D, )
+        return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
+    ```
 
-def grid(args: dict[str, Any]) -> tuple[int]:
-    # This creates a 1D grid of size (C * D, )
-    return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
+    The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
+    pid_m = tl.program_id(0)  # Gets the ID for the first dimension
+    pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
 
-The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
-pid_m = tl.program_id(0)  # Gets the ID for the first dimension
-pid_n = tl.program_id(1)  # Tries to get ID for a non-existent second dimension
+10. **AMD GPU Specific Optimization Considerations:**
+    When implementing and optimizing the kernel, consider these critical AMD GPU characteristics:
+    *   AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
+    *   Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
+    *   Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
+    *   Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
+    *   Appropriate block sizes that align with AMD GPU architecture for better occupancy
+    *   Consider using AMD-specific intrinsics through `tl.inline_asm_elementwise` for maximum performance
 """