Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@
*/*/__pycache__/
*/*/*/*/__pycache__/
*/*/*/*/*/__pycache__/
__pycache__/*
temp

venv/*

*.pyc
35 changes: 32 additions & 3 deletions src/agents/reflexion_oneshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,29 @@ class Reflexion_Oneshot(Reflexion):
"""

def __init__(self, model: BaseModel, dataset, corpus_path, mem_file=None):
logger.info("Initializing Reflexion_Oneshot agent")
logger.info(f"Model: {model.__class__.__name__}")
logger.info(f"Dataset size: {len(dataset)}")
logger.info(f"Corpus path: {corpus_path}")

self.model = model
self.dataset = dataset
self.memories = []

logger.info("Initializing instruction retriever")
self.instruction_retriever = BM25Retriever()
self.instruction_retriever.process(content_input_path=corpus_path)
logger.info("Instruction retriever initialized successfully")

logger.info("Initializing code retriever")
self.code_retriever = BM25Retriever(mode="code")
self.code_retriever.process(content_input_path=corpus_path)
logger.info("Code retriever initialized successfully")

logger.info("Initializing memories")
self.memory_init(mem_file)
logger.info(f"Memories initialized successfully, count: {len(self.memories)}")
logger.info("Reflexion_Oneshot agent initialization completed")

def memory_init(self, mem_file=None):
class Memory(metaclass=MemoryClassMeta, field_names=["ps",
Expand Down Expand Up @@ -199,8 +212,16 @@ def generate_solution(self, mem, temperature=0):
msg = [
{"role": "user", "content": text},
]
response = self.model.generate(msg, temperature=temperature)
mem.ps.solution = clear_code(response)

# 添加模型调用前的日志
logger.info(f"Calling model {self.model.__class__.__name__} for {mem.ps.filename}")
try:
response = self.model.generate(msg, temperature=temperature)
mem.ps.solution = clear_code(response)
logger.info(f"Successfully generated solution for {mem.ps.filename}")
except Exception as e:
logger.error(f"Failed to generate solution for {mem.ps.filename}: {str(e)}")
raise

return

Expand All @@ -220,4 +241,12 @@ def generate_reflexion(self, mem, temperature):
"content": reflect_txt
}
]
mem.reflection = self.model.generate(reflect_msg, temperature=temperature)

# 添加模型调用前的日志
logger.info(f"Calling model {self.model.__class__.__name__} for reflection on {mem.ps.filename}")
try:
mem.reflection = self.model.generate(reflect_msg, temperature=temperature)
logger.info(f"Successfully generated reflection for {mem.ps.filename}")
except Exception as e:
logger.error(f"Failed to generate reflection for {mem.ps.filename}: {str(e)}")
raise
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,14 @@ def __init__(

self.input_tensors = []
self.do_bench_config = do_bench_config()
# 添加folder_path属性,默认值会在run_benchmark中被替换
self.folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"

def get_input_tensors(self):
raise NotImplementedError("You must implement this method to get input tensors")

def to_cuda(self, input_tensor):
raise NotImplementedError("You must implement this method to get input tensors")
raise NotImplementedError("You must implement this method to move tensors to CUDA")

def call_op(self, input_tensor):
raise NotImplementedError("You must implement this method to call the op")
Expand Down Expand Up @@ -97,7 +99,9 @@ def get_do_bench_config(self, warmup=None, rep=None):
previous_ms = ms

print("MS did not stabilize. Returning default config.")
raise NotImplementedError("You must implement this method to make the runtime stable")
# Instead of raising an error, set default config
self.do_bench_config = do_bench_config()
return

def get_runtime(self, op: Callable):
ms, min_ms, max_ms = triton.testing.do_bench(
Expand All @@ -121,7 +125,8 @@ def run_benchmark(self):
try:
input_tensor = self.to_cuda(input_tensor_)
# print(input_tensor)
op = lambda : self.call_op(input_tensor)
input_tensor_clone = input_tensor # Create a copy to avoid late binding issues
op = lambda : self.call_op(input_tensor_clone)
ms = self.get_runtime(op)
gbps = self.get_gbps(input_tensor, ms)
tflops = self.get_tflops(input_tensor, ms)
Expand All @@ -136,8 +141,12 @@ def run_benchmark(self):
except Exception as e:
print(f"Failed to run benchmark for input tensor. Error: {e}")
input_tensor = None
folder_path = "../outputs/optimagent_gpt41_tritonbench_perf_results"

# 确保folder_path目录存在
if not os.path.exists(self.folder_path):
os.makedirs(self.folder_path, exist_ok=True)

file_name = self.op_name + ".json"
file_path = os.path.join(folder_path, file_name)
file_path = os.path.join(self.folder_path, file_name)
with open(file_path, 'w', encoding='utf8') as f:
json.dump(results, f, indent=4)
json.dump(results, f, indent=4)
15 changes: 12 additions & 3 deletions src/main_reflexion_oneshot.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,24 @@

import os
import logging
from agents.reflexion_oneshot import Reflexion_Oneshot
from models.KimiK2 import KimiK2Model
from dataloaders.TritonBench import TritonBench
from args_config import load_config

# 配置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


def main():
args = load_config("configs/tritonbench_oneshot_config.yaml")
args = load_config("/hackathon-agent/src/configs/tritonbench_oneshot_config.yaml")

# setup LLM model
#model = OpenAIModel(api_key=args.api_key, model_id=args.model_id)
logger.info("Initializing KimiK2Model...")
model = KimiK2Model(api_key=args.api_key, model_id=args.model_id)
logger.info("KimiK2Model initialized successfully")

# setup dataset
result_path = None
dataset = TritonBench(statis_path=args.statis_path,
Expand All @@ -25,11 +32,13 @@ def main():
target_kernels=args.target_kernels)

# setup agent
logger.info("Initializing Reflexion_Oneshot agent...")
agent = Reflexion_Oneshot(model=model, dataset=dataset, corpus_path=args.corpus_path)
logger.info("Reflexion_Oneshot agent initialized successfully")

# run the agent
agent.run(output_path=args.output_path, multi_thread=args.multi_thread, iteration_num=args.max_iteration, temperature=args.temperature, datalen=None)


if __name__ == "__main__":
main()
main()
19 changes: 15 additions & 4 deletions src/models/KimiK2.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@
from typing import List
import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential
import logging

from models.Base import BaseModel

# 创建日志记录器
logger = logging.getLogger(__name__)

class KimiK2Model(BaseModel):
def __init__(self,
Expand All @@ -25,6 +28,7 @@ def __init__(self,
#api_key = "wisemodel-xxvqzbsnecjtoxufxodx",
api_key=api_key,
base_url = "https://laiyeapi.aifoundrys.com:7443/v1",
# base_url = "https://api.moonshot.cn/v1",
default_headers = headers
)

Expand All @@ -35,6 +39,9 @@ def generate(self,
presence_penalty=0,
frequency_penalty=0,
max_tokens=5000) -> str:
logger.info(f"Sending request to model {self.model_id} with {len(messages)} messages")
logger.debug(f"Messages content: {messages}")

response = self.client.chat.completions.create(
model=self.model_id,
messages=messages,
Expand All @@ -44,8 +51,12 @@ def generate(self,
)

if not response or not hasattr(response, 'choices') or len(response.choices) == 0:
raise ValueError("No response choices returned from the API.")

return response.choices[0].message.content

error_msg = "No response choices returned from the API."
logger.error(error_msg)
raise ValueError(error_msg)

result = response.choices[0].message.content
logger.info(f"Received response from model {self.model_id}, response length: {len(result)} characters")
logger.debug(f"Response content: {result[:200]}..." if len(result) > 200 else f"Response content: {result}")

return result
106 changes: 67 additions & 39 deletions src/prompts/prompt_for_generation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@

prompt = """
You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:

**Target Platform:** AMD GPU (ROCm)

Expand Down Expand Up @@ -38,20 +37,35 @@
* **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
* **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
8. **Triton Version:** Assume Triton version 3.1.0 or later.
9. **AMD GPU Optimization Guidelines:**
* Consider wavefront size of 64 threads for AMD GPUs (different from NVIDIA's 32 threads).
* Optimize memory access patterns for AMD's memory hierarchy to ensure coalesced access.
* Pay attention to shared memory bank conflicts which are more critical on AMD GPUs - try to access shared memory in a strided pattern that avoids conflicts.
* Use appropriate block sizes that align with AMD GPU architecture (e.g., multiple of 64 for wavefront efficiency).
* Consider using `tl.inline_asm_elementwise` for AMD-specific intrinsics if needed.
* Minimize register pressure to avoid spills which significantly impact performance on AMD GPUs.
10. **Performance Optimization:**
* Implement autotuning when possible with sensible default values for BLOCK_M, BLOCK_N, BLOCK_K, num_warps, and num_stages.
* Consider memory coalescing for global memory accesses.
* Minimize divergent branching within wavefronts.
* Optimize data reuse in shared memory.
* Consider using tensor cores (MFMA instructions) on AMD GPUs when applicable.

**FINAL VERIFICATION:**
Before completing, verify:
1. ALL functions defined in the code have EXACT signatures matching the required function signatures above.
2. ALL function calls exactly match their definitions in terms of parameter counts and names.
3. No functions are called without being defined.
4. No parameters are missing from your implementations.
5. The code follows AMD GPU optimization guidelines.
6. Autotuning configurations are properly set up if applicable.

**Generated AMD ROCm Compatible Triton Kernel Code:**
"""

prompt_rocm = """
You are an expert Python programmer specializing in NVIDIA Triton kernels, specifically targeting **AMD GPUs using the ROCm environment**.
Your task is to generate a Python code snippet containing a Triton kernel based on the following request:
You are an expert Python programmer specializing in Triton kernels for high-performance computing, with deep expertise in AMD GPU optimization using the ROCm environment.
Your task is to generate a Python code snippet containing a high-performance Triton kernel based on the following request, specifically optimized for AMD GPUs:

**Target Platform:** AMD GPU (ROCm)

Expand Down Expand Up @@ -88,44 +102,58 @@
* **`tl.arange`:** Arguments `start` and `end` **must be `tl.constexpr`**.
* **Math:** Use functions from `tl.math` where available (e.g., `tl.math.exp`, `tl.math.sqrt`). Check function existence; avoid assuming functions like `tanh` or `log1p` exist if they don't in `tl.math`.
8. **Triton Version:** Assume Triton version 3.2.0 or later.
9. Maximize performance by exploring the following:
i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps.
ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
iii. exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.
Primary Autotuning Fields (Mandatory)
1. BLOCK_M, BLOCK_N, BLOCK_K
* Tile sizes for GEMM or other tensor contractions.
* Larger blocks improve compute density, but reduce grid-level parallelism.
* Explore wide range of values like:
* BLOCK: [32, ..., 128, ..., 2048, ...]
* Adjust based on memory reuse and L2 cache locality.
2. num_stages=n
* Controls pipeline depth for kernel execution.
* Rules for setting this:
* 1 if no GEMM.
* 2 if a single GEMM (e.g., GEMM + ReLU).
* 1 if two GEMMs are fused (e.g., Flash Attention).
* Optimize for latency and execution overlap.
3. num_warps
* Controls number of warps (groups of 64 threads) to launch per block.
* If it is too low then underutilization -> kernel runs slow.
* If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
* You must choose a sweet spot by trying out integer range of 1 to 16.
* You MUST NOT try the range beyond 16, it is NOT VALID.
Examples of Autotuning Setup
Here's how Triton kernels should be decorated to allow autotuning:
9. **Performance Optimization Strategy:**
Maximize performance by exploring the following:
i. Autotuning key parameters BLOCK_SIZE, num_stages, num_warps.
ii. Better algorithmic implementation (e.g., naive softmax vs online softmax vs fused softmax), better memory access patterns and numerical stability.
iii. Exploring all possible operator fusion strategies within the kernel while adhering to resource constraints.

**Primary Autotuning Fields (Mandatory)**
1. BLOCK_M, BLOCK_N, BLOCK_K
* Tile sizes for GEMM or other tensor contractions.
* Larger blocks improve compute density, but reduce grid-level parallelism.
* Explore wide range of values like:
* BLOCK: [32, 64, 128, 256, 512] - optimal values for AMD GPU wavefront efficiency
* Adjust based on memory reuse and L2 cache locality.
2. num_stages=n
* Controls pipeline depth for kernel execution.
* Rules for setting this:
* 1 if no GEMM.
* 2 if a single GEMM (e.g., GEMM + ReLU).
* 1 if two GEMMs are fused (e.g., Flash Attention).
* Optimize for latency and execution overlap.
3. num_warps
* Controls number of warps (groups of 64 threads) to launch per block.
* If it is too low then underutilization -> kernel runs slow.
* If it is too high then register spill happens and shared memory is overused -> kernel runs slow.
* You must choose a sweet spot by trying out integer range of 1 to 16.
* You MUST NOT try the range beyond 16, it is NOT VALID.

**Examples of Autotuning Setup**
Here's how Triton kernels should be decorated to allow autotuning:
* key argument indicates the variables that change and trigger autotune to re-run. This is a must argument and you must not miss this.
* BLOCK_M refers to the chunk of variable M that will be used for compute by a thread at a time.
* You must ensure that variables used in the triton.Config should not be passed as arguments to the triton kernel.
For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.
For example: the following autotune config receives BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K, GROUP_SIZE_M, num_warps, and num_stages as input arguments. Hence the triton kernel must not receive these arguments as inputs in the wrapper function. You must comment/delete any such instances.

NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.

NOTE: If you face kernel timeout issues, check if Grid and Program ID Mismatch exists or not for example The kernel is launched with a 1-dimensional (1D) grid, but inside the kernel, it attempts to read program IDs from a 2-dimensional (2D) grid etc.
```python
def grid(args: dict[str, Any]) -> tuple[int]:
# This creates a 1D grid of size (C * D, )
return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
```

def grid(args: dict[str, Any]) -> tuple[int]:
# This creates a 1D grid of size (C * D, )
return (triton.cdiv(M, args["BLOCK_SIZE_M"]) * triton.cdiv(N, args["BLOCK_SIZE_N"]), )
The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
pid_m = tl.program_id(0) # Gets the ID for the first dimension
pid_n = tl.program_id(1) # Tries to get ID for a non-existent second dimension

The grid is calculated as a single integer, creating a 1D grid, however the kernel might try to get two separate program IDs, pid_m and pid_n, as if it were a 2D grid:
pid_m = tl.program_id(0) # Gets the ID for the first dimension
pid_n = tl.program_id(1) # Tries to get ID for a non-existent second dimension
10. **AMD GPU Specific Optimization Considerations:**
When implementing and optimizing the kernel, consider these critical AMD GPU characteristics:
* AMD GPU wavefront size of 64 threads (different from NVIDIA's 32 threads) - ensure your block sizes are multiples of 64 for optimal occupancy
* Memory coalescing patterns optimal for AMD architecture - sequential threads should access sequential memory locations
* Shared memory bank conflicts which are more critical on AMD GPUs - use appropriate access patterns to avoid conflicts
* Register usage optimization to avoid spills - keep register usage low to prevent performance degradation
* Appropriate block sizes that align with AMD GPU architecture for better occupancy
* Consider using AMD-specific intrinsics through `tl.inline_asm_elementwise` for maximum performance
"""
Loading