Skip to content

Commit

Permalink
Refactor: Enhance PR diff retrieval to fetch overall and per-commit d…
Browse files Browse the repository at this point in the history
…iffs via GitHub API, enabling more detailed review.
  • Loading branch information
utsmannn committed Jan 25, 2025
1 parent 1b47247 commit 4b5d1bf
Showing 1 changed file with 161 additions and 72 deletions.
233 changes: 161 additions & 72 deletions src/review.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import json
import time
import html
import base64
from github import Github
import google.generativeai as genai
from google.ai.generativelanguage_v1beta.types import content
Expand Down Expand Up @@ -32,7 +33,11 @@ def wrapper(*args, **kwargs):


def get_pr_number() -> int:
pr_num = os.getenv('GITHUB_REF').split('/')[-2]
"""Retrieve the PR number from environment variables"""
pr_ref = os.getenv('GITHUB_REF', '')
if not pr_ref:
raise ValueError("Environment variable GITHUB_REF not found.")
pr_num = pr_ref.split('/')[-2]
return int(pr_num)


Expand All @@ -42,47 +47,74 @@ def sanitize_input(text: str, max_length=2000) -> str:


@retry(max_retries=3, delay=2)
def get_pr_diff() -> str:
"""Retrieve PR diff from GitHub API"""
def get_pr_diff() -> dict:
"""Retrieve PR diffs from GitHub API"""
try:
pr_number = get_pr_number()
repo_name = os.getenv('GITHUB_REPOSITORY')
github_token = os.getenv('GITHUB_TOKEN')

if not repo_name or not github_token:
raise ValueError("Environment variables GITHUB_REPOSITORY or GITHUB_TOKEN not found.")

headers = {
'Authorization': f'Bearer {github_token}',
'Accept': 'application/vnd.github.v3.diff'
}

response = requests.get(
# Fetch the overall diff between the target branch and the PR branch
overall_diff_response = requests.get(
f'https://api.github.com/repos/{repo_name}/pulls/{pr_number}',
headers=headers
)
response.raise_for_status()
overall_diff_response.raise_for_status()
overall_diff = overall_diff_response.text

# Fetch the list of commits in the PR
commits_response = requests.get(
f'https://api.github.com/repos/{repo_name}/pulls/{pr_number}/commits',
headers=headers
)
commits_response.raise_for_status()
commits = commits_response.json()

commit_diffs = []
for i in range(1, len(commits)):
current_commit = commits[i]
previous_commit = commits[i - 1]

current_sha = current_commit['sha']
previous_sha = previous_commit['sha']

# Fetch the diff for the current commit compared to the previous commit
commit_diff_response = requests.get(
f'https://api.github.com/repos/{repo_name}/compare/{previous_sha}...{current_sha}',
headers={'Authorization': f'Bearer {github_token}', 'Accept': 'application/vnd.github.v3.diff'}
)
commit_diff_response.raise_for_status()
commit_diff = commit_diff_response.text

commit_diffs.append({
'commit_sha': current_sha,
'commit_message': current_commit['commit']['message'].strip(),
'diff': commit_diff
})

return response.text
return {
'overall_diff': overall_diff,
'commit_diffs': commit_diffs
}

except requests.exceptions.RequestException as e:
raise RuntimeError(f"Failed to fetch PR diff: {str(e)}")
raise RuntimeError(f"Failed to fetch PR diffs: {str(e)}")

def find_line_number(file_path: str, line_str: str) -> int:
"""Cari line number berdasarkan konten line string"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
# Normalisasi whitespace untuk matching lebih akurat
if line.strip() == line_str.strip():
return line_num
return -1 # Jika tidak ditemukan
except Exception as e:
raise RuntimeError(f"Gagal baca file {file_path}: {str(e)}")

@retry(max_retries=3, delay=2)
def generate_review(diff: str, model_name: str, custom_instructions: str) -> dict:
"""Generate structured code review using Gemini"""
print("diff:")
print(diff)
print("diff===")
"""Generate structured code review using Gemini"""
try:
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))

Expand Down Expand Up @@ -126,7 +158,7 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic
model = genai.GenerativeModel(
model_name=model_name,
generation_config=generation_config,
system_instruction="you are git diff analyzer for give me review line per line code"
system_instruction="You are a git diff analyzer that provides line-by-line code reviews."
)

safe_diff = sanitize_input(diff, 50000)
Expand All @@ -137,32 +169,34 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic
{safe_diff}
**Requirements:**
- Berikan line string yang spesifik dari kode
- Contoh response:
{instructions}
- Example:
{{
"response": [
{{
"comment": "Potential SQL injection",
"file_path": "src/db.py",
"line_string": "query = f\"SELECT * FROM users WHERE id = user_id\""
}}
]
],
"summary_advice": "Overall recommendations"
}}"""
response = model.generate_content(prompt)
raw_json = response.text.strip().replace('```json', '').replace('```', '')

# Validate JSON structure
result = json.loads(response.text)
result = json.loads(raw_json)
print("result:")
print(result)
print("result====")
# if not all(key in result for key in ['response', 'summary_advice']):
# raise ValueError("Invalid response structure")
#
# for comment in result['response']:
# if not all(k in comment for k in ['comment', 'file_path']):
# raise ValueError("Invalid comment format")
# comment['line'] = comment.get('line', 0)

# Validate that required fields exist
if not all(key in result for key in ['response', 'summary_advice']):
raise ValueError("Invalid response structure")

for comment in result['response']:
if not all(k in comment for k in ['comment', 'file_path', 'line_string']):
raise ValueError("Invalid comment format")

return result

Expand All @@ -174,56 +208,99 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic

@retry(max_retries=2, delay=3)
def post_comment(comments: list):
"""Post comments dengan offset +4 dan pengecekan batas"""
"""Post comments by finding accurate line numbers based on line_string"""
try:
pr_number = get_pr_number()
github_token = os.getenv('GITHUB_TOKEN')
repo_name = os.getenv('GITHUB_REPOSITORY')

if not github_token or not repo_name:
raise ValueError("Environment variables GITHUB_TOKEN or GITHUB_REPOSITORY not found.")

g = Github(github_token)
repo = g.get_repo(repo_name)
pr = repo.get_pull(pr_number)
files = pr.get_files() # Dapatkan list file yang diubah
files = pr.get_files() # Get list of changed files

comment_payload = []

for comment in comments:
# Adjust line number dengan offset +4
original_line = find_line_number(comment['file_path'], comment['line_string'])
adjusted_line = original_line + 3 # Offset disesuaikan dengan pola diff
file_path = comment['file_path']
line_string = comment['line_string']
issue_comment = comment['comment']

# Dapatkan diff untuk file yang bersangkutan
target_file = next((f for f in files if f.filename == comment['file_path']), None)
# Find the file in the PR
target_file = next((f for f in files if f.filename == file_path), None)

if not target_file or not target_file.patch:
print(f"File {comment['file_path']} tidak ditemukan dalam PR")
print(f"File {file_path} not found in PR or does not have a patch.")
continue

# Get the latest content of the file from the PR's head commit
file_contents_response = requests.get(
f"https://api.github.com/repos/{repo_name}/contents/{file_path}?ref={pr.head.sha}",
headers={'Authorization': f'Bearer {github_token}'}
)
if file_contents_response.status_code != 200:
print(f"Failed to fetch contents of file {file_path}: {file_contents_response.status_code}")
continue

# Hitung jumlah lines dalam diff
diff_lines = target_file.patch.split('\n')
max_position = len(diff_lines)
file_contents = file_contents_response.json()
file_text = base64.b64decode(file_contents['content']).decode('utf-8').splitlines()

# Pastikan adjusted_line tidak melebihi batas diff
safe_position = min(adjusted_line, max_position)
# Find the line number based on line_string
line_number = None
for idx, line in enumerate(file_text, start=1):
if line.strip() == line_string.strip():
line_number = idx
break

# Jika line asli 0, jangan gunakan offset
final_position = safe_position if original_line > 0 else original_line
if line_number is None:
print(f"Line string '{line_string}' not found in file {file_path}.")
continue

# Add comment with accurate line number
comment_payload.append({
"path": comment['file_path'],
"position": final_position,
"body": f"**Finding**: {comment['comment']}\n(Original line: {original_line})"
"path": file_path,
"position": line_number, # Accurate line number
"body": f"**Finding**: {issue_comment}\n(Line: {line_number})"
})

pr.create_review(
event="COMMENT",
comments=comment_payload
)
if comment_payload:
pr.create_review(
event="COMMENT",
comments=comment_payload
)
else:
print("No comments to post.")

except Exception as e:
raise RuntimeError(f"Failed to post comments: {str(e)}")


@retry(max_retries=2, delay=3)
def post_summary(summary: str, footer_text: str):
"""Post the summary advice as an issue comment"""
try:
pr_number = get_pr_number()
github_token = os.getenv('GITHUB_TOKEN')
repo_name = os.getenv('GITHUB_REPOSITORY')

if not github_token or not repo_name:
raise ValueError("Environment variables GITHUB_TOKEN or GITHUB_REPOSITORY not found.")

g = Github(github_token)
repo = g.get_repo(repo_name)
pr = repo.get_pull(pr_number)

pr.create_issue_comment(
f"## 📝 {footer_text}\n\n{summary}"
)

except Exception as e:
raise RuntimeError(f"Failed to post summary comment: {str(e)}")


def main():
"""Main execution workflow"""
try:
Expand All @@ -233,30 +310,42 @@ def main():
max_diff_size = int(os.getenv('INPUT_MAX_DIFF_SIZE', '100000'))
footer_text = os.getenv('INPUT_FOOTER_TEXT', 'AI Code Review Report')

# Get PR diff
diff_content = get_pr_diff()
if len(diff_content) > max_diff_size:
print(f"⚠️ Diff size ({len(diff_content)} bytes) exceeds limit")
# Get PR diffs
diffs = get_pr_diff()
overall_diff = diffs['overall_diff']
commit_diffs = diffs['commit_diffs']

if len(overall_diff) > max_diff_size:
print(f"⚠️ Diff size ({len(overall_diff)} bytes) exceeds limit")
return

# Generate review
review_data = generate_review(diff_content, model_name, custom_instructions)
# Generate review for overall diff
overall_review = generate_review(overall_diff, model_name, custom_instructions)

# Post individual comments
# Post summary for overall review
post_summary(overall_review['summary_advice'], footer_text)

print("Debug Overall Review Data:")
print(overall_review)
print("Debug Overall Review Data End")

# Post summary
g = Github(os.getenv('GITHUB_TOKEN'))
repo = g.get_repo(os.getenv('GITHUB_REPOSITORY'))
pr = repo.get_pull(get_pr_number())
pr.create_issue_comment(
f"## 📝 {footer_text}\n\n{review_data['summary_advice']}"
)
# Post individual comments for overall diff
post_comment(overall_review['response'])

# Generate and post reviews for each commit diff
for commit_diff in commit_diffs:
print(f"Processing commit: {commit_diff['commit_sha']} - {commit_diff['commit_message']}")
commit_review = generate_review(commit_diff['diff'], model_name, custom_instructions)

# Post summary for commit review
post_summary(commit_review['summary_advice'], footer_text)

print("Debug Commit Review Data:")
print(commit_review)
print("Debug Commit Review Data End")

print("asuuu..")
print(review_data)
print("asuuu..0")
post_comment(review_data['response'])
# Post individual comments for commit diff
post_comment(commit_review['response'])

print("✅ Review completed successfully")

Expand All @@ -266,4 +355,4 @@ def main():


if __name__ == "__main__":
main()
main()

0 comments on commit 4b5d1bf

Please sign in to comment.