Refactor: Enhance PR diff retrieval to fetch overall and per-commit d…

…iffs via GitHub API, enabling more detailed review.
utsmannn · Jan 25, 2025 · 4b5d1bf · 4b5d1bf
1 parent 1b47247
commit 4b5d1bf
Showing 1 changed file with 161 additions and 72 deletions.
diff --git a/src/review.py b/src/review.py
@@ -3,6 +3,7 @@
 import json
 import time
 import html
+import base64
 from github import Github
 import google.generativeai as genai
 from google.ai.generativelanguage_v1beta.types import content
@@ -32,7 +33,11 @@ def wrapper(*args, **kwargs):
 
 
 def get_pr_number() -> int:
-    pr_num = os.getenv('GITHUB_REF').split('/')[-2]
+    """Retrieve the PR number from environment variables"""
+    pr_ref = os.getenv('GITHUB_REF', '')
+    if not pr_ref:
+        raise ValueError("Environment variable GITHUB_REF not found.")
+    pr_num = pr_ref.split('/')[-2]
     return int(pr_num)
 
 
@@ -42,47 +47,74 @@ def sanitize_input(text: str, max_length=2000) -> str:
 
 
 @retry(max_retries=3, delay=2)
-def get_pr_diff() -> str:
-    """Retrieve PR diff from GitHub API"""
+def get_pr_diff() -> dict:
+    """Retrieve PR diffs from GitHub API"""
     try:
         pr_number = get_pr_number()
         repo_name = os.getenv('GITHUB_REPOSITORY')
         github_token = os.getenv('GITHUB_TOKEN')
 
+        if not repo_name or not github_token:
+            raise ValueError("Environment variables GITHUB_REPOSITORY or GITHUB_TOKEN not found.")
+
         headers = {
             'Authorization': f'Bearer {github_token}',
             'Accept': 'application/vnd.github.v3.diff'
         }
 
-        response = requests.get(
+        # Fetch the overall diff between the target branch and the PR branch
+        overall_diff_response = requests.get(
             f'https://api.github.com/repos/{repo_name}/pulls/{pr_number}',
             headers=headers
         )
-        response.raise_for_status()
+        overall_diff_response.raise_for_status()
+        overall_diff = overall_diff_response.text
+
+        # Fetch the list of commits in the PR
+        commits_response = requests.get(
+            f'https://api.github.com/repos/{repo_name}/pulls/{pr_number}/commits',
+            headers=headers
+        )
+        commits_response.raise_for_status()
+        commits = commits_response.json()
+
+        commit_diffs = []
+        for i in range(1, len(commits)):
+            current_commit = commits[i]
+            previous_commit = commits[i - 1]
+
+            current_sha = current_commit['sha']
+            previous_sha = previous_commit['sha']
+
+            # Fetch the diff for the current commit compared to the previous commit
+            commit_diff_response = requests.get(
+                f'https://api.github.com/repos/{repo_name}/compare/{previous_sha}...{current_sha}',
+                headers={'Authorization': f'Bearer {github_token}', 'Accept': 'application/vnd.github.v3.diff'}
+            )
+            commit_diff_response.raise_for_status()
+            commit_diff = commit_diff_response.text
+
+            commit_diffs.append({
+                'commit_sha': current_sha,
+                'commit_message': current_commit['commit']['message'].strip(),
+                'diff': commit_diff
+            })
 
-        return response.text
+        return {
+            'overall_diff': overall_diff,
+            'commit_diffs': commit_diffs
+        }
 
     except requests.exceptions.RequestException as e:
-        raise RuntimeError(f"Failed to fetch PR diff: {str(e)}")
+        raise RuntimeError(f"Failed to fetch PR diffs: {str(e)}")
 
-def find_line_number(file_path: str, line_str: str) -> int:
-    """Cari line number berdasarkan konten line string"""
-    try:
-        with open(file_path, 'r', encoding='utf-8') as f:
-            for line_num, line in enumerate(f, 1):
-                # Normalisasi whitespace untuk matching lebih akurat
-                if line.strip() == line_str.strip():
-                    return line_num
-        return -1  # Jika tidak ditemukan
-    except Exception as e:
-        raise RuntimeError(f"Gagal baca file {file_path}: {str(e)}")
 
 @retry(max_retries=3, delay=2)
 def generate_review(diff: str, model_name: str, custom_instructions: str) -> dict:
+    """Generate structured code review using Gemini"""
     print("diff:")
     print(diff)
     print("diff===")
-    """Generate structured code review using Gemini"""
     try:
         genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
 
@@ -126,7 +158,7 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic
         model = genai.GenerativeModel(
             model_name=model_name,
             generation_config=generation_config,
-            system_instruction="you are git diff analyzer for give me review line per line code"
+            system_instruction="You are a git diff analyzer that provides line-by-line code reviews."
         )
 
         safe_diff = sanitize_input(diff, 50000)
@@ -137,32 +169,34 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic
 {safe_diff}
 
 **Requirements:**
-- Berikan line string yang spesifik dari kode
-- Contoh response: 
+{instructions}
+- Example: 
 {{
   "response": [
     {{
       "comment": "Potential SQL injection",
       "file_path": "src/db.py",
       "line_string": "query = f\"SELECT * FROM users WHERE id = user_id\""
     }}
-  ]
+  ],
+  "summary_advice": "Overall recommendations"
 }}"""
         response = model.generate_content(prompt)
         raw_json = response.text.strip().replace('```json', '').replace('```', '')
 
         # Validate JSON structure
-        result = json.loads(response.text)
+        result = json.loads(raw_json)
         print("result:")
         print(result)
         print("result====")
-        # if not all(key in result for key in ['response', 'summary_advice']):
-        #     raise ValueError("Invalid response structure")
-        #
-        # for comment in result['response']:
-        #     if not all(k in comment for k in ['comment', 'file_path']):
-        #         raise ValueError("Invalid comment format")
-        #     comment['line'] = comment.get('line', 0)
+
+        # Validate that required fields exist
+        if not all(key in result for key in ['response', 'summary_advice']):
+            raise ValueError("Invalid response structure")
+
+        for comment in result['response']:
+            if not all(k in comment for k in ['comment', 'file_path', 'line_string']):
+                raise ValueError("Invalid comment format")
 
         return result
 
@@ -174,56 +208,99 @@ def generate_review(diff: str, model_name: str, custom_instructions: str) -> dic
 
 @retry(max_retries=2, delay=3)
 def post_comment(comments: list):
-    """Post comments dengan offset +4 dan pengecekan batas"""
+    """Post comments by finding accurate line numbers based on line_string"""
     try:
         pr_number = get_pr_number()
         github_token = os.getenv('GITHUB_TOKEN')
         repo_name = os.getenv('GITHUB_REPOSITORY')
 
+        if not github_token or not repo_name:
+            raise ValueError("Environment variables GITHUB_TOKEN or GITHUB_REPOSITORY not found.")
+
         g = Github(github_token)
         repo = g.get_repo(repo_name)
         pr = repo.get_pull(pr_number)
-        files = pr.get_files()  # Dapatkan list file yang diubah
+        files = pr.get_files()  # Get list of changed files
 
         comment_payload = []
 
         for comment in comments:
-            # Adjust line number dengan offset +4
-            original_line = find_line_number(comment['file_path'], comment['line_string'])
-            adjusted_line = original_line + 3  # Offset disesuaikan dengan pola diff
+            file_path = comment['file_path']
+            line_string = comment['line_string']
+            issue_comment = comment['comment']
 
-            # Dapatkan diff untuk file yang bersangkutan
-            target_file = next((f for f in files if f.filename == comment['file_path']), None)
+            # Find the file in the PR
+            target_file = next((f for f in files if f.filename == file_path), None)
 
             if not target_file or not target_file.patch:
-                print(f"File {comment['file_path']} tidak ditemukan dalam PR")
+                print(f"File {file_path} not found in PR or does not have a patch.")
+                continue
+
+            # Get the latest content of the file from the PR's head commit
+            file_contents_response = requests.get(
+                f"https://api.github.com/repos/{repo_name}/contents/{file_path}?ref={pr.head.sha}",
+                headers={'Authorization': f'Bearer {github_token}'}
+            )
+            if file_contents_response.status_code != 200:
+                print(f"Failed to fetch contents of file {file_path}: {file_contents_response.status_code}")
                 continue
 
-            # Hitung jumlah lines dalam diff
-            diff_lines = target_file.patch.split('\n')
-            max_position = len(diff_lines)
+            file_contents = file_contents_response.json()
+            file_text = base64.b64decode(file_contents['content']).decode('utf-8').splitlines()
 
-            # Pastikan adjusted_line tidak melebihi batas diff
-            safe_position = min(adjusted_line, max_position)
+            # Find the line number based on line_string
+            line_number = None
+            for idx, line in enumerate(file_text, start=1):
+                if line.strip() == line_string.strip():
+                    line_number = idx
+                    break
 
-            # Jika line asli 0, jangan gunakan offset
-            final_position = safe_position if original_line > 0 else original_line
+            if line_number is None:
+                print(f"Line string '{line_string}' not found in file {file_path}.")
+                continue
 
+            # Add comment with accurate line number
             comment_payload.append({
-                "path": comment['file_path'],
-                "position": final_position,
-                "body": f"**Finding**: {comment['comment']}\n(Original line: {original_line})"
+                "path": file_path,
+                "position": line_number,  # Accurate line number
+                "body": f"**Finding**: {issue_comment}\n(Line: {line_number})"
             })
 
-        pr.create_review(
-            event="COMMENT",
-            comments=comment_payload
-        )
+        if comment_payload:
+            pr.create_review(
+                event="COMMENT",
+                comments=comment_payload
+            )
+        else:
+            print("No comments to post.")
 
     except Exception as e:
         raise RuntimeError(f"Failed to post comments: {str(e)}")
 
 
+@retry(max_retries=2, delay=3)
+def post_summary(summary: str, footer_text: str):
+    """Post the summary advice as an issue comment"""
+    try:
+        pr_number = get_pr_number()
+        github_token = os.getenv('GITHUB_TOKEN')
+        repo_name = os.getenv('GITHUB_REPOSITORY')
+
+        if not github_token or not repo_name:
+            raise ValueError("Environment variables GITHUB_TOKEN or GITHUB_REPOSITORY not found.")
+
+        g = Github(github_token)
+        repo = g.get_repo(repo_name)
+        pr = repo.get_pull(pr_number)
+
+        pr.create_issue_comment(
+            f"## 📝 {footer_text}\n\n{summary}"
+        )
+
+    except Exception as e:
+        raise RuntimeError(f"Failed to post summary comment: {str(e)}")
+
+
 def main():
     """Main execution workflow"""
     try:
@@ -233,30 +310,42 @@ def main():
         max_diff_size = int(os.getenv('INPUT_MAX_DIFF_SIZE', '100000'))
         footer_text = os.getenv('INPUT_FOOTER_TEXT', 'AI Code Review Report')
 
-        # Get PR diff
-        diff_content = get_pr_diff()
-        if len(diff_content) > max_diff_size:
-            print(f"⚠️ Diff size ({len(diff_content)} bytes) exceeds limit")
+        # Get PR diffs
+        diffs = get_pr_diff()
+        overall_diff = diffs['overall_diff']
+        commit_diffs = diffs['commit_diffs']
+
+        if len(overall_diff) > max_diff_size:
+            print(f"⚠️ Diff size ({len(overall_diff)} bytes) exceeds limit")
             return
 
-        # Generate review
-        review_data = generate_review(diff_content, model_name, custom_instructions)
+        # Generate review for overall diff
+        overall_review = generate_review(overall_diff, model_name, custom_instructions)
 
-        # Post individual comments
+        # Post summary for overall review
+        post_summary(overall_review['summary_advice'], footer_text)
 
+        print("Debug Overall Review Data:")
+        print(overall_review)
+        print("Debug Overall Review Data End")
 
-        # Post summary
-        g = Github(os.getenv('GITHUB_TOKEN'))
-        repo = g.get_repo(os.getenv('GITHUB_REPOSITORY'))
-        pr = repo.get_pull(get_pr_number())
-        pr.create_issue_comment(
-            f"## 📝 {footer_text}\n\n{review_data['summary_advice']}"
-        )
+        # Post individual comments for overall diff
+        post_comment(overall_review['response'])
+
+        # Generate and post reviews for each commit diff
+        for commit_diff in commit_diffs:
+            print(f"Processing commit: {commit_diff['commit_sha']} - {commit_diff['commit_message']}")
+            commit_review = generate_review(commit_diff['diff'], model_name, custom_instructions)
+
+            # Post summary for commit review
+            post_summary(commit_review['summary_advice'], footer_text)
+
+            print("Debug Commit Review Data:")
+            print(commit_review)
+            print("Debug Commit Review Data End")
 
-        print("asuuu..")
-        print(review_data)
-        print("asuuu..0")
-        post_comment(review_data['response'])
+            # Post individual comments for commit diff
+            post_comment(commit_review['response'])
 
         print("✅ Review completed successfully")
 
@@ -266,4 +355,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()