From 976c5adb1b39ccd9e783283a5d9a252a9dcbae5d Mon Sep 17 00:00:00 2001 From: ziad hany Date: Sat, 30 Aug 2025 05:53:53 +0300 Subject: [PATCH] Add initial improver for collect repo fix commits Signed-off-by: ziad hany --- vulnerabilities/improvers/__init__.py | 4 + .../v2_improvers/collect_repo_fix_commits.py | 185 ++++++++++++++++ .../pipelines/v2_improvers/extract_commits.py | 202 ++++++++++++++++++ .../pipelines/v2_improvers/issue_tracker_.py | 76 +++++++ 4 files changed, 467 insertions(+) create mode 100644 vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py create mode 100644 vulnerabilities/pipelines/v2_improvers/extract_commits.py create mode 100644 vulnerabilities/pipelines/v2_improvers/issue_tracker_.py diff --git a/vulnerabilities/improvers/__init__.py b/vulnerabilities/improvers/__init__.py index 1be791241..f95cc6639 100644 --- a/vulnerabilities/improvers/__init__.py +++ b/vulnerabilities/improvers/__init__.py @@ -19,6 +19,9 @@ from vulnerabilities.pipelines import flag_ghost_packages from vulnerabilities.pipelines import populate_vulnerability_summary_pipeline from vulnerabilities.pipelines import remove_duplicate_advisories +from vulnerabilities.pipelines.v2_improvers import ( + collect_repo_fix_commits as collect_repo_fix_commits_v2, +) from vulnerabilities.pipelines.v2_improvers import compute_advisory_todo as compute_advisory_todo_v2 from vulnerabilities.pipelines.v2_improvers import compute_package_risk as compute_package_risk_v2 from vulnerabilities.pipelines.v2_improvers import ( @@ -67,6 +70,7 @@ compute_package_risk_v2.ComputePackageRiskPipeline, compute_version_rank_v2.ComputeVersionRankPipeline, compute_advisory_todo_v2.ComputeToDo, + collect_repo_fix_commits_v2.CollectRepoFixCommitPipeline, compute_advisory_todo.ComputeToDo, ] ) diff --git a/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py b/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py new file mode 100644 index 000000000..a8eac0abb --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/collect_repo_fix_commits.py @@ -0,0 +1,185 @@ +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# VulnerableCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/aboutcode-org/vulnerablecode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +import bisect +import re +from collections import defaultdict +from typing import List +from typing import Optional +from typing import Tuple + +from git import Commit +from git import Repo + +from vulnerabilities.models import AdvisoryV2 +from vulnerabilities.models import CodeFixV2 +from vulnerabilities.pipelines import VulnerableCodePipeline + + +class CollectRepoFixCommitPipeline(VulnerableCodePipeline): + """ + Pipeline to collect fix commits from any git repository. + """ + + pipeline_id = "repo_fix_commit_pipeline" + repositories_url = "git+https://github.com/the-tcpdump-group/tcpdump" + + @classmethod + def steps(cls): + return ( + cls.collect_fix_commits, + cls.store_fix_commits, + ) + + def classify_commit_type(self, commit) -> str: + num_parents = len(commit.parents) + if num_parents == 0: + return "root" + elif num_parents == 1: + return "normal" + else: + return "merge" + + def detect_fix_commit(self, commit) -> str: + """ + Detect whether a commit is a bug-fix or vulnerability-fix commit. + Returns: "vulnerability_fix" or "other" + """ + msg = commit.message.lower() + security_patterns = [ + # CVE identifiers + r"\bcve-[0-9]{4}-[0-9]{4,19}\b", + ] + if any(re.search(p, msg) for p in security_patterns): + return "vulnerability_fix" + return "other" + + def extract_cves(self, text: str) -> List[str]: + if not text: + return [] + cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) + return list({cve.upper() for cve in cves}) + + def get_previous_releases( + self, + release_tags_sorted: List[Tuple[str, int]], + dates: List[int], + commit_date: int, + ) -> List[str]: + index = bisect.bisect_left(dates, commit_date) + return [tag for tag, _ in release_tags_sorted[:index]] + + def get_current_or_next_release( + self, + release_tags_sorted: List[Tuple[str, int]], + dates: List[int], + commit_date: int, + ) -> Optional[str]: + index = bisect.bisect_left(dates, commit_date) + + if index < len(dates) and dates[index] == commit_date: + return release_tags_sorted[index][0] + + if index < len(dates): + return release_tags_sorted[index][0] + + return None + + def get_current_release( + self, repo: Repo, commit: Commit, prev_release_by_date: Optional[str] + ) -> str: + try: + return repo.git.describe("--tags", "--exact-match", commit.hexsha) + except Exception: + pass + + try: + return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) + except Exception: + pass + + if prev_release_by_date: + return prev_release_by_date + + return "NO_TAGS_AVAILABLE" + + def collect_fix_commits(self): + self.log("Processing git repository fix commits.") + repo_url = "https://github.com/the-tcpdump-group/tcpdump" + repo_path = "/home/ziad-hany/PycharmProjects/tcpdump" + + repo = Repo(repo_path) + cve_list = defaultdict(set) + + # Precompute release tags + release_tags = [] + for tag in repo.tags: + try: + release_tags.append((tag.name, tag.commit.committed_date)) + except Exception: + continue + + release_tags_sorted = sorted(release_tags, key=lambda x: x[1]) + dates_array = [date for _, date in release_tags_sorted] + + for commit in repo.iter_commits("--all"): + commit_type = self.classify_commit_type(commit) + fix_type = self.detect_fix_commit(commit) + + if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: + prev_release_list = self.get_previous_releases( + release_tags_sorted, dates_array, commit.committed_date + ) + prev_release_by_date = prev_release_list[-1] if prev_release_list else None + + curr_release = self.get_current_release(repo, commit, prev_release_by_date) + commit_info = { + "hash": commit.hexsha, + "url": repo_url + "/commit/" + commit.hexsha, + "message": commit.message.strip(), + "curr_release": curr_release, + "prev_release": prev_release_list, + "fix_type": fix_type, + } + + for cve_id in self.extract_cves(commit.message.strip()): + commit_url = f"{repo_url}/commit/{commit.hexsha}" + cve_list[cve_id].add(commit_url) + + # Save results into pipeline state + self.fix_commits = {cve: list(commits) for cve, commits in cve_list.items()} + self.log(f"Found {len(self.fix_commits)} unique CVEs with fix commits.") + + def store_fix_commits(self): + if not hasattr(self, "fix_commits"): + self.log("No fix commits collected. Run collect_fix_commits() first.") + return + + created_fix_count = 0 + + # FIXME + for vulnerability_id, commit_urls in self.fix_commits.items(): + advisories = AdvisoryV2.objects.filter(advisory_id__iendswith=vulnerability_id) + + if not advisories.exists(): + self.log(f"No advisories found for vulnerability_id: {vulnerability_id}") + continue + + for adv in advisories: + for impact in adv.impacted_packages.all(): + for package in impact.affecting_packages.all(): + for vcs_url in commit_urls: + code_fix, created = CodeFixV2.objects.get_or_create( + commits=[vcs_url], + advisory=adv, + affected_package=package, + ) + if created: + created_fix_count += 1 + + self.log(f"Stored {created_fix_count} new CodeFixV2 entries.") diff --git a/vulnerabilities/pipelines/v2_improvers/extract_commits.py b/vulnerabilities/pipelines/v2_improvers/extract_commits.py new file mode 100644 index 000000000..1d844c8eb --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/extract_commits.py @@ -0,0 +1,202 @@ +import bisect +import json +import os +import re +from collections import defaultdict +from typing import List +from typing import Optional +from typing import Tuple + +from git import Commit +from git import Repo + + +def clone_repo(repo_url: str, clone_dir: str) -> str: + os.makedirs(clone_dir, exist_ok=True) + try: + print(f"Cloning {repo_url} into {clone_dir}...") + repo = Repo.clone_from(repo_url, clone_dir) + print("Clone successful.") + return repo.working_tree_dir + except Exception as e: + print(f"Failed to clone repository: {e}") + return "" + + +def classify_commit_type(commit) -> str: + num_parents = len(commit.parents) + if num_parents == 0: + return "root" # never a fix + elif num_parents == 1: + return "normal" # main source of fixes + else: + return "merge" # usually not a fix + + +def detect_fix_commit(commit) -> str: + """ + Detect whether a commit is a bug-fix or vulnerability-fix commit. + Returns: "vulnerability_fix", "other" + """ + msg = commit.message.lower() + + security_patterns = [ + # CVE identifiers + r"\bcve-\d{4}-\d{4,}\b", + # Explicitly marked security fixes + r"\bsecurity fix\b", + r"\bfix security issue\b", + r"\bfix(?:es)? for security\b", + # Permission / privilege escalation + r"\bprivilege escalation\b", + r"\bprivesc\b", + r"\bescalat(?:e|ion) of privilege\b", + # No New Privileges / unsafe exec + r"\bno[- ]new[- ]privs\b", + r"\bunsafe exec\b", + # Refcount / UAF (classic kernel vulns, almost always security) + r"\buse[- ]after[- ]free\b", + r"\buaf\b", + r"\brefcount (?:leak|error|overflow|underflow)\b", + r"\bdouble free\b", + # Out-of-bounds (OOB) + r"\bout[- ]of[- ]bounds\b", + r"\boob\b", + # Info leaks (security-relevant, not generic leaks) + r"\binformation leak\b", + r"\binfo leak\b", + r"\bleak (?:kernel|userns|credentials?|mnt_idmap)\b", + # Bypass + r"\bsecurity bypass\b", + r"\baccess control bypass\b", + r"\bpermission check (?:bug|fix|error)\b", + ] + + SECURITY_REGEX = re.compile("|".join(security_patterns), re.IGNORECASE) + + if SECURITY_REGEX.search(msg): + return "vulnerability_fix" + return "other" + + +def extract_cves(text: str) -> List[str]: + if not text: + return [] + cves = re.findall(r"cve-[0-9]{4}-[0-9]{4,19}", text, flags=re.IGNORECASE) + return list({cve.upper() for cve in cves}) + + +def get_previous_releases( + release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int +) -> List[str]: + """ + Get all release tags with commit dates strictly before the given commit date. + release_tags_sorted: list of (tag_name, committed_date), sorted by committed_date + dates: list of commit dates (parallel to release_tags_sorted, sorted ascending) + """ + index = bisect.bisect_left(dates, commit_date) + return [tag for tag, _ in release_tags_sorted[:index]] + + +def get_current_or_next_release( + release_tags_sorted: List[Tuple[str, int]], dates: List[int], commit_date: int +) -> Optional[str]: + """ + Get the current release if commit matches a release date, + otherwise return the next release after the commit date. + """ + index = bisect.bisect_left(dates, commit_date) + + # Exact match → this commit is tagged + if index < len(dates) and dates[index] == commit_date: + return release_tags_sorted[index][0] + + # Otherwise, next release after this commit + if index < len(dates): + return release_tags_sorted[index][0] + + # No next release available + return None + + +def get_current_release(repo: Repo, commit: Commit, prev_release_by_date: Optional[str]) -> str: + """ + Return a non-null release tag for the given commit: + 1) exact tag if commit is tagged + 2) nearest reachable tag (fast, first-parent) + 3) latest prior tag by date (fallback) + 4) "NO_TAGS_AVAILABLE" if repo has no tags at all + """ + # 1) Exact tag at this commit + try: + return repo.git.describe("--tags", "--exact-match", commit.hexsha) + except Exception: + pass + + # 2) Nearest reachable tag along first-parent + try: + return repo.git.describe("--tags", "--abbrev=0", "--first-parent", commit.hexsha) + except Exception: + pass + + # 3) Fallback: latest prior tag by date + if prev_release_by_date: + return prev_release_by_date + + # 4) No tags at all + return "NO_TAGS_AVAILABLE" + + +if __name__ == "__main__": + repo_url = "https://github.com/torvalds/linux" + repo_path = "/home/ziad-hany/PycharmProjects/linux" + + repo = Repo(repo_path) + commits_data = [] + cve_list = defaultdict(set) + + # Precompute and sort release tags by commit date + release_tags = [] + for tag in repo.tags: + try: + release_tags.append((tag.name, tag.commit, tag.commit.committed_date)) + except Exception: + continue + + release_tags_sorted = sorted(release_tags, key=lambda x: x[2]) + + # For previous releases lookup (by date) + release_tags_for_previous = [(tag_name, date) for tag_name, _, date in release_tags_sorted] + dates_array = [date for _, date in release_tags_for_previous] + + for commit in repo.iter_commits("--all"): + commit_type = classify_commit_type(commit) + fix_type = detect_fix_commit(commit) + + if fix_type == "vulnerability_fix" and commit_type in ["normal", "merge"]: + # Compute "previous by date" first so we can feed it as a fallback + prev_release_list = get_previous_releases( + release_tags_for_previous, dates_array, commit.committed_date + ) + prev_release_by_date = prev_release_list[-1] if prev_release_list else None + + curr_release = get_current_release(repo, commit, prev_release_by_date) + + commit_info = { + "hash": commit.hexsha, + "url": repo_url + "/commit/" + commit.hexsha, + "message": commit.message.strip(), + "curr_release": curr_release, + "prev_release": prev_release_list, + "fix_type": fix_type, + } + print(commit_info) + commits_data.append(commit_info) + + # Optional CVE collection + for cve_id in extract_cves(commit.message.strip()): + cve_list[cve_id].add(repo_url + "/commit/" + commit.hexsha) + + result = {cve: list(commits) for cve, commits in cve_list.items()} + print(f"Found {len(result)} unique CVEs") + print(json.dumps(result, indent=2)) diff --git a/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py b/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py new file mode 100644 index 000000000..05edd4008 --- /dev/null +++ b/vulnerabilities/pipelines/v2_improvers/issue_tracker_.py @@ -0,0 +1,76 @@ +from abc import ABC +from abc import abstractmethod +from typing import Dict +from typing import List +from typing import Optional + +import requests + + +class IssueTrackerClient(ABC): + @abstractmethod + def get_issues(self, project: str, **kwargs) -> List[Dict]: + pass + + @abstractmethod + def get_pull_requests(self, project: str, **kwargs) -> List[Dict]: + pass + + @abstractmethod + def get_comments(self, project: str, **kwargs) -> List[Dict]: + pass + + +class IssueTrackerFactory: + @staticmethod + def create_client(platform: str, token: Optional[str] = None, **kwargs) -> IssueTrackerClient: + platform = platform.lower() + + GIT_PLATFORM_CLIENT = { + "github": GitHubClient, + } + + if platform not in GIT_PLATFORM_CLIENT: + raise ValueError(f"Unsupported platform: {platform}") + + return GIT_PLATFORM_CLIENT[platform](token=token, **kwargs) + + +class GitHubClient(IssueTrackerClient): + API_BASE = "https://api.github.com" + + def __init__(self, token: Optional[str] = None): + self.session = requests.Session() + self.session.headers.update({"Accept": "application/vnd.github.v3+json"}) + if token: + self.session.headers["Authorization"] = f"token {token}" + + def _paginate(self, url: str, params: dict = None) -> List[Dict]: + results, page = [], 1 + params = params or {} + while True: + params.update({"per_page": 100, "page": page}) + response = self.session.get(url, params=params) + response.raise_for_status() + data = response.json() + if not data: + break + results.extend(data) + page += 1 + return results + + def get_issues(self, project: str, state: str = "all") -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/issues" + issues = self._paginate(url, {"state": state}) + return [i for i in issues if "pull_request" not in i] + + def get_pull_requests(self, project: str, state: str = "all") -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/pulls" + return self._paginate(url, {"state": state}) + + def get_comments(self, project: str, issue_num: int) -> List[Dict]: + owner, repo = project.split("/") + url = f"{self.API_BASE}/repos/{owner}/{repo}/issues/{issue_num}/comments" + return self._paginate(url)