From d8238d35d9cabc8d0db6c803144fdb8ef496d657 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 9 Aug 2025 13:40:58 +0000 Subject: [PATCH 1/2] Remove GitHub-specific token validation and generalize authentication Co-authored-by: nicoragne --- src/gitingest/clone.py | 4 +- src/gitingest/utils/auth.py | 8 +--- src/gitingest/utils/git_utils.py | 46 ++++++---------------- src/server/query_processor.py | 5 +-- tests/test_git_utils.py | 65 +++++++++----------------------- 5 files changed, 35 insertions(+), 93 deletions(-) diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index d05381b1..e4d11171 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -40,7 +40,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: config : CloneConfig The configuration for cloning the repository. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Raises ------ @@ -84,7 +84,7 @@ async def clone_repo(config: CloneConfig, *, token: str | None = None) -> None: logger.debug("Resolved commit", extra={"commit": commit}) clone_cmd = ["git"] - if token and is_github_host(url): + if token: clone_cmd += ["-c", create_git_auth_header(token, url=url)] clone_cmd += ["clone", "--single-branch", "--no-checkout", "--depth=1"] diff --git a/src/gitingest/utils/auth.py b/src/gitingest/utils/auth.py index c2ff1328..aa0e3ca1 100644 --- a/src/gitingest/utils/auth.py +++ b/src/gitingest/utils/auth.py @@ -4,8 +4,6 @@ import os -from gitingest.utils.git_utils import validate_github_token - def resolve_token(token: str | None) -> str | None: """Resolve the token to use for the query. @@ -13,7 +11,7 @@ def resolve_token(token: str | None) -> str | None: Parameters ---------- token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- @@ -21,7 +19,5 @@ def resolve_token(token: str | None) -> str | None: The resolved token. """ - token = token or os.getenv("GITHUB_TOKEN") - if token: - validate_github_token(token) + token = token or os.getenv("GITHUB_TOKEN") # Keep env var name for backward compatibility return token diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index daf4056d..a86a12e8 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -4,17 +4,16 @@ import asyncio import base64 -import re import sys from pathlib import Path -from typing import TYPE_CHECKING, Final, Iterable +from typing import TYPE_CHECKING, Iterable from urllib.parse import urlparse import httpx from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND from gitingest.utils.compat_func import removesuffix -from gitingest.utils.exceptions import InvalidGitHubTokenError + from gitingest.utils.logging_config import get_logger if TYPE_CHECKING: @@ -23,10 +22,7 @@ # Initialize logger for this module logger = get_logger(__name__) -# GitHub Personal-Access tokens (classic + fine-grained). -# - ghp_ / gho_ / ghu_ / ghs_ / ghr_ → 36 alphanumerics -# - github_pat_ → 22 alphanumerics + "_" + 59 alphanumerics -_GITHUB_PAT_PATTERN: Final[str] = r"^(?:gh[pousr]_[A-Za-z0-9]{36}|github_pat_[A-Za-z0-9]{22}_[A-Za-z0-9]{59})$" + def is_github_host(url: str) -> bool: @@ -263,9 +259,9 @@ def create_git_command(base_cmd: list[str], local_path: str, url: str, token: st local_path : str The local path where the git command should be executed. url : str - The repository URL to check if it's a GitHub repository. + The repository URL for authentication. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- @@ -274,21 +270,20 @@ def create_git_command(base_cmd: list[str], local_path: str, url: str, token: st """ cmd = [*base_cmd, "-C", local_path] - if token and is_github_host(url): + if token: cmd += ["-c", create_git_auth_header(token, url=url)] return cmd -def create_git_auth_header(token: str, url: str = "https://github.com") -> str: - """Create a Basic authentication header for GitHub git operations. +def create_git_auth_header(token: str, url: str) -> str: + """Create a Basic authentication header for git operations. Parameters ---------- token : str - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. url : str - The GitHub URL to create the authentication header for. - Defaults to "https://github.com" if not provided. + The repository URL to create the authentication header for. Returns ------- @@ -298,35 +293,18 @@ def create_git_auth_header(token: str, url: str = "https://github.com") -> str: Raises ------ ValueError - If the URL is not a valid GitHub repository URL. + If the URL is not a valid repository URL. """ hostname = urlparse(url).hostname if not hostname: - msg = f"Invalid GitHub URL: {url!r}" + msg = f"Invalid repository URL: {url!r}" raise ValueError(msg) basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() return f"http.https://{hostname}/.extraheader=Authorization: Basic {basic}" -def validate_github_token(token: str) -> None: - """Validate the format of a GitHub Personal Access Token. - - Parameters - ---------- - token : str - GitHub personal access token (PAT) for accessing private repositories. - - Raises - ------ - InvalidGitHubTokenError - If the token format is invalid. - - """ - if not re.fullmatch(_GITHUB_PAT_PATTERN, token): - raise InvalidGitHubTokenError - async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None: """Configure sparse-checkout for a partially cloned repository. diff --git a/src/server/query_processor.py b/src/server/query_processor.py index 03f52f16..3aa8667c 100644 --- a/src/server/query_processor.py +++ b/src/server/query_processor.py @@ -9,7 +9,7 @@ from gitingest.clone import clone_repo from gitingest.ingestion import ingest_query from gitingest.query_parser import parse_remote_repo -from gitingest.utils.git_utils import resolve_commit, validate_github_token +from gitingest.utils.git_utils import resolve_commit from gitingest.utils.logging_config import get_logger from gitingest.utils.pattern_utils import process_patterns from server.models import IngestErrorResponse, IngestResponse, IngestSuccessResponse, PatternType, S3Metadata @@ -262,9 +262,6 @@ async def process_query( If the commit hash is not found (should never happen). """ - if token: - validate_github_token(token) - try: query = await parse_remote_repo(input_text, token=token) except Exception as exc: diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 48408130..a24eee24 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -1,18 +1,17 @@ """Tests for the ``git_utils`` module. -These tests validate the ``validate_github_token`` function, which ensures that -GitHub personal access tokens (PATs) are properly formatted. +These tests validate various git utility functions for repository operations. """ from __future__ import annotations import base64 from typing import TYPE_CHECKING +from urllib.parse import urlparse import pytest -from gitingest.utils.exceptions import InvalidGitHubTokenError -from gitingest.utils.git_utils import create_git_auth_header, create_git_command, is_github_host, validate_github_token +from gitingest.utils.git_utils import create_git_auth_header, create_git_command, is_github_host if TYPE_CHECKING: from pathlib import Path @@ -20,39 +19,6 @@ from pytest_mock import MockerFixture -@pytest.mark.parametrize( - "token", - [ - # Valid tokens: correct prefixes and at least 36 allowed characters afterwards - "github_pat_" + "a" * 22 + "_" + "b" * 59, - "ghp_" + "A" * 36, - "ghu_" + "B" * 36, - "ghs_" + "C" * 36, - "ghr_" + "D" * 36, - "gho_" + "E" * 36, - ], -) -def test_validate_github_token_valid(token: str) -> None: - """validate_github_token should accept properly-formatted tokens.""" - # Should not raise any exception - validate_github_token(token) - - -@pytest.mark.parametrize( - "token", - [ - "github_pat_short", # Too short after prefix - "ghp_" + "b" * 35, # one character short - "invalidprefix_" + "c" * 36, # Wrong prefix - "github_pat_" + "!" * 36, # Disallowed characters - "github_pat_" + "a" * 36, # Too short after 'github_pat_' prefix - "", # Empty string - ], -) -def test_validate_github_token_invalid(token: str) -> None: - """Test that ``validate_github_token`` raises ``InvalidGitHubTokenError`` on malformed tokens.""" - with pytest.raises(InvalidGitHubTokenError): - validate_github_token(token) @pytest.mark.parametrize( @@ -72,15 +38,18 @@ def test_validate_github_token_invalid(token: str) -> None: "ghp_" + "d" * 36, [ "-c", - create_git_auth_header("ghp_" + "d" * 36), - ], # Auth header expected for GitHub URL + token + create_git_auth_header("ghp_" + "d" * 36, "https://github.com/owner/repo.git"), + ], # Auth header expected when token is provided ), ( ["git", "clone"], "/some/path", "https://gitlab.com/owner/repo.git", "ghp_" + "e" * 36, - [], # No auth header for non-GitHub URL even if token provided + [ + "-c", + create_git_auth_header("ghp_" + "e" * 36, "https://gitlab.com/owner/repo.git"), + ], # Auth header expected for any URL when token is provided ), ], ) @@ -103,17 +72,19 @@ def test_create_git_command( @pytest.mark.parametrize( - "token", + ("token", "url"), [ - "ghp_abcdefghijklmnopqrstuvwxyz012345", # typical ghp_ token - "github_pat_1234567890abcdef1234567890abcdef1234", + ("ghp_abcdefghijklmnopqrstuvwxyz012345", "https://github.com/owner/repo.git"), # typical ghp_ token + ("github_pat_1234567890abcdef1234567890abcdef1234", "https://github.com/owner/repo.git"), + ("some_token", "https://gitlab.com/owner/repo.git"), # non-GitHub URL ], ) -def test_create_git_auth_header(token: str) -> None: +def test_create_git_auth_header(token: str, url: str) -> None: """Test that ``create_git_auth_header`` produces correct base64-encoded header.""" - header = create_git_auth_header(token) + header = create_git_auth_header(token, url) expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() - expected = f"http.https://github.com/.extraheader=Authorization: Basic {expected_basic}" + hostname = urlparse(url).hostname + expected = f"http.https://{hostname}/.extraheader=Authorization: Basic {expected_basic}" assert header == expected @@ -122,7 +93,7 @@ def test_create_git_auth_header(token: str) -> None: [ ("https://github.com/foo/bar.git", "ghp_" + "f" * 36, True), ("https://github.com/foo/bar.git", None, False), - ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, False), + ("https://gitlab.com/foo/bar.git", "ghp_" + "g" * 36, True), # Now called for all URLs with token ], ) def test_create_git_command_helper_calls( From 0a35f586367b38874debe995abfe500247873b65 Mon Sep 17 00:00:00 2001 From: Cursor Agent Date: Sat, 9 Aug 2025 13:52:43 +0000 Subject: [PATCH 2/2] Simplify GitHub-specific code and generalize Git repository handling Co-authored-by: nicoragne --- src/gitingest/clone.py | 1 - src/gitingest/utils/exceptions.py | 10 +-- src/gitingest/utils/git_utils.py | 105 +++++------------------------- tests/test_git_utils.py | 41 +++--------- 4 files changed, 26 insertions(+), 131 deletions(-) diff --git a/src/gitingest/clone.py b/src/gitingest/clone.py index e4d11171..b50c5e74 100644 --- a/src/gitingest/clone.py +++ b/src/gitingest/clone.py @@ -12,7 +12,6 @@ create_git_auth_header, create_git_command, ensure_git_installed, - is_github_host, resolve_commit, run_command, ) diff --git a/src/gitingest/utils/exceptions.py b/src/gitingest/utils/exceptions.py index b7d23e35..10bf942a 100644 --- a/src/gitingest/utils/exceptions.py +++ b/src/gitingest/utils/exceptions.py @@ -16,12 +16,4 @@ def __init__(self, message: str) -> None: super().__init__(message) -class InvalidGitHubTokenError(ValueError): - """Exception raised when a GitHub Personal Access Token is malformed.""" - - def __init__(self) -> None: - msg = ( - "Invalid GitHub token format. To generate a token, go to " - "https://github.com/settings/tokens/new?description=gitingest&scopes=repo." - ) - super().__init__(msg) + diff --git a/src/gitingest/utils/git_utils.py b/src/gitingest/utils/git_utils.py index a86a12e8..ad945e62 100644 --- a/src/gitingest/utils/git_utils.py +++ b/src/gitingest/utils/git_utils.py @@ -9,10 +9,7 @@ from typing import TYPE_CHECKING, Iterable from urllib.parse import urlparse -import httpx -from starlette.status import HTTP_200_OK, HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND -from gitingest.utils.compat_func import removesuffix from gitingest.utils.logging_config import get_logger @@ -25,23 +22,6 @@ -def is_github_host(url: str) -> bool: - """Check if a URL is from a GitHub host (github.com or GitHub Enterprise). - - Parameters - ---------- - url : str - The URL to check - - Returns - ------- - bool - True if the URL is from a GitHub host, False otherwise - - """ - hostname = urlparse(url).hostname or "" - return hostname.startswith("github.") - async def run_command(*args: str) -> tuple[bytes, bytes]: """Execute a shell command asynchronously and return (stdout, stderr) bytes. @@ -115,80 +95,27 @@ async def check_repo_exists(url: str, token: str | None = None) -> bool: url : str URL of the Git repository to check. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- bool ``True`` if the repository exists, ``False`` otherwise. - Raises - ------ - RuntimeError - If the host returns an unrecognised status code. - """ - headers = {} - - if token and is_github_host(url): - host, owner, repo = _parse_github_url(url) - # Public GitHub vs. GitHub Enterprise - base_api = "https://api.github.com" if host == "github.com" else f"https://{host}/api/v3" - url = f"{base_api}/repos/{owner}/{repo}" - headers["Authorization"] = f"Bearer {token}" - - async with httpx.AsyncClient(follow_redirects=True) as client: - try: - response = await client.head(url, headers=headers) - except httpx.RequestError: - return False - - status_code = response.status_code - - if status_code == HTTP_200_OK: + try: + # Use git ls-remote to check if repository exists + cmd = ["git"] + if token: + cmd += ["-c", create_git_auth_header(token, url=url)] + cmd += ["ls-remote", "--heads", url] + + await run_command(*cmd) return True - if status_code in {HTTP_401_UNAUTHORIZED, HTTP_403_FORBIDDEN, HTTP_404_NOT_FOUND}: + except Exception: return False - msg = f"Unexpected HTTP status {status_code} for {url}" - raise RuntimeError(msg) -def _parse_github_url(url: str) -> tuple[str, str, str]: - """Parse a GitHub URL and return (hostname, owner, repo). - - Parameters - ---------- - url : str - The URL of the GitHub repository to parse. - - Returns - ------- - tuple[str, str, str] - A tuple containing the hostname, owner, and repository name. - - Raises - ------ - ValueError - If the URL is not a valid GitHub repository URL. - - """ - parsed = urlparse(url) - if parsed.scheme not in {"http", "https"}: - msg = f"URL must start with http:// or https://: {url!r}" - raise ValueError(msg) - - if not parsed.hostname or not parsed.hostname.startswith("github."): - msg = f"Un-recognised GitHub hostname: {parsed.hostname!r}" - raise ValueError(msg) - - parts = removesuffix(parsed.path, ".git").strip("/").split("/") - expected_path_length = 2 - if len(parts) != expected_path_length: - msg = f"Path must look like //: {parsed.path!r}" - raise ValueError(msg) - - owner, repo = parts - return parsed.hostname, owner, repo async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | None = None) -> list[str]: @@ -201,7 +128,7 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | ref_type: str The type of reference to fetch. Can be "branches" or "tags". token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- @@ -221,7 +148,7 @@ async def fetch_remote_branches_or_tags(url: str, *, ref_type: str, token: str | cmd = ["git"] # Add authentication if needed - if token and is_github_host(url): + if token: cmd += ["-c", create_git_auth_header(token, url=url)] cmd += ["ls-remote"] @@ -314,7 +241,7 @@ async def checkout_partial_clone(config: CloneConfig, token: str | None) -> None config : CloneConfig The configuration for cloning the repository, including subpath and blob flag. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. """ subpath = config.subpath.lstrip("/") @@ -333,7 +260,7 @@ async def resolve_commit(config: CloneConfig, token: str | None) -> str: config : CloneConfig The configuration for cloning the repository. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- @@ -365,7 +292,7 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None) pattern : str The pattern to use to resolve the commit SHA. token : str | None - GitHub personal access token (PAT) for accessing private repositories. + Personal access token (PAT) for accessing private repositories. Returns ------- @@ -380,7 +307,7 @@ async def _resolve_ref_to_sha(url: str, pattern: str, token: str | None = None) """ # Build: git [-c http./.extraheader=Auth...] ls-remote cmd: list[str] = ["git"] - if token and is_github_host(url): + if token: cmd += ["-c", create_git_auth_header(token, url=url)] cmd += ["ls-remote", url, pattern] diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index a24eee24..c6e9d54f 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -11,7 +11,7 @@ import pytest -from gitingest.utils.git_utils import create_git_auth_header, create_git_command, is_github_host +from gitingest.utils.git_utils import create_git_auth_header, create_git_command if TYPE_CHECKING: from pathlib import Path @@ -118,50 +118,27 @@ def test_create_git_command_helper_calls( assert "HEADER" not in cmd -@pytest.mark.parametrize( - ("url", "expected"), - [ - # GitHub.com URLs - ("https://github.com/owner/repo.git", True), - ("http://github.com/owner/repo.git", True), - ("https://github.com/owner/repo", True), - # GitHub Enterprise URLs - ("https://github.company.com/owner/repo.git", True), - ("https://github.enterprise.org/owner/repo.git", True), - ("http://github.internal/owner/repo.git", True), - ("https://github.example.co.uk/owner/repo.git", True), - # Non-GitHub URLs - ("https://gitlab.com/owner/repo.git", False), - ("https://bitbucket.org/owner/repo.git", False), - ("https://git.example.com/owner/repo.git", False), - ("https://mygithub.com/owner/repo.git", False), # doesn't start with "github." - ("https://subgithub.com/owner/repo.git", False), - ("https://example.com/github/repo.git", False), - # Edge cases - ("", False), - ("not-a-url", False), - ("ftp://github.com/owner/repo.git", True), # Different protocol but still github.com - ], -) -def test_is_github_host(url: str, *, expected: bool) -> None: - """Test that ``is_github_host`` correctly identifies GitHub and GitHub Enterprise URLs.""" - assert is_github_host(url) == expected + @pytest.mark.parametrize( ("token", "url", "expected_hostname"), [ - # GitHub.com URLs (default) + # GitHub.com URLs ("ghp_" + "a" * 36, "https://github.com", "github.com"), ("ghp_" + "a" * 36, "https://github.com/owner/repo.git", "github.com"), # GitHub Enterprise URLs ("ghp_" + "b" * 36, "https://github.company.com", "github.company.com"), ("ghp_" + "c" * 36, "https://github.enterprise.org/owner/repo.git", "github.enterprise.org"), ("ghp_" + "d" * 36, "http://github.internal", "github.internal"), + # Other Git services + ("glpat-xxxxxxxxxxxxxxxxxxxx", "https://gitlab.com/owner/repo.git", "gitlab.com"), + ("some_token", "https://bitbucket.org/owner/repo.git", "bitbucket.org"), + ("custom_token", "https://git.example.com/owner/repo.git", "git.example.com"), ], ) -def test_create_git_auth_header_with_ghe_url(token: str, url: str, expected_hostname: str) -> None: - """Test that ``create_git_auth_header`` handles GitHub Enterprise URLs correctly.""" +def test_create_git_auth_header_with_different_hostnames(token: str, url: str, expected_hostname: str) -> None: + """Test that ``create_git_auth_header`` handles different Git service URLs correctly.""" header = create_git_auth_header(token, url=url) expected_basic = base64.b64encode(f"x-oauth-basic:{token}".encode()).decode() expected = f"http.https://{expected_hostname}/.extraheader=Authorization: Basic {expected_basic}"