Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,7 @@ cache and restore data. Strategies are not limited to using the Cache though; fo
keep a local bare checkout of upstream Git repositories and serve packs from the repo directly.

The codebase uses Hermit to manage toolchains. It is written in Go, and uses Just for running common tasks.

Only add comments for relatively large blocks of code, 20+ lines or more, and ONLY if it is not obvious what the code is
doing. ALWAYS add Go-style documentation comments for public variables/types/functions. If you do add comments, the
comments should explain WHY something is happening, not WHAT is happening.
47 changes: 7 additions & 40 deletions internal/strategy/git/backend.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"github.com/block/cachew/internal/logging"
)

// serveFromBackend serves a Git request using git http-backend.
func (s *Strategy) serveFromBackend(w http.ResponseWriter, r *http.Request, c *clone) {
ctx := r.Context()
logger := logging.FromContext(ctx)
Expand All @@ -37,12 +36,10 @@ func (s *Strategy) serveFromBackend(w http.ResponseWriter, r *http.Request, c *c
return
}

// Build the path that git http-backend expects
host := r.PathValue("host")
pathValue := r.PathValue("path")

// For regular clones, we need to insert /.git before the git protocol paths
// Find where the git operation starts (e.g., /info/refs, /git-upload-pack)
// Insert /.git before the git protocol paths to match the filesystem layout
var gitOperation string
var repoPathWithSuffix string

Expand All @@ -54,18 +51,14 @@ func (s *Strategy) serveFromBackend(w http.ResponseWriter, r *http.Request, c *c
}
}

// Remove .git suffix from repo path for the filesystem path
repoPath := strings.TrimSuffix(repoPathWithSuffix, ".git")

// Construct backend path with .git directory: /host/repo/.git/info/refs
backendPath := "/" + host + "/" + repoPath + "/.git" + gitOperation

logger.DebugContext(r.Context(), "Serving with git http-backend",
slog.String("original_path", r.URL.Path),
slog.String("backend_path", backendPath),
slog.String("clone_path", c.path))

// Capture stderr from git http-backend to log errors
var stderrBuf bytes.Buffer

handler := &cgi.Handler{
Expand All @@ -79,30 +72,27 @@ func (s *Strategy) serveFromBackend(w http.ResponseWriter, r *http.Request, c *c
},
}

// Modify request for http-backend
r2 := r.Clone(r.Context())
r2.URL.Path = backendPath

handler.ServeHTTP(w, r2)

// Log stderr if there was any output (indicates an error)
if stderrBuf.Len() > 0 {
logger.ErrorContext(r.Context(), "git http-backend error",
slog.String("stderr", stderrBuf.String()),
slog.String("path", backendPath))
}
}

// executeClone performs a git clone operation.
func (s *Strategy) executeClone(ctx context.Context, c *clone) error {
logger := logging.FromContext(ctx)

if err := os.MkdirAll(filepath.Dir(c.path), 0o750); err != nil {
return errors.Wrap(err, "create clone directory")
}

// #nosec G204 - c.upstreamURL and c.path are controlled by us
// Configure git for large repositories to avoid network buffer issues
// #nosec G204 - c.upstreamURL and c.path are controlled by us
args := []string{"clone"}
if s.config.CloneDepth > 0 {
args = append(args, "--depth", strconv.Itoa(s.config.CloneDepth))
Expand All @@ -124,9 +114,7 @@ func (s *Strategy) executeClone(ctx context.Context, c *clone) error {
return errors.Wrap(err, "git clone")
}

// Configure remote to fetch all branches, not just the default branch
// git clone sets fetch = +refs/heads/master:refs/remotes/origin/master by default
// We need to change it to fetch all branches
// git clone only sets up fetching for the default branch, change it to fetch all branches
// #nosec G204 - c.path is controlled by us
cmd = exec.CommandContext(ctx, "git", "-C", c.path, "config", "remote.origin.fetch", "+refs/heads/*:refs/remotes/origin/*")
output, err = cmd.CombinedOutput()
Expand All @@ -137,7 +125,6 @@ func (s *Strategy) executeClone(ctx context.Context, c *clone) error {
return errors.Wrap(err, "configure fetch refspec")
}

// Fetch all branches now that the refspec is configured
cmd, err = gitCommand(ctx, c.upstreamURL, "-C", c.path,
"-c", "http.postBuffer=524288000",
"-c", "http.lowSpeedLimit=1000",
Expand All @@ -157,36 +144,29 @@ func (s *Strategy) executeClone(ctx context.Context, c *clone) error {
return nil
}

// executeFetch performs a git remote update operation.
func (s *Strategy) executeFetch(ctx context.Context, c *clone) error {
logger := logging.FromContext(ctx)

// Try to acquire the semaphore
select {
case <-c.fetchSem:
// We acquired the semaphore, perform the fetch
defer func() {
// Release the semaphore
c.fetchSem <- struct{}{}
}()
case <-ctx.Done():
return errors.Wrap(ctx.Err(), "context cancelled before acquiring fetch semaphore")
default:
// Semaphore is held by another goroutine, wait for it
logger.DebugContext(ctx, "Fetch already in progress, waiting")
select {
case <-c.fetchSem:
// Fetch completed by another goroutine, release and return
c.fetchSem <- struct{}{}
return nil
case <-ctx.Done():
return errors.Wrap(ctx.Err(), "context cancelled while waiting for fetch")
}
}

// #nosec G204 - c.path is controlled by us
// Configure git for large repositories to avoid network buffer issues
// Use 'remote update' to properly handle ref updates and pruning
// #nosec G204 - c.path is controlled by us
cmd, err := gitCommand(ctx, c.upstreamURL, "-C", c.path,
"-c", "http.postBuffer=524288000", // 500MB buffer
"-c", "http.lowSpeedLimit=1000", // 1KB/s minimum speed
Expand All @@ -211,12 +191,11 @@ func (s *Strategy) executeFetch(ctx context.Context, c *clone) error {
}

// ensureRefsUpToDate checks if upstream has refs we don't have and fetches if needed.
// Uses a short-lived cache to avoid excessive ls-remote calls.
// Short-lived cache avoids excessive ls-remote calls.
func (s *Strategy) ensureRefsUpToDate(ctx context.Context, c *clone) error {
logger := logging.FromContext(ctx)

c.mu.Lock()
// Check if we've done a recent ref check
if c.refCheckValid && time.Since(c.lastRefCheck) < s.config.RefCheckInterval {
c.mu.Unlock()
logger.DebugContext(ctx, "Skipping ref check, recently checked",
Expand All @@ -230,32 +209,25 @@ func (s *Strategy) ensureRefsUpToDate(ctx context.Context, c *clone) error {
logger.DebugContext(ctx, "Checking upstream for new refs",
slog.String("upstream", c.upstreamURL))

// Get local refs
localRefs, err := s.getLocalRefs(ctx, c)
if err != nil {
return errors.Wrap(err, "get local refs")
}

// Get upstream refs
upstreamRefs, err := s.getUpstreamRefs(ctx, c)
if err != nil {
return errors.Wrap(err, "get upstream refs")
}

// Check if upstream has any refs we don't have or refs that have been updated
// Skip peeled refs (refs ending in ^{}) as they're not real refs
needsFetch := false
for ref, upstreamSHA := range upstreamRefs {
// Skip peeled tag refs like refs/tags/v1.0.0^{}
if strings.HasSuffix(ref, "^{}") {
continue
}
// Only check refs/heads/* from upstream since those are what we fetch
// (GitHub exposes refs/pull/* and other refs we don't fetch)
// Only check refs/heads/* since GitHub exposes refs/pull/* and other refs we don't fetch
if !strings.HasPrefix(ref, "refs/heads/") {
continue
}
// Convert refs/heads/X to refs/remotes/origin/X for local lookup
localRef := "refs/remotes/origin/" + strings.TrimPrefix(ref, "refs/heads/")
localSHA, exists := localRefs[localRef]
if !exists || localSHA != upstreamSHA {
Expand Down Expand Up @@ -287,11 +259,8 @@ func (s *Strategy) ensureRefsUpToDate(ctx context.Context, c *clone) error {
return err
}

// getLocalRefs returns a map of ref names to SHAs for the local clone.
func (s *Strategy) getLocalRefs(ctx context.Context, c *clone) (map[string]string, error) {
// #nosec G204 - c.path is controlled by us
// Use for-each-ref to get all refs including remote refs
// No need for insteadOf protection since this is purely local
cmd := exec.CommandContext(ctx, "git", "-C", c.path, "for-each-ref", "--format=%(objectname) %(refname)")
output, err := cmd.CombinedOutput()
if err != nil {
Expand All @@ -301,7 +270,6 @@ func (s *Strategy) getLocalRefs(ctx context.Context, c *clone) (map[string]strin
return ParseGitRefs(output), nil
}

// getUpstreamRefs returns a map of ref names to SHAs for the upstream repository.
func (s *Strategy) getUpstreamRefs(ctx context.Context, c *clone) (map[string]string, error) {
// #nosec G204 - c.upstreamURL is controlled by us
cmd, err := gitCommand(ctx, c.upstreamURL, "ls-remote", c.upstreamURL)
Expand All @@ -316,8 +284,7 @@ func (s *Strategy) getUpstreamRefs(ctx context.Context, c *clone) (map[string]st
return ParseGitRefs(output), nil
}

// ParseGitRefs parses the output of git show-ref or git ls-remote.
// Format: <SHA> <ref>.
// ParseGitRefs parses the output of git show-ref or git ls-remote (format: <SHA> <ref>).
func ParseGitRefs(output []byte) map[string]string {
refs := make(map[string]string)
scanner := bufio.NewScanner(strings.NewReader(string(output)))
Expand Down
15 changes: 2 additions & 13 deletions internal/strategy/git/bundle.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,9 @@ import (
"github.com/block/cachew/internal/logging"
)

// cloneBundleLoop generates bundles periodically for a single clone.
func (s *Strategy) cloneBundleLoop(ctx context.Context, c *clone) {
logger := logging.FromContext(ctx)

// Generate bundle immediately on start if one doesn't exist
s.generateAndUploadBundleIfMissing(ctx, c)

ticker := time.NewTicker(s.config.BundleInterval)
Expand All @@ -38,35 +36,29 @@ func (s *Strategy) cloneBundleLoop(ctx context.Context, c *clone) {
}
}

// generateAndUploadBundleIfMissing generates a bundle only if one doesn't exist in cache.
func (s *Strategy) generateAndUploadBundleIfMissing(ctx context.Context, c *clone) {
logger := logging.FromContext(ctx)

// Check if bundle already exists in cache
cacheKey := cache.NewKey(c.upstreamURL + ".bundle")

reader, _, err := s.cache.Open(ctx, cacheKey)
if err == nil {
// Bundle exists, close and skip generation
_ = reader.Close()
logger.DebugContext(ctx, "Bundle already exists in cache, skipping generation",
slog.String("upstream", c.upstreamURL))
return
}

// Only generate if the error is that the bundle doesn't exist
if !errors.Is(err, os.ErrNotExist) {
logger.ErrorContext(ctx, "Failed to check for existing bundle",
slog.String("upstream", c.upstreamURL),
slog.String("error", err.Error()))
return
}

// Bundle doesn't exist, generate it
s.generateAndUploadBundle(ctx, c)
}

// generateAndUploadBundle generates a bundle and streams it directly to cache.
func (s *Strategy) generateAndUploadBundle(ctx context.Context, c *clone) {
logger := logging.FromContext(ctx)

Expand All @@ -75,7 +67,6 @@ func (s *Strategy) generateAndUploadBundle(ctx context.Context, c *clone) {

cacheKey := cache.NewKey(c.upstreamURL + ".bundle")

// Create cache writer
headers := textproto.MIMEHeader{
"Content-Type": []string{"application/x-git-bundle"},
}
Expand All @@ -89,9 +80,8 @@ func (s *Strategy) generateAndUploadBundle(ctx context.Context, c *clone) {
}
defer w.Close()

// Stream bundle directly to cache
// #nosec G204 - c.path is controlled by us
// Use --branches --remotes to include all branches but exclude tags (which can be massive)
// #nosec G204 - c.path is controlled by us
args := []string{"-C", c.path, "bundle", "create", "-", "--branches", "--remotes"}
cmd, err := gitCommand(ctx, "", args...)
if err != nil {
Expand All @@ -102,7 +92,6 @@ func (s *Strategy) generateAndUploadBundle(ctx context.Context, c *clone) {
}
cmd.Stdout = w

// Capture stderr for error reporting
stderrPipe, err := cmd.StderrPipe()
if err != nil {
logger.ErrorContext(ctx, "Failed to create stderr pipe",
Expand All @@ -122,7 +111,7 @@ func (s *Strategy) generateAndUploadBundle(ctx context.Context, c *clone) {
return
}

stderr, _ := io.ReadAll(stderrPipe) //nolint:errcheck // Only used for logging
stderr, _ := io.ReadAll(stderrPipe) //nolint:errcheck

if err := cmd.Wait(); err != nil {
logger.ErrorContext(ctx, "Failed to generate bundle",
Expand Down
22 changes: 5 additions & 17 deletions internal/strategy/git/command.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,18 +9,14 @@ import (
"github.com/alecthomas/errors"
)

// gitCommand creates a git command with insteadOf URL rewriting disabled for the given URL.
// This prevents git config rules like "url.X.insteadOf=Y" from rewriting the specific URL
// to point back through the proxy, which would cause infinite loops.
// Other insteadOf rules and all auth configuration are preserved.
// gitCommand creates a git command with insteadOf URL rewriting disabled for the given URL
// to prevent infinite loops where git config rules rewrite URLs to point back through the proxy.
func gitCommand(ctx context.Context, url string, args ...string) (*exec.Cmd, error) {
// Query for insteadOf rules that would affect this URL and build -c flags to disable them
configArgs, err := getInsteadOfDisableArgsForURL(ctx, url)
if err != nil {
return nil, errors.Wrap(err, "get insteadOf disable args")
}

// Prepend disable args to the git command arguments
var allArgs []string
if len(configArgs) > 0 {
allArgs = append(allArgs, configArgs...)
Expand All @@ -31,37 +27,29 @@ func gitCommand(ctx context.Context, url string, args ...string) (*exec.Cmd, err
return cmd, nil
}

// getInsteadOfDisableArgsForURL queries git config for insteadOf rules that would affect
// the given URL and returns arguments to disable only those specific rules.
// getInsteadOfDisableArgsForURL returns arguments to disable insteadOf rules that would affect the given URL.
func getInsteadOfDisableArgsForURL(ctx context.Context, targetURL string) ([]string, error) {
if targetURL == "" {
return nil, nil
}

// Query git config for all url.*.insteadOf and url.*.pushInsteadOf settings
cmd := exec.CommandContext(ctx, "git", "config", "--get-regexp", "^url\\..*\\.(insteadof|pushinsteadof)$")
output, err := cmd.CombinedOutput()
if err != nil {
// No insteadOf rules found (exit code 1) is expected and not an error
// Return empty args to continue without disabling any rules
return []string{}, nil //nolint:nilerr // Exit code 1 is expected when no rules exist
// Exit code 1 when no insteadOf rules exist is expected, not an error
return []string{}, nil //nolint:nilerr
}

// Parse output and check which rules would match our URL
// Output format: url.<base>.insteadof <pattern> or url.<base>.pushinsteadof <pattern>
var args []string
scanner := bufio.NewScanner(strings.NewReader(string(output)))
for scanner.Scan() {
line := scanner.Text()
// Split into config key and value
parts := strings.Fields(line)
if len(parts) >= 2 {
configKey := parts[0]
pattern := parts[1]

// Check if our target URL would match this insteadOf pattern
if strings.HasPrefix(targetURL, pattern) {
// This rule would affect our URL, so disable it
args = append(args, "-c", configKey+"=")
}
}
Expand Down
Loading