-
Notifications
You must be signed in to change notification settings - Fork 0
fix: address code audit findings (24 items) and review fixes #69
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,17 +1,17 @@ | ||
| -- Enable pg_trgm extension for trigram-based text search (CJK support). | ||
| CREATE EXTENSION IF NOT EXISTS pg_trgm; | ||
|
|
||
| -- CONCURRENTLY avoids locking the table during index creation. | ||
| -- NOTE: CONCURRENTLY cannot run inside a transaction block. | ||
| -- golang-migrate runs each file in a transaction by default; | ||
| -- the operator must run this migration manually with: | ||
| -- psql -f 000004_trigram_search.up.sql | ||
| -- or disable transactions in the migration tool. | ||
|
|
||
| -- GIN trigram index on messages.text_content for ILIKE fallback search. | ||
| CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_messages_text_content_trgm | ||
| -- NOTE: For large production tables with existing data, create these indexes | ||
| -- manually with CONCURRENTLY before running this migration (they will be | ||
| -- no-ops due to IF NOT EXISTS): | ||
| -- CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_messages_text_content_trgm | ||
| -- ON messages USING gin (text_content gin_trgm_ops); | ||
| -- CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_memories_content_trgm | ||
| -- ON memories USING gin (content gin_trgm_ops); | ||
| CREATE INDEX IF NOT EXISTS idx_messages_text_content_trgm | ||
| ON messages USING gin (text_content gin_trgm_ops); | ||
|
|
||
| -- GIN trigram index on memories.content for similarity() scoring. | ||
| CREATE INDEX CONCURRENTLY IF NOT EXISTS idx_memories_content_trgm | ||
| CREATE INDEX IF NOT EXISTS idx_memories_content_trgm | ||
| ON memories USING gin (content gin_trgm_ops); | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| ALTER TABLE sessions ALTER COLUMN owner_id SET DEFAULT ''; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Data Integrity Risk: Setting the default value of Recommendation: Consider using |
||
| ALTER TABLE sessions DROP CONSTRAINT IF EXISTS sessions_owner_id_not_empty; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,7 @@ | ||
| -- Backfill empty owner_id before adding constraint. | ||
| -- Sessions created before ownership was enforced may have owner_id = ''. | ||
| UPDATE sessions SET owner_id = 'anonymous' WHERE owner_id = ''; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential Data Integrity Issue: Recommended Solution: UPDATE sessions SET owner_id = 'anonymous' WHERE owner_id IS NULL OR owner_id = ''; |
||
|
|
||
| -- Prevent empty owner_id in sessions (enforces NOT NULL + non-empty). | ||
| ALTER TABLE sessions ADD CONSTRAINT sessions_owner_id_not_empty CHECK (owner_id != ''); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Constraint Does Not Prevent NULL Values: Recommended Solution: ALTER TABLE sessions ADD CONSTRAINT sessions_owner_id_not_empty CHECK (owner_id IS NOT NULL AND owner_id != ''); |
||
| ALTER TABLE sessions ALTER COLUMN owner_id DROP DEFAULT; | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -45,7 +45,7 @@ func (h *searchHandler) searchMessages(w http.ResponseWriter, r *http.Request) { | |
|
|
||
| results, total, err := h.store.SearchMessages(r.Context(), userID, query, limit, offset) | ||
| if err != nil { | ||
| h.logger.Error("searching messages", "error", err, "user_id", userID, "query", query) | ||
| h.logger.Error("searching messages", "error", err, "user_id", userID, "query_len", len(query)) | ||
| WriteError(w, http.StatusInternalServerError, "search_failed", "failed to search messages", h.logger) | ||
| return | ||
| } | ||
|
Comment on lines
47
to
51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Error Handling Granularity: Recommendation: if errors.Is(err, session.ErrInvalidQuery) {
WriteError(w, http.StatusBadRequest, "invalid_query", "query is malformed", h.logger)
return
}This improves maintainability and user experience.
Comment on lines
46
to
51
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Performance and Context Handling: Recommendation: |
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,23 +16,17 @@ import ( | |
| "time" | ||
|
|
||
| "github.com/google/uuid" | ||
|
|
||
| "github.com/koopa0/koopa/internal/session" | ||
| ) | ||
|
|
||
| // Sentinel errors for session/CSRF operations. | ||
| var ( | ||
| // ErrSessionCookieNotFound is returned when the session cookie is absent from the request. | ||
| ErrSessionCookieNotFound = errors.New("session cookie not found") | ||
| // ErrSessionInvalid is returned when the session cookie value is not a valid UUID. | ||
| ErrSessionInvalid = errors.New("session ID invalid") | ||
| // ErrCSRFRequired is returned when a state-changing request has no CSRF token. | ||
| ErrCSRFRequired = errors.New("csrf token required") | ||
| // ErrCSRFInvalid is returned when the CSRF token signature does not match. | ||
| ErrCSRFInvalid = errors.New("csrf token invalid") | ||
| // ErrCSRFExpired is returned when the CSRF token timestamp exceeds csrfTokenTTL. | ||
| ErrCSRFExpired = errors.New("csrf token expired") | ||
| // ErrCSRFMalformed is returned when the CSRF token format cannot be parsed. | ||
| ErrCSRFMalformed = errors.New("csrf token malformed") | ||
| errSessionCookieNotFound = errors.New("session cookie not found") | ||
| errSessionInvalid = errors.New("session ID invalid") | ||
| errCSRFRequired = errors.New("csrf token required") | ||
| errCSRFInvalid = errors.New("csrf token invalid") | ||
| errCSRFExpired = errors.New("csrf token expired") | ||
| errCSRFMalformed = errors.New("csrf token malformed") | ||
| ) | ||
|
|
||
| // Pre-session CSRF token prefix to distinguish from user-bound tokens. | ||
|
|
@@ -61,12 +55,12 @@ type sessionManager struct { | |
| func (*sessionManager) SessionID(r *http.Request) (uuid.UUID, error) { | ||
| cookie, err := r.Cookie(sessionCookieName) | ||
| if err != nil { | ||
| return uuid.Nil, ErrSessionCookieNotFound | ||
| return uuid.Nil, errSessionCookieNotFound | ||
| } | ||
|
|
||
| sessionID, err := uuid.Parse(cookie.Value) | ||
| if err != nil { | ||
| return uuid.Nil, ErrSessionInvalid | ||
| return uuid.Nil, errSessionInvalid | ||
| } | ||
|
|
||
| return sessionID, nil | ||
|
|
@@ -109,17 +103,17 @@ func (sm *sessionManager) NewCSRFToken(userID string) string { | |
| // CheckCSRF verifies a user-bound CSRF token. | ||
| func (sm *sessionManager) CheckCSRF(userID, token string) error { | ||
| if token == "" { | ||
| return ErrCSRFRequired | ||
| return errCSRFRequired | ||
| } | ||
|
|
||
| parts := strings.SplitN(token, ":", 2) | ||
| if len(parts) != 2 { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| timestamp, err := strconv.ParseInt(parts[0], 10, 64) | ||
| if err != nil { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| // SECURITY: Compute and verify HMAC BEFORE timestamp checks to prevent | ||
|
|
@@ -133,19 +127,19 @@ func (sm *sessionManager) CheckCSRF(userID, token string) error { | |
|
|
||
| actualSig, err := base64.URLEncoding.DecodeString(parts[1]) | ||
| if err != nil { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| if subtle.ConstantTimeCompare(actualSig, expectedSig) != 1 { | ||
| return ErrCSRFInvalid | ||
| return errCSRFInvalid | ||
| } | ||
|
|
||
| age := time.Since(time.Unix(timestamp, 0)) | ||
| if age > csrfTokenTTL { | ||
| return ErrCSRFExpired | ||
| return errCSRFExpired | ||
| } | ||
| if age < -csrfClockSkew { | ||
| return ErrCSRFInvalid | ||
| return errCSRFInvalid | ||
| } | ||
|
Comment on lines
141
to
143
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ambiguous handling of future-dated CSRF tokens The check Recommended solution: if age < -csrfClockSkew {
return errCSRFMalformed // or errCSRFExpired, depending on intended semantics
} |
||
|
|
||
| return nil | ||
|
|
@@ -168,23 +162,23 @@ func (sm *sessionManager) NewPreSessionCSRFToken() string { | |
| // CheckPreSessionCSRF verifies a pre-session CSRF token. | ||
| func (sm *sessionManager) CheckPreSessionCSRF(token string) error { | ||
| if token == "" { | ||
| return ErrCSRFRequired | ||
| return errCSRFRequired | ||
| } | ||
|
|
||
| if !strings.HasPrefix(token, preSessionPrefix) { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| tokenBody := strings.TrimPrefix(token, preSessionPrefix) | ||
| parts := strings.SplitN(tokenBody, ":", 3) | ||
| if len(parts) != 3 { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| nonce := parts[0] | ||
| timestamp, err := strconv.ParseInt(parts[1], 10, 64) | ||
| if err != nil { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| // SECURITY: Compute and verify HMAC BEFORE timestamp checks to prevent | ||
|
|
@@ -196,19 +190,19 @@ func (sm *sessionManager) CheckPreSessionCSRF(token string) error { | |
|
|
||
| actualSig, err := base64.URLEncoding.DecodeString(parts[2]) | ||
| if err != nil { | ||
| return ErrCSRFMalformed | ||
| return errCSRFMalformed | ||
| } | ||
|
|
||
| if subtle.ConstantTimeCompare(actualSig, expectedSig) != 1 { | ||
| return ErrCSRFInvalid | ||
| return errCSRFInvalid | ||
| } | ||
|
|
||
| age := time.Since(time.Unix(timestamp, 0)) | ||
| if age > csrfTokenTTL { | ||
| return ErrCSRFExpired | ||
| return errCSRFExpired | ||
| } | ||
| if age < -csrfClockSkew { | ||
| return ErrCSRFInvalid | ||
| return errCSRFInvalid | ||
| } | ||
|
Comment on lines
204
to
206
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ambiguous handling of future-dated pre-session CSRF tokens The check Recommended solution: if age < -csrfClockSkew {
return errCSRFMalformed // or errCSRFExpired, depending on intended semantics
} |
||
|
|
||
| return nil | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -103,7 +103,7 @@ func (cfg Config) validate() error { | |
| return errors.New("at least one tool is required") | ||
| } | ||
| if cfg.MemoryStore != nil && cfg.WG == nil { | ||
| return errors.New("WG is required when MemoryStore is set") | ||
| return errors.New("wg is required when memory store is set") | ||
| } | ||
|
Comment on lines
105
to
107
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Potential Incomplete Validation for Memory-Related Fields The current validation checks that Recommended solution: if cfg.MemoryStore != nil && cfg.BackgroundCtx == nil {
return errors.New("background context is required when memory store is set")
}Alternatively, document the defaulting behavior explicitly in the configuration struct. |
||
| return nil | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,8 +3,9 @@ | |
| import ( | ||
| "fmt" | ||
| "log/slog" | ||
| "os" | ||
| "slices" | ||
|
|
||
| "github.com/koopa0/koopa/internal/rag" | ||
| ) | ||
|
|
||
| // supportedProviders lists all valid AI provider values. | ||
|
|
@@ -87,7 +88,7 @@ | |
| return fmt.Errorf("%w: postgres_password must be set in config.yaml", | ||
| ErrInvalidPostgresPassword) | ||
| } | ||
| if c.PostgresPassword == "koopa_dev_password" { | ||
| slog.Warn("Using default development password for PostgreSQL", | ||
| "warning", "Change postgres_password in config.yaml for production deployments") | ||
| } | ||
|
|
@@ -127,7 +128,8 @@ | |
| } | ||
|
|
||
| // requiredVectorDimension must match the pgvector schema: embedding vector(768). | ||
| const requiredVectorDimension = 768 | ||
| // Canonical source: rag.VectorDimension. | ||
| var requiredVectorDimension = int(rag.VectorDimension) | ||
|
|
||
| // validateEmbedder checks that the configured embedder model produces vectors | ||
| // compatible with the database schema. For known models whose native dimension | ||
|
|
@@ -200,16 +202,17 @@ | |
| } | ||
|
|
||
| // validateProviderAPIKey checks that the required API key is set for the configured provider. | ||
| // API keys are captured from environment in Load() and stored as unexported fields. | ||
| func (c *Config) validateProviderAPIKey() error { | ||
| switch c.resolvedProvider() { | ||
| case ProviderGemini: | ||
| if os.Getenv("GEMINI_API_KEY") == "" { | ||
| if c.geminiAPIKey == "" { | ||
| return fmt.Errorf("%w: GEMINI_API_KEY environment variable is required for provider %q\n"+ | ||
| "Get your API key at: https://ai.google.dev/gemini-api/docs/api-key", | ||
| ErrMissingAPIKey, c.resolvedProvider()) | ||
| } | ||
| case ProviderOpenAI: | ||
| if os.Getenv("OPENAI_API_KEY") == "" { | ||
| if c.openaiAPIKey == "" { | ||
| return fmt.Errorf("%w: OPENAI_API_KEY environment variable is required for provider %q\n"+ | ||
| "Get your API key at: https://platform.openai.com/api-keys", | ||
| ErrMissingAPIKey, c.resolvedProvider()) | ||
|
Comment on lines
202
to
218
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing default case in provider API key validation The Recommended solution: default:
return fmt.Errorf("%w: unknown provider %q", ErrInvalidProvider, c.resolvedProvider()) |
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Risk of Table Locks and Downtime on Large Tables
Creating GIN indexes without the
CONCURRENTLYkeyword (lines 12-17) can lock writes to themessagesandmemoriestables, potentially causing downtime in production environments with large datasets. Although the comments warn about this, the migration itself does not enforce safe index creation. To mitigate this risk, consider splitting index creation into a separate migration usingCONCURRENTLY, or ensure that this migration is not run on large tables in production. Example:Recommendation:
CONCURRENTLYbefore running this migration, or modify the migration to useCONCURRENTLYif your migration framework supports it.