Skip to content

Commit

Permalink
only report expensive queries
Browse files Browse the repository at this point in the history
  • Loading branch information
yuchen-db committed Sep 11, 2024
1 parent aa63600 commit 3fa93a1
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 22 deletions.
3 changes: 2 additions & 1 deletion cmd/thanos/query_frontend.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,8 @@ func registerQueryFrontend(app *extkingpin.App) {
cmd.Flag("query-frontend.log-queries-longer-than", "Log queries that are slower than the specified duration. "+
"Set to 0 to disable. Set to < 0 to enable on all queries.").Default("0").DurationVar(&cfg.CortexHandlerConfig.LogQueriesLongerThan)

cmd.Flag("query-frontend.query-stats-enabled", "Enable query stats logging and metrics").Default("true").BoolVar(&cfg.CortexHandlerConfig.QueryStatsEnabled)
cmd.Flag("query-frontend.log-queries-more-expensive-than", "Log queries that fetch more data than the specified bytes. "+
"Set to 0 to disable. Otherwise must be a positive value.").Default("0").Uint64Var(&cfg.CortexHandlerConfig.LogQueriesMoreExpensiveThan)

cmd.Flag("query-frontend.log-failed-queries", "Log failed queries due to any reason").Default("true").BoolVar(&cfg.CortexHandlerConfig.LogFailedQueries)

Expand Down
68 changes: 53 additions & 15 deletions internal/cortex/frontend/transport/handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,13 @@ var (

// HandlerConfig Config for a Handler.
type HandlerConfig struct {
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
MaxBodySize int64 `yaml:"max_body_size"`
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
LogFailedQueries bool `yaml:"log_failed_queries"`
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
LogQueriesLongerThan time.Duration `yaml:"log_queries_longer_than"`
MaxBodySize int64 `yaml:"max_body_size"`
QueryStatsEnabled bool `yaml:"query_stats_enabled"`
LogFailedQueries bool `yaml:"log_failed_queries"`
FailedQueryCacheCapacity int `yaml:"failed_query_cache_capacity"`
SlowQueryLogsUserHeader string `yaml:"slow_query_logs_user_header"`
LogQueriesMoreExpensiveThan uint64 `yaml:"log_queries_more_expensive_than"`
}

// Handler accepts queries and forwards them to RoundTripper. It can log slow queries,
Expand All @@ -61,12 +62,13 @@ type Handler struct {
failedQueryCache *utils.FailedQueryCache

// Metrics.
querySeconds *prometheus.CounterVec
querySeries *prometheus.CounterVec
queryBytes *prometheus.CounterVec
activeUsers *util.ActiveUsersCleanupService
slowQueryCount prometheus.Counter
failedQueryCount prometheus.Counter
querySeconds *prometheus.CounterVec
querySeries *prometheus.CounterVec
queryBytes *prometheus.CounterVec
activeUsers *util.ActiveUsersCleanupService
slowQueryCount prometheus.Counter
failedQueryCount prometheus.Counter
expensiveQueryCount prometheus.Counter
}

// NewHandler creates a new frontend handler.
Expand Down Expand Up @@ -120,6 +122,10 @@ func NewHandler(cfg HandlerConfig, roundTripper http.RoundTripper, log log.Logge
Name: "cortex_failed_query_total",
Help: "Total number of failed queries detected.",
})
h.expensiveQueryCount = promauto.With(reg).NewCounter(prometheus.CounterOpts{
Name: "cortex_expensive_query_total",
Help: "Total number of expensive queries detected.",
})
return h
}

Expand Down Expand Up @@ -205,16 +211,19 @@ func (f *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {

// Check whether we should parse the query string.
shouldReportSlowQuery := f.cfg.LogQueriesLongerThan != 0 && queryResponseTime > f.cfg.LogQueriesLongerThan
if shouldReportSlowQuery || f.cfg.QueryStatsEnabled {
queryBytesFetched := queryrange.GetQueryBytesFetchedFromHeader(resp.Header)
shouldReportExpensiveQuery := f.cfg.LogQueriesMoreExpensiveThan != 0 && queryBytesFetched > f.cfg.LogQueriesMoreExpensiveThan
if shouldReportSlowQuery || shouldReportExpensiveQuery || f.cfg.QueryStatsEnabled {
queryString = f.parseRequestQueryString(r, buf)
}

if shouldReportSlowQuery {
f.reportSlowQuery(r, hs, queryString, queryResponseTime)
}
if shouldReportExpensiveQuery {
f.reportExpensiveQuery(r, queryString, queryBytesFetched, queryResponseTime)
}
if f.cfg.QueryStatsEnabled {
stats.FetchedChunkBytes = queryrange.GetQueryBytesFetchedFromHeader(resp.Header)
stats.FetchedSeriesCount = queryrange.GetQuerySeriesFetchedFromHeader(resp.Header)
f.reportQueryStats(r, queryString, queryResponseTime, stats)
}
}
Expand Down Expand Up @@ -248,6 +257,35 @@ func (f *Handler) reportFailedQuery(r *http.Request, queryString url.Values, err
level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
}

func (f *Handler) reportExpensiveQuery(r *http.Request, queryString url.Values, queryBytesFetched uint64, queryResponseTime time.Duration) {
f.expensiveQueryCount.Inc()
// NOTE(GiedriusS): see https://github.com/grafana/grafana/pull/60301 for more info.
grafanaDashboardUID := "-"
if dashboardUID := r.Header.Get("X-Dashboard-Uid"); dashboardUID != "" {
grafanaDashboardUID = dashboardUID
}
grafanaPanelID := "-"
if panelID := r.Header.Get("X-Panel-Id"); panelID != "" {
grafanaPanelID = panelID
}
remoteUser, _, _ := r.BasicAuth()

logMessage := append([]interface{}{
"msg", "expensive query",
"method", r.Method,
"host", r.Host,
"path", r.URL.Path,
"remote_user", remoteUser,
"remote_addr", r.RemoteAddr,
"query_megabytes_fetched", queryBytesFetched / (1024 * 1024),
"grafana_dashboard_uid", grafanaDashboardUID,
"grafana_panel_id", grafanaPanelID,
"query_response_time", queryResponseTime.String(),
}, formatQueryString(queryString)...)

level.Error(util_log.WithContext(r.Context(), f.log)).Log(logMessage...)
}

// reportSlowQuery reports slow queries.
func (f *Handler) reportSlowQuery(r *http.Request, responseHeaders http.Header, queryString url.Values, queryResponseTime time.Duration) {
f.slowQueryCount.Inc()
Expand Down
5 changes: 0 additions & 5 deletions internal/cortex/querier/queryrange/query_bytes_fetched.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ import (
// QueryBytesFetchedHeaderName is the http header name of number of bytes fetched by a query from m3readcoord.
// This name is compatible with M3 and rule manager code
const QueryBytesFetchedHeaderName = "M3-Fetched-Bytes-Estimate"
const QuerySeriesFetchedHeaderName = "M3-Fetched-Series-Estimate"

func sumQueryBytesFetched(responses ...Response) uint64 {
var result uint64
Expand Down Expand Up @@ -72,7 +71,3 @@ func getHeaderValue(hdr http.Header, key string) uint64 {
func GetQueryBytesFetchedFromHeader(hdr http.Header) uint64 {
return getHeaderValue(hdr, QueryBytesFetchedHeaderName)
}

func GetQuerySeriesFetchedFromHeader(hdr http.Header) uint64 {
return getHeaderValue(hdr, QuerySeriesFetchedHeaderName)
}
1 change: 0 additions & 1 deletion internal/cortex/querier/queryrange/query_range.go
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,6 @@ func (prometheusCodec) EncodeResponse(ctx context.Context, res Response) (*http.
} else if res.(*PrometheusResponse).Data.SeriesStatsCounter != nil {
// Pantheon code path
httpHeader[QueryBytesFetchedHeaderName] = []string{strconv.FormatInt(res.(*PrometheusResponse).Data.SeriesStatsCounter.Bytes, 10)}
httpHeader[QuerySeriesFetchedHeaderName] = []string{strconv.FormatInt(res.(*PrometheusResponse).Data.SeriesStatsCounter.Series, 10)}
}
resp := http.Response{
Header: httpHeader,
Expand Down

0 comments on commit 3fa93a1

Please sign in to comment.