Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bugfix: When OOO native histograms are disabled it should be a client error #9567

Merged
merged 4 commits into from
Oct 8, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions pkg/ingester/ingester.go
Original file line number Diff line number Diff line change
Expand Up @@ -1344,6 +1344,18 @@ func (i *Ingester) pushSamplesToAppender(userID string, timeseries []mimirpb.Pre
return true

// Map TSDB native histogram validation errors to soft errors.
case errors.Is(err, storage.ErrOOONativeHistogramsDisabled):
stats.invalidNativeHistogramCount++
gotjosh marked this conversation as resolved.
Show resolved Hide resolved
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, storage.ErrNativeHistogramsDisabled):
gotjosh marked this conversation as resolved.
Show resolved Hide resolved
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
return newNativeHistogramValidationError(globalerror.NativeHistogramDisabled, err, model.Time(timestamp), labels)
})
return true
case errors.Is(err, histogram.ErrHistogramCountMismatch):
stats.invalidNativeHistogramCount++
updateFirstPartial(i.errorSamplers.nativeHistogramValidationError, func() softError {
Expand Down
93 changes: 87 additions & 6 deletions pkg/ingester/ingester_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ func TestIngester_Push(t *testing.T) {
maxMetadataPerUser int
maxMetadataPerMetric int
nativeHistograms bool
oooNativeHistograms bool
ignoreOOOExemplars bool
}{
"should succeed on valid series and metadata": {
Expand Down Expand Up @@ -1915,6 +1916,77 @@ func TestIngester_Push(t *testing.T) {
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
`,
},
"should not fail if native histograms are disabled": {
gotjosh marked this conversation as resolved.
Show resolved Hide resolved
nativeHistograms: false,
reqs: []*mimirpb.WriteRequest{
mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries(
[][]mimirpb.LabelAdapter{metricLabelAdapters},
[]mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, util_test.GenerateTestHistogram(1))},
nil,
),
},
expectedErr: nil,
},
"should soft fail if OOO native histograms are disabled": {
nativeHistograms: true,
oooNativeHistograms: false,
reqs: []*mimirpb.WriteRequest{
mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries(
[][]mimirpb.LabelAdapter{metricLabelAdapters},
[]mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(10, util_test.GenerateTestHistogram(1))},
nil,
),
mimirpb.NewWriteRequest(nil, mimirpb.API).AddHistogramSeries(
[][]mimirpb.LabelAdapter{metricLabelAdapters},
[]mimirpb.Histogram{mimirpb.FromHistogramToHistogramProto(-10, util_test.GenerateTestHistogram(1))},
nil,
),
},
expectedErr: newErrorWithStatus(wrapOrAnnotateWithUser(newNativeHistogramValidationError(globalerror.NativeHistogramOOODisabled, fmt.Errorf("out-of-order native histogram ingestion is disabled"), model.Time(-10), []mimirpb.LabelAdapter{metricLabelAdapters[0]}), userID), codes.InvalidArgument),
expectedMetrics: `
# HELP cortex_ingester_ingested_samples_total The total number of samples ingested per user.
# TYPE cortex_ingester_ingested_samples_total counter
cortex_ingester_ingested_samples_total{user="test"} 1
# HELP cortex_discarded_samples_total The total number of samples that were discarded.
# TYPE cortex_discarded_samples_total counter
cortex_discarded_samples_total{group="",reason="invalid-native-histogram",user="test"} 1
# HELP cortex_ingester_ingested_samples_failures_total The total number of samples that errored on ingestion per user.
# TYPE cortex_ingester_ingested_samples_failures_total counter
cortex_ingester_ingested_samples_failures_total{user="test"} 1
# HELP cortex_ingester_memory_users The current number of users in memory.
# TYPE cortex_ingester_memory_users gauge
cortex_ingester_memory_users 1
# HELP cortex_ingester_memory_series The current number of series in memory.
# TYPE cortex_ingester_memory_series gauge
cortex_ingester_memory_series 1
# HELP cortex_ingester_memory_series_created_total The total number of series that were created per user.
# TYPE cortex_ingester_memory_series_created_total counter
cortex_ingester_memory_series_created_total{user="test"} 1
# HELP cortex_ingester_memory_series_removed_total The total number of series that were removed per user.
# TYPE cortex_ingester_memory_series_removed_total counter
cortex_ingester_memory_series_removed_total{user="test"} 0
# HELP cortex_ingester_tsdb_head_min_timestamp_seconds Minimum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_min_timestamp_seconds gauge
cortex_ingester_tsdb_head_min_timestamp_seconds 0.01
# HELP cortex_ingester_tsdb_head_max_timestamp_seconds Maximum timestamp of the head block across all tenants.
# TYPE cortex_ingester_tsdb_head_max_timestamp_seconds gauge
cortex_ingester_tsdb_head_max_timestamp_seconds 0.01
# HELP cortex_ingester_active_native_histogram_buckets Number of currently active native histogram buckets per user.
# TYPE cortex_ingester_active_native_histogram_buckets gauge
cortex_ingester_active_native_histogram_buckets{user="test"} 8
# HELP cortex_ingester_active_native_histogram_series Number of currently active native histogram series per user.
# TYPE cortex_ingester_active_native_histogram_series gauge
cortex_ingester_active_native_histogram_series{user="test"} 1
# HELP cortex_ingester_active_series Number of currently active series per user.
# TYPE cortex_ingester_active_series gauge
cortex_ingester_active_series{user="test"} 1
`,
expectedIngested: model.Matrix{
&model.SampleStream{Metric: metricLabelSet, Histograms: []model.SampleHistogramPair{
{Histogram: mimirpb.FromHistogramToPromHistogram(util_test.GenerateTestHistogram(1)), Timestamp: 10}},
},
},
},
"should soft fail if histogram has a negative bucket count": {
nativeHistograms: true,
reqs: []*mimirpb.WriteRequest{
Expand Down Expand Up @@ -3194,7 +3266,13 @@ func TestIngester_Push(t *testing.T) {
limits.MaxGlobalMetricsWithMetadataPerUser = testData.maxMetadataPerUser
limits.MaxGlobalMetadataPerMetric = testData.maxMetadataPerMetric
limits.NativeHistogramsIngestionEnabled = testData.nativeHistograms
limits.OOONativeHistogramsIngestionEnabled = testData.oooNativeHistograms
limits.IgnoreOOOExemplars = testData.ignoreOOOExemplars
var oooTimeWindow int64
if testData.nativeHistograms && !testData.oooNativeHistograms {
oooTimeWindow = int64(1 * time.Hour.Seconds())
limits.OutOfOrderTimeWindow = model.Duration(1 * time.Hour)
}

i, err := prepareIngesterWithBlocksStorageAndLimits(t, cfg, limits, nil, "", registry)
require.NoError(t, err)
Expand Down Expand Up @@ -3222,10 +3300,11 @@ func TestIngester_Push(t *testing.T) {
if testData.expectedErr == nil {
assert.NoError(t, err)
} else {
require.Error(t, err)
handledErr := mapPushErrorToErrorWithStatus(err)
errWithStatus, ok := handledErr.(globalerror.ErrorWithStatus)
assert.True(t, ok)
assert.True(t, errWithStatus.Equals(testData.expectedErr))
require.True(t, ok)
require.Truef(t, errWithStatus.Equals(testData.expectedErr), "errors don't match \nactual: '%v'\nexpected: '%v'", errWithStatus, testData.expectedErr)
}
}
}
Expand All @@ -3244,7 +3323,7 @@ func TestIngester_Push(t *testing.T) {
if len(res) == 0 {
res = nil
}
assert.Equal(t, testData.expectedIngested, res)
require.Equal(t, testData.expectedIngested, res)

// Read back samples to see what has been really ingested
exemplarRes, err := i.QueryExemplars(ctx, &client.ExemplarQueryRequest{
Expand Down Expand Up @@ -3307,9 +3386,11 @@ func TestIngester_Push(t *testing.T) {
assert.Equal(t, int64(expectedTenantsCount), usagestats.GetInt(memoryTenantsStatsName).Value())
assert.Equal(t, int64(expectedSamplesCount)+appendedSamplesStatsBefore, usagestats.GetCounter(appendedSamplesStatsName).Total())
assert.Equal(t, int64(expectedExemplarsCount)+appendedExemplarsStatsBefore, usagestats.GetCounter(appendedExemplarsStatsName).Total())
assert.Equal(t, int64(0), usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, int64(0), usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())

assert.Equal(t, testData.oooNativeHistograms, usagestats.GetInt(tenantsWithOutOfOrderEnabledStatName).Value() == int64(0))
assert.Equal(t, oooTimeWindow, usagestats.GetInt(minOutOfOrderTimeWindowSecondsStatName).Value())
assert.Equal(t, oooTimeWindow, usagestats.GetInt(maxOutOfOrderTimeWindowSecondsStatName).Value())

})
}
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/util/globalerror/user.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ const (
NativeHistogramNegativeBucketCount ID = "native-histogram-negative-bucket-count"
NativeHistogramSpanNegativeOffset ID = "native-histogram-span-negative-offset"
NativeHistogramSpansBucketsMismatch ID = "native-histogram-spans-buckets-mismatch"
NativeHistogramDisabled ID = "native-histogram-disabled"
NativeHistogramOOODisabled ID = "native-histogram-ooo-disabled"

// Alertmanager errors
AlertmanagerMaxGrafanaConfigSize ID = "alertmanager-max-grafana-config-size"
Expand Down
Loading