From 405ca1339a38e6590a7f3a6a8af9cc1b7cf37d6c Mon Sep 17 00:00:00 2001 From: Kristina Pathak Date: Mon, 18 Nov 2024 10:13:17 -0800 Subject: [PATCH] [exporter/prometheus] fix: validate metric types and help/descriptions (#36356) #### Description Fixes bug where exporting fails due to different help messages for the same metric. With this solution, the exporter will always export metrics of the same name with the first description it receives. This also rejects metrics whose types have changed. These changes follow the [spec](https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/#metric-metadata-1): >Exporters MUST drop entire metrics to prevent conflicting TYPE comments, but SHOULD NOT drop metric points as a result of conflicting UNIT or HELP comments. Instead, all but one of the conflicting UNIT and HELP comments (but not metric points) SHOULD be dropped. If dropping a comment or metric points, the exporter SHOULD warn the user through error logging. Based on https://github.com/open-telemetry/opentelemetry-go/pull/3469 #### Link to tracking issue Fixes https://github.com/open-telemetry/opentelemetry-collector-contrib/issues/28617 #### Testing Unit test cases added. --------- Co-authored-by: David Ashpole Co-authored-by: Alex Boten <223565+codeboten@users.noreply.github.com> --- .chloggen/prometheus-metric-types-help.yaml | 27 +++++ exporter/prometheusexporter/collector.go | 99 ++++++++++++++++--- exporter/prometheusexporter/collector_test.go | 98 ++++++++++++++---- 3 files changed, 192 insertions(+), 32 deletions(-) create mode 100644 .chloggen/prometheus-metric-types-help.yaml diff --git a/.chloggen/prometheus-metric-types-help.yaml b/.chloggen/prometheus-metric-types-help.yaml new file mode 100644 index 000000000000..b8167777213a --- /dev/null +++ b/.chloggen/prometheus-metric-types-help.yaml @@ -0,0 +1,27 @@ +# Use this changelog template to create an entry for release notes. + +# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' +change_type: bug_fix + +# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) +component: prometheusexporter + +# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). +note: reject metrics whose types have changed, use pre-existing descriptions when help strings change + +# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. +issues: [28617] + +# (Optional) One or more lines of additional information to render under the primary note. +# These lines will be padded with 2 spaces and then inserted directly into the document. +# Use pipe (|) for multiline entries. +subtext: + +# If your change doesn't affect end users or the exported elements of any package, +# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. +# Optional: The change log or logs in which this entry should be included. +# e.g. '[user]' or '[user, api]' +# Include 'user' if the change is relevant to end users. +# Include 'api' if there is a change to a library API. +# Default: '[user]' +change_logs: [] diff --git a/exporter/prometheusexporter/collector.go b/exporter/prometheusexporter/collector.go index be676ac7c9a5..42035add2fbc 100644 --- a/exporter/prometheusexporter/collector.go +++ b/exporter/prometheusexporter/collector.go @@ -7,13 +7,17 @@ import ( "encoding/hex" "fmt" "sort" + "sync" + "time" "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" "github.com/prometheus/common/model" "go.opentelemetry.io/collector/pdata/pcommon" "go.opentelemetry.io/collector/pdata/pmetric" conventions "go.opentelemetry.io/collector/semconv/v1.25.0" "go.uber.org/zap" + "google.golang.org/protobuf/proto" prometheustranslator "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus" ) @@ -30,6 +34,13 @@ type collector struct { addMetricSuffixes bool namespace string constLabels prometheus.Labels + metricFamilies sync.Map + metricExpiration time.Duration +} + +type metricFamily struct { + lastSeen time.Time + mf *dto.MetricFamily } func newCollector(config *Config, logger *zap.Logger) *collector { @@ -40,6 +51,7 @@ func newCollector(config *Config, logger *zap.Logger) *collector { sendTimestamps: config.SendTimestamps, constLabels: config.ConstLabels, addMetricSuffixes: config.AddMetricSuffixes, + metricExpiration: config.MetricExpiration, } } @@ -104,7 +116,13 @@ func (c *collector) convertMetric(metric pmetric.Metric, resourceAttrs pcommon.M return nil, errUnknownMetricType } -func (c *collector) getMetricMetadata(metric pmetric.Metric, attributes pcommon.Map, resourceAttrs pcommon.Map) (*prometheus.Desc, []string) { +func (c *collector) getMetricMetadata(metric pmetric.Metric, mType *dto.MetricType, attributes pcommon.Map, resourceAttrs pcommon.Map) (*prometheus.Desc, []string, error) { + name := prometheustranslator.BuildCompliantName(metric, c.namespace, c.addMetricSuffixes) + help, err := c.validateMetrics(name, metric.Description(), mType) + if err != nil { + return nil, nil, err + } + keys := make([]string, 0, attributes.Len()+2) // +2 for job and instance labels. values := make([]string, 0, attributes.Len()+2) @@ -123,18 +141,17 @@ func (c *collector) getMetricMetadata(metric pmetric.Metric, attributes pcommon. values = append(values, instance) } - return prometheus.NewDesc( - prometheustranslator.BuildCompliantName(metric, c.namespace, c.addMetricSuffixes), - metric.Description(), - keys, - c.constLabels, - ), values + return prometheus.NewDesc(name, help, keys, c.constLabels), values, nil } func (c *collector) convertGauge(metric pmetric.Metric, resourceAttrs pcommon.Map) (prometheus.Metric, error) { ip := metric.Gauge().DataPoints().At(0) - desc, attributes := c.getMetricMetadata(metric, ip.Attributes(), resourceAttrs) + desc, attributes, err := c.getMetricMetadata(metric, dto.MetricType_GAUGE.Enum(), ip.Attributes(), resourceAttrs) + if err != nil { + return nil, err + } + var value float64 switch ip.ValueType() { case pmetric.NumberDataPointValueTypeInt: @@ -162,11 +179,16 @@ func (c *collector) convertSum(metric pmetric.Metric, resourceAttrs pcommon.Map) ip := metric.Sum().DataPoints().At(0) metricType := prometheus.GaugeValue + mType := dto.MetricType_GAUGE.Enum() if metric.Sum().IsMonotonic() { metricType = prometheus.CounterValue + mType = dto.MetricType_COUNTER.Enum() } - desc, attributes := c.getMetricMetadata(metric, ip.Attributes(), resourceAttrs) + desc, attributes, err := c.getMetricMetadata(metric, mType, ip.Attributes(), resourceAttrs) + if err != nil { + return nil, err + } var value float64 switch ip.ValueType() { case pmetric.NumberDataPointValueTypeInt: @@ -182,7 +204,6 @@ func (c *collector) convertSum(metric pmetric.Metric, resourceAttrs pcommon.Map) } var m prometheus.Metric - var err error if metricType == prometheus.CounterValue && ip.StartTimestamp().AsTime().Unix() > 0 { m, err = prometheus.NewConstMetricWithCreatedTimestamp(desc, metricType, value, ip.StartTimestamp().AsTime(), attributes...) } else { @@ -218,9 +239,11 @@ func (c *collector) convertSummary(metric pmetric.Metric, resourceAttrs pcommon. quantiles[qvj.Quantile()] = qvj.Value() } - desc, attributes := c.getMetricMetadata(metric, point.Attributes(), resourceAttrs) + desc, attributes, err := c.getMetricMetadata(metric, dto.MetricType_SUMMARY.Enum(), point.Attributes(), resourceAttrs) + if err != nil { + return nil, err + } var m prometheus.Metric - var err error if point.StartTimestamp().AsTime().Unix() > 0 { m, err = prometheus.NewConstSummaryWithCreatedTimestamp(desc, point.Count(), point.Sum(), quantiles, point.StartTimestamp().AsTime(), attributes...) } else { @@ -237,7 +260,10 @@ func (c *collector) convertSummary(metric pmetric.Metric, resourceAttrs pcommon. func (c *collector) convertDoubleHistogram(metric pmetric.Metric, resourceAttrs pcommon.Map) (prometheus.Metric, error) { ip := metric.Histogram().DataPoints().At(0) - desc, attributes := c.getMetricMetadata(metric, ip.Attributes(), resourceAttrs) + desc, attributes, err := c.getMetricMetadata(metric, dto.MetricType_HISTOGRAM.Enum(), ip.Attributes(), resourceAttrs) + if err != nil { + return nil, err + } indicesMap := make(map[float64]int) buckets := make([]float64, 0, ip.BucketCounts().Len()) @@ -266,7 +292,6 @@ func (c *collector) convertDoubleHistogram(metric pmetric.Metric, resourceAttrs exemplars := convertExemplars(ip.Exemplars()) var m prometheus.Metric - var err error if ip.StartTimestamp().AsTime().Unix() > 0 { m, err = prometheus.NewConstHistogramWithCreatedTimestamp(desc, ip.Count(), ip.Sum(), points, ip.StartTimestamp().AsTime(), attributes...) } else { @@ -403,4 +428,50 @@ func (c *collector) Collect(ch chan<- prometheus.Metric) { ch <- m c.logger.Debug(fmt.Sprintf("metric served: %s", m.Desc().String())) } + c.cleanupMetricFamilies() +} + +func (c *collector) validateMetrics(name, description string, metricType *dto.MetricType) (help string, err error) { + now := time.Now() + v, exist := c.metricFamilies.Load(name) + if !exist { + c.metricFamilies.Store(name, metricFamily{ + lastSeen: now, + mf: &dto.MetricFamily{ + Name: proto.String(name), + Help: proto.String(description), + Type: metricType, + }, + }) + return description, nil + } + emf := v.(metricFamily) + if emf.mf.GetType() != *metricType { + return "", fmt.Errorf("instrument type conflict, using existing type definition. instrument: %s, existing: %s, dropped: %s", name, emf.mf.GetType(), *metricType) + } + emf.lastSeen = now + c.metricFamilies.Store(name, emf) + if emf.mf.GetHelp() != description { + c.logger.Info( + "Instrument description conflict, using existing", + zap.String("instrument", name), + zap.String("existing", emf.mf.GetHelp()), + zap.String("dropped", description), + ) + } + return emf.mf.GetHelp(), nil +} + +func (c *collector) cleanupMetricFamilies() { + expirationTime := time.Now().Add(-c.metricExpiration) + + c.metricFamilies.Range(func(key, value any) bool { + v := value.(metricFamily) + if expirationTime.After(v.lastSeen) { + c.logger.Debug("metric expired", zap.String("instrument", key.(string))) + c.metricFamilies.Delete(key) + return true + } + return true + }) } diff --git a/exporter/prometheusexporter/collector_test.go b/exporter/prometheusexporter/collector_test.go index 9b5d31d7efdb..f1b87d9fe10b 100644 --- a/exporter/prometheusexporter/collector_test.go +++ b/exporter/prometheusexporter/collector_test.go @@ -10,6 +10,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + dto "github.com/prometheus/client_model/go" io_prometheus_client "github.com/prometheus/client_model/go" "github.com/stretchr/testify/require" "go.opentelemetry.io/collector/pdata/pcommon" @@ -17,6 +18,7 @@ import ( conventions "go.opentelemetry.io/collector/semconv/v1.25.0" "go.uber.org/zap" "go.uber.org/zap/zapcore" + "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/timestamppb" prometheustranslator "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/translator/prometheus" @@ -66,25 +68,85 @@ func TestConvertInvalidDataType(t *testing.T) { } } -func TestConvertInvalidMetric(t *testing.T) { - for _, mType := range []pmetric.MetricType{ - pmetric.MetricTypeHistogram, - pmetric.MetricTypeSum, - pmetric.MetricTypeGauge, - } { - metric := pmetric.NewMetric() - switch mType { - case pmetric.MetricTypeGauge: - metric.SetEmptyGauge().DataPoints().AppendEmpty() - case pmetric.MetricTypeSum: - metric.SetEmptySum().DataPoints().AppendEmpty() - case pmetric.MetricTypeHistogram: - metric.SetEmptyHistogram().DataPoints().AppendEmpty() - } - c := collector{} +func TestConvertMetric(t *testing.T) { + tests := []struct { + description string + mName string + mType pmetric.MetricType + mapVals map[string]metricFamily + err bool + }{ + { + description: "invalid histogram metric", + mType: pmetric.MetricTypeHistogram, + err: true, + }, + { + description: "invalid sum metric", + mType: pmetric.MetricTypeSum, + err: true, + }, + { + description: "invalid gauge metric", + mType: pmetric.MetricTypeGauge, + err: true, + }, + { + description: "metric type conflict", + mName: "testgauge", + mType: pmetric.MetricTypeGauge, + mapVals: map[string]metricFamily{ + "testgauge": { + mf: &io_prometheus_client.MetricFamily{ + Name: proto.String("testgauge"), + Type: dto.MetricType_COUNTER.Enum(), + }, + }, + }, + err: true, + }, + { + description: "metric description conflict", + mName: "testgauge", + mType: pmetric.MetricTypeGauge, + mapVals: map[string]metricFamily{ + "testgauge": { + mf: &io_prometheus_client.MetricFamily{ + Name: proto.String("testgauge"), + Type: dto.MetricType_GAUGE.Enum(), + Help: proto.String("test help value"), + }, + }, + }, + err: false, + }, + } + for _, tt := range tests { + t.Run(tt.description, func(t *testing.T) { + metric := pmetric.NewMetric() + metric.SetName(tt.mName) + switch tt.mType { + case pmetric.MetricTypeGauge: + metric.SetEmptyGauge().DataPoints().AppendEmpty() + case pmetric.MetricTypeSum: + metric.SetEmptySum().DataPoints().AppendEmpty() + case pmetric.MetricTypeHistogram: + metric.SetEmptyHistogram().DataPoints().AppendEmpty() + } + c := collector{ + logger: zap.NewNop(), + } + for k, v := range tt.mapVals { + c.metricFamilies.Store(k, v) + } - _, err := c.convertMetric(metric, pcommon.NewMap()) - require.Error(t, err) + _, err := c.convertMetric(metric, pcommon.NewMap()) + if tt.err { + require.Error(t, err) + return + } + require.NoError(t, err) + }) } }