open-telemetry · larsn777 · Nov 20, 2024 · Nov 21, 2024 · Nov 21, 2024 · jpkrohling
@@ -0,0 +1,27 @@
+# Use this changelog template to create an entry for release notes.
+
+# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix'
+change_type: enhancement
+
+# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver)
+component: processor/tail_sampling
+
+# A brief description of the change.  Surround your text with quotes ("") if it needs to start with a backtick (`).
+note: Add new tail sampling policy for sampling low-frequency spans.
+
+# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists.
+issues: [36487]
+
+# (Optional) One or more lines of additional information to render under the primary note.
+# These lines will be padded with 2 spaces and then inserted directly into the document.
+# Use pipe (|) for multiline entries.
+subtext: A new policy for sampling rare spans based on the use of count-min sketch.
+
+# If your change doesn't affect end users or the exported elements of any package,
+# you should instead start your pull request title with [chore] or use the "Skip Changelog" label.
+# Optional: The change log or logs in which this entry should be included.
+# e.g. '[user]' or '[user, api]'
+# Include 'user' if the change is relevant to end users.
+# Include 'api' if there is a change to a library API.
+# Default: '[user]'
+change_logs: []
@@ -35,7 +35,8 @@ Multiple policies exist today and it is straight forward to add more. These incl
 - `span_count`: Sample based on the minimum and/or maximum number of spans, inclusive. If the sum of all spans in the trace is outside the range threshold, the trace will not be sampled.
 - `boolean_attribute`: Sample based on boolean attribute (resource and record).
 - `ottl_condition`: Sample based on given boolean OTTL condition (span and span event).
-- `and`: Sample based on multiple policies, creates an AND policy 
+- `and`: Sample based on multiple policies, creates an AND policy
+- `rare_spans`: Sample low-frequency spans based on counting unique spans
 - `composite`: Sample based on a combination of above samplers, with ordering and rate allocation per sampler. Rate allocation allocates certain percentages of spans per policy order. 
   For example if we have set max_total_spans_per_second as 100 then we can set rate_allocation as follows
   1. test-composite-policy-1 = 50 % of max_total_spans_per_second = 50 spans_per_second
@@ -166,6 +167,19 @@ processors:
               ]
             }
          },
+         {
+            name: rare-spans-policy-1,
+            type: rare_spans,
+            rare_spans: {
+              error_probability: 0.01,
+              total_frequency: 1000,
+              max_error_value: 1,
+              observation_interval: 60m,
+              buckets_num: 4,
+              rare_span_frequency: 2,
+              sampled_spans_per_second: 500,
+              processed_spans_per_second: 1000,
+            }
          {
             name: composite-policy-1,
             type: composite,

@@ -43,6 +43,8 @@ const (
 	// OTTLCondition sample traces which match user provided OpenTelemetry Transformation Language
 	// conditions.
 	OTTLCondition PolicyType = "ottl_condition"
+	// RareSpans sample traces with rare spans
+	RareSpans PolicyType = "rare_spans"
 )
 
 // sharedPolicyCfg holds the common configuration to all policies that are used in derivative policy configurations
@@ -72,6 +74,50 @@ type sharedPolicyCfg struct {
 	BooleanAttributeCfg BooleanAttributeCfg `mapstructure:"boolean_attribute"`
 	// Configs for OTTL condition filter sampling policy evaluator
 	OTTLConditionCfg OTTLConditionCfg `mapstructure:"ottl_condition"`
+	// Configs for rare_spans policy
+	RareSpansCfg RareSpansCfg `mapstructure:"rare_spans"`
+}
+
+// RareSpansCfg configuration for the rare spans sampler.
+type RareSpansCfg struct {
+	// ErrorProbability error probability (δ, delta in the turms of Count-min sketch)
+	// defines the probability of error or failure rate of the estimation. If the
+	// probability is small, it means there is a low chance that the CMS will produce
+	// a count estimation that is too far from the true count. On the other hand,
+	// the smaller the value, the more times the hash will need to be calculated for
+	// each span. This in turn can negatively affect performance.
+	ErrorProbability float64 `mapstructure:"error_probability"`
+	// TotalFreq total number of spans that will need to be processed in
+	// ObservationInterval time interval. This parameter affects the accuracy
+	// of the span frequency calculation (`epsilon` in the terms of
+	// Count-min sketch):
+	//  - the closer the value is to the actual number of spans, the more
+	//    accurate the estimate will be;
+	//  - if the value is higher than the real one, this will lead to a very
+	//    accurate estimate;
+	//  - the larger this value, the more memory will be needed to calculate
+	//    the estimate.
+	TotalFreq float64 `mapstructure:"total_frequency"`
+	// MaxErrValue the maximum value of the overestimation at spans frequency
+	// calculation. Alongside with the TotalFreq option, it is used to calculate
+	// the `epsilon` (ε) parameter for the Count-Min sketch data structure.
+	// The lower the value of MaxErrValue, the more accurate the estimate of the
+	// frequency of each unique span will be. On the other hand, the smaller the
+	// value, the more memory will be allocated for CMS data structure.
+	MaxErrValue float64 `mapstructure:"max_error_value"`
+	// SpsSampledLimit maximum number of spans that can be sampled per second.
+	SpsSampledLimit int64 `mapstructure:"sampled_spans_per_second"`
+	// SpsSampledLimit maximum number of spans that can be processed per second.
+	SpsProcessedLimit int64 `mapstructure:"processed_spans_per_second"`
+	// ObservationInterval the time interval of the sliding window within which
+	// rare spans will be taken into account.
+	ObservationInterval time.Duration `mapstructure:"observation_interval"`
+	// Buckets number of segments in a sliding window.
+	Buckets uint8 `mapstructure:"buckets_num"`
+	// RareSpanFrequency frequency of occurrence of a span in the ObservationInterval
+	// at which the span will be sampled. For example, if the value is 1, then the
+	// span will be sampled only at its first occurrence in the ObservationInterval.
+	RareSpanFrequency uint32 `mapstructure:"rare_span_frequency"`
 }
 
 // CompositeSubPolicyCfg holds the common configuration to all policies under composite policy.

@@ -17,7 +17,7 @@ require (
 	go.opentelemetry.io/collector/featuregate v1.20.0
 	go.opentelemetry.io/collector/pdata v1.20.0
 	go.opentelemetry.io/collector/processor v0.114.0
-	go.opentelemetry.io/collector/semconv v0.114.0 // indirect
+	go.opentelemetry.io/collector/semconv v0.114.0
 	go.opentelemetry.io/otel v1.32.0
 	go.opentelemetry.io/otel/metric v1.32.0
 	go.opentelemetry.io/otel/sdk/metric v1.32.0
@@ -27,6 +27,7 @@ require (
 )
 
 require (
+	github.com/cespare/xxhash/v2 v2.3.0
 	go.opentelemetry.io/collector/component/componenttest v0.114.0
 	go.opentelemetry.io/collector/consumer/consumertest v0.114.0
 	go.opentelemetry.io/collector/processor/processortest v0.114.0

@@ -0,0 +1,110 @@
+// Copyright The OpenTelemetry Authors
+// SPDX-License-Identifier: Apache-2.0
+
+package cms // import "github.com/open-telemetry/opentelemetry-collector-contrib/processor/tailsamplingprocessor/internal/cms"
+
+import (
+	"math"
+)
+
+// CountMinSketch interface for Count-Min Sketch data structure. CMS is a
+// probabilistic data structure that provides an estimate of the frequency
+// of elements in a data stream.
+type CountMinSketch interface {
+	// Count returns the estimated frequency of the given key in the data
+	// stream.
+	Count(key []byte) uint32
+	// Insert increments the counters in the Count-Min Sketch for the given
+	// key.
+	Insert(element []byte)
+	// Clear resets the internal state of Count-Min Sketch
+	Clear()
+	// InsertWithCount increases the count of specified key and	returns a new
+	// estimated frequency of the key.
+	InsertWithCount(element []byte) uint32
+}
+
+type hasher interface {
+	Hash([]byte) uint32
+}
+
+// CountMinSketchCfg describes the main configuration options for CMS data
+// structure
+type CountMinSketchCfg struct {
+	// MaxErr approximation error, determines the error bound of the frequency
+	// estimates. Used together with the TotalFreq option to calculate the
+	// epsilon (ε) parameter for the CountMin Sketch.
+	MaxErr float64
+	// ErrorProbability error probability (δ, delta), defines the probability that
+	// the error exceeds the bound.
+	ErrorProbability float64
+	// TotalFreq total number of elements (keys) in the data stream. Used in
+	// calculation of epsilon (ε) parameter for the CountMin Sketch
+	TotalFreq float64
+}
+
+type CMS struct {
+	data [][]uint32
+	hs   []hasher
+}
+
+// NewCMS creates new CMS structure based on given width and depth.
+func NewCMS(w, h int) *CMS {
+	data := make([][]uint32, h)
+	hs := make([]hasher, h)
+	for i := 0; i < h; i++ {
+		hs[i] = NewHWHasher(uint32(w), i)
+		data[i] = make([]uint32, w)
+	}
+	return &CMS{
+		data: data,
+		hs:   hs,
+	}
+}
+
+// NewCMSWithErrorParams creates new CMS structure based on given config.
+// There CMS width = ⌈e/ε⌉, and depth = ⌈ln(1/δ)⌉
+func NewCMSWithErrorParams(cfg *CountMinSketchCfg) *CMS {
+	d := math.Ceil(math.Log2(1 / cfg.ErrorProbability))
+	w := math.Ceil(math.E / (cfg.MaxErr / cfg.TotalFreq))
+	return NewCMS(int(w), int(d))
+}
+
+// Insert inserts new element in CMS
+func (c *CMS) Insert(element []byte) {
+	for i, h := range c.hs {
+		c.data[i][h.Hash(element)]++
+	}
+}
+
+// Clear resets the CMS state
+func (c *CMS) Clear() {
+	for i := range c.hs {
+		for k := range c.data[i] {
+			c.data[i][k] = 0
+		}
+	}
+}
+
+// Count estimates the frequency of a given element
+func (c *CMS) Count(element []byte) uint32 {
+	var m uint32 = math.MaxUint32
+	for i, h := range c.hs {
+		m = min(m, c.data[i][h.Hash(element)])
+	}
+	return m
+}
+
+// InsertWithCount inserts the element to the CMS and returns the element's
+// frequency estimation. This method is equivalent to sequential calls to
+// Insert(element) and Count(element). However, in comparison with Count+Insert,
+// the InsertWithCount method has 2 times less number of hash calculations.
+func (c *CMS) InsertWithCount(element []byte) uint32 {
+	var m uint32 = math.MaxUint32
+	for i, h := range c.hs {
+		position := h.Hash(element)
+		c.data[i][position]++
+		m = min(m, c.data[i][position])
+	}
+	return m
+}