From c56a09b173c07a1876ad2034a303c540bbfcef10 Mon Sep 17 00:00:00 2001 From: Rodrigo Pastrana Date: Fri, 24 Jan 2025 15:15:12 -0500 Subject: [PATCH] HPCC-33298 JTrace support sampling configuration - Adds Jtrace sampler configuration - Adds OTel sampler initialization logic - Updates JTrace configuration README - Provides samples - Jlog trace/span ids suppressed if not sampled (not sure if this is wanted) Signed-off-by: Rodrigo Pastrana --- helm/examples/tracing/README.md | 4 + .../baremetal-otlp-http-localhost-sample.xml | 1 + .../otlp-http-collector-default-sampled.yaml | 10 +++ helm/hpcc/values.schema.json | 19 +++++ system/jlib/jtrace.cpp | 80 ++++++++++++++++++- 5 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 helm/examples/tracing/otlp-http-collector-default-sampled.yaml diff --git a/helm/examples/tracing/README.md b/helm/examples/tracing/README.md index 00fd4b8f77b..591561d64a4 100644 --- a/helm/examples/tracing/README.md +++ b/helm/examples/tracing/README.md @@ -12,6 +12,10 @@ All configuration options detailed here are part of the HPCC Systems Helm chart, - alwaysCreateGlobalIds - If true, assign newly created global ID to any requests that do not supply one. - optAlwaysCreateTraceIds - If true components generate trace/span ids if none are provided by the remote caller. - enableDefaultLogExporter - If true, creates a trace exporter outputting to the log using the default options +- sampler - Defines head sampling strategy. Decision to sample or drop a span or trace is not made by inspecting the trace as a whole. https://opentelemetry.io/docs/concepts/sampling/ + - type "AlwaysOff" | "AlwaysOn" | "Ratio" + - argument - Optional sampler type configuration value. Currently, only supported value applies to the "Ratio" sampler type. The argument value is a string representing a numeric value betwen 0.0 and 1.0. This value represents the ratio of trace/spans to sample + - parentBased - Optional boolean. Determines if the sampler policy honors the remote root span sampled flag - resourceAttributes: - Defines OTel specific resource attribute configuration values which are appended to the runtime OTEL_RESOURCE_ATTRIBUTES. See OTel doc: https://opentelemetry.io/docs/specs/otel/configuration/sdk-environment-variables/#general-sdk-configuration - deploymentEnvironment - Defines deployment.environment, which is used to specify diff --git a/helm/examples/tracing/baremetal-otlp-http-localhost-sample.xml b/helm/examples/tracing/baremetal-otlp-http-localhost-sample.xml index e430888efa5..e4e69f5fd3b 100644 --- a/helm/examples/tracing/baremetal-otlp-http-localhost-sample.xml +++ b/helm/examples/tracing/baremetal-otlp-http-localhost-sample.xml @@ -5,6 +5,7 @@ + \ No newline at end of file diff --git a/helm/examples/tracing/otlp-http-collector-default-sampled.yaml b/helm/examples/tracing/otlp-http-collector-default-sampled.yaml new file mode 100644 index 00000000000..55b0d57f106 --- /dev/null +++ b/helm/examples/tracing/otlp-http-collector-default-sampled.yaml @@ -0,0 +1,10 @@ +global: + tracing: + sampler: + - type: Ratio //Head sampling based on simple ratio + - argument: "0.1" //only sample 10% of traces/spans + - parentBased: true + exporters: + - type: OTLP-HTTP + endpoint: "localhost:4318/v1/traces" + consoleDebug: true \ No newline at end of file diff --git a/helm/hpcc/values.schema.json b/helm/hpcc/values.schema.json index 86d2966d332..a9e19ad4c54 100644 --- a/helm/hpcc/values.schema.json +++ b/helm/hpcc/values.schema.json @@ -1164,6 +1164,25 @@ "type": "boolean", "description": "If true, creates a trace exporter outputting to the log using the default options" }, + "sampler": { + "type": "object", + "properties": { + "type": { + "type": "string", + "enum": ["AlwaysOff", "AlwaysOn", "Ratio"], + "description": "Name of the Head Sampling type AlwaysOff|AlwaysOn|Ratio" + }, + "argument" : { + "type": "string", + "description": "Optional sampler type configuration value" + }, + "parentBased" : { + "type": "boolean", + "description": "Optional sets the sampling decision based on the Span’s parent, or absence of parent, to know which secondary sampler to use." + }, + "additionalProperties": false + } + }, "exporters": { "type": "array", "description": "List of trace exporters", diff --git a/system/jlib/jtrace.cpp b/system/jlib/jtrace.cpp index 9d17c644908..096f44614f0 100644 --- a/system/jlib/jtrace.cpp +++ b/system/jlib/jtrace.cpp @@ -23,6 +23,12 @@ #include "opentelemetry/sdk/trace/tracer_context_factory.h" //opentelemetry::sdk::trace::TracerContextFactory::Create(std::move(processors)); #include "opentelemetry/sdk/trace/simple_processor_factory.h" #include "opentelemetry/sdk/trace/batch_span_processor_factory.h" +#include +#include +#include +#include +#include +#include #include "opentelemetry/exporters/ostream/span_exporter_factory.h"// auto exporter = opentelemetry::exporter::trace::OStreamSpanExporterFactory::Create(); #include "opentelemetry/exporters/ostream/common_utils.h" #include "opentelemetry/exporters/memory/in_memory_span_exporter_factory.h" @@ -762,6 +768,9 @@ class CSpan : public CInterfaceOf if (span == nullptr) return false; + if (!span->IsRecording()) //if not sampled, we shouldn't consider this valid? + return false; + auto spanCtx = span->GetContext(); return spanCtx.IsValid(); } @@ -1350,6 +1359,59 @@ std::unique_ptr CTraceManager::createP return opentelemetry::sdk::trace::SimpleSpanProcessorFactory::Create(std::move(exporter)); } +static std::unique_ptr createSampler(IPropertyTree * samplerTree) +{ + std::unique_ptr sampler; + + if (samplerTree) + { + const char * samplerType = samplerTree->queryProp("@type"); + if (!isEmptyString(samplerType)) + { + const char * samplerArgument = samplerTree->queryProp("@argument"); + if (strcmp("AlwaysOff", samplerType)==0) + { + sampler.reset(new opentelemetry::sdk::trace::AlwaysOffSampler()); + } + else if (strcmp("AlwaysOn", samplerType)==0) + { + sampler.reset(new opentelemetry::sdk::trace::AlwaysOnSampler()); + } + else if (strcmp("Ratio", samplerType)==0) + { + size_t pos; + double ratio = std::stod(samplerArgument, &pos); + if (ratio < 0 || ratio > 1) + { + OERRLOG("JTrace invalid ratio sampler configuration. Ratio must be LE 1.0 or GE 0.0"); + } + else + { + sampler.reset(new opentelemetry::sdk::trace::TraceIdRatioBasedSampler(ratio)); + } + } + else + { + WARNLOG("JTrace initialization: Invalid sampler type configured: '%s'", samplerType); + } + + if (sampler && samplerTree->getPropBool("@parentBased", true)) + { + return std::unique_ptr(new opentelemetry::sdk::trace::ParentBasedSampler( std::move(sampler))); + } + } + } + + if (!sampler) + { + WARNLOG("JTrace: Default Sampler 'Always ON' set"); + sampler = std::unique_ptr + (new opentelemetry::sdk::trace::AlwaysOnSampler); + } + + return sampler; +} + void CTraceManager::initTracerProviderAndGlobalInternals(const IPropertyTree * traceConfig) { /* @@ -1368,11 +1430,15 @@ void CTraceManager::initTracerProviderAndGlobalInternals(const IPropertyTree * t std::vector> processors; + std::unique_ptr sampler; + //By default trace spans to the logs in debug builds - so that developers get used to seeing them. //Default off for release builds to avoid flooding the logs, and because they are likely to use OTLP bool enableDefaultLogExporter = isDebugBuild(); if (traceConfig) { + sampler = createSampler(traceConfig->queryPropTree("sampler")); + IPropertyTree * resourceAttributesTree = traceConfig->queryPropTree("resourceAttributes"); if (resourceAttributesTree) { @@ -1398,6 +1464,12 @@ void CTraceManager::initTracerProviderAndGlobalInternals(const IPropertyTree * t enableDefaultLogExporter = traceConfig->getPropBool("enableDefaultLogExporter", enableDefaultLogExporter); } + if (!sampler) + { + sampler = std::unique_ptr + (new opentelemetry::sdk::trace::AlwaysOnSampler); + } + if (enableDefaultLogExporter) { //Simple option to create logging to the log file - primarily to aid developers. @@ -1407,9 +1479,9 @@ void CTraceManager::initTracerProviderAndGlobalInternals(const IPropertyTree * t auto jtraceResource = opentelemetry::sdk::resource::Resource::Create(resourceAtts); - // Default is an always-on sampler. std::unique_ptr context = - opentelemetry::sdk::trace::TracerContextFactory::Create(std::move(processors), jtraceResource); + opentelemetry::sdk::trace::TracerContextFactory::Create(std::move(processors), jtraceResource, std::move(sampler)); + std::shared_ptr provider = opentelemetry::sdk::trace::TracerProviderFactory::Create(std::move(context)); @@ -1424,6 +1496,10 @@ Expected Configuration format: disabled: true #optional - disable OTel tracing alwaysCreateGlobalIds : false #optional - should global ids always be created? alwaysCreateTraceIds #optional - should trace ids always be created? + sampler: #optional - controls how traces are either suppressed or sampled + type: #"AlwaysOff" | "AlwaysOn" | "Ratio" + argument: #optional sampler type configuration value + parentBased: #optional sets the sampling decision based on the Span’s parent, or absence of parent, to know which secondary sampler to use. exporters: #optional - Controls how trace data is exported/reported - type: OTLP #OS|OTLP|Prometheus|JLOG endpoint: "localhost:4317" #exporter specific key/value pairs