Skip to content

Commit

Permalink
Add support for OTEL tracing
Browse files Browse the repository at this point in the history
`crictl` now features 3 new CLI parameters:

- `--enable-tracing`: Enable OpenTelemetry tracing. (default: false)
- `--tracing-endpoint`: Address to which the gRPC tracing collector will send spans to. (default: "0.0.0.0:4317")
- `--tracing-sampling-rate-per-million`: Number of samples to collect per million OpenTelemetry spans. Set to 1000000 or -1 to always sample. (default: -1)

The tracer provider will be created on startup and the `Shutdown()`
invocation will ensure that all spans are processed before exiting the
binary.

The `hack/tracing` directory contains scripts for local testing:

```
> ./hack/tracing/start
…
Everything is ready, open http://localhost:16686 to access jaeger
```

When now running `crictl` with `--enable-tracing`:

```
> sudo ./build/bin/linux/amd64/crictl --enable-tracing ps
```

Then jaeger should show collected traces and spans for the 3 RPCs
`ListContainers`, `ImageFsInfo` as well as `Version`.

Signed-off-by: Sascha Grunert <sgrunert@redhat.com>
  • Loading branch information
saschagrunert committed Jan 29, 2024
1 parent c631776 commit 66bb77f
Show file tree
Hide file tree
Showing 47 changed files with 2,007 additions and 502 deletions.
70 changes: 64 additions & 6 deletions cmd/crictl/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package main

import (
"context"
"fmt"
"os"
"runtime"
Expand All @@ -25,11 +26,15 @@ import (

"github.com/sirupsen/logrus"
"github.com/urfave/cli/v2"
sdktrace "go.opentelemetry.io/otel/sdk/trace"
"go.opentelemetry.io/otel/trace"
"go.opentelemetry.io/otel/trace/noop"

internalapi "k8s.io/cri-api/pkg/apis"
"k8s.io/kubernetes/pkg/kubelet/cri/remote"

"github.com/kubernetes-sigs/cri-tools/pkg/common"
"github.com/kubernetes-sigs/cri-tools/pkg/tracing"
"github.com/kubernetes-sigs/cri-tools/pkg/version"
)

Expand All @@ -53,9 +58,11 @@ var (
PullImageOnCreate bool
// DisablePullOnRun disable pulling image on run requests
DisablePullOnRun bool
// tracerProvider is the global OpenTelemetry tracing instance.
tracerProvider *sdktrace.TracerProvider
)

func getRuntimeService(context *cli.Context, timeout time.Duration) (res internalapi.RuntimeService, err error) {
func getRuntimeService(_ *cli.Context, timeout time.Duration) (res internalapi.RuntimeService, err error) {
if RuntimeEndpointIsSet && RuntimeEndpoint == "" {
return nil, fmt.Errorf("--runtime-endpoint is not set")
}
Expand All @@ -67,6 +74,13 @@ func getRuntimeService(context *cli.Context, timeout time.Duration) (res interna
t = timeout
}

// Use the noop tracer provider and not tracerProvider directly, otherwise
// we'll panic in the unary call interceptor
var tp trace.TracerProvider = noop.NewTracerProvider()
if tracerProvider != nil {
tp = tracerProvider
}

// If no EP set then use the default endpoint types
if !RuntimeEndpointIsSet {
logrus.Warningf("runtime connect using default endpoints: %v. "+
Expand All @@ -79,7 +93,7 @@ func getRuntimeService(context *cli.Context, timeout time.Duration) (res interna
for _, endPoint := range defaultRuntimeEndpoints {
logrus.Debugf("Connect using endpoint %q with %q timeout", endPoint, t)

res, err = remote.NewRemoteRuntimeService(endPoint, t, nil)
res, err = remote.NewRemoteRuntimeService(endPoint, t, tp)
if err != nil {
logrus.Error(err)
continue
Expand All @@ -90,10 +104,10 @@ func getRuntimeService(context *cli.Context, timeout time.Duration) (res interna
}
return res, err
}
return remote.NewRemoteRuntimeService(RuntimeEndpoint, t, nil)
return remote.NewRemoteRuntimeService(RuntimeEndpoint, t, tp)
}

func getImageService(context *cli.Context) (res internalapi.ImageManagerService, err error) {
func getImageService(*cli.Context) (res internalapi.ImageManagerService, err error) {
if ImageEndpoint == "" {
if RuntimeEndpointIsSet && RuntimeEndpoint == "" {
return nil, fmt.Errorf("--image-endpoint is not set")
Expand All @@ -103,6 +117,14 @@ func getImageService(context *cli.Context) (res internalapi.ImageManagerService,
}

logrus.Debugf("get image connection")

// Use the noop tracer provider and not tracerProvider directly, otherwise
// we'll panic in the unary call interceptor
var tp trace.TracerProvider = noop.NewTracerProvider()
if tracerProvider != nil {
tp = tracerProvider
}

// If no EP set then use the default endpoint types
if !ImageEndpointIsSet {
logrus.Warningf("image connect using default endpoints: %v. "+
Expand All @@ -115,7 +137,7 @@ func getImageService(context *cli.Context) (res internalapi.ImageManagerService,
for _, endPoint := range defaultRuntimeEndpoints {
logrus.Debugf("Connect using endpoint %q with %q timeout", endPoint, Timeout)

res, err = remote.NewRemoteImageService(endPoint, Timeout, nil)
res, err = remote.NewRemoteImageService(endPoint, Timeout, tp)
if err != nil {
logrus.Error(err)
continue
Expand All @@ -126,7 +148,7 @@ func getImageService(context *cli.Context) (res internalapi.ImageManagerService,
}
return res, err
}
return remote.NewRemoteImageService(ImageEndpoint, Timeout, nil)
return remote.NewRemoteImageService(ImageEndpoint, Timeout, tp)
}

func getTimeout(timeDuration time.Duration) time.Duration {
Expand Down Expand Up @@ -220,6 +242,20 @@ func main() {
Aliases: []string{"D"},
Usage: "Enable debug mode",
},
&cli.BoolFlag{
Name: "enable-tracing",
Usage: "Enable OpenTelemetry tracing.",
},
&cli.IntFlag{
Name: "tracing-sampling-rate-per-million",
Usage: "Number of samples to collect per million OpenTelemetry spans. Set to 1000000 or -1 to always sample.",
Value: -1,
},
&cli.StringFlag{
Name: "tracing-endpoint",
Usage: "Address to which the gRPC tracing collector will send spans to.",
Value: "0.0.0.0:4317",
},
}

app.Before = func(context *cli.Context) (err error) {
Expand Down Expand Up @@ -290,6 +326,19 @@ func main() {
if Debug {
logrus.SetLevel(logrus.DebugLevel)
}

// Configure tracing if enabled
if context.IsSet("enable-tracing") {
tracerProvider, err = tracing.Init(
context.Context,
context.String("tracing-endpoint"),
context.Int("tracing-sampling-rate-per-million"),
)
if err != nil {
return fmt.Errorf("init tracing: %w", err)
}
}

return nil
}
// sort all flags
Expand All @@ -301,4 +350,13 @@ func main() {
if err := app.Run(os.Args); err != nil {
logrus.Fatal(err)
}

// Ensure that all spans are processed.
if tracerProvider != nil {
ctx, cancel := context.WithTimeout(context.Background(), Timeout)
defer cancel()
if err := tracerProvider.Shutdown(ctx); err != nil {
logrus.Errorf("Unable to shutdown tracer provider: %v", err)
}
}
}
12 changes: 6 additions & 6 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ require (
github.com/opencontainers/selinux v1.11.0
github.com/sirupsen/logrus v1.9.3
github.com/urfave/cli/v2 v2.27.1
go.opentelemetry.io/otel v1.22.0
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0
go.opentelemetry.io/otel/sdk v1.19.0
go.opentelemetry.io/otel/trace v1.22.0
golang.org/x/net v0.20.0
golang.org/x/sys v0.16.0
golang.org/x/term v0.16.0
Expand Down Expand Up @@ -44,7 +48,7 @@ require (
github.com/emicklei/go-restful/v3 v3.11.0 // indirect
github.com/felixge/httpsnoop v1.0.3 // indirect
github.com/fsnotify/fsnotify v1.7.0 // indirect
github.com/go-logr/logr v1.3.0 // indirect
github.com/go-logr/logr v1.4.1 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/go-openapi/jsonpointer v0.19.6 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
Expand Down Expand Up @@ -77,12 +81,8 @@ require (
github.com/xrash/smetrics v0.0.0-20201216005158-039620a65673 // indirect
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0 // indirect
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0 // indirect
go.opentelemetry.io/otel v1.19.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 // indirect
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 // indirect
go.opentelemetry.io/otel/metric v1.19.0 // indirect
go.opentelemetry.io/otel/sdk v1.19.0 // indirect
go.opentelemetry.io/otel/trace v1.19.0 // indirect
go.opentelemetry.io/otel/metric v1.22.0 // indirect
go.opentelemetry.io/proto/otlp v1.0.0 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/oauth2 v0.10.0 // indirect
Expand Down
15 changes: 8 additions & 7 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@ github.com/felixge/httpsnoop v1.0.3/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSw
github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA=
github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.3.0 h1:2y3SDp0ZXuc6/cjLSZ+Q3ir+QB9T/iG5yYRXqsagWSY=
github.com/go-logr/logr v1.3.0/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/logr v1.4.1 h1:pKouT5E8xu9zeFC39JXRDukb6JFQPXM5p5I91188VAQ=
github.com/go-logr/logr v1.4.1/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
github.com/go-logr/stdr v1.2.2 h1:hSWxHoqTgW2S2qGc0LTAI563KZ5YKYRhT3MFKZMbjag=
github.com/go-logr/stdr v1.2.2/go.mod h1:mMo/vtBO5dYbehREoey6XUKy/eSumjCCveDpRre4VKE=
github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE=
Expand Down Expand Up @@ -169,18 +170,18 @@ go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.4
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.42.0/go.mod h1:5z+/ZWJQKXa9YT34fQNx5K8Hd1EoIhvtUygUQPqEOgQ=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0 h1:KfYpVmrjI7JuToy5k8XV3nkapjWx48k4E4JOtVstzQI=
go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.44.0/go.mod h1:SeQhzAEccGVZVEy7aH87Nh0km+utSpo1pTv6eMMop48=
go.opentelemetry.io/otel v1.19.0 h1:MuS/TNf4/j4IXsZuJegVzI1cwut7Qc00344rgH7p8bs=
go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY=
go.opentelemetry.io/otel v1.22.0 h1:xS7Ku+7yTFvDfDraDIJVpw7XPyuHlB9MCiqqX5mcJ6Y=
go.opentelemetry.io/otel v1.22.0/go.mod h1:eoV4iAi3Ea8LkAEI9+GFT44O6T/D0GWAVFyZVCC6pMI=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U=
go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 h1:3d+S281UTjM+AbF31XSOYn1qXn3BgIdWl8HNEpx08Jk=
go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0/go.mod h1:0+KuTDyKL4gjKCF75pHOX4wuzYDUZYfAQdSu43o+Z2I=
go.opentelemetry.io/otel/metric v1.19.0 h1:aTzpGtV0ar9wlV4Sna9sdJyII5jTVJEvKETPiOKwvpE=
go.opentelemetry.io/otel/metric v1.19.0/go.mod h1:L5rUsV9kM1IxCj1MmSdS+JQAcVm319EUrDVLrt7jqt8=
go.opentelemetry.io/otel/metric v1.22.0 h1:lypMQnGyJYeuYPhOM/bgjbFM6WE44W1/T45er4d8Hhg=
go.opentelemetry.io/otel/metric v1.22.0/go.mod h1:evJGjVpZv0mQ5QBRJoBF64yMuOf4xCWdXjK8pzFvliY=
go.opentelemetry.io/otel/sdk v1.19.0 h1:6USY6zH+L8uMH8L3t1enZPR3WFEmSTADlqldyHtJi3o=
go.opentelemetry.io/otel/sdk v1.19.0/go.mod h1:NedEbbS4w3C6zElbLdPJKOpJQOrGUJ+GfzpjUvI0v1A=
go.opentelemetry.io/otel/trace v1.19.0 h1:DFVQmlVbfVeOuBRrwdtaehRrWiL1JoVs9CPIQ1Dzxpg=
go.opentelemetry.io/otel/trace v1.19.0/go.mod h1:mfaSyvGyEJEI0nyV2I4qhNQnbBOUUmYZpYojqMnX2vo=
go.opentelemetry.io/otel/trace v1.22.0 h1:Hg6pPujv0XG9QaVbGOBVHunyuLcCC3jN7WEhPx83XD0=
go.opentelemetry.io/otel/trace v1.22.0/go.mod h1:RbbHXVqKES9QhzZq/fE5UnOSILqRt40a21sPw2He1xo=
go.opentelemetry.io/proto/otlp v1.0.0 h1:T0TX0tmXU8a3CbNXzEKGeU5mIVOdf0oykP+u2lIVU/I=
go.opentelemetry.io/proto/otlp v1.0.0/go.mod h1:Sy6pihPLfYHkr3NkUbEhGHFhINUSI/v80hjKIs5JXpM=
go.uber.org/goleak v1.2.1 h1:NBol2c7O1ZokfZ0LEU9K6Whx/KnwvepVetCUhtKja4A=
Expand Down
30 changes: 30 additions & 0 deletions hack/tracing/env
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env bash

# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -uo pipefail

cd "$(dirname "$0")"

CONTAINER_RUNTIME=$(which podman 2>/dev/null) || CONTAINER_RUNTIME=$(which docker 2>/dev/null)
if [[ -z "$CONTAINER_RUNTIME" ]]; then
echo "Neither docker nor podman found in \$PATH"
exit 1
fi

set -e

export JAEGER_CTR=jaeger
export CONTAINER_RUNTIME
44 changes: 44 additions & 0 deletions hack/tracing/start
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env bash

# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -uo pipefail

# Global vars to be used
# shellcheck source=stop
source "$(dirname "${BASH_SOURCE[0]}")"/stop

JAEGER_IMG="jaegertracing/all-in-one:1.52"

echo "Starting $JAEGER_CTR"
"$CONTAINER_RUNTIME" run -d --rm --network host --name "$JAEGER_CTR" "$JAEGER_IMG"

PORT=14250
MAX_CNT=100
for ((i = 0; i <= "$MAX_CNT"; i++)); do
if netstat -tuplen 2>/dev/null | grep -q "$PORT .* LISTEN"; then
break
fi

if [[ $i == "$MAX_CNT" ]]; then
echo "Giving up"
exit 1
fi

echo "Waiting for gRPC port $PORT to listen… ($i)"
sleep 3
done

echo "Everything is ready, open http://localhost:16686 to access jaeger"
24 changes: 24 additions & 0 deletions hack/tracing/stop
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env bash

# Copyright 2023 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

set -uo pipefail

# Global vars to be used
# shellcheck source=env
source "$(dirname "${BASH_SOURCE[0]}")"/env

echo "Stopping $JAEGER_CTR container"
"$CONTAINER_RUNTIME" stop "$JAEGER_CTR" >/dev/null 2>&1 || true
Loading

0 comments on commit 66bb77f

Please sign in to comment.