From 667a86315d35f69d179314ccc74d322ff1ef9c30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geyslan=20Greg=C3=B3rio?= Date: Tue, 5 Sep 2023 09:01:34 -0300 Subject: [PATCH 1/2] fix(ebpf): closed channel read This libbpfgo issue aquasecurity/libbpfgo#122 is actually a Tracee problem. It was caused by read attempts on a closed channel (lostEvChannel), when libbpgo had already closed it. --- pkg/ebpf/bpf_log.go | 11 +++-------- pkg/ebpf/capture.go | 11 +++-------- pkg/ebpf/events_processor.go | 23 +++++++++++++---------- pkg/ebpf/net_capture.go | 9 +++------ 4 files changed, 22 insertions(+), 32 deletions(-) diff --git a/pkg/ebpf/bpf_log.go b/pkg/ebpf/bpf_log.go index fad7ab30c189..7d3824810621 100644 --- a/pkg/ebpf/bpf_log.go +++ b/pkg/ebpf/bpf_log.go @@ -192,15 +192,10 @@ func (t *Tracee) processBPFLogs(ctx context.Context) { } case lost := <-t.lostBPFLogChannel: - // When terminating tracee-ebpf the lost channel receives multiple "0 lost events" events. - // This check prevents those 0 lost events messages to be written to stderr until the bug is fixed: - // https://github.com/aquasecurity/libbpfgo/issues/122 - if lost > 0 { - if err := t.stats.LostBPFLogsCount.Increment(lost); err != nil { - logger.Errorw("Incrementing lost BPF logs count", "error", err) - } - logger.Warnw(fmt.Sprintf("Lost %d ebpf logs events", lost)) + if err := t.stats.LostBPFLogsCount.Increment(lost); err != nil { + logger.Errorw("Incrementing lost BPF logs count", "error", err) } + logger.Warnw(fmt.Sprintf("Lost %d ebpf logs events", lost)) case <-ctx.Done(): return diff --git a/pkg/ebpf/capture.go b/pkg/ebpf/capture.go index 6d81f4e4e343..746c350dc3b5 100644 --- a/pkg/ebpf/capture.go +++ b/pkg/ebpf/capture.go @@ -214,15 +214,10 @@ func (t *Tracee) processFileCaptures(ctx context.Context) { } case lost := <-t.lostCapturesChannel: - // When terminating tracee-ebpf the lost channel receives multiple "0 lost events" events. - // This check prevents those 0 lost events messages to be written to stderr until the bug is fixed: - // https://github.com/aquasecurity/libbpfgo/issues/122 - if lost > 0 { - if err := t.stats.LostWrCount.Increment(lost); err != nil { - logger.Errorw("Incrementing lost capture count", "error", err) - } - logger.Warnw(fmt.Sprintf("Lost %d capture events", lost)) + if err := t.stats.LostWrCount.Increment(lost); err != nil { + logger.Errorw("Incrementing lost capture count", "error", err) } + logger.Warnw(fmt.Sprintf("Lost %d capture events", lost)) case <-ctx.Done(): return diff --git a/pkg/ebpf/events_processor.go b/pkg/ebpf/events_processor.go index 9f5302348cfd..32744acf317d 100644 --- a/pkg/ebpf/events_processor.go +++ b/pkg/ebpf/events_processor.go @@ -36,19 +36,22 @@ func (t *Tracee) processLostEvents() { logger.Debugw("Starting processLostEvents goroutine") defer logger.Debugw("Stopped processLostEvents goroutine") + // Since this is an end-stage goroutine, it should be terminated when: + // - lostEvChannel is closed, or finally when; + // - internal done channel is closed (not ctx). for { select { - case lost := <-t.lostEvChannel: - // When terminating tracee-ebpf the lost channel receives multiple "0 lost events" events. - // This check prevents those 0 lost events messages to be written to stderr until the bug is fixed: - // https://github.com/aquasecurity/libbpfgo/issues/122 - if lost > 0 { - if err := t.stats.LostEvCount.Increment(lost); err != nil { - logger.Errorw("Incrementing lost event count", "error", err) - } - logger.Warnw(fmt.Sprintf("Lost %d events", lost)) + case lost, ok := <-t.lostEvChannel: + if !ok { + return // lostEvChannel is closed, lost is zero value + } + + if err := t.stats.LostEvCount.Increment(lost); err != nil { + logger.Errorw("Incrementing lost event count", "error", err) } - // Since this is an end-state goroutine, it should be terminated only when Tracee done channel is closed. + logger.Warnw(fmt.Sprintf("Lost %d events", lost)) + + // internal done channel is closed when Tracee is stopped via Tracee.Close() case <-t.done: return } diff --git a/pkg/ebpf/net_capture.go b/pkg/ebpf/net_capture.go index b8a1ba15d054..e39c4e0bab54 100644 --- a/pkg/ebpf/net_capture.go +++ b/pkg/ebpf/net_capture.go @@ -62,13 +62,10 @@ func (t *Tracee) processNetCapEvents(ctx context.Context, in <-chan *trace.Event _ = t.stats.NetCapCount.Increment() case lost := <-t.lostNetCapChannel: - if lost > 0 { - // https://github.com/aquasecurity/libbpfgo/issues/122 - if err := t.stats.LostNtCapCount.Increment(lost); err != nil { - logger.Errorw("Incrementing lost network events count", "error", err) - } - logger.Warnw(fmt.Sprintf("Lost %d network capture events", lost)) + if err := t.stats.LostNtCapCount.Increment(lost); err != nil { + logger.Errorw("Incrementing lost network events count", "error", err) } + logger.Warnw(fmt.Sprintf("Lost %d network capture events", lost)) case <-ctx.Done(): return From eebd7e71416550dfbcd4c519858d0547372d249d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Geyslan=20Greg=C3=B3rio?= Date: Tue, 29 Aug 2023 15:40:47 -0300 Subject: [PATCH 2/2] fix(ebpf): close perfbuf instead of just stop This was an old TODO comment removed by https://github.com/aquasecurity/tracee/pull/296/files#diff-82aa2eb77c3ce782b8ebfb70eaba8a1a5a8a2dce9f3349352a3a3c0f909478f1L604 --- pkg/ebpf/tracee.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pkg/ebpf/tracee.go b/pkg/ebpf/tracee.go index 92a2289c483a..c6196cb2340d 100644 --- a/pkg/ebpf/tracee.go +++ b/pkg/ebpf/tracee.go @@ -1366,16 +1366,16 @@ func (t *Tracee) Run(ctx gocontext.Context) error { t.ready(ctx) // executes ready callback, non blocking <-ctx.Done() // block until ctx is cancelled elsewhere - // Stop perf buffers + // Close perf buffers - t.eventsPerfMap.Stop() + t.eventsPerfMap.Close() if t.config.BlobPerfBufferSize > 0 { - t.fileWrPerfMap.Stop() + t.fileWrPerfMap.Close() } if pcaps.PcapsEnabled(t.config.Capture.Net) { - t.netCapPerfMap.Stop() + t.netCapPerfMap.Close() } - t.bpfLogsPerfMap.Stop() + t.bpfLogsPerfMap.Close() // TODO: move logic below somewhere else (related to file writes)