Skip to content

Configurable timeout for dmesg scan #518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions services/sysinfo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# System Info
Service to retrieve system information from target system, such as kernel messages, uptime and journaling entries.

# Usage

### sanssh sysinfo uptime
Print the uptime of the system in below format: System_idx (ip:port) up for X days, X hours, X minutes, X seconds | total X seconds (X is a placeholder that will be replaced)

```bash
sanssh <sanssh-args> sysinfo uptime
```
Where:
- `<sanssh-args>` common sanssh arguments

Examples:
```bash
# Get data from a yml file
sanssh --targets=localhost sysinfo uptime
```

### sanssh sysinfo dmesg
Print the messages from kernel ring buffer.

```bash
sanssh <sanssh-args> sysinfo dmesg [--tail==<tail-n-lines>] [--grep=<grep-pattern>] [-i] [-v] [--dmesg-read-timeout=<timeout>]
```

Where:
- `<sanssh-args>` common sanssh arguments
- `<tail-n-lines>` specify number of lines to `tail`
- `<grep-pattern>` grep regex pattern to filter out messages with
- `-i` - ignore grep case
- `-v` - inverted match grep
- `<dmesg-read-timeout>` timeout collection of kernel messages after this duration, default is 2 seconds, must be specified in time.ParseDuration compatible format

Examples:
```bash
### Default
sanssh --targets localhost sysinfo dmesg
### Get messages matching NVMe SSD pattern
sanssh --targets localhost sysinfo dmesg --grep "nvme1n1.*ssd.*"
### Get messages not related to BTRFS (ignoring case)
sanssh --targets localhost sysinfo dmesg -grep "btrfs" -i -v
### Collect messages for 10 seconds
sanssh --targets localhost sysinfo dmesg -grep "btrfs" -i -v --dmesg-read-timeout=10s
```

### sanssh sysinfo journalctl
Get the log entries stored in journald by systemd-journald.service

```bash
sanssh <sanssh-args> sysinfo djournalctl [--since|--S=<since>] [--until|-U=<until>] [-tail=<tail>] [-u|-unit=<unit>] [--json]
```

Where:
- `<sanssh-args>` common sanssh arguments
- `<since>` Sets the date (YYYY-MM-DD HH:MM:SS) we want to filter from
- `<until>` Sets the date (YYYY-MM-DD HH:MM:SS) we want to filter until
- `<tail>` - If positive, the latest n records to fetch. By default, fetch latest 100 records. The upper limit is 10000 for now
- `<unit>` - Sets systemd unit to filter messages
- `--json` - Print the journal entries in JSON format(can work with jq for better visualization)

Examples:
```bash
### Get 5 journalctl entries in json format between 18th and 19th of December 2024
sanssh --targets localhost sysinfo journalctl --json --tail 5 --since "2024-12-18 00:00:00" --until "2024-12-19 00:00:00"
### Read entries from default.target unit
sanssh --targets localhost sysinfo journalctl --unit default.target
```
16 changes: 15 additions & 1 deletion services/sysinfo/client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"encoding/json"
"flag"
"fmt"
"google.golang.org/protobuf/types/known/durationpb"
"io"
"os"
"time"
Expand Down Expand Up @@ -116,12 +117,13 @@ type dmesgCmd struct {
grep string
ignoreCase bool
invertMatch bool
timeout time.Duration
}

func (*dmesgCmd) Name() string { return "dmesg" }
func (*dmesgCmd) Synopsis() string { return "View the messages in kernel ring buffer" }
func (*dmesgCmd) Usage() string {
return `dmesg [--tail=N] [--grep=PATTERN] [-i] [-v]:
return `dmesg [--tail=N] [--grep=PATTERN] [-i] [-v] [--dmesg-read-timeout=N]:
Print the messages from kernel ring buffer.
`
}
Expand All @@ -131,18 +133,30 @@ func (p *dmesgCmd) SetFlags(f *flag.FlagSet) {
f.Int64Var(&p.tail, "tail", -1, "tail the latest n lines")
f.BoolVar(&p.ignoreCase, "i", false, "ignore case")
f.BoolVar(&p.invertMatch, "v", false, "invert match")
f.DurationVar(&p.timeout, "dmesg-read-timeout", 2*time.Second, "timeout collection of messages after specified duration, default is 2s, max 30s")
}

func (p *dmesgCmd) Execute(ctx context.Context, f *flag.FlagSet, args ...interface{}) subcommands.ExitStatus {
state := args[0].(*util.ExecuteState)
c := pb.NewSysInfoClientProxy(state.Conn)

if p.timeout > pb.MaxDmesgTimeout {
fmt.Printf("Invalid dmesg-read-timeout, value %s is higher than maximum dmesg-read-timeout of %s\n", p.timeout.String(), pb.MaxDmesgTimeout.String())
return subcommands.ExitUsageError
}
if p.timeout < pb.MinDmesgTimeout {
fmt.Printf("Invalid dmesg-read-timeout, value %s is lower than minimum value of %s\n", p.timeout.String(), pb.MinDmesgTimeout.String())
return subcommands.ExitUsageError
}

req := &pb.DmesgRequest{
TailLines: int32(p.tail),
Grep: p.grep,
IgnoreCase: p.ignoreCase,
InvertMatch: p.invertMatch,
Timeout: durationpb.New(p.timeout),
}

stream, err := c.DmesgOneMany(ctx, req)
if err != nil {
// Emit this to every error file as it's not specific to a given target.
Expand Down
30 changes: 25 additions & 5 deletions services/sysinfo/server/sysinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@ package server

import (
"context"
"regexp"

"go.opentelemetry.io/otel/attribute"
"google.golang.org/grpc"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/status"
"google.golang.org/protobuf/types/known/durationpb"
"google.golang.org/protobuf/types/known/emptypb"

"go.opentelemetry.io/otel/attribute"
"regexp"
"time"

"github.com/Snowflake-Labs/sansshell/services"
pb "github.com/Snowflake-Labs/sansshell/services/sysinfo"
Expand Down Expand Up @@ -71,7 +70,16 @@ func (s *server) Dmesg(req *pb.DmesgRequest, stream pb.SysInfo_DmesgServer) erro
return status.Error(codes.InvalidArgument, "must provide grep argument before setting ignore_case or invert_match")
}

records, err := getKernelMessages()
timeout := pb.DefaultDmesgTimeout
if req.Timeout != nil {
timeout = req.Timeout.AsDuration()
}

if err := validateTimeout(timeout); err != nil {
return err
}

records, err := getKernelMessages(timeout, ctx.Done())
if err != nil {
recorder.CounterOrLog(ctx, sysinfoDmesgFailureCounter, 1, attribute.String("reason", "get kernel message error"))
return status.Errorf(codes.InvalidArgument, "can't get kernel message %v", err)
Expand Down Expand Up @@ -115,6 +123,18 @@ func (s *server) Dmesg(req *pb.DmesgRequest, stream pb.SysInfo_DmesgServer) erro
return nil
}

func validateTimeout(timeout time.Duration) error {
if timeout > pb.MaxDmesgTimeout {
return status.Errorf(codes.InvalidArgument, "dmesg-read-timeout value %s is higher than maximum value %s", timeout, pb.MaxDmesgTimeout)
}

if timeout < pb.MinDmesgTimeout {
return status.Errorf(codes.InvalidArgument, "dmesg-read-timeout value %s is lower than minimum value %s", timeout, pb.MinDmesgTimeout)
}

return nil
}

func (s *server) Journal(req *pb.JournalRequest, stream pb.SysInfo_JournalServer) error {
ctx := stream.Context()
recorder := metrics.RecorderFromContextOrNoop(ctx)
Expand Down
30 changes: 28 additions & 2 deletions services/sysinfo/server/sysinfo_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ var getUptime = func() (time.Duration, error) {
// we set 2 seconds timeout to explicitly close the channel
// If the package releases new feature to support non-blocking read, we can
// make corresponding changes below to get rid of hard code timeout setting
var getKernelMessages = func() ([]*pb.DmsgRecord, error) {
var getKernelMessages = func(timeout time.Duration, cancelCh <-chan struct{}) ([]*pb.DmsgRecord, error) {
parser, err := getKmsgParser()
if err != nil {
return nil, err
Expand All @@ -95,7 +95,26 @@ var getKernelMessages = func() ([]*pb.DmsgRecord, error) {
var records []*pb.DmsgRecord
messages := parser.Parse()
done := false
timeoutCh := time.After(timeout)
for !done {
// Select doesn't care about the order of statements, a chatty enough kernel will continue pushing messages
// into kmsg and therefore our cancellation and timeout logic will not be reached ever,
// so we do this check first to ensure we don't miss our "deadlines" or client-side cancellation
select {
case <-cancelCh:
parser.Close()
done = true
continue
default:
}
select {
case <-timeoutCh:
parser.Close()
done = true
continue
default:
}

select {
case msg, ok := <-messages:
if !ok {
Expand All @@ -106,7 +125,14 @@ var getKernelMessages = func() ([]*pb.DmsgRecord, error) {
Message: msg.Message,
Time: timestamppb.New(msg.Timestamp),
})
case <-time.After(2 * time.Second):

// messages channel can have excessive idle time, we want to utilize that to avoid excessive CPU usage
// hence we do a blocking read of the messages channel (no default statement) but at the same time
// do blocking read from other channels in case this idle window exceeds timeout or if client cancels command
case <-timeoutCh:
parser.Close()
done = true
case <-cancelCh:
parser.Close()
done = true
}
Expand Down
6 changes: 3 additions & 3 deletions services/sysinfo/server/sysinfo_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ func TestDmesg(t *testing.T) {
savedGetKernelMessages := getKernelMessages

// dmesg is not supported in other OS, so an error should be raised
getKernelMessages = func() ([]*pb.DmsgRecord, error) {
getKernelMessages = func(time.Duration, <-chan struct{}) ([]*pb.DmsgRecord, error) {
return nil, status.Errorf(codes.Unimplemented, "dmesg is not supported")
}
t.Cleanup(func() { getKernelMessages = savedGetKernelMessages })
Expand Down Expand Up @@ -252,8 +252,8 @@ func TestDmesg(t *testing.T) {
})
}

getKernelMessages = func() ([]*pb.DmsgRecord, error) {
_, err := savedGetKernelMessages()
getKernelMessages = func(time.Duration, <-chan struct{}) ([]*pb.DmsgRecord, error) {
_, err := savedGetKernelMessages(time.Duration(0), nil)
if err != nil {
return nil, err
}
Expand Down
11 changes: 11 additions & 0 deletions services/sysinfo/sysinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
// Package sysinfo defines the RPC interface for the sansshell SysInfo actions.
package sysinfo

import "time"

// To regenerate the proto headers if the proto changes, just run go generate
// and this encodes the necessary magic:
//go:generate protoc --go_out=. --go_opt=paths=source_relative --go-grpc_out=require_unimplemented_servers=false:. --go-grpc_opt=paths=source_relative --go-grpcproxy_out=. --go-grpcproxy_opt=paths=source_relative sysinfo.proto
Expand All @@ -26,3 +28,12 @@ const TimeFormat_YYYYMMDDHHMMSS = "2006-01-02 15:04:05"

// Maximum journal entries we can fetch for each host
const JounalEntriesLimit = 10000

// Maximum Dmesg message capture timeout
const MaxDmesgTimeout = 30 * time.Second

// Minimum Dmesg message capture timeout
const MinDmesgTimeout = 2 * time.Second

// Default Dmesg message capture timeout
const DefaultDmesgTimeout = MinDmesgTimeout
Loading
Loading