diff --git a/cmd/log-validator/main.go b/cmd/log-validator/main.go index d2d539ed70a..8efcba48517 100644 --- a/cmd/log-validator/main.go +++ b/cmd/log-validator/main.go @@ -38,14 +38,9 @@ func main() { defer oTelShutdown(context.Background()) logger.Info(cmd.VersionString()) - v := validator.New(logger, stats) + v := validator.New(config.Files, logger, stats) defer v.Shutdown() - for _, filename := range config.Files { - err := v.TailValidateFile(filename) - cmd.FailOnError(err, "failed to tail file") - } - cmd.WaitForSignal() } diff --git a/log/validator/tail_logger.go b/log/validator/tail_logger.go index 9e8fcf6a2b1..ba8fdd0d328 100644 --- a/log/validator/tail_logger.go +++ b/log/validator/tail_logger.go @@ -6,7 +6,7 @@ import ( "github.com/letsencrypt/boulder/log" ) -// tailLogger is an adapter to the hpcloud/tail module's logging interface. +// tailLogger is an adapter to the nxadm/tail module's logging interface. type tailLogger struct { log.Logger } diff --git a/log/validator/validator.go b/log/validator/validator.go index 09cc82ac091..a73330cb3f3 100644 --- a/log/validator/validator.go +++ b/log/validator/validator.go @@ -1,11 +1,14 @@ package validator import ( + "context" "encoding/base64" "errors" "fmt" "os" + "path/filepath" "strings" + "sync" "time" "github.com/nxadm/tail" @@ -17,73 +20,132 @@ import ( var errInvalidChecksum = errors.New("invalid checksum length") type Validator struct { - tailers []*tail.Tail + // mu guards patterns and tailers to prevent Shutdown racing monitor + mu sync.Mutex + + // patterns is the list of glob patterns to monitor with filepath.Glob for logs + patterns []string + + // tailers is a map of filenames to the tailer which are currently being tailed + tailers map[string]*tail.Tail + + // monitorCancel cancels the monitor's context, so it exits + monitorCancel context.CancelFunc lineCounter *prometheus.CounterVec log log.Logger } -func New(logger log.Logger, stats prometheus.Registerer) *Validator { +// New Validator monitoring paths, which is a list of file globs. +func New(patterns []string, logger log.Logger, stats prometheus.Registerer) *Validator { lineCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ Name: "log_lines", Help: "A counter of log lines processed, with status", }, []string{"filename", "status"}) stats.MustRegister(lineCounter) - return &Validator{log: logger, lineCounter: lineCounter} -} + monitorContext, monitorCancel := context.WithCancel(context.Background()) -// TailValidateFile takes a filename, and starts tailing it. -// An error is returned if the file couldn't be opened. -func (v *Validator) TailValidateFile(filename string) error { - t, err := tail.TailFile(filename, tail.Config{ - ReOpen: true, - MustExist: false, // sometimes files won't exist, so we must tolerate that - Follow: true, - Logger: tailLogger{v.log}, - CompleteLines: true, - }) - if err != nil { - return err + v := &Validator{ + patterns: patterns, + tailers: map[string]*tail.Tail{}, + log: logger, + monitorCancel: monitorCancel, + lineCounter: lineCounter, } - go func() { - // Emit no more than 1 error line per second. This prevents consuming large - // amounts of disk space in case there is problem that causes all log lines to - // be invalid. - outputLimiter := time.NewTicker(time.Second) - defer outputLimiter.Stop() + go v.monitor(monitorContext) + + return v +} + +// pollPaths expands v.patterns and calls v.tailValidateFile on each resulting file +func (v *Validator) pollPaths() { + v.mu.Lock() + defer v.mu.Unlock() + for _, pattern := range v.patterns { + paths, err := filepath.Glob(pattern) + if err != nil { + v.log.Err(err.Error()) + } - for line := range t.Lines { - if line.Err != nil { - v.log.Errf("error while tailing %s: %s", t.Filename, line.Err) + for _, path := range paths { + if _, ok := v.tailers[path]; ok { + // We are already tailing this file continue } - err := lineValid(line.Text) + + t, err := tail.TailFile(path, tail.Config{ + ReOpen: true, + MustExist: false, // sometimes files won't exist, so we must tolerate that + Follow: true, + Logger: tailLogger{v.log}, + CompleteLines: true, + }) if err != nil { - if errors.Is(err, errInvalidChecksum) { - v.lineCounter.WithLabelValues(t.Filename, "invalid checksum length").Inc() - } else { - v.lineCounter.WithLabelValues(t.Filename, "bad").Inc() - } - select { - case <-outputLimiter.C: - v.log.Errf("%s: %s %q", t.Filename, err, line.Text) - default: - } - } else { - v.lineCounter.WithLabelValues(t.Filename, "ok").Inc() + // TailFile shouldn't error when MustExist is false + v.log.Errf("unexpected error from TailFile: %v", err) } + + go v.tailValidate(path, t.Lines) + + v.tailers[path] = t + } + } +} + +// Monitor calls v.pollPaths every minute until its context is cancelled +func (v *Validator) monitor(ctx context.Context) { + for { + v.pollPaths() + + // Wait a minute, unless cancelled + timer := time.NewTimer(time.Minute) + select { + case <-ctx.Done(): + return + case <-timer.C: } - }() + } +} - v.tailers = append(v.tailers, t) +func (v *Validator) tailValidate(filename string, lines chan *tail.Line) { + // Emit no more than 1 error line per second. This prevents consuming large + // amounts of disk space in case there is problem that causes all log lines to + // be invalid. + outputLimiter := time.NewTicker(time.Second) + defer outputLimiter.Stop() - return nil + for line := range lines { + if line.Err != nil { + v.log.Errf("error while tailing %s: %s", filename, line.Err) + continue + } + err := lineValid(line.Text) + if err != nil { + if errors.Is(err, errInvalidChecksum) { + v.lineCounter.WithLabelValues(filename, "invalid checksum length").Inc() + } else { + v.lineCounter.WithLabelValues(filename, "bad").Inc() + } + select { + case <-outputLimiter.C: + v.log.Errf("%s: %s %q", filename, err, line.Text) + default: + } + } else { + v.lineCounter.WithLabelValues(filename, "ok").Inc() + } + } } // Shutdown should be called before process shutdown func (v *Validator) Shutdown() { + v.mu.Lock() + defer v.mu.Unlock() + + v.monitorCancel() + for _, t := range v.tailers { // The tail module seems to have a race condition that will generate // errors like this on shutdown: diff --git a/test/config-next/log-validator.json b/test/config-next/log-validator.json index 3a5dce900f0..61c6263605c 100644 --- a/test/config-next/log-validator.json +++ b/test/config-next/log-validator.json @@ -10,16 +10,8 @@ "files": [ "/var/log/akamai-purger.log", "/var/log/bad-key-revoker.log", - "/var/log/boulder-ca.log", - "/var/log/boulder-observer.log", - "/var/log/boulder-publisher.log", - "/var/log/boulder-ra.log", - "/var/log/boulder-remoteva.log", - "/var/log/boulder-sa.log", - "/var/log/boulder-va.log", - "/var/log/boulder-wfe2.log", - "/var/log/crl-storer.log", - "/var/log/crl-updater.log", + "/var/log/boulder-*.log", + "/var/log/crl-*.log", "/var/log/nonce-service.log", "/var/log/ocsp-responder.log" ]