From 18a78cba577b4b55bf452e071b03d3574f010def Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Wed, 5 Jul 2023 22:14:38 +0200 Subject: [PATCH 1/7] chore: adds memoize implementation for regexes. Currently we create and allocate memory for every regex we compile, however there are cases where you compile the same regex over and over e.g. https://github.com/corazawaf/coraza-caddy/issues/76. Here we implement the memoize pattern to be able to reuse the regex and reduce the memory consumption. --- go.mod | 1 + go.sum | 1 + internal/corazawaf/rule.go | 14 +++- internal/memoize/cache.go | 40 +++++++++ internal/memoize/cache_test.go | 28 +++++++ internal/memoize/memoize.go | 45 ++++++++++ internal/memoize/memoize_test.go | 128 +++++++++++++++++++++++++++++ internal/memoize/noop.go | 10 +++ internal/operators/restpath.go | 6 +- internal/operators/rx.go | 11 ++- internal/operators/validate_nid.go | 7 +- internal/seclang/directives.go | 11 ++- 12 files changed, 287 insertions(+), 15 deletions(-) create mode 100644 internal/memoize/cache.go create mode 100644 internal/memoize/cache_test.go create mode 100644 internal/memoize/memoize.go create mode 100644 internal/memoize/memoize_test.go create mode 100644 internal/memoize/noop.go diff --git a/go.mod b/go.mod index c779790a6..59da50095 100644 --- a/go.mod +++ b/go.mod @@ -24,6 +24,7 @@ require ( github.com/petar-dambovaliev/aho-corasick v0.0.0-20211021192214-5ab2d9280aa9 github.com/tidwall/gjson v1.14.4 golang.org/x/net v0.11.0 + golang.org/x/sync v0.1.0 rsc.io/binaryregexp v0.2.0 ) diff --git a/go.sum b/go.sum index bf2237977..12f51696a 100644 --- a/go.sum +++ b/go.sum @@ -37,6 +37,7 @@ golang.org/x/net v0.11.0/go.mod h1:2L/ixqYpgIVXmeoSA/4Lu7BzTG4KIyPIryS4IsOd1oQ= golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0 h1:wsuoTGHzEhffawBOhz5CYhcrV4IdKZbEyZjBMuTp12o= +golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20190922100055-0a153f010e69/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= diff --git a/internal/corazawaf/rule.go b/internal/corazawaf/rule.go index 7616ccaca..e67e6fb02 100644 --- a/internal/corazawaf/rule.go +++ b/internal/corazawaf/rule.go @@ -15,6 +15,7 @@ import ( "github.com/corazawaf/coraza/v3/experimental/plugins/macro" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" "github.com/corazawaf/coraza/v3/internal/corazarules" + "github.com/corazawaf/coraza/v3/internal/memoize" "github.com/corazawaf/coraza/v3/types" "github.com/corazawaf/coraza/v3/types/variables" ) @@ -456,7 +457,12 @@ func (r *Rule) AddVariable(v variables.RuleVariable, key string, iscount bool) e var re *regexp.Regexp if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' { key = key[1 : len(key)-1] - re = regexp.MustCompile(key) + + if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil { + panic(err) + } else { + re = vare.(*regexp.Regexp) + } } if multiphaseEvaluation { @@ -521,7 +527,11 @@ func (r *Rule) AddVariableNegation(v variables.RuleVariable, key string) error { var re *regexp.Regexp if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' { key = key[1 : len(key)-1] - re = regexp.MustCompile(key) + if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil { + panic(err) + } else { + re = vare.(*regexp.Regexp) + } } // Prevent sigsev if r == nil { diff --git a/internal/memoize/cache.go b/internal/memoize/cache.go new file mode 100644 index 000000000..030509632 --- /dev/null +++ b/internal/memoize/cache.go @@ -0,0 +1,40 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !tinygo + +// Highly inspired in https://github.com/patrickmn/go-cache/blob/master/cache.go + +package memoize + +import ( + "sync" +) + +type cache struct { + mu sync.RWMutex + entries map[string]interface{} +} + +func newCache() *cache { + return &cache{ + entries: make(map[string]interface{}), + } +} + +func (c *cache) set(key string, value interface{}) { + c.mu.Lock() + c.entries[key] = value + c.mu.Unlock() +} + +func (c *cache) get(key string) (interface{}, bool) { + c.mu.RLock() + item, found := c.entries[key] + if !found { + c.mu.RUnlock() + return nil, false + } + c.mu.RUnlock() + return item, true +} diff --git a/internal/memoize/cache_test.go b/internal/memoize/cache_test.go new file mode 100644 index 000000000..9eec1fa88 --- /dev/null +++ b/internal/memoize/cache_test.go @@ -0,0 +1,28 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !tinygo + +package memoize + +import "testing" + +func TestCache(t *testing.T) { + tc := newCache() + + _, found := tc.get("key1") + if want, have := false, found; want != have { + t.Fatalf("unexpected value, want %t, have %t", want, have) + } + + tc.set("key1", 1) + + item, found := tc.get("key1") + if want, have := true, found; want != have { + t.Fatalf("unexpected value, want %t, have %t", want, have) + } + + if want, have := 1, item.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } +} diff --git a/internal/memoize/memoize.go b/internal/memoize/memoize.go new file mode 100644 index 000000000..da90083f6 --- /dev/null +++ b/internal/memoize/memoize.go @@ -0,0 +1,45 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !tinygo + +// https://github.com/kofalt/go-memoize/blob/master/memoize.go + +package memoize + +import ( + "golang.org/x/sync/singleflight" +) + +var doer = makeDoer(newCache(), &singleflight.Group{}) + +// Do executes and returns the results of the given function, unless there was a cached +// value of the same key. Only one execution is in-flight for a given key at a time. +// The boolean return value indicates whether v was previously stored. +func Do(key string, fn func() (interface{}, error)) (interface{}, error) { + value, err, _ := doer(key, fn) + return value, err +} + +// makeDoer returns a function that executes and returns the results of the given function +func makeDoer(cache *cache, group *singleflight.Group) func(string, func() (interface{}, error)) (interface{}, error, bool) { + return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) { + // Check cache + value, found := cache.get(key) + if found { + return value, nil, true + } + + // Combine memoized function with a cache store + value, err, _ := group.Do(key, func() (interface{}, error) { + data, innerErr := fn() + if innerErr == nil { + cache.set(key, data) + } + + return data, innerErr + }) + + return value, err, false + } +} diff --git a/internal/memoize/memoize_test.go b/internal/memoize/memoize_test.go new file mode 100644 index 000000000..d5c0a480b --- /dev/null +++ b/internal/memoize/memoize_test.go @@ -0,0 +1,128 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build !tinygo + +// https://github.com/kofalt/go-memoize/blob/master/memoize.go + +package memoize + +import ( + "errors" + "testing" + + "golang.org/x/sync/singleflight" +) + +func TestSuccessCall(t *testing.T) { + do := makeDoer(newCache(), &singleflight.Group{}) + + expensiveCalls := 0 + + // Function tracks how many times its been called + expensive := func() (interface{}, error) { + expensiveCalls++ + return expensiveCalls, nil + } + + // First call SHOULD NOT be cached + result, err, cached := do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Second call on same key SHOULD be cached + result, err, cached = do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := true, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // First call on a new key SHOULD NOT be cached + result, err, cached = do("key2", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } +} + +func TestFailedCall(t *testing.T) { + do := makeDoer(newCache(), &singleflight.Group{}) + + calls := 0 + + // This function will fail IFF it has not been called before. + twoForTheMoney := func() (interface{}, error) { + calls++ + + if calls == 1 { + return calls, errors.New("Try again") + } else { + return calls, nil + } + } + + // First call should fail, and not be cached + result, err, cached := do("key1", twoForTheMoney) + if err == nil { + t.Fatalf("expected error") + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Second call should succeed, and not be cached + result, err, cached = do("key1", twoForTheMoney) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Third call should succeed, and be cached + result, err, cached = do("key1", twoForTheMoney) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := true, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } +} diff --git a/internal/memoize/noop.go b/internal/memoize/noop.go new file mode 100644 index 000000000..f2e7a6063 --- /dev/null +++ b/internal/memoize/noop.go @@ -0,0 +1,10 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build tinygo + +package memoize + +func Do(_ string, fn func() (interface{}, error)) (interface{}, error) { + return fn() +} diff --git a/internal/operators/restpath.go b/internal/operators/restpath.go index 526cb35bf..f1e4a8911 100644 --- a/internal/operators/restpath.go +++ b/internal/operators/restpath.go @@ -11,6 +11,7 @@ import ( "strings" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) var rePathTokenRe = regexp.MustCompile(`\{([^\}]+)\}`) @@ -30,11 +31,12 @@ func newRESTPath(options plugintypes.OperatorOptions) (plugintypes.Operator, err for _, token := range rePathTokenRe.FindAllStringSubmatch(data, -1) { data = strings.Replace(data, token[0], fmt.Sprintf("(?P<%s>.*)", token[1]), 1) } - re, err := regexp.Compile(data) + + re, err := memoize.Do(data, func() (interface{}, error) { return regexp.Compile(data) }) if err != nil { return nil, err } - return &restpath{re: re}, nil + return &restpath{re: re.(*regexp.Regexp)}, nil } func (o *restpath) Evaluate(tx plugintypes.TransactionState, value string) bool { diff --git a/internal/operators/rx.go b/internal/operators/rx.go index 147890e1c..e801c9f72 100644 --- a/internal/operators/rx.go +++ b/internal/operators/rx.go @@ -14,6 +14,7 @@ import ( "rsc.io/binaryregexp" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) type rx struct { @@ -35,15 +36,14 @@ func newRX(options plugintypes.OperatorOptions) (plugintypes.Operator, error) { return newBinaryRX(options) } - re, err := regexp.Compile(data) + re, err := memoize.Do(data, func() (interface{}, error) { return regexp.Compile(data) }) if err != nil { return nil, err } - return &rx{re: re}, nil + return &rx{re: re.(*regexp.Regexp)}, nil } func (o *rx) Evaluate(tx plugintypes.TransactionState, value string) bool { - if tx.Capturing() { match := o.re.FindStringSubmatch(value) if len(match) == 0 { @@ -72,15 +72,14 @@ var _ plugintypes.Operator = (*binaryRX)(nil) func newBinaryRX(options plugintypes.OperatorOptions) (plugintypes.Operator, error) { data := options.Arguments - re, err := binaryregexp.Compile(data) + re, err := memoize.Do(data, func() (interface{}, error) { return binaryregexp.Compile(data) }) if err != nil { return nil, err } - return &binaryRX{re: re}, nil + return &binaryRX{re: re.(*binaryregexp.Regexp)}, nil } func (o *binaryRX) Evaluate(tx plugintypes.TransactionState, value string) bool { - if tx.Capturing() { match := o.re.FindStringSubmatch(value) if len(match) == 0 { diff --git a/internal/operators/validate_nid.go b/internal/operators/validate_nid.go index ce6192a2b..383f160b6 100644 --- a/internal/operators/validate_nid.go +++ b/internal/operators/validate_nid.go @@ -12,6 +12,7 @@ import ( "strings" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) type validateNidFunction = func(input string) bool @@ -39,11 +40,13 @@ func newValidateNID(options plugintypes.OperatorOptions) (plugintypes.Operator, default: return nil, fmt.Errorf("invalid @validateNid argument") } - re, err := regexp.Compile(expr) + + re, err := memoize.Do(expr, func() (interface{}, error) { return regexp.Compile(expr) }) if err != nil { return nil, err } - return &validateNid{fn: fn, re: re}, nil + + return &validateNid{fn: fn, re: re.(*regexp.Regexp)}, nil } func (o *validateNid) Evaluate(tx plugintypes.TransactionState, value string) bool { diff --git a/internal/seclang/directives.go b/internal/seclang/directives.go index 8b0628f0a..7a158c349 100644 --- a/internal/seclang/directives.go +++ b/internal/seclang/directives.go @@ -16,6 +16,7 @@ import ( "github.com/corazawaf/coraza/v3/debuglog" "github.com/corazawaf/coraza/v3/internal/auditlog" "github.com/corazawaf/coraza/v3/internal/corazawaf" + "github.com/corazawaf/coraza/v3/internal/memoize" utils "github.com/corazawaf/coraza/v3/internal/strings" "github.com/corazawaf/coraza/v3/types" ) @@ -731,9 +732,13 @@ func directiveSecAuditLogRelevantStatus(options *DirectiveOptions) error { return errEmptyOptions } - var err error - options.WAF.AuditLogRelevantStatus, err = regexp.Compile(options.Opts) - return err + re, err := memoize.Do(options.Opts, func() (interface{}, error) { return regexp.Compile(options.Opts) }) + if err != nil { + return err + } + + options.WAF.AuditLogRelevantStatus = re.(*regexp.Regexp) + return nil } // Description: Defines which parts of each transaction are going to be recorded From 23382ac1a0ad21b17735fa7d17612f7fb0aa08ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Wed, 5 Jul 2023 22:27:59 +0200 Subject: [PATCH 2/7] docs: adds comments to code. --- internal/memoize/cache.go | 23 ++++++++++++++++++++--- internal/memoize/cache_test.go | 9 +++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/internal/memoize/cache.go b/internal/memoize/cache.go index 030509632..1ff5a8ae6 100644 --- a/internal/memoize/cache.go +++ b/internal/memoize/cache.go @@ -11,23 +11,32 @@ import ( "sync" ) +// Cache is a simple in-memory key-value store. type cache struct { - mu sync.RWMutex - entries map[string]interface{} + mu sync.RWMutex + isClosed bool + entries map[string]interface{} } +// newCache returns a new cache. func newCache() *cache { return &cache{ entries: make(map[string]interface{}), } } +// set the value for the given key. func (c *cache) set(key string, value interface{}) { c.mu.Lock() + defer c.mu.Unlock() + + if c.isClosed { + return + } c.entries[key] = value - c.mu.Unlock() } +// get the value for the given key. func (c *cache) get(key string) (interface{}, bool) { c.mu.RLock() item, found := c.entries[key] @@ -38,3 +47,11 @@ func (c *cache) get(key string) (interface{}, bool) { c.mu.RUnlock() return item, true } + +// Close the cache. +func (c *cache) Close() { + c.mu.Lock() + c.isClosed = true + c.entries = nil + c.mu.Unlock() +} diff --git a/internal/memoize/cache_test.go b/internal/memoize/cache_test.go index 9eec1fa88..0019f6646 100644 --- a/internal/memoize/cache_test.go +++ b/internal/memoize/cache_test.go @@ -25,4 +25,13 @@ func TestCache(t *testing.T) { if want, have := 1, item.(int); want != have { t.Fatalf("unexpected value, want %d, have %d", want, have) } + + tc.Close() + + tc.set("key1", 1) + + _, found = tc.get("key1") + if want, have := false, found; want != have { + t.Fatalf("unexpected value, want %t, have %t", want, have) + } } From be2fc241da34320c452d876d5d7b47a70c651d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Thu, 6 Jul 2023 10:05:14 +0200 Subject: [PATCH 3/7] chore: simplify the memoize package by using sync.Map. --- internal/memoize/README.md | 11 ++++++ internal/memoize/cache.go | 57 -------------------------------- internal/memoize/cache_test.go | 37 --------------------- internal/memoize/memoize.go | 12 ++++--- internal/memoize/memoize_test.go | 47 ++++++++++++++++++++++++-- internal/memoize/noop.go | 2 +- magefile.go | 10 ++++++ 7 files changed, 73 insertions(+), 103 deletions(-) create mode 100644 internal/memoize/README.md delete mode 100644 internal/memoize/cache.go delete mode 100644 internal/memoize/cache_test.go diff --git a/internal/memoize/README.md b/internal/memoize/README.md new file mode 100644 index 000000000..1e593efa9 --- /dev/null +++ b/internal/memoize/README.md @@ -0,0 +1,11 @@ +# Memoize + +Memoize allows to cache certain expensive function calls and +cache the result. The main advantage in Coraza is to memoize +the regexes when the connects spins up more than one WAF in +the same process and hence same regexes are being compiled +over and over. + +Currently it is opt-in under the `memoize_regex` build tag +as under a misuse it could lead to a memory leak as currently +the cache is global. diff --git a/internal/memoize/cache.go b/internal/memoize/cache.go deleted file mode 100644 index 1ff5a8ae6..000000000 --- a/internal/memoize/cache.go +++ /dev/null @@ -1,57 +0,0 @@ -// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors -// SPDX-License-Identifier: Apache-2.0 - -//go:build !tinygo - -// Highly inspired in https://github.com/patrickmn/go-cache/blob/master/cache.go - -package memoize - -import ( - "sync" -) - -// Cache is a simple in-memory key-value store. -type cache struct { - mu sync.RWMutex - isClosed bool - entries map[string]interface{} -} - -// newCache returns a new cache. -func newCache() *cache { - return &cache{ - entries: make(map[string]interface{}), - } -} - -// set the value for the given key. -func (c *cache) set(key string, value interface{}) { - c.mu.Lock() - defer c.mu.Unlock() - - if c.isClosed { - return - } - c.entries[key] = value -} - -// get the value for the given key. -func (c *cache) get(key string) (interface{}, bool) { - c.mu.RLock() - item, found := c.entries[key] - if !found { - c.mu.RUnlock() - return nil, false - } - c.mu.RUnlock() - return item, true -} - -// Close the cache. -func (c *cache) Close() { - c.mu.Lock() - c.isClosed = true - c.entries = nil - c.mu.Unlock() -} diff --git a/internal/memoize/cache_test.go b/internal/memoize/cache_test.go deleted file mode 100644 index 0019f6646..000000000 --- a/internal/memoize/cache_test.go +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors -// SPDX-License-Identifier: Apache-2.0 - -//go:build !tinygo - -package memoize - -import "testing" - -func TestCache(t *testing.T) { - tc := newCache() - - _, found := tc.get("key1") - if want, have := false, found; want != have { - t.Fatalf("unexpected value, want %t, have %t", want, have) - } - - tc.set("key1", 1) - - item, found := tc.get("key1") - if want, have := true, found; want != have { - t.Fatalf("unexpected value, want %t, have %t", want, have) - } - - if want, have := 1, item.(int); want != have { - t.Fatalf("unexpected value, want %d, have %d", want, have) - } - - tc.Close() - - tc.set("key1", 1) - - _, found = tc.get("key1") - if want, have := false, found; want != have { - t.Fatalf("unexpected value, want %t, have %t", want, have) - } -} diff --git a/internal/memoize/memoize.go b/internal/memoize/memoize.go index da90083f6..426d683cd 100644 --- a/internal/memoize/memoize.go +++ b/internal/memoize/memoize.go @@ -1,17 +1,19 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build !tinygo +//go:build !tinygo && memoize_regex // https://github.com/kofalt/go-memoize/blob/master/memoize.go package memoize import ( + "sync" + "golang.org/x/sync/singleflight" ) -var doer = makeDoer(newCache(), &singleflight.Group{}) +var doer = makeDoer(new(sync.Map), new(singleflight.Group)) // Do executes and returns the results of the given function, unless there was a cached // value of the same key. Only one execution is in-flight for a given key at a time. @@ -22,10 +24,10 @@ func Do(key string, fn func() (interface{}, error)) (interface{}, error) { } // makeDoer returns a function that executes and returns the results of the given function -func makeDoer(cache *cache, group *singleflight.Group) func(string, func() (interface{}, error)) (interface{}, error, bool) { +func makeDoer(cache *sync.Map, group *singleflight.Group) func(string, func() (interface{}, error)) (interface{}, error, bool) { return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) { // Check cache - value, found := cache.get(key) + value, found := cache.Load(key) if found { return value, nil, true } @@ -34,7 +36,7 @@ func makeDoer(cache *cache, group *singleflight.Group) func(string, func() (inte value, err, _ := group.Do(key, func() (interface{}, error) { data, innerErr := fn() if innerErr == nil { - cache.set(key, data) + cache.Store(key, data) } return data, innerErr diff --git a/internal/memoize/memoize_test.go b/internal/memoize/memoize_test.go index d5c0a480b..3f89c9d24 100644 --- a/internal/memoize/memoize_test.go +++ b/internal/memoize/memoize_test.go @@ -1,7 +1,7 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build !tinygo +//go:build !tinygo && memoize_regex // https://github.com/kofalt/go-memoize/blob/master/memoize.go @@ -9,13 +9,54 @@ package memoize import ( "errors" + "sync" "testing" "golang.org/x/sync/singleflight" ) +func TestDo(t *testing.T) { + expensiveCalls := 0 + + // Function tracks how many times its been called + expensive := func() (interface{}, error) { + expensiveCalls++ + return expensiveCalls, nil + } + + // First call SHOULD NOT be cached + result, err := Do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + // Second call on same key SHOULD be cached + result, err = Do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + // First call on a new key SHOULD NOT be cached + result, err = Do("key2", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } +} + func TestSuccessCall(t *testing.T) { - do := makeDoer(newCache(), &singleflight.Group{}) + do := makeDoer(new(sync.Map), &singleflight.Group{}) expensiveCalls := 0 @@ -69,7 +110,7 @@ func TestSuccessCall(t *testing.T) { } func TestFailedCall(t *testing.T) { - do := makeDoer(newCache(), &singleflight.Group{}) + do := makeDoer(new(sync.Map), &singleflight.Group{}) calls := 0 diff --git a/internal/memoize/noop.go b/internal/memoize/noop.go index f2e7a6063..fd1e13184 100644 --- a/internal/memoize/noop.go +++ b/internal/memoize/noop.go @@ -1,7 +1,7 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build tinygo +//go:build tinygo || !memoize_regex package memoize diff --git a/magefile.go b/magefile.go index 11c2c84e7..ca601846c 100644 --- a/magefile.go +++ b/magefile.go @@ -110,12 +110,22 @@ func Test() error { if err := sh.RunV("go", "test", "./..."); err != nil { return err } + + if err := sh.RunV("go", "test", "-tags=memoize_regex", "./..."); err != nil { + return err + } + if err := sh.RunV("go", "test", "./examples/http-server"); err != nil { return err } + if err := sh.RunV("go", "test", "./testing/coreruleset"); err != nil { return err } + + if err := sh.RunV("go", "test", "-tags=memoize_regex", "./testing/coreruleset"); err != nil { + return err + } // Execute FTW tests with multiphase evaluation enabled as well if err := sh.RunV("go", "test", "-tags=coraza.rule.multiphase_evaluation", "./testing/coreruleset"); err != nil { return err From 73be18c6eaf528da36b272143a9e3e63872086cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Sat, 29 Jul 2023 22:55:17 +0200 Subject: [PATCH 4/7] feat: extends memoize to ahocorasick and allow impl for tinygo but not synced as no concurrency. --- internal/memoize/README.md | 2 +- internal/memoize/noop.go | 2 +- internal/memoize/nosync.go | 34 ++++ internal/memoize/nosync_test.go | 167 ++++++++++++++++++ internal/memoize/{memoize.go => sync.go} | 2 +- .../memoize/{memoize_test.go => sync_test.go} | 2 +- internal/operators/from_file.go | 10 +- internal/operators/from_file_test.go | 2 +- internal/operators/pm.go | 4 +- internal/operators/pm_from_dataset.go | 5 +- internal/operators/pm_from_file.go | 9 +- magefile.go | 5 +- 12 files changed, 227 insertions(+), 17 deletions(-) create mode 100644 internal/memoize/nosync.go create mode 100644 internal/memoize/nosync_test.go rename internal/memoize/{memoize.go => sync.go} (97%) rename internal/memoize/{memoize_test.go => sync_test.go} (99%) diff --git a/internal/memoize/README.md b/internal/memoize/README.md index 1e593efa9..5c6c04aef 100644 --- a/internal/memoize/README.md +++ b/internal/memoize/README.md @@ -6,6 +6,6 @@ the regexes when the connects spins up more than one WAF in the same process and hence same regexes are being compiled over and over. -Currently it is opt-in under the `memoize_regex` build tag +Currently it is opt-in under the `memoize_builders` build tag as under a misuse it could lead to a memory leak as currently the cache is global. diff --git a/internal/memoize/noop.go b/internal/memoize/noop.go index fd1e13184..959408aff 100644 --- a/internal/memoize/noop.go +++ b/internal/memoize/noop.go @@ -1,7 +1,7 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build tinygo || !memoize_regex +//go:build !memoize_builders package memoize diff --git a/internal/memoize/nosync.go b/internal/memoize/nosync.go new file mode 100644 index 000000000..5680de8a1 --- /dev/null +++ b/internal/memoize/nosync.go @@ -0,0 +1,34 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build tinygo && memoize_builders + +package memoize + +var doer = makeDoer(new(sync.Map)) + +// Do executes and returns the results of the given function, unless there was a cached +// value of the same key. Only one execution is in-flight for a given key at a time. +// The boolean return value indicates whether v was previously stored. +func Do(key string, fn func() (interface{}, error)) (interface{}, error) { + value, err, _ := doer(key, fn) + return value, err +} + +// makeDoer returns a function that executes and returns the results of the given function +func makeDoer(cache *sync.Map) func(string, func() (interface{}, error)) (interface{}, error, bool) { + return func(key string, fn func() (interface{}, error)) (interface{}, error, bool) { + // Check cache + value, found := cache.Load(key) + if found { + return value, nil, true + } + + data, err := fn() + if err == nil { + cache.Store(key, data) + } + + return data, err, false + } +} diff --git a/internal/memoize/nosync_test.go b/internal/memoize/nosync_test.go new file mode 100644 index 000000000..c942a522c --- /dev/null +++ b/internal/memoize/nosync_test.go @@ -0,0 +1,167 @@ +// Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors +// SPDX-License-Identifier: Apache-2.0 + +//go:build tinygo && memoize_builders + +// https://github.com/kofalt/go-memoize/blob/master/memoize.go + +package memoize + +import ( + "errors" + "sync" + "testing" +) + +func TestDo(t *testing.T) { + expensiveCalls := 0 + + // Function tracks how many times its been called + expensive := func() (interface{}, error) { + expensiveCalls++ + return expensiveCalls, nil + } + + // First call SHOULD NOT be cached + result, err := Do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + // Second call on same key SHOULD be cached + result, err = Do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + // First call on a new key SHOULD NOT be cached + result, err = Do("key2", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } +} + +func TestSuccessCall(t *testing.T) { + do := makeDoer(new(sync.Map)) + + expensiveCalls := 0 + + // Function tracks how many times its been called + expensive := func() (interface{}, error) { + expensiveCalls++ + return expensiveCalls, nil + } + + // First call SHOULD NOT be cached + result, err, cached := do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Second call on same key SHOULD be cached + result, err, cached = do("key1", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := true, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // First call on a new key SHOULD NOT be cached + result, err, cached = do("key2", expensive) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } +} + +func TestFailedCall(t *testing.T) { + do := makeDoer(new(sync.Map)) + + calls := 0 + + // This function will fail IFF it has not been called before. + twoForTheMoney := func() (interface{}, error) { + calls++ + + if calls == 1 { + return calls, errors.New("Try again") + } else { + return calls, nil + } + } + + // First call should fail, and not be cached + result, err, cached := do("key1", twoForTheMoney) + if err == nil { + t.Fatalf("expected error") + } + + if want, have := 1, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Second call should succeed, and not be cached + result, err, cached = do("key1", twoForTheMoney) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := false, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } + + // Third call should succeed, and be cached + result, err, cached = do("key1", twoForTheMoney) + if err != nil { + t.Fatalf("unexpected error: %s", err.Error()) + } + + if want, have := 2, result.(int); want != have { + t.Fatalf("unexpected value, want %d, have %d", want, have) + } + + if want, have := true, cached; want != have { + t.Fatalf("unexpected caching, want %t, have %t", want, have) + } +} diff --git a/internal/memoize/memoize.go b/internal/memoize/sync.go similarity index 97% rename from internal/memoize/memoize.go rename to internal/memoize/sync.go index 426d683cd..a8d5c9610 100644 --- a/internal/memoize/memoize.go +++ b/internal/memoize/sync.go @@ -1,7 +1,7 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build !tinygo && memoize_regex +//go:build !tinygo && memoize_builders // https://github.com/kofalt/go-memoize/blob/master/memoize.go diff --git a/internal/memoize/memoize_test.go b/internal/memoize/sync_test.go similarity index 99% rename from internal/memoize/memoize_test.go rename to internal/memoize/sync_test.go index 3f89c9d24..d995d1d41 100644 --- a/internal/memoize/memoize_test.go +++ b/internal/memoize/sync_test.go @@ -1,7 +1,7 @@ // Copyright 2023 Juan Pablo Tosso and the OWASP Coraza contributors // SPDX-License-Identifier: Apache-2.0 -//go:build !tinygo && memoize_regex +//go:build !tinygo && memoize_builders // https://github.com/kofalt/go-memoize/blob/master/memoize.go diff --git a/internal/operators/from_file.go b/internal/operators/from_file.go index cc4a7bacd..4d011c1af 100644 --- a/internal/operators/from_file.go +++ b/internal/operators/from_file.go @@ -10,15 +10,15 @@ import ( "path" ) -var errEmptyPaths = errors.New("empty paths") +var errEmptyDirs = errors.New("empty dirs") -func loadFromFile(filepath string, paths []string, root fs.FS) ([]byte, error) { +func loadFromFile(filepath string, dirs []string, root fs.FS) ([]byte, error) { if path.IsAbs(filepath) { return fs.ReadFile(root, filepath) } - if len(paths) == 0 { - return nil, errEmptyPaths + if len(dirs) == 0 { + return nil, errEmptyDirs } // handling files by operators is hard because we must know the paths where we can @@ -29,7 +29,7 @@ func loadFromFile(filepath string, paths []string, root fs.FS) ([]byte, error) { err error ) - for _, p := range paths { + for _, p := range dirs { absFilepath := path.Join(p, filepath) content, err = fs.ReadFile(root, absFilepath) if err != nil { diff --git a/internal/operators/from_file_test.go b/internal/operators/from_file_test.go index 228f3983b..b5c642725 100644 --- a/internal/operators/from_file_test.go +++ b/internal/operators/from_file_test.go @@ -29,7 +29,7 @@ func getTestFile(t *testing.T) (string, string) { func TestLoadFromFileNoPaths(t *testing.T) { _, err := loadFromFile("non-existing-file", nil, io.OSFS{}) if err == nil { - t.Errorf("expected error: %s", errEmptyPaths.Error()) + t.Errorf("expected error: %s", errEmptyDirs.Error()) } } diff --git a/internal/operators/pm.go b/internal/operators/pm.go index df7ada7ee..8c78cc35b 100644 --- a/internal/operators/pm.go +++ b/internal/operators/pm.go @@ -11,6 +11,7 @@ import ( ahocorasick "github.com/petar-dambovaliev/aho-corasick" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) type pm struct { @@ -31,8 +32,9 @@ func newPM(options plugintypes.OperatorOptions) (plugintypes.Operator, error) { DFA: true, }) + m, _ := memoize.Do(data, func() (interface{}, error) { return builder.Build(dict), nil }) // TODO this operator is supposed to support snort data syntax: "@pm A|42|C|44|F" - return &pm{matcher: builder.Build(dict)}, nil + return &pm{matcher: m.(ahocorasick.AhoCorasick)}, nil } func (o *pm) Evaluate(tx plugintypes.TransactionState, value string) bool { diff --git a/internal/operators/pm_from_dataset.go b/internal/operators/pm_from_dataset.go index 9e3cc5385..fc4d57594 100644 --- a/internal/operators/pm_from_dataset.go +++ b/internal/operators/pm_from_dataset.go @@ -11,6 +11,7 @@ import ( ahocorasick "github.com/petar-dambovaliev/aho-corasick" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) func newPMFromDataset(options plugintypes.OperatorOptions) (plugintypes.Operator, error) { @@ -26,7 +27,9 @@ func newPMFromDataset(options plugintypes.OperatorOptions) (plugintypes.Operator DFA: true, }) - return &pm{matcher: builder.Build(dataset)}, nil + m, _ := memoize.Do(data, func() (interface{}, error) { return builder.Build(dataset), nil }) + + return &pm{matcher: m.(ahocorasick.AhoCorasick)}, nil } func init() { diff --git a/internal/operators/pm_from_file.go b/internal/operators/pm_from_file.go index d15b934e0..035a448e4 100644 --- a/internal/operators/pm_from_file.go +++ b/internal/operators/pm_from_file.go @@ -13,12 +13,13 @@ import ( ahocorasick "github.com/petar-dambovaliev/aho-corasick" "github.com/corazawaf/coraza/v3/experimental/plugins/plugintypes" + "github.com/corazawaf/coraza/v3/internal/memoize" ) func newPMFromFile(options plugintypes.OperatorOptions) (plugintypes.Operator, error) { - path := options.Arguments + filepath := options.Arguments - data, err := loadFromFile(path, options.Path, options.Root) + data, err := loadFromFile(filepath, options.Path, options.Root) if err != nil { return nil, err } @@ -44,7 +45,9 @@ func newPMFromFile(options plugintypes.OperatorOptions) (plugintypes.Operator, e DFA: false, }) - return &pm{matcher: builder.Build(lines)}, nil + m, _ := memoize.Do(strings.Join(options.Path, ",")+filepath, func() (interface{}, error) { return builder.Build(lines), nil }) + + return &pm{matcher: m.(ahocorasick.AhoCorasick)}, nil } func init() { diff --git a/magefile.go b/magefile.go index a9a90d0af..960f0e418 100644 --- a/magefile.go +++ b/magefile.go @@ -111,7 +111,7 @@ func Test() error { return err } - if err := sh.RunV("go", "test", "-tags=memoize_regex", "./..."); err != nil { + if err := sh.RunV("go", "test", "-tags=memoize_builders", "./..."); err != nil { return err } @@ -123,9 +123,10 @@ func Test() error { return err } - if err := sh.RunV("go", "test", "-tags=memoize_regex", "./testing/coreruleset"); err != nil { + if err := sh.RunV("go", "test", "-tags=memoize_builders", "./testing/coreruleset"); err != nil { return err } + // Execute FTW tests with multiphase evaluation enabled as well if err := sh.RunV("go", "test", "-tags=coraza.rule.multiphase_evaluation", "./testing/coreruleset"); err != nil { return err From d43a91f31195ece6487035d40c8d673815e8ebbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Sat, 29 Jul 2023 23:09:59 +0200 Subject: [PATCH 5/7] tests: covers memoize_builders in tinygo. --- .github/workflows/tinygo.yml | 3 +++ internal/corazawaf/rule.go | 4 ++-- internal/memoize/README.md | 10 +++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/.github/workflows/tinygo.yml b/.github/workflows/tinygo.yml index 43ba052b9..b714ce1b2 100644 --- a/.github/workflows/tinygo.yml +++ b/.github/workflows/tinygo.yml @@ -47,3 +47,6 @@ jobs: - name: Tests run: tinygo test ./... + + - name: Tests memoize + run: tinygo test -tags=memoize_builders ./... diff --git a/internal/corazawaf/rule.go b/internal/corazawaf/rule.go index e67e6fb02..14016b957 100644 --- a/internal/corazawaf/rule.go +++ b/internal/corazawaf/rule.go @@ -459,7 +459,7 @@ func (r *Rule) AddVariable(v variables.RuleVariable, key string, iscount bool) e key = key[1 : len(key)-1] if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil { - panic(err) + return err } else { re = vare.(*regexp.Regexp) } @@ -528,7 +528,7 @@ func (r *Rule) AddVariableNegation(v variables.RuleVariable, key string) error { if len(key) > 2 && key[0] == '/' && key[len(key)-1] == '/' { key = key[1 : len(key)-1] if vare, err := memoize.Do(key, func() (interface{}, error) { return regexp.Compile(key) }); err != nil { - panic(err) + return err } else { re = vare.(*regexp.Regexp) } diff --git a/internal/memoize/README.md b/internal/memoize/README.md index 5c6c04aef..45733793b 100644 --- a/internal/memoize/README.md +++ b/internal/memoize/README.md @@ -2,10 +2,10 @@ Memoize allows to cache certain expensive function calls and cache the result. The main advantage in Coraza is to memoize -the regexes when the connects spins up more than one WAF in -the same process and hence same regexes are being compiled -over and over. +the regexes and aho-corasick dictionaries when the connects +spins up more than one WAF in the same process and hence same +regexes are being compiled over and over. Currently it is opt-in under the `memoize_builders` build tag -as under a misuse it could lead to a memory leak as currently -the cache is global. +as under a misuse (e.g. using after build time) it could lead +to a memory leak as currently the cache is global. From 3ed990b86911eddfdd86cb732d048b30ba8e24b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Sat, 5 Aug 2023 22:16:19 +0200 Subject: [PATCH 6/7] chore: fixes nosync for tinygo. --- internal/memoize/nosync.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/internal/memoize/nosync.go b/internal/memoize/nosync.go index 5680de8a1..d238244d9 100644 --- a/internal/memoize/nosync.go +++ b/internal/memoize/nosync.go @@ -5,6 +5,8 @@ package memoize +import "sync" + var doer = makeDoer(new(sync.Map)) // Do executes and returns the results of the given function, unless there was a cached From 8d52562a34700ae64f3ed6fe7c75f6e5a010b05d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Carlos=20Ch=C3=A1vez?= Date: Sat, 5 Aug 2023 22:23:24 +0200 Subject: [PATCH 7/7] docs: updates docs. --- README.md | 3 +++ internal/memoize/README.md | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/README.md b/README.md index e57e5a53f..8edb6ddc6 100644 --- a/README.md +++ b/README.md @@ -106,6 +106,9 @@ have compatibility guarantees across minor versions - use with care. the operator with `plugins.RegisterOperator` to reduce binary size / startup overhead. * `coraza.rule.multiphase_valuation` - enables evaluation of rule variables in the phases that they are ready, not only the phase the rule is defined for. +* `memoize_builders` - enables memoization of builders for regex and aho-corasick +dictionaries to reduce memory consumption in deployments that launch several coraza +instances. For more context check [this issue](https://github.com/corazawaf/coraza-caddy/issues/76) ## E2E Testing diff --git a/internal/memoize/README.md b/internal/memoize/README.md index 45733793b..5ebe09aba 100644 --- a/internal/memoize/README.md +++ b/internal/memoize/README.md @@ -9,3 +9,9 @@ regexes are being compiled over and over. Currently it is opt-in under the `memoize_builders` build tag as under a misuse (e.g. using after build time) it could lead to a memory leak as currently the cache is global. + +**Important:** Connectors with *live reload* functionality (e.g. Caddy) +could lead to memory leaks which might or might not be negligible in +most of the cases as usually config changes in a WAF are about a few +rules, this is old objects will be still alive in memory until the program +stops.