-
Notifications
You must be signed in to change notification settings - Fork 2.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[receiver/filelog] Fix issue where flushed tokens could be truncated (#…
…37596) Fixes #35042 (and #32100 again) The issue affected unterminated logs of particular lengths. Specifically, longer than our internal `scanner.DefaultBufferSize` (16kB) and shorter than `max_log_size`. The failure mode was described in #32100 but was apparently only fixed in some circumstances. I believe this is a more robust fix. I'll articulate the exact failure mode again here: 1. During a poll cycle, `reader.ReadToEnd` is called. Within this, a scanner is created which starts with a default buffer size. The buffer is filled, but no terminator is found. Therefore the scanner resizes the buffer to accommodate more data, hoping to find a terminator. Eventually, the buffer is large enough to contain all content until EOF, but still no terminator was found. At this time, the flush timer has not expired, so `reader.ReadToEnd` returns without emitting anything. 2. During the _next_ poll cycle, `reader.ReadToEnd` creates a new scanner, also with default buffer size. The first time is looks for a terminator, it of course doesn't find one, but at this time the flush timer has expired. Therefore, instead of resizing the buffer and continuing to look for a terminator, it just emits what it has. What should happen instead is the scanner continues to resize the buffer to find as much of the unterminated token as possible before emitting it. Therefore, this fix introduces a simple layer into the split func stack which allows us to reason about unterminated tokens more carefully. It captures the length of unterminated tokens and ensures that when we recreate a scanner, we will start with a buffer size that is appropriate to read the same content as last time, plus one additional byte. The extra byte allows us to check if new content has been added, in which case we will resume resizing. If no new content is found, the flusher will emit the entire unterminated token as one.
- Loading branch information
1 parent
1438738
commit 68b24ea
Showing
7 changed files
with
282 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# Use this changelog template to create an entry for release notes. | ||
|
||
# One of 'breaking', 'deprecation', 'new_component', 'enhancement', 'bug_fix' | ||
change_type: bug_fix | ||
|
||
# The name of the component, or a single word describing the area of concern, (e.g. filelogreceiver) | ||
component: filelogreceiver | ||
|
||
# A brief description of the change. Surround your text with quotes ("") if it needs to start with a backtick (`). | ||
note: Fix issue where flushed tokens could be truncated. | ||
|
||
# Mandatory: One or more tracking issues related to the change. You can use the PR number here if no issue exists. | ||
issues: [35042] | ||
|
||
# (Optional) One or more lines of additional information to render under the primary note. | ||
# These lines will be padded with 2 spaces and then inserted directly into the document. | ||
# Use pipe (|) for multiline entries. | ||
subtext: | ||
|
||
# If your change doesn't affect end users or the exported elements of any package, | ||
# you should instead start your pull request title with [chore] or use the "Skip Changelog" label. | ||
# Optional: The change log or logs in which this entry should be included. | ||
# e.g. '[user]' or '[user, api]' | ||
# Include 'user' if the change is relevant to end users. | ||
# Include 'api' if there is a change to a library API. | ||
# Default: '[user]' | ||
change_logs: [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package tokenlen // import "github.com/open-telemetry/opentelemetry-collector-contrib/pkg/stanza/tokenlen" | ||
|
||
import "bufio" | ||
|
||
// State tracks the potential length of a token before any terminator checking | ||
type State struct { | ||
MinimumLength int | ||
} | ||
|
||
// Func wraps a bufio.SplitFunc to track potential token lengths | ||
// Records the length of the data before delegating to the wrapped function | ||
func (s *State) Func(splitFunc bufio.SplitFunc) bufio.SplitFunc { | ||
if s == nil { | ||
return splitFunc | ||
} | ||
|
||
return func(data []byte, atEOF bool) (advance int, token []byte, err error) { | ||
// Note the potential token length but don't update state until we know | ||
// whether or not a token is actually returned | ||
potentialLen := len(data) | ||
|
||
advance, token, err = splitFunc(data, atEOF) | ||
if advance == 0 && token == nil && err == nil { | ||
// The splitFunc is asking for more data. Remember how much | ||
// we saw previously so the buffer can be sized appropriately. | ||
s.MinimumLength = potentialLen | ||
} else { | ||
// A token was returned. This state represented that token, so clear it. | ||
s.MinimumLength = 0 | ||
} | ||
return advance, token, err | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
// Copyright The OpenTelemetry Authors | ||
// SPDX-License-Identifier: Apache-2.0 | ||
|
||
package tokenlen | ||
|
||
import ( | ||
"bufio" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestTokenLenState_Func(t *testing.T) { | ||
cases := []struct { | ||
name string | ||
input []byte | ||
atEOF bool | ||
expectedLen int | ||
expectedToken []byte | ||
expectedAdv int | ||
expectedErr error | ||
}{ | ||
{ | ||
name: "no token yet", | ||
input: []byte("partial"), | ||
atEOF: false, | ||
expectedLen: len("partial"), | ||
}, | ||
{ | ||
name: "complete token", | ||
input: []byte("complete\ntoken"), | ||
atEOF: false, | ||
expectedLen: 0, // should clear state after finding token | ||
expectedToken: []byte("complete"), | ||
expectedAdv: len("complete\n"), | ||
}, | ||
{ | ||
name: "growing token", | ||
input: []byte("growing"), | ||
atEOF: false, | ||
expectedLen: len("growing"), | ||
}, | ||
{ | ||
name: "flush at EOF", | ||
input: []byte("flush"), | ||
atEOF: true, | ||
expectedLen: 0, // should clear state after flushing | ||
expectedToken: []byte("flush"), | ||
expectedAdv: len("flush"), | ||
}, | ||
} | ||
|
||
for _, tc := range cases { | ||
t.Run(tc.name, func(t *testing.T) { | ||
state := &State{} | ||
splitFunc := state.Func(bufio.ScanLines) | ||
|
||
adv, token, err := splitFunc(tc.input, tc.atEOF) | ||
require.Equal(t, tc.expectedErr, err) | ||
require.Equal(t, tc.expectedToken, token) | ||
require.Equal(t, tc.expectedAdv, adv) | ||
require.Equal(t, tc.expectedLen, state.MinimumLength) | ||
}) | ||
} | ||
} | ||
|
||
func TestTokenLenState_GrowingToken(t *testing.T) { | ||
state := &State{} | ||
splitFunc := state.Func(bufio.ScanLines) | ||
|
||
// First call with partial token | ||
adv, token, err := splitFunc([]byte("part"), false) | ||
require.NoError(t, err) | ||
require.Nil(t, token) | ||
require.Equal(t, 0, adv) | ||
require.Equal(t, len("part"), state.MinimumLength) | ||
|
||
// Second call with longer partial token | ||
adv, token, err = splitFunc([]byte("partial"), false) | ||
require.NoError(t, err) | ||
require.Nil(t, token) | ||
require.Equal(t, 0, adv) | ||
require.Equal(t, len("partial"), state.MinimumLength) | ||
|
||
// Final call with complete token | ||
adv, token, err = splitFunc([]byte("partial\ntoken"), false) | ||
require.NoError(t, err) | ||
require.Equal(t, []byte("partial"), token) | ||
require.Equal(t, len("partial\n"), adv) | ||
require.Equal(t, 0, state.MinimumLength) // State should be cleared after emitting token | ||
} |