seccomp-bpf: Render syscall rules after binary search tree traversa…

…l code. Structure before: ``` if sysno < current node value: goto left_node if sysno > current node value: goto right_node // Render rules for current sysno here... left_node: // Recursively render left node code here... right_node: // Recursively render right node code here... ``` This is fine, but if the "render rules for current sysno" part is larger enough for any syscall in the BST, this makes the jumps to `left_node` and `right_node` of the current nodes (and all its ancestor nodes) have to use unconditional jumps (i.e. extra instructions) during BST traversal. Since BST traversal must be fast, it is better to keep all the rules for each syscall separate from the BST traversal code. This ensures that the BST traversal code all fits in the maximum unconditional jump size (255 instructions), and then we do just one possibly-unconditional jump to the set of rules for that syscall. Compare that to the previous structure, where multiple jumps during BST traversal could have been unconditional jumps. Structure after: ``` if sysno < current node value: goto left_node if sysno > current node value: goto right_node goto sysno_rules left_node: // Recursively render left node traversal code here... right_node: // Recursively render right node traversal code here... sysno_rules: // Render rules for current sysno here... // Recursively render left node syscall rules code here... // Recursively render right node syscall rules code here... ``` With all the optimizations done in previous CLs, BSTs in practice are actually small enough that both the traversal and and the syscall rules together all fit under 255 instructions, so this only rarely comes into play. However, as we add more syscalls and syscall rules, the effect of this optimization should increase. This actually results in slightly larger bytecode (because most syscall filter rules do fit in just a few instructions, but now they have to be grouped at the end which takes a bigger jump to reach), but *execution* is still faster, because only one unconditional jump is ever done per program execution. Benchmarks expectedly don't show much change, except KVM which is quite happy for some reason: ``` │ before │ after │ │ sec/op │ sec/op vs base │ SentrySystrap/Postgres-48 51.43n ± 20% 50.50n ± 14% ~ (p=0.394 n=93+92) SentryKVM/Postgres-48 48.15n ± 10% 40.19n ± 14% -16.53% (p=0.012 n=96+99) NVProxyIoctl/nvproxy-48 65.75n ± 2% 65.43n ± 2% ~ (p=0.703 n=99+98) geomean 57.84n 57.69n -0.49% ``` (Most other benchmarks including`futex` and such show no change, which makes sense because they're part of the "hot" syscalls which aren't in a BST at all.) PiperOrigin-RevId: 595816325
google · Jan 4, 2024 · 1e61310 · 1e61310
1 parent 9e2db2c
commit 1e61310
Showing 1 changed file with 32 additions and 22 deletions.
diff --git a/pkg/seccomp/seccomp.go b/pkg/seccomp/seccomp.go
@@ -823,18 +823,16 @@ func (ors orderedRuleSets) renderBST(program *syscallProgram, ls *labelSet, alre
 	frag := program.Record()
 	root := createBST(orderedSysnos)
 	root.root = true
-	if err := root.traverse(
-		buildBSTProgram,
-		knownRange{
-			lowerBoundExclusive: -1,
-			// sysno fits in 32 bits, so this is definitely out of bounds:
-			upperBoundExclusive: 1 << 32,
-			previouslyChecked:   alreadyChecked,
-		},
-		syscallMap,
-		program,
-		noSycallNumberMatch,
-	); err != nil {
+	knownRng := knownRange{
+		lowerBoundExclusive: -1,
+		// sysno fits in 32 bits, so this is definitely out of bounds:
+		upperBoundExclusive: 1 << 32,
+		previouslyChecked:   alreadyChecked,
+	}
+	if err := root.traverse(renderBSTTraversal, knownRng, syscallMap, program, noSycallNumberMatch); err != nil {
+		return nil, err
+	}
+	if err := root.traverse(renderBSTRules, knownRng, syscallMap, program, noSycallNumberMatch); err != nil {
 		return nil, err
 	}
 	frag.MustHaveJumpedToOrReturned([]label{noSycallNumberMatch, defaultLabel}, possibleActions)
@@ -858,8 +856,8 @@ func createBST(syscalls []uintptr) *node {
 	return &parent
 }
 
-// buildBSTProgram converts a binary tree started in 'root' into BPF code. The outline of the code
-// is as follows, given a BST with:
+// renderBSTTraversal renders the traversal bytecode for a binary search tree.
+// The outline of the code is as follows, given a BST with:
 //
 //		     22
 //		    /  \
@@ -870,38 +868,41 @@ func createBST(syscalls []uintptr) *node {
 //		index_22: // SYS_PIPE(22), root
 //		(A < 22) ? goto index_9  : continue
 //		(A > 22) ? goto index_24 : continue
-//		(args OK for SYS_PIPE) ? return action : goto defaultLabel
+//		goto checkArgs_22
 //
 //		index_9: // SYS_MMAP(9), single child
 //		(A < 9)  ? goto index_8  : continue
 //		(A == 9) ? continue : goto defaultLabel
-//		(args OK for SYS_MMAP) ? return action : goto defaultLabel
+//		goto checkArgs_9
 //
 //		index_8: // SYS_LSEEK(8), leaf
 //		(A == 8) ? continue : goto defaultLabel
-//		(args OK for SYS_LSEEK) ? return action : goto defaultLabel
+//		goto checkArgs_8
 //
 //		index_24: // SYS_SCHED_YIELD(24)
 //		(A < 24) ? goto index_23 : continue
 //		(A > 22) ? goto index_50 : continue
-//		(args OK for SYS_SCHED_YIELD) ? return action : goto defaultLabel
+//		goto checkArgs_24
 //
 //		index_23: // SYS_SELECT(23), leaf with parent nodes adjacent in value
 //		# Notice that we do not check for equality at all here, since we've
 //		# already established that the syscall number is 23 from the
 //		# two parent nodes that we've already traversed.
 //		# This is tracked in the `rng knownRange` argument during traversal.
-//		(args OK for SYS_SELECT) ? return action : goto defaultLabel
+//		goto rules_23
 //
 //		index_50: // SYS_LISTEN(50), leaf
 //		(A == 50) ? continue : goto defaultLabel
-//		(args OK for SYS_LISTEN) ? return action : goto defaultLabel
-func buildBSTProgram(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
+//		goto checkArgs_50
+//
+// All of the "checkArgs_XYZ" labels are not defined in this function; they
+// are created using the `renderBSTRules` function, which is expected to be
+// called after this one on the entire BST.
+func renderBSTTraversal(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
 	// Root node is never referenced by label, skip it.
 	if !n.root {
 		program.Label(n.label())
 	}
-	nodeLabelSet := &labelSet{prefix: string(n.label())}
 	sysno := n.value
 	nodeFrag := program.Record()
 	checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno))
@@ -921,13 +922,22 @@ func buildBSTProgram(n *node, rng knownRange, syscallMap map[uintptr]singleSysca
 	}
 	program.JumpTo(checkArgsLabel)
 	nodeFrag.MustHaveJumpedTo(n.left.label(), n.right.label(), checkArgsLabel, searchFailed)
+	return nil
+}
 
+// renderBSTRules renders the `checkArgs_XYZ` labels that `renderBSTTraversal`
+// jumps to as part of the BST traversal code. It contains all the
+// argument-specific syscall rules for each syscall number.
+func renderBSTRules(n *node, rng knownRange, syscallMap map[uintptr]singleSyscallRuleSet, program *syscallProgram, searchFailed label) error {
+	sysno := n.value
+	checkArgsLabel := label(fmt.Sprintf("checkArgs_%d", sysno))
 	program.Label(checkArgsLabel)
 	ruleSetsFrag := program.Record()
 	possibleActions := make(map[linux.BPFAction]struct{})
 	for _, ra := range syscallMap[sysno].rules {
 		possibleActions[ra.action] = struct{}{}
 	}
+	nodeLabelSet := &labelSet{prefix: string(n.label())}
 	syscallMap[sysno].Render(program, nodeLabelSet, defaultLabel)
 	ruleSetsFrag.MustHaveJumpedToOrReturned(
 		[]label{