BPF optimizer: Optimize redundant loads.

This is useful in conjunction cases when a single argument is compared against multiple values, e.g. for `fcntl(2)` where `args[1]` may be `F_GETFL`, `F_SETFL`, or `F_GETFD`. There is no need to reload the value of `args[1]` into the register if it is already the same as the one that was loaded from the previous check. This helps with syscall rules where one specific argument may be one of many specific values, especially `ioctl` rules (critical for KVM and `nvproxy` performance), and the benchmarks seem to agree: ``` # Time to evaluate seccomp filter for a syscall: # (Many syscalls for which the rules are not "this argument must be one of # these values" show no change and are omitted from this list) │ before │ after │ │ sec/op │ sec/op vs base │ SentrySystrap/Postgres/futex 62.39n ± 13% 53.06n ± 39% ~ (p=0.184 n=19+20) SentrySystrap/Postgres/nanosleep 237.7n ± 33% 119.3n ± 142% ~ (p=0.089 n=12) [...] SentryKVM/Postgres/ioctl 60.88n ± 17% 56.63n ± 12% -6.98% (p=0.026 n=20+19) SentryKVM/Postgres/rt_sigreturn 34.67n ± 32% 29.33n ± 49% ~ (p=0.443 n=20+18) [...] NVProxyIoctl/nvproxy/ioctl_3222292175 58.81n ± 8% 54.85n ± 8% -6.73% (p=0.015 n=29+30) NVProxyIoctl/nvproxy/ioctl_3221767894 58.22n ± 5% 54.06n ± 6% -7.14% (p=0.022 n=29+30) NVProxyIoctl/nvproxy/ioctl_3222292009 58.33n ± 8% 55.67n ± 3% ~ (p=0.070 n=28+30) NVProxyIoctl/nvproxy/ioctl_3223340586 61.57n ± 7% 58.56n ± 7% ~ (p=0.059 n=27+29) NVProxyIoctl/nvproxy/ioctl_3223340587 63.10n ± 9% 58.78n ± 7% ~ (p=0.140 n=29+30) NVProxyIoctl/nvproxy/ioctl_3223864875 63.66n ± 7% 58.25n ± 3% -8.50% (p=0.030 n=29) NVProxyIoctl/nvproxy/ioctl_3224389163 64.12n ± 5% 60.96n ± 4% ~ (p=0.114 n=29+30) NVProxyIoctl/nvproxy/ioctl_3223078452 63.90n ± 9% 58.25n ± 6% -8.84% (p=0.014 n=29) NVProxyIoctl/nvproxy/ioctl_3222816309 66.07n ± 10% 58.08n ± 5% -12.10% (p=0.003 n=29+30) NVProxyIoctl/nvproxy/ioctl_3233302090 65.55n ± 7% 58.88n ± 8% -10.17% (p=0.000 n=29+30) NVProxyIoctl/nvproxy/ioctl_3224913486 65.62n ± 6% 61.10n ± 7% ~ (p=0.072 n=28+29) NVProxyIoctl/nvproxy/ioctl_3223340623 63.45n ± 7% 59.47n ± 6% -6.26% (p=0.018 n=29+30) NVProxyIoctl/nvproxy/ioctl_3223864926 66.35n ± 8% 59.45n ± 5% -10.41% (p=0.000 n=29+30) NVProxyIoctl/nvproxy/ioctl_805306369 62.98n ± 6% 56.26n ± 7% -10.66% (p=0.006 n=29+30) NVProxyIoctl/nvproxy/ioctl_75 69.09n ± 7% 64.87n ± 8% ~ (p=0.083 n=29+28) NVProxyIoctl/nvproxy/ioctl_805306370 72.30n ± 8% 57.81n ± 7% -20.04% (p=0.000 n=29+30) NVProxyIoctl/nvproxy/ioctl_23 79.59n ± 5% 73.65n ± 4% -7.46% (p=0.001 n=29) NVProxyIoctl/nvproxy/ioctl_24 79.70n ± 4% 73.86n ± 3% -7.33% (p=0.014 n=29+30) NVProxyIoctl/nvproxy/ioctl_25 79.32n ± 4% 77.69n ± 6% ~ (p=0.468 n=29) NVProxyIoctl/nvproxy/ioctl_26 80.15n ± 5% 76.61n ± 6% ~ (p=0.140 n=29+30) NVProxyIoctl/nvproxy/ioctl_27 80.02n ± 5% 75.77n ± 7% -5.31% (p=0.003 n=29+30) NVProxyIoctl/nvproxy/ioctl_28 80.52n ± 6% 76.79n ± 3% -4.63% (p=0.028 n=29+30) NVProxyIoctl/nvproxy/ioctl_33 85.05n ± 8% 78.23n ± 5% -8.02% (p=0.028 n=29+30) NVProxyIoctl/nvproxy/ioctl_34 83.57n ± 7% 78.74n ± 4% -5.77% (p=0.035 n=29+30) NVProxyIoctl/nvproxy/ioctl_37 82.20n ± 6% 79.85n ± 7% ~ (p=0.474 n=29+30) NVProxyIoctl/nvproxy/ioctl_38 85.38n ± 6% 78.23n ± 6% -8.38% (p=0.001 n=29+30) NVProxyIoctl/nvproxy/ioctl_39 85.53n ± 4% 79.87n ± 6% -6.62% (p=0.007 n=29+30) NVProxyIoctl/nvproxy/ioctl_45 86.29n ± 4% 83.78n ± 8% ~ (p=0.176 n=29+30) NVProxyIoctl/nvproxy/ioctl_65 86.89n ± 4% 82.06n ± 5% -5.56% (p=0.028 n=29+30) NVProxyIoctl/nvproxy/ioctl_68 88.80n ± 6% 80.78n ± 7% -9.03% (p=0.000 n=29+30) NVProxyIoctl/nvproxy/ioctl_72 90.13n ± 3% 82.77n ± 6% -8.17% (p=0.005 n=29+30) NVProxyIoctl/nvproxy/ioctl_73 87.23n ± 6% 81.86n ± 4% -6.16% (p=0.039 n=29+30) NVProxyIoctl/nvproxy/ioctl_21506 55.90n ± 7% 56.20n ± 6% ~ (p=0.728 n=28+30) NVProxyIoctl/nvproxy/ioctl_3224913447 57.26n ± 9% 55.15n ± 7% -3.68% (p=0.028 n=27+30) NVProxyIoctl/nvproxy-48 69.44n ± 7% 64.64n ± 6% -6.91% (p=0.002 n=10) geomean 61.27n 56.73n -7.41% # Number of BPF instructions before optimizations: │ before │ after │ │ gen-instr │ gen-instr vs base │ SentrySystrap/Postgres-48 1.487k ± 0% 1.487k ± 0% ~ (p=1.000 n=10) ¹ SentryKVM/Postgres-48 1.345k ± 0% 1.345k ± 0% ~ (p=1.000 n=10) ¹ NVProxyIoctl/nvproxy-48 1.931k ± 0% 1.931k ± 0% ~ (p=1.000 n=10) ¹ geomean 1.569k 1.569k +0.00% ¹ all samples are equal # Number of BPF instructions after optimizations: │ before │ after │ │ opt-instr │ opt-instr vs base │ SentrySystrap/Postgres-48 382.0 ± 0% 355.0 ± 0% -7.07% (p=0.000 n=10) SentryKVM/Postgres-48 322.0 ± 0% 297.0 ± 0% -7.76% (p=0.000 n=10) NVProxyIoctl/nvproxy-48 473.0 ± 0% 409.0 ± 0% -13.53% (p=0.000 n=10) geomean 387.5 350.7 -9.50% # Ratio of post-optimizer #instructions vs pre-optimizer #instructions: │ before │ after │ │ compression-ratio │ compression-ratio vs base │ SentrySystrap/Postgres-48 3.893 ± 0% 4.189 ± 0% +7.60% (p=0.000 n=10) SentryKVM/Postgres-48 4.177 ± 0% 4.529 ± 0% +8.43% (p=0.000 n=10) NVProxyIoctl/nvproxy-48 4.082 ± 0% 4.721 ± 0% +15.65% (p=0.000 n=10) geomean 4.049 4.474 +10.50% # Time to run optimizer: │ before │ after │ │ opt-sec │ opt-sec vs base │ SentrySystrap/Postgres-48 4.699m ± 3% 5.327m ± 4% +13.34% (p=0.000 n=10) SentryKVM/Postgres-48 4.130m ± 3% 4.651m ± 3% +12.60% (p=0.000 n=10) NVProxyIoctl/nvproxy-48 10.19m ± 3% 10.42m ± 1% +2.26% (p=0.023 n=10) geomean 5.826m 6.367m +9.28% ``` Note that this only applies in practice because we extract each 32-bit "half" of each syscall argument into its own matcher rule. So the conjunction that this optimizes for isn't exactly over a single argument, but over a single 32-bit half of an argument. PiperOrigin-RevId: 595563115
google · Jan 4, 2024 · c7e3ea3 · c7e3ea3
1 parent 0bb4cfd
commit c7e3ea3
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 0 deletions.
diff --git a/pkg/bpf/optimizer.go b/pkg/bpf/optimizer.go
@@ -273,6 +273,77 @@ func optimizeJumpsToReturn(insns []Instruction) ([]Instruction, bool) {
 	return insns, changed
 }
 
+// removeRedundantLoads removes some redundant load instructions
+// when the value in register A is already the same value as what is
+// being loaded.
+func removeRedundantLoads(insns []Instruction) ([]Instruction, bool) {
+	// reverseWalk maps instruction indexes I to the set of instruction indexes
+	// that, after their execution, may result in the control flow jumping to I.
+	reverseWalk := make([]map[int]struct{}, len(insns))
+	for pc := range insns {
+		reverseWalk[pc] = make(map[int]struct{})
+	}
+	for pc, ins := range insns {
+		if ins.IsReturn() {
+			continue // Return instructions are terminal.
+		}
+		if ins.IsJump() {
+			for _, offset := range ins.JumpOffsets() {
+				reverseWalk[pc+int(offset.Offset)+1][pc] = struct{}{}
+			}
+			continue
+		}
+		// All other instructions flow through.
+		reverseWalk[pc+1][pc] = struct{}{}
+	}
+
+	// Now look for redundant load instructions.
+	// We iterate backwards here so that we can remove instructions as we go
+	// without a past iteration interfering with the results of the current
+	// iteration. This is relying on the property that BPF programs may only
+	// flow forwards (no backwards jumps).
+	changed := false
+	for pc := len(insns) - 1; pc >= 0; pc-- {
+		ins := insns[pc]
+		if ins.OpCode&instructionClassMask != Ld {
+			continue
+		}
+		// Walk backwards until either we've reached the beginning of the program,
+		// or we've reached an operation which modifies register A.
+		lastModifiedA := -1
+		beforePCs := reverseWalk[pc]
+	walk:
+		for {
+			switch len(beforePCs) {
+			case 0:
+				// We've reached the beginning of the program without modifying A.
+				break walk
+			case 1:
+				var beforePC int
+				for bpc := range beforePCs { // Note: we know that this map only has one element.
+					beforePC = bpc
+				}
+				if !insns[beforePC].ModifiesRegisterA() {
+					beforePCs = reverseWalk[beforePC]
+					continue walk
+				}
+				lastModifiedA = beforePC
+				break walk
+			default:
+				// Multiple ways to get to `pc`.
+				// For simplicity, we only support the single-branch case right now.
+				break walk
+			}
+		}
+		if lastModifiedA != -1 && insns[pc].Equal(insns[lastModifiedA]) {
+			insns = append(insns[:pc], insns[pc+1:]...)
+			decrementJumps(insns, pc)
+			changed = true
+		}
+	}
+	return insns, changed
+}
+
 // jumpRewriteOperation rewrites a jump target.
 type jumpRewriteOperation struct {
 	pc        int      // Rewrite instruction at this offset.
@@ -587,6 +658,7 @@ func Optimize(insns []Instruction) []Instruction {
 		optimizeJumpsToReturn,
 		removeZeroInstructionJumps,
 		removeDeadCode,
+		removeRedundantLoads,
 		optimizeJumpsToSmallestSetOfReturns,
 	})
 }
diff --git a/pkg/bpf/optimizer_test.go b/pkg/bpf/optimizer_test.go
@@ -141,6 +141,50 @@ func TestOptimize(t *testing.T) {
 				Stmt(Ret|K, 0),
 			},
 		},
+		{
+			name: "redundant loads removed",
+			optimizers: []optimizerFunc{
+				removeRedundantLoads,
+			},
+			insns: []Instruction{
+				Stmt(Ld|Imm|W, 42), // Not removed.
+				Jump(Jmp|Jeq|K, 42, 0, 0),
+				Jump(Jmp|Jgt|K, 42, 1, 0),
+				Stmt(Ld|Imm|W, 42), // Removed as it is equal in all cases.
+				Jump(Jmp|Jeq|K, 42, 1, 0),
+				Stmt(Ld|Imm|W, 43),
+				Stmt(Ld|Imm|W, 43),        // Not removed as the jump above may skip over previous instruction.
+				Stmt(Ld|Imm|W, 43),        // Removed.
+				Stmt(Ld|Imm|W, 44),        // Not removed.
+				Stmt(Ld|Imm|W, 44),        // Removed.
+				Stmt(Ld|Abs|W, 0),         // Not removed.
+				Stmt(Ld|Abs|W, 0),         // Removed.
+				Stmt(Ld|Abs|W, 0),         // Removed.
+				Stmt(Ld|Abs|W, 4),         // Not removed.
+				Stmt(Ld|Abs|W, 0),         // Not removed.
+				Stmt(Ld|Abs|W, 11),        // Not removed.
+				Jump(Jmp|Jeq|K, 42, 1, 0), // "True" branch will be culled.
+				Stmt(Ld|Abs|W, 11),        // Removed as there is only one way to get here and it already loads 11.
+				Stmt(Ld|Abs|W, 11),        // There are two branches to get here but the first gets culled, so it is still removed.
+				Stmt(Ld|Abs|W, 11),        // Removed.
+				Stmt(Ret|K, 0),
+			},
+			want: []Instruction{
+				Stmt(Ld|Imm|W, 42),
+				Jump(Jmp|Jeq|K, 42, 0, 0),
+				Jump(Jmp|Jgt|K, 42, 0, 0),
+				Jump(Jmp|Jeq|K, 42, 1, 0),
+				Stmt(Ld|Imm|W, 43),
+				Stmt(Ld|Imm|W, 43),
+				Stmt(Ld|Imm|W, 44),
+				Stmt(Ld|Abs|W, 0),
+				Stmt(Ld|Abs|W, 4),
+				Stmt(Ld|Abs|W, 0),
+				Stmt(Ld|Abs|W, 11),
+				Jump(Jmp|Jeq|K, 42, 0, 0),
+				Stmt(Ret|K, 0),
+			},
+		},
 		{
 			name: "jumps to return",
 			optimizers: []optimizerFunc{