From 96b5a7fe2187f32cfbb9dd7012ac3c4039836c11 Mon Sep 17 00:00:00 2001 From: Charles Oliver Nutter Date: Tue, 10 Oct 2023 13:43:44 -0500 Subject: [PATCH] Pool a matcher per thread Matching is a leaf operation, so we can pool a matcher per thread to avoid the cost of constructing and initializing the entire matcher for each string. This patch does eliminate all object creation in typical usage, reusing all objects and expecting users to pull match data out of them before using the regex again. However in benchmarks of a regex-heavy StringScanner-based CSV library (jruby/jruby#7604) the overhead of fetching and clearing the pooled matcher appears to hurt performance more than it helps to remove the allocations. --- src/org/joni/ByteCodeMachine.java | 14 ++++++++++++++ src/org/joni/Matcher.java | 22 ++++++++++++++++++---- src/org/joni/Regex.java | 14 +++++++++++++- src/org/joni/StackEntry.java | 10 ++++++++++ src/org/joni/StackMachine.java | 19 +++++++++++++++++++ 5 files changed, 74 insertions(+), 5 deletions(-) diff --git a/src/org/joni/ByteCodeMachine.java b/src/org/joni/ByteCodeMachine.java index d25ea6d3..535f35a1 100644 --- a/src/org/joni/ByteCodeMachine.java +++ b/src/org/joni/ByteCodeMachine.java @@ -57,6 +57,20 @@ class ByteCodeMachine extends StackMachine { this.code = regex.code; } + public void reset(byte[]bytes, int p, int end) { + super.reset(bytes, p, end); + + interrupted = false; + bestLen = 0; + s = 0; + range = 0; + sprev = 0; + sstart = 0; + sbegin = 0; + pkeep = 0; + ip = 0; + } + @Override public void interrupt() { interrupted = true; diff --git a/src/org/joni/Matcher.java b/src/org/joni/Matcher.java index a2d4d18f..c1fc50bc 100644 --- a/src/org/joni/Matcher.java +++ b/src/org/joni/Matcher.java @@ -35,13 +35,13 @@ public abstract class Matcher extends IntHolder { protected final Regex regex; protected final Encoding enc; - protected final byte[]bytes; - protected final int str; - protected final int end; + protected byte[]bytes; + protected int str; + protected int end; protected int msaStart; protected int msaOptions; - protected final Region msaRegion; + protected Region msaRegion; protected int msaBestLen; protected int msaBestS; protected int msaGpos; @@ -58,6 +58,20 @@ public abstract class Matcher extends IntHolder { this.msaRegion = region; } + public void reset(byte[]bytes, int p, int end) { + if (this.msaRegion != null) this.msaRegion.clear(); + this.bytes = bytes; + this.str = p; + this.end = end; + this.msaStart = 0; + this.msaOptions = 0; + this.msaBestLen = 0; + this.msaBestS = 0; + this.msaGpos = 0; + this.msaBegin = 0; + this.msaEnd = 0; + } + // main matching method protected abstract int matchAt(int range, int sstart, int sprev, boolean interrupt) throws InterruptedException; diff --git a/src/org/joni/Regex.java b/src/org/joni/Regex.java index 5e983124..9ecbe46b 100644 --- a/src/org/joni/Regex.java +++ b/src/org/joni/Regex.java @@ -25,8 +25,10 @@ import static org.joni.Option.isDontCaptureGroup; import java.nio.charset.Charset; +import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; +import java.util.List; import org.jcodings.CaseFoldCodeItem; import org.jcodings.Encoding; @@ -182,9 +184,19 @@ public Matcher matcherNoRegion(byte[]bytes) { } public Matcher matcher(byte[]bytes, int p, int end) { - return factory.create(this, numMem == 0 ? null : Region.newRegion(numMem + 1), bytes, p, end); + Matcher matcher = matchers.get(); + + if (matcher == null) { + matchers.set(matcher = factory.create(this, numMem == 0 ? null : Region.newRegion(numMem + 1), bytes, p, end)); + } else { + matcher.reset(bytes, p, end); + } + + return matcher; } + private ThreadLocal matchers = new ThreadLocal<>(); + public Matcher matcherNoRegion(byte[]bytes, int p, int end) { return factory.create(this, null, bytes, p, end); } diff --git a/src/org/joni/StackEntry.java b/src/org/joni/StackEntry.java index 3ad2fc2e..3445518f 100644 --- a/src/org/joni/StackEntry.java +++ b/src/org/joni/StackEntry.java @@ -23,6 +23,10 @@ class StackEntry { int type; private int E1, E2, E3, E4; + void reset() { + type = E1 = E2 = E3 = E4 = 0; + } + // first union member /* byte code position */ void setStatePCode(int pcode) { @@ -188,4 +192,10 @@ void setStateCheck(int check) { int getStateCheck() { return E5; } + + void reset() { + super.reset(); + + E5 = 0; + } } \ No newline at end of file diff --git a/src/org/joni/StackMachine.java b/src/org/joni/StackMachine.java index 71adff26..1caad444 100644 --- a/src/org/joni/StackMachine.java +++ b/src/org/joni/StackMachine.java @@ -56,6 +56,18 @@ protected StackMachine(Regex regex, Region region, byte[]bytes, int p , int end) repeatStk = n > 0 ? new int[n] : null; } + public void reset(byte[]bytes, int p, int end) { + super.reset(bytes, p, end); + + if (regex.requireStack) resetStack(stack); + if (repeatStk != null) { + Arrays.fill(repeatStk, repeatStk.length); + } + stk = 0; + stateCheckBuff = null; + stateCheckBuffSize = 0; + } + protected final void stackInit() { if (stack != null) pushEnsured(ALT, regex.codeLength - 1); /* bottom stack */ if (repeatStk != null) { @@ -71,6 +83,13 @@ private static StackEntry[] allocateStack() { return stack; } + private static void resetStack(StackEntry[] stack) { + for (int i = 0; i < stack.length; i++) { + StackEntry entry = stack[i]; + if (entry != null) entry.reset(); + } + } + private void doubleStack() { StackEntry[] newStack = new StackEntry[stack.length << 1]; System.arraycopy(stack, 0, newStack, 0, stack.length);