Skip to content

Commit

Permalink
Omit leading empty matches from Pattern.split, improve performance
Browse files Browse the repository at this point in the history
Fixes #131.

This change modifies Pattern.split to omit a leading empty match. This
behavior was specified in JDK8 and brings RE2/J split into line with
more recent JDK implementations.

Furthermore, the split function no longer needs determine the number of
matches before assembling the result. The upshot is that the number of
find() calls is halved in many cases. The benchmark in the previous
change shows a significant improvement.

Reference impl (JDK):
BenchmarkSplit.benchmarkSplit     JDK  avgt    5  14.217 ± 0.410  us/op

RE2J (before):
BenchmarkSplit.benchmarkSplit    RE2J  avgt    5  95.807 ± 6.737  us/op

RE2J (after):
BenchmarkSplit.benchmarkSplit    RE2J  avgt    5  49.092 ± 0.717  us/op
  • Loading branch information
sjamesr committed Jun 27, 2022
1 parent 7ba33e2 commit 7bf197f
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 34 deletions.
70 changes: 46 additions & 24 deletions java/com/google/re2j/Pattern.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@
package com.google.re2j;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;

/**
Expand Down Expand Up @@ -223,38 +225,58 @@ public String[] split(String input, int limit) {

/** Helper: run split on m's input. */
private String[] split(Matcher m, int limit) {
int matchCount = 0;
int arraySize = 0;
List<String> result = new ArrayList<String>();
int emptiesSkipped = 0;
int last = 0;

while (m.find()) {
matchCount++;
if (limit != 0 || last < m.start()) {
arraySize = matchCount;
if (last == 0 && m.end() == 0) {
// Zero-width match at the beginning, skip (JDK8+ behavior).
last = m.end();
continue;
}

if (limit > 0 && result.size() == limit - 1) {
// no more room for matches
break;
}

if (last == m.start()) {
if (limit == 0) {
// Empty match, may or may not be trailing.
emptiesSkipped++;
last = m.end();
continue;
}
} else {
// If emptiesSkipped > 0 then limit == 0 and we have non-trailing empty matches to add before
// this non-empty match.
while (emptiesSkipped > 0) {
result.add("");
emptiesSkipped--;
}
}

result.add(m.substring(last, m.start()));
last = m.end();
}
if (last < m.inputLength() || limit != 0) {
matchCount++;
arraySize = matchCount;
}

int trunc = 0;
if (limit > 0 && arraySize > limit) {
arraySize = limit;
trunc = 1;
}
String[] array = new String[arraySize];
int i = 0;
last = 0;
m.reset();
while (m.find() && i < arraySize - trunc) {
array[i++] = m.substring(last, m.start());
last = m.end();
if (limit == 0 && last != m.inputLength()) {
// Unlimited match, no more delimiters but we have a non-empty input at the end. Catch up any skipped empty
// matches, then emit the final match.
while (emptiesSkipped > 0) {
result.add("");
emptiesSkipped--;
}

result.add(m.substring(last, m.inputLength()));
}
if (i < arraySize) {
array[i] = m.substring(last, m.inputLength());

if (limit != 0 || result.isEmpty()) {
result.add(m.substring(last, m.inputLength()));
}
return array;

return result.toArray(new String[0]);
}

/**
Expand Down
24 changes: 14 additions & 10 deletions javatests/com/google/re2j/PatternTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -148,16 +148,20 @@ public void testSplit() {
// http://docs.oracle.com/javase/1.5.0/docs/api/java/util/regex/Pattern.html#split(java.lang.CharSequence, int)

String s = "boo:and:foo";
String regexp1 = ":";
String regexp2 = "o";

ApiTestUtils.testSplit(regexp1, s, 2, new String[] {"boo", "and:foo"});
ApiTestUtils.testSplit(regexp1, s, 5, new String[] {"boo", "and", "foo"});
ApiTestUtils.testSplit(regexp1, s, -2, new String[] {"boo", "and", "foo"});
ApiTestUtils.testSplit(regexp2, s, 5, new String[] {"b", "", ":and:f", "", ""});
ApiTestUtils.testSplit(regexp2, s, -2, new String[] {"b", "", ":and:f", "", ""});
ApiTestUtils.testSplit(regexp2, s, 0, new String[] {"b", "", ":and:f"});
ApiTestUtils.testSplit(regexp2, s, new String[] {"b", "", ":and:f"});

ApiTestUtils.testSplit(":", s, 2, new String[] {"boo", "and:foo"});
ApiTestUtils.testSplit(":", s, 5, new String[] {"boo", "and", "foo"});
ApiTestUtils.testSplit(":", s, -2, new String[] {"boo", "and", "foo"});
ApiTestUtils.testSplit("o", s, 5, new String[] {"b", "", ":and:f", "", ""});
ApiTestUtils.testSplit("o", s, -2, new String[] {"b", "", ":and:f", "", ""});
ApiTestUtils.testSplit("o", s, 0, new String[] {"b", "", ":and:f"});
ApiTestUtils.testSplit("o", s, new String[] {"b", "", ":and:f"});

// From https://github.com/google/re2j/issues/131.
ApiTestUtils.testSplit("x*", "foo", new String[] {"f", "o", "o"});
ApiTestUtils.testSplit("x*", "foo", 1, new String[] {"foo"});
ApiTestUtils.testSplit("x*", "f", 2, new String[] {"f", ""});
ApiTestUtils.testSplit(":", ":a::b", new String[] {"", "a", "", "b"});
}

@Test
Expand Down

0 comments on commit 7bf197f

Please sign in to comment.