Skip to content

Commit e7c6f76

Browse files
authored
Task/faster (#1)
* ⚡️ rules “compiler” Pre-parse robots.txt rules. * 🎨 cleanups * 🐛 .NETFramework 4.8 compat * ⬆️ upgrade dependencies --------- Signed-off-by: Yves Bastide <stid@acm.org>
1 parent 99f5f15 commit e7c6f76

11 files changed

+1337
-609
lines changed

RobotsTxt/Extensions.cs

Lines changed: 63 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -1,73 +1,86 @@
1-
namespace RobotsTxt
1+
namespace RobotsTxt;
2+
3+
public static class MyExtensions
24
{
3-
public static class MyExtensions
5+
#if !NETCOREAPP
6+
public static bool Contains(this ReadOnlySpan<byte> self, byte other)
47
{
5-
public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<byte> other)
8+
foreach (var c in self)
69
{
7-
if (self.Length != other.Length)
8-
{
9-
return false;
10-
}
11-
12-
for (var i = 0; i < self.Length; i++)
10+
if (c == other)
1311
{
14-
var c1 = self[i];
15-
var c2 = other[i];
16-
if ('A' <= c1 && c1 <= 'Z')
17-
c1 += 32;
18-
if ('A' <= c2 && c2 <= 'Z')
19-
c2 += 32;
20-
if (c1 != c2)
21-
{
22-
return false;
23-
}
12+
return true;
2413
}
14+
}
15+
return false;
16+
}
17+
#endif
2518

26-
return true;
19+
public static bool EqualsIgnoreCase(this ReadOnlySpan<byte> self, ReadOnlySpan<byte> other)
20+
{
21+
if (self.Length != other.Length)
22+
{
23+
return false;
2724
}
2825

29-
public static bool StartsWithIgnoreCase(this ReadOnlySpan<byte> span, ReadOnlySpan<byte> value)
26+
for (var i = 0; i < self.Length; i++)
3027
{
31-
if (span.Length < value.Length)
28+
var c1 = self[i];
29+
var c2 = other[i];
30+
if ('A' <= c1 && c1 <= 'Z')
31+
c1 += 32;
32+
if ('A' <= c2 && c2 <= 'Z')
33+
c2 += 32;
34+
if (c1 != c2)
3235
{
3336
return false;
3437
}
35-
36-
for (var i = 0; i < value.Length; i++)
37-
{
38-
var c1 = span[i];
39-
var c2 = value[i];
40-
if ('A' <= c1 && c1 <= 'Z')
41-
c1 += 32;
42-
if ('A' <= c2 && c2 <= 'Z')
43-
c2 += (byte)' ';
44-
if (c1 != c2)
45-
{
46-
return false;
47-
}
48-
}
49-
50-
return true;
5138
}
5239

53-
public static bool IsXDigit(this byte c)
54-
{
55-
return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
56-
}
40+
return true;
41+
}
5742

58-
public static bool IsAlpha(this byte c)
43+
public static bool StartsWithIgnoreCase(this ReadOnlySpan<byte> span, ReadOnlySpan<byte> value)
44+
{
45+
if (span.Length < value.Length)
5946
{
60-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
47+
return false;
6148
}
6249

63-
public static bool IsSpace(this byte c)
50+
for (var i = 0; i < value.Length; i++)
6451
{
65-
return c == ' ' || c == '\t';
52+
var c1 = span[i];
53+
var c2 = value[i];
54+
if ('A' <= c1 && c1 <= 'Z')
55+
c1 += 32;
56+
if ('A' <= c2 && c2 <= 'Z')
57+
c2 += (byte)' ';
58+
if (c1 != c2)
59+
{
60+
return false;
61+
}
6662
}
6763

68-
public static byte ToUpper(this byte c)
69-
{
70-
return (byte)('a' <= c && c <= 'z' ? c - ' ' : c);
71-
}
64+
return true;
65+
}
66+
67+
public static bool IsXDigit(this byte c)
68+
{
69+
return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F');
70+
}
71+
72+
public static bool IsAlpha(this byte c)
73+
{
74+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
75+
}
76+
77+
public static bool IsSpace(this byte c)
78+
{
79+
return c == ' ' || c == '\t';
80+
}
81+
82+
public static byte ToUpper(this byte c)
83+
{
84+
return (byte)('a' <= c && c <= 'z' ? c - ' ' : c);
7285
}
7386
}

RobotsTxt/IRobotsParseHandler.cs

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,13 @@
1-
namespace RobotsTxt
1+
namespace RobotsTxt;
2+
3+
public interface IRobotsParseHandler
24
{
3-
public interface IRobotsParseHandler
4-
{
5-
void HandleRobotsStart();
6-
void HandleRobotsEnd();
5+
void HandleRobotsStart();
6+
void HandleRobotsEnd();
77

8-
void HandleUserAgent(int lineNum, ReadOnlySpan<byte> value);
9-
void HandleAllow(int lineNum, ReadOnlySpan<byte> value);
10-
void HandleDisallow(int lineNum, ReadOnlySpan<byte> value);
11-
void HandleSitemap(int lineNum, ReadOnlySpan<byte> value);
12-
void HandleUnknownAction(int lineNum, ReadOnlySpan<byte> action, ReadOnlySpan<byte> value);
13-
}
8+
void HandleUserAgent(int lineNum, ReadOnlySpan<byte> userAgent);
9+
void HandleAllow(int lineNum, ReadOnlySpan<byte> value);
10+
void HandleDisallow(int lineNum, ReadOnlySpan<byte> value);
11+
void HandleSitemap(int lineNum, ReadOnlySpan<byte> value);
12+
void HandleUnknownAction(int lineNum, ReadOnlySpan<byte> action, ReadOnlySpan<byte> value);
1413
}
Lines changed: 116 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,72 +1,139 @@
1-
namespace RobotsTxt
1+
using System.Runtime.CompilerServices;
2+
3+
namespace RobotsTxt;
4+
5+
/// <summary>
6+
/// A RobotsMatchStrategy defines a strategy for matching individual lines in a
7+
/// robots.txt file. Each Match* method should return a match priority, which is
8+
/// interpreted as:
9+
///
10+
/// match priority &lt; 0:
11+
/// No match.
12+
///
13+
/// match priority == 0:
14+
/// Match, but treat it as if matched an empty pattern.
15+
///
16+
/// match priority &gt; 0:
17+
/// Match.
18+
/// </summary>
19+
internal static class LongestMatchRobotsMatchStrategy
220
{
21+
internal static int MatchAllowSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
22+
{
23+
return MatchesSlow(path, pattern) ? pattern.Length : -1;
24+
}
25+
26+
internal static int MatchDisallowSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
27+
{
28+
return MatchesSlow(path, pattern) ? pattern.Length : -1;
29+
}
330

4-
/// <summary>
5-
/// A RobotsMatchStrategy defines a strategy for matching individual lines in a
6-
/// robots.txt file. Each Match* method should return a match priority, which is
7-
/// interpreted as:
8-
///
9-
/// match priority &lt; 0:
10-
/// No match.
11-
///
12-
/// match priority == 0:
13-
/// Match, but treat it as if matched an empty pattern.
14-
///
15-
/// match priority &gt; 0:
16-
/// Match.
17-
/// </summary>
18-
internal static class LongestMatchRobotsMatchStrategy
31+
internal static bool MatchesSlow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
1932
{
20-
internal static int MatchAllow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
33+
var pathlen = path.Length;
34+
var pos = new int[pathlen + 1];
35+
var numpos = 1;
36+
var patlen = pattern.Length;
37+
for (var j = 0; j < patlen; j++)
2138
{
22-
return Matches(path, pattern) ? pattern.Length : -1;
39+
var ch = pattern[j];
40+
if (ch == '$' && j + 1 == patlen)
41+
{
42+
return pos[numpos - 1] == pathlen;
43+
}
44+
45+
if (ch == '*')
46+
{
47+
numpos = pathlen - pos[0] + 1;
48+
for (var i = 1; i < numpos; i++)
49+
{
50+
pos[i] = pos[i - 1] + 1;
51+
}
52+
}
53+
else
54+
{
55+
// Includes '$' when not at end of pattern.
56+
var newnumpos = 0;
57+
for (var i = 0; i < numpos; i++)
58+
{
59+
if (pos[i] < pathlen && path[pos[i]] == ch)
60+
{
61+
pos[newnumpos++] = pos[i] + 1;
62+
}
63+
}
64+
65+
numpos = newnumpos;
66+
if (numpos == 0) return false;
67+
}
2368
}
2469

25-
internal static int MatchDisallow(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
70+
return true;
71+
}
72+
73+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
74+
internal static int MatchAllowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
75+
{
76+
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
77+
}
78+
79+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
80+
internal static int MatchDisallowFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
81+
{
82+
return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1;
83+
}
84+
85+
[MethodImpl(MethodImplOptions.AggressiveInlining)]
86+
internal static bool MatchesFast(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern, bool haveWildcards)
87+
{
88+
if (pattern.Length == 0) return true;
89+
if (path.Length == 0) return pattern.Length == 0;
90+
91+
if (!haveWildcards)
2692
{
27-
return Matches(path, pattern) ? pattern.Length : -1;
93+
return path.IndexOf(pattern) != -1;
2894
}
2995

30-
internal static bool Matches(ReadOnlySpan<byte> path, ReadOnlySpan<byte> pattern)
96+
Span<int> pos = stackalloc int[path.Length + 1];
97+
var numpos = 1;
98+
99+
for (var j = 0; j < pattern.Length; j++)
31100
{
32-
var pathlen = path.Length;
33-
var pos = new int[pathlen + 1];
34-
int numpos = 1;
35-
var patlen = pattern.Length;
36-
for (var j = 0; j < patlen; j++)
101+
var ch = pattern[j];
102+
103+
// Check for end anchor
104+
if (ch == '$' && j + 1 == pattern.Length)
105+
{
106+
return pos[numpos - 1] == path.Length;
107+
}
108+
109+
if (ch == '*')
37110
{
38-
var ch = pattern[j];
39-
if (ch == '$' && j + 1 == patlen)
111+
var startPos = pos[0];
112+
numpos = path.Length - startPos + 1;
113+
114+
for (var i = 0; i < numpos; i++)
40115
{
41-
return (pos[numpos - 1] == pathlen);
116+
pos[i] = startPos + i;
42117
}
118+
}
119+
else
120+
{
121+
var newnumpos = 0;
122+
var pathLen = path.Length;
43123

44-
if (ch == '*')
124+
for (var i = 0; i < numpos && pos[i] < pathLen; i++)
45125
{
46-
numpos = pathlen - pos[0] + 1;
47-
for (int i = 1; i < numpos; i++)
126+
if (path[pos[i]] == ch)
48127
{
49-
pos[i] = pos[i - 1] + 1;
128+
pos[newnumpos++] = pos[i] + 1;
50129
}
51130
}
52-
else
53-
{
54-
// Includes '$' when not at end of pattern.
55-
int newnumpos = 0;
56-
for (int i = 0; i < numpos; i++)
57-
{
58-
if (pos[i] < pathlen && path[pos[i]] == ch)
59-
{
60-
pos[newnumpos++] = pos[i] + 1;
61-
}
62-
}
63131

64-
numpos = newnumpos;
65-
if (numpos == 0) return false;
66-
}
132+
if (newnumpos == 0) return false;
133+
numpos = newnumpos;
67134
}
68-
69-
return true;
70135
}
136+
137+
return true;
71138
}
72139
}

0 commit comments

Comments
 (0)