diff --git a/RobotsTxt/Extensions.cs b/RobotsTxt/Extensions.cs index ee5e28c..b874a79 100644 --- a/RobotsTxt/Extensions.cs +++ b/RobotsTxt/Extensions.cs @@ -1,73 +1,86 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public static class MyExtensions { - public static class MyExtensions +#if !NETCOREAPP + public static bool Contains(this ReadOnlySpan self, byte other) { - public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) + foreach (var c in self) { - if (self.Length != other.Length) - { - return false; - } - - for (var i = 0; i < self.Length; i++) + if (c == other) { - var c1 = self[i]; - var c2 = other[i]; - if ('A' <= c1 && c1 <= 'Z') - c1 += 32; - if ('A' <= c2 && c2 <= 'Z') - c2 += 32; - if (c1 != c2) - { - return false; - } + return true; } + } + return false; + } +#endif - return true; + public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) + { + if (self.Length != other.Length) + { + return false; } - public static bool StartsWithIgnoreCase(this ReadOnlySpan span, ReadOnlySpan value) + for (var i = 0; i < self.Length; i++) { - if (span.Length < value.Length) + var c1 = self[i]; + var c2 = other[i]; + if ('A' <= c1 && c1 <= 'Z') + c1 += 32; + if ('A' <= c2 && c2 <= 'Z') + c2 += 32; + if (c1 != c2) { return false; } - - for (var i = 0; i < value.Length; i++) - { - var c1 = span[i]; - var c2 = value[i]; - if ('A' <= c1 && c1 <= 'Z') - c1 += 32; - if ('A' <= c2 && c2 <= 'Z') - c2 += (byte)' '; - if (c1 != c2) - { - return false; - } - } - - return true; } - public static bool IsXDigit(this byte c) - { - return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); - } + return true; + } - public static bool IsAlpha(this byte c) + public static bool StartsWithIgnoreCase(this ReadOnlySpan span, ReadOnlySpan value) + { + if (span.Length < value.Length) { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); + return false; } - public static bool IsSpace(this byte c) + for (var i = 0; i < value.Length; i++) { - return c == ' ' || c == '\t'; + var c1 = span[i]; + var c2 = value[i]; + if ('A' <= c1 && c1 <= 'Z') + c1 += 32; + if ('A' <= c2 && c2 <= 'Z') + c2 += (byte)' '; + if (c1 != c2) + { + return false; + } } - public static byte ToUpper(this byte c) - { - return (byte)('a' <= c && c <= 'z' ? c - ' ' : c); - } + return true; + } + + public static bool IsXDigit(this byte c) + { + return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); + } + + public static bool IsAlpha(this byte c) + { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); + } + + public static bool IsSpace(this byte c) + { + return c == ' ' || c == '\t'; + } + + public static byte ToUpper(this byte c) + { + return (byte)('a' <= c && c <= 'z' ? c - ' ' : c); } } diff --git a/RobotsTxt/IRobotsParseHandler.cs b/RobotsTxt/IRobotsParseHandler.cs index 8a6cf60..1b673ec 100644 --- a/RobotsTxt/IRobotsParseHandler.cs +++ b/RobotsTxt/IRobotsParseHandler.cs @@ -1,14 +1,13 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public interface IRobotsParseHandler { - public interface IRobotsParseHandler - { - void HandleRobotsStart(); - void HandleRobotsEnd(); + void HandleRobotsStart(); + void HandleRobotsEnd(); - void HandleUserAgent(int lineNum, ReadOnlySpan value); - void HandleAllow(int lineNum, ReadOnlySpan value); - void HandleDisallow(int lineNum, ReadOnlySpan value); - void HandleSitemap(int lineNum, ReadOnlySpan value); - void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value); - } + void HandleUserAgent(int lineNum, ReadOnlySpan userAgent); + void HandleAllow(int lineNum, ReadOnlySpan value); + void HandleDisallow(int lineNum, ReadOnlySpan value); + void HandleSitemap(int lineNum, ReadOnlySpan value); + void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value); } diff --git a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs index 2898083..b1942b6 100644 --- a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs +++ b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs @@ -1,72 +1,139 @@ -namespace RobotsTxt +using System.Runtime.CompilerServices; + +namespace RobotsTxt; + +/// +/// A RobotsMatchStrategy defines a strategy for matching individual lines in a +/// robots.txt file. Each Match* method should return a match priority, which is +/// interpreted as: +/// +/// match priority < 0: +/// No match. +/// +/// match priority == 0: +/// Match, but treat it as if matched an empty pattern. +/// +/// match priority > 0: +/// Match. +/// +internal static class LongestMatchRobotsMatchStrategy { + internal static int MatchAllowSlow(ReadOnlySpan path, ReadOnlySpan pattern) + { + return MatchesSlow(path, pattern) ? pattern.Length : -1; + } + + internal static int MatchDisallowSlow(ReadOnlySpan path, ReadOnlySpan pattern) + { + return MatchesSlow(path, pattern) ? pattern.Length : -1; + } - /// - /// A RobotsMatchStrategy defines a strategy for matching individual lines in a - /// robots.txt file. Each Match* method should return a match priority, which is - /// interpreted as: - /// - /// match priority < 0: - /// No match. - /// - /// match priority == 0: - /// Match, but treat it as if matched an empty pattern. - /// - /// match priority > 0: - /// Match. - /// - internal static class LongestMatchRobotsMatchStrategy + internal static bool MatchesSlow(ReadOnlySpan path, ReadOnlySpan pattern) { - internal static int MatchAllow(ReadOnlySpan path, ReadOnlySpan pattern) + var pathlen = path.Length; + var pos = new int[pathlen + 1]; + var numpos = 1; + var patlen = pattern.Length; + for (var j = 0; j < patlen; j++) { - return Matches(path, pattern) ? pattern.Length : -1; + var ch = pattern[j]; + if (ch == '$' && j + 1 == patlen) + { + return pos[numpos - 1] == pathlen; + } + + if (ch == '*') + { + numpos = pathlen - pos[0] + 1; + for (var i = 1; i < numpos; i++) + { + pos[i] = pos[i - 1] + 1; + } + } + else + { + // Includes '$' when not at end of pattern. + var newnumpos = 0; + for (var i = 0; i < numpos; i++) + { + if (pos[i] < pathlen && path[pos[i]] == ch) + { + pos[newnumpos++] = pos[i] + 1; + } + } + + numpos = newnumpos; + if (numpos == 0) return false; + } } - internal static int MatchDisallow(ReadOnlySpan path, ReadOnlySpan pattern) + return true; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchAllowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchDisallowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool MatchesFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + if (pattern.Length == 0) return true; + if (path.Length == 0) return pattern.Length == 0; + + if (!haveWildcards) { - return Matches(path, pattern) ? pattern.Length : -1; + return path.IndexOf(pattern) != -1; } - internal static bool Matches(ReadOnlySpan path, ReadOnlySpan pattern) + Span pos = stackalloc int[path.Length + 1]; + var numpos = 1; + + for (var j = 0; j < pattern.Length; j++) { - var pathlen = path.Length; - var pos = new int[pathlen + 1]; - int numpos = 1; - var patlen = pattern.Length; - for (var j = 0; j < patlen; j++) + var ch = pattern[j]; + + // Check for end anchor + if (ch == '$' && j + 1 == pattern.Length) + { + return pos[numpos - 1] == path.Length; + } + + if (ch == '*') { - var ch = pattern[j]; - if (ch == '$' && j + 1 == patlen) + var startPos = pos[0]; + numpos = path.Length - startPos + 1; + + for (var i = 0; i < numpos; i++) { - return (pos[numpos - 1] == pathlen); + pos[i] = startPos + i; } + } + else + { + var newnumpos = 0; + var pathLen = path.Length; - if (ch == '*') + for (var i = 0; i < numpos && pos[i] < pathLen; i++) { - numpos = pathlen - pos[0] + 1; - for (int i = 1; i < numpos; i++) + if (path[pos[i]] == ch) { - pos[i] = pos[i - 1] + 1; + pos[newnumpos++] = pos[i] + 1; } } - else - { - // Includes '$' when not at end of pattern. - int newnumpos = 0; - for (int i = 0; i < numpos; i++) - { - if (pos[i] < pathlen && path[pos[i]] == ch) - { - pos[newnumpos++] = pos[i] + 1; - } - } - numpos = newnumpos; - if (numpos == 0) return false; - } + if (newnumpos == 0) return false; + numpos = newnumpos; } - - return true; } + + return true; } } diff --git a/RobotsTxt/ParsedRobotsKey.cs b/RobotsTxt/ParsedRobotsKey.cs index b10b356..b5bb154 100644 --- a/RobotsTxt/ParsedRobotsKey.cs +++ b/RobotsTxt/ParsedRobotsKey.cs @@ -1,93 +1,91 @@ using System.Diagnostics; -namespace RobotsTxt +namespace RobotsTxt; + +internal class ParsedRobotsKey { - class ParsedRobotsKey - { - private byte[]? _keyText; - const bool AllowFrequentTypos = true; + private byte[]? _keyText; + private const bool AllowFrequentTypos = true; - public enum KeyType - { - // Generic high level fields. - UserAgent, - Sitemap, + public enum KeyType + { + // Generic high level fields. + UserAgent, + Sitemap, - // Fields within a user-agent. - Allow, - Disallow, + // Fields within a user-agent. + Allow, + Disallow, - // Unrecognized field; kept as-is. High number so that additions to the - // enumeration above does not change the serialization. - Unknown = 128, - }; + // Unrecognized field; kept as-is. High number so that additions to the + // enumeration above does not change the serialization. + Unknown = 128, + } - public void Parse(ReadOnlySpan key) + public void Parse(ReadOnlySpan key) + { + _keyText = null; + if (KeyIsUserAgent(key)) { - _keyText = null; - if (KeyIsUserAgent(key)) - { - Type = KeyType.UserAgent; - } - else if (KeyIsAllow(key)) - { - Type = KeyType.Allow; - } - else if (KeyIsDisallow(key)) - { - Type = KeyType.Disallow; - } - else if (KeyIsSitemap(key)) - { - Type = KeyType.Sitemap; - } - else - { - Type = KeyType.Unknown; - UnknownText = key.ToArray(); - } + Type = KeyType.UserAgent; } - - private bool KeyIsSitemap(ReadOnlySpan key) + else if (KeyIsAllow(key)) { - return key.StartsWithIgnoreCase("sitemap"u8) || - key.StartsWithIgnoreCase("site-map"u8); + Type = KeyType.Allow; } - - private bool KeyIsDisallow(ReadOnlySpan key) + else if (KeyIsDisallow(key)) { - return ( - key.StartsWithIgnoreCase("disallow"u8) || - (AllowFrequentTypos && (key.StartsWithIgnoreCase("dissallow"u8) || - key.StartsWithIgnoreCase("dissalow"u8) || - key.StartsWithIgnoreCase("disalow"u8) || - key.StartsWithIgnoreCase("diasllow"u8) || - key.StartsWithIgnoreCase("disallaw"u8)))); + Type = KeyType.Disallow; } - - private bool KeyIsAllow(ReadOnlySpan key) + else if (KeyIsSitemap(key)) { - return key.StartsWithIgnoreCase("allow"u8); + Type = KeyType.Sitemap; } - - private bool KeyIsUserAgent(ReadOnlySpan key) + else { - return key.StartsWithIgnoreCase("user-agent"u8) || - (AllowFrequentTypos && (key.StartsWithIgnoreCase("useragent"u8) || - key.StartsWithIgnoreCase("user agent"u8))); + Type = KeyType.Unknown; + UnknownText = key.ToArray(); } + } + + private static bool KeyIsSitemap(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("sitemap"u8) || + key.StartsWithIgnoreCase("site-map"u8); + } + + private static bool KeyIsDisallow(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("disallow"u8) || + (AllowFrequentTypos && (key.StartsWithIgnoreCase("dissallow"u8) || + key.StartsWithIgnoreCase("dissalow"u8) || + key.StartsWithIgnoreCase("disalow"u8) || + key.StartsWithIgnoreCase("diasllow"u8) || + key.StartsWithIgnoreCase("disallaw"u8))); + } + + private static bool KeyIsAllow(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("allow"u8); + } + + private static bool KeyIsUserAgent(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("user-agent"u8) || + (AllowFrequentTypos && (key.StartsWithIgnoreCase("useragent"u8) || + key.StartsWithIgnoreCase("user agent"u8))); + } - public KeyType Type { get; private set; } = KeyType.Unknown; + public KeyType Type { get; private set; } = KeyType.Unknown; - public byte[]? UnknownText + public byte[]? UnknownText + { + get { - get - { - Debug.Assert(Type == KeyType.Unknown); - return _keyText; - } - private set => _keyText = value; + Debug.Assert(Type == KeyType.Unknown); + return _keyText; } + private set => _keyText = value; } } diff --git a/RobotsTxt/RobotsMachine.cs b/RobotsTxt/RobotsMachine.cs new file mode 100644 index 0000000..891c50c --- /dev/null +++ b/RobotsTxt/RobotsMachine.cs @@ -0,0 +1,219 @@ +using System.Runtime.CompilerServices; + +namespace RobotsTxt; + +public class RobotsMachine : IRobotsParseHandler +{ + private class State; + + private class UserAgentState : State; + + private class AllowState(byte[] pattern, bool haveWildcards) : State + { + public byte[] Pattern { get; } = pattern; + public bool HaveWildcards { get; } = haveWildcards; + } + + private class DisallowState(byte[] pattern, bool haveWildcards) : State + { + public byte[] Pattern { get; } = pattern; + public bool HaveWildcards { get; } = haveWildcards; + } + + private readonly List _userAgents; + + private readonly List _globalStates = []; + private readonly List _specificStates = []; + + private bool _currentAgentIsSpecific; // True if we're in a block for our agent. + private bool EverSeenSpecificAgent => _specificStates.Count > 0; + + public RobotsMachine(byte[] robotsBody, List userAgents) + { + _userAgents = userAgents; + ParseRobotsTxt(robotsBody, this); + } + + private static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + { + var parser = new RobotsTxtParser(robotsBody, parseCallback); + parser.Parse(); + } + + public void HandleRobotsStart() + { + } + + public void HandleRobotsEnd() + { + } + + internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + { + // Allowed characters in user-agent are [a-zA-Z_-]. + var i = 0; + for (; i < userAgent.Length; i++) + { + var c = userAgent[i]; + if (!(c.IsAlpha() || c == '_' || c == '-')) + { + break; + } + } + + return userAgent[..i]; + } + + public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + { + // Google-specific optimization: a '*' followed by space and more characters + // in a user-agent record is still regarded a global rule. + if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) + { + _globalStates.Add(new UserAgentState()); + _currentAgentIsSpecific = false; + return; + } + userAgent = ExtractUserAgent(userAgent); + foreach (var ua in _userAgents) + { + if (!userAgent.EqualsIgnoreCase(ua)) continue; + _specificStates.Add(new UserAgentState()); + _currentAgentIsSpecific = true; + return; + } + } + + private bool SeenAnyAgent => _specificStates.Count > 0 || _globalStates.Count > 0; + + public void HandleAllow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + var states = _currentAgentIsSpecific ? _specificStates : _globalStates; + var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); + states.Add(new AllowState(value.ToArray(), haveWildcards)); + } + public void HandleDisallow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + var states = _currentAgentIsSpecific ? _specificStates : _globalStates; + var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); + states.Add(new DisallowState(value.ToArray(), haveWildcards)); + } + + public void HandleSitemap(int lineNum, ReadOnlySpan value) + { + } + + public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) + { + } + + public bool PathAllowedByRobots(byte[] path) + { + return !Disallow(path); + } + + private bool Disallow(byte[] path) + { + if (!SeenAnyAgent) + return false; + + var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); + if (allowHierarchy.Priority > 0 || disallowHierarchy.Priority > 0) + { + return disallowHierarchy.Priority > allowHierarchy.Priority; + } + + if (EverSeenSpecificAgent) + { + // Matching group for user-agent but either without disallow or empty one, + // i.e. priority == 0. + return false; + } + + (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); + + if (disallowHierarchy.Priority > 0 || allowHierarchy.Priority > 0) + { + return disallowHierarchy.Priority > allowHierarchy.Priority; + } + + return false; + } + + private static (Match, Match) AssessAccessRules(byte[] path, List states) + { + Match allowHierarchy = new(); // Characters of 'url' matching Allow. + Match disallowHierarchy = new(); // Characters of 'url' matching Disallow. + foreach (var state in states) + { + switch (state) + { + case AllowState allow: + CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy); + break; + case DisallowState disallow: + CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy); + break; + } + } + return (allowHierarchy, disallowHierarchy); + } + + private class Match(int priority = Match.NoMatchPriority) + { + private const int NoMatchPriority = -1; + + public int Priority { get; set; } = priority; + } + + private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckAllow(byte[] path, ReadOnlySpan pattern, bool haveWildcards, Match allow) + { + while (true) + { + var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); + if (priority >= 0) + { + if (allow.Priority < priority) + { + allow.Priority = priority; + } + } + else + { + // Google-specific optimization: 'index.htm' and 'index.html' are normalized + // to '/'. + var slashPos = pattern.LastIndexOf((byte)'/'); + + if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes)) + { + var len = slashPos + 1; + var newpattern = new byte[len + 1]; + pattern[..len].CopyTo(newpattern); + newpattern[len] = (byte)'$'; + pattern = newpattern; + haveWildcards = true; + continue; + } + } + break; + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private static void CheckDisallow(byte[] path, ReadOnlySpan value, bool haveWildcards, Match disallow) + { + var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards); + if (priority < 0) return; + if (disallow.Priority < priority) + { + disallow.Priority = priority; + } + } +} diff --git a/RobotsTxt/RobotsMatcher.cs b/RobotsTxt/RobotsMatcher.cs index b944065..9be04ad 100644 --- a/RobotsTxt/RobotsMatcher.cs +++ b/RobotsTxt/RobotsMatcher.cs @@ -1,96 +1,95 @@ using System.Diagnostics; using System.Text; -namespace RobotsTxt +namespace RobotsTxt; + +/// +/// Create a RobotsMatcher with the default matching strategy. The default +/// matching strategy is longest-match as opposed to the former internet draft +/// that provisioned first-match strategy. Analysis shows that longest-match, +/// while more restrictive for crawlers, is what webmasters assume when writing +/// directives. For example, in case of conflicting matches (both Allow and +/// Disallow), the longest match is the one the user wants. For example, in +/// case of a robots.txt file that has the following rules +/// Allow: / +/// Disallow: /cgi-bin +/// it's pretty obvious what the webmaster wants: they want to allow crawl of +/// every URI except /cgi-bin. However, according to the expired internet +/// standard, crawlers should be allowed to crawl everything with such a rule. +/// +public class RobotsMatcher : IRobotsParseHandler { - /// - /// Create a RobotsMatcher with the default matching strategy. The default - /// matching strategy is longest-match as opposed to the former internet draft - /// that provisioned first-match strategy. Analysis shows that longest-match, - /// while more restrictive for crawlers, is what webmasters assume when writing - /// directives. For example, in case of conflicting matches (both Allow and - /// Disallow), the longest match is the one the user wants. For example, in - /// case of a robots.txt file that has the following rules - /// Allow: / - /// Disallow: /cgi-bin - /// it's pretty obvious what the webmaster wants: they want to allow crawl of - /// every URI except /cgi-bin. However, according to the expired internet - /// standard, crawlers should be allowed to crawl everything with such a rule. - /// - public class RobotsMatcher : IRobotsParseHandler + public void HandleRobotsStart() { - public void HandleRobotsStart() - { - // This is a new robots.txt file, so we need to reset all the instance member - // variables. We do it in the same order the instance member variables are - // declared, so it's easier to keep track of which ones we have (or maybe - // haven't!) done. - _allow.Clear(); - _disallow.Clear(); - - _seenGlobalAgent = false; - _seenSpecificAgent = false; - _everSeenSpecificAgent = false; - _seenSeparator = false; - } + // This is a new robots.txt file, so we need to reset all the instance member + // variables. We do it in the same order the instance member variables are + // declared, so it's easier to keep track of which ones we have (or maybe + // haven't!) done. + _allow.Clear(); + _disallow.Clear(); + + _seenGlobalAgent = false; + _seenSpecificAgent = false; + _everSeenSpecificAgent = false; + _seenSeparator = false; + } - public void HandleRobotsEnd() - { - } + public void HandleRobotsEnd() + { + } - internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + { + // Allowed characters in user-agent are [a-zA-Z_-]. + var i = 0; + for (; i < userAgent.Length; i++) { - // Allowed characters in user-agent are [a-zA-Z_-]. - var i = 0; - for (; i < userAgent.Length; i++) + var c = userAgent[i]; + if (!(c.IsAlpha() || c == '_' || c == '-')) { - var c = userAgent[i]; - if (!(c.IsAlpha() || c == '_' || c == '-')) - { - break; - } + break; } - - return userAgent.Slice(0, i); } - public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + return userAgent[..i]; + } + + public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + { + if (_seenSeparator) { - if (_seenSeparator) - { - _seenSpecificAgent = _seenGlobalAgent = _seenSeparator = false; - } + _seenSpecificAgent = _seenGlobalAgent = _seenSeparator = false; + } - // Google-specific optimization: a '*' followed by space and more characters - // in a user-agent record is still regarded a global rule. - if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) - { - _seenGlobalAgent = true; - } - else + // Google-specific optimization: a '*' followed by space and more characters + // in a user-agent record is still regarded a global rule. + if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) + { + _seenGlobalAgent = true; + } + else + { + userAgent = ExtractUserAgent(userAgent); + Debug.Assert(_userAgents != null); + foreach (var ua in _userAgents) { - userAgent = ExtractUserAgent(userAgent); - Debug.Assert(_userAgents != null); - foreach (var ua in _userAgents) - { - if (userAgent.EqualsIgnoreCase(ua)) - { - _everSeenSpecificAgent = _seenSpecificAgent = true; - break; - } - } + if (!userAgent.EqualsIgnoreCase(ua)) continue; + _everSeenSpecificAgent = _seenSpecificAgent = true; + break; } } + } - readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); + private readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); - public void HandleAllow(int lineNum, ReadOnlySpan value) + public void HandleAllow(int lineNum, ReadOnlySpan value) + { + while (true) { - if (!SeenAnyAgent) - return; + if (!SeenAnyAgent) return; Debug.Assert(_allow != null); _seenSeparator = true; - var priority = LongestMatchRobotsMatchStrategy.MatchAllow(_path, value); + var priority = LongestMatchRobotsMatchStrategy.MatchAllowSlow(_path, value); if (priority >= 0) { if (_seenSpecificAgent) @@ -98,7 +97,7 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) Debug.Assert(_allow.Specific != null); if (_allow.Specific.Priority < priority) { - _allow.Specific.Set(priority, lineNum); + _allow.Specific.Set(priority); } } else @@ -107,7 +106,7 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) Debug.Assert(_allow.Global != null); if (_allow.Global.Priority < priority) { - _allow.Global.Set(priority, lineNum); + _allow.Global.Set(priority); } } } @@ -117,204 +116,197 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) // to '/'. var slashPos = value.LastIndexOf((byte)'/'); - if (slashPos != -1 && - value.Slice(slashPos).StartsWith(_indexHtmBytes)) - { - var len = slashPos + 1; - var newpattern = new byte[len + 1]; - value.Slice(0, len).CopyTo(newpattern); - newpattern[len] = (byte)'$'; - HandleAllow(lineNum, newpattern); - } + if (slashPos == -1 || !value[slashPos..].StartsWith(_indexHtmBytes)) return; + var len = slashPos + 1; + var newpattern = new byte[len + 1]; + value[..len].CopyTo(newpattern); + newpattern[len] = (byte)'$'; + value = newpattern; + continue; } + break; } + } - public void HandleDisallow(int lineNum, ReadOnlySpan value) + public void HandleDisallow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + _seenSeparator = true; + var priority = LongestMatchRobotsMatchStrategy.MatchDisallowSlow(_path, value); + if (priority < 0) return; + if (_seenSpecificAgent) { - if (!SeenAnyAgent) - return; - _seenSeparator = true; - var priority = LongestMatchRobotsMatchStrategy.MatchDisallow(_path, value); - if (priority >= 0) + if (_disallow.Specific.Priority < priority) { - if (_seenSpecificAgent) - { - if (_disallow.Specific.Priority < priority) - { - _disallow.Specific.Set(priority, lineNum); - } - } - else - { - Debug.Assert(_seenGlobalAgent); - if (_disallow.Global.Priority < priority) - { - _disallow.Global.Set(priority, lineNum); - } - } + _disallow.Specific.Set(priority); } } - - public void HandleSitemap(int lineNum, ReadOnlySpan value) + else { + Debug.Assert(_seenGlobalAgent); + if (_disallow.Global.Priority < priority) + { + _disallow.Global.Set(priority); + } } + } - public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) - { - } + public void HandleSitemap(int lineNum, ReadOnlySpan value) + { + } - private void InitUserAgentsAndPath(List userAgents, byte[] path) - { - _userAgents = userAgents; - Debug.Assert(path.Length > 0 && path[0] == '/'); - _path = path; - } + public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) + { + } + + private void InitUserAgentsAndPath(List userAgents, byte[] path) + { + _userAgents = userAgents; + Debug.Assert(path.Length > 0 && path[0] == '/'); + _path = path; + } + + private bool SeenAnyAgent => _seenGlobalAgent || _seenSpecificAgent; + + public bool AllowedByRobots(byte[] robotsBody, List userAgents, string url) + { + // The url is not normalized (escaped, percent encoded) here because the user + // is asked to provide it in escaped form already. + var path = GetPathParamsQuery(url); + return PathAllowedByRobots(robotsBody, userAgents, new UTF8Encoding().GetBytes(path)); + } + + public bool PathAllowedByRobots(byte[] robotsBody, List userAgents, byte[] path) + { + InitUserAgentsAndPath(userAgents, path); + ParseRobotsTxt(robotsBody, this); + return !Disallow(); + } - private bool SeenAnyAgent => _seenGlobalAgent || _seenSpecificAgent; + private bool Disallow() + { + Debug.Assert(_allow != null); + Debug.Assert(_disallow != null); - public bool AllowedByRobots(byte[] robotsBody, List userAgents, string url) + if (_allow.Specific.Priority > 0 || _disallow.Specific.Priority > 0) { - // The url is not normalized (escaped, percent encoded) here because the user - // is asked to provide it in escaped form already. - var path = GetPathParamsQuery(url); - return PathAllowedByRobots(robotsBody, userAgents, new UTF8Encoding().GetBytes(path)); + return _disallow.Specific.Priority > _allow.Specific.Priority; } - public bool PathAllowedByRobots(byte[] robotsBody, List userAgents, byte[] path) + if (_everSeenSpecificAgent) { - InitUserAgentsAndPath(userAgents, path); - ParseRobotsTxt(robotsBody, this); - return !Disallow(); + // Matching group for user-agent but either without disallow or empty one, + // i.e. priority == 0. + return false; } - private bool Disallow() + if (_disallow.Global.Priority > 0 || _allow.Global.Priority > 0) { - Debug.Assert(_allow != null); - Debug.Assert(_disallow != null); - - if (_allow.Specific.Priority > 0 || _disallow.Specific.Priority > 0) - { - return (_disallow.Specific.Priority > _allow.Specific.Priority); - } + return _disallow.Global.Priority > _allow.Global.Priority; + } - if (_everSeenSpecificAgent) - { - // Matching group for user-agent but either without disallow or empty one, - // i.e. priority == 0. - return false; - } + return false; + } - if (_disallow.Global.Priority > 0 || _allow.Global.Priority > 0) - { - return _disallow.Global.Priority > _allow.Global.Priority; - } + internal static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + { + var parser = new RobotsTxtParser(robotsBody, parseCallback); + parser.Parse(); + } - return false; + internal static string GetPathParamsQuery(string url) + { + var searchStart = 0; + if (url is ['/', '/', ..,]) searchStart = 2; + var earlyPath = url.IndexOfAny(['/', '?', ';',], searchStart); + var protocolEnd = url.IndexOf("://", searchStart, StringComparison.Ordinal); + if (earlyPath < protocolEnd) + { + protocolEnd = -1; } - internal static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + if (protocolEnd == -1) { - var parser = new RobotsTxtParser(robotsBody, parseCallback); - parser.Parse(); + protocolEnd = searchStart; } - - internal static string GetPathParamsQuery(string url) + else { - var searchStart = 0; - if (url is ['/', '/', ..]) searchStart = 2; - var earlyPath = url.IndexOfAny(['/', '?', ';',], searchStart); - var protocolEnd = url.IndexOf("://", searchStart, StringComparison.Ordinal); - if (earlyPath < protocolEnd) - { - protocolEnd = -1; - } - - if (protocolEnd == -1) - { - protocolEnd = searchStart; - } - else - { - protocolEnd += 3; - } + protocolEnd += 3; + } - var pathStart = url.IndexOfAny(['/', '?', ';',], protocolEnd); - if (pathStart != -1) - { - var hashPos = url.IndexOf('#', searchStart); - if (hashPos >= 0 && hashPos < pathStart) return "/"; - var pathEnd = (hashPos == -1) ? url.Length : hashPos; - if (url[pathStart] != '/') - { - // Prepend a slash if the result would start e.g. with '?'. - return "/" + url.Substring(pathStart, pathEnd - pathStart); - } + var pathStart = url.IndexOfAny(['/', '?', ';',], protocolEnd); + if (pathStart == -1) return "/"; + var hashPos = url.IndexOf('#', searchStart); + if (hashPos >= 0 && hashPos < pathStart) return "/"; + var pathEnd = hashPos == -1 ? url.Length : hashPos; + return url[pathStart] != '/' + ? + // Prepend a slash if the result would start e.g. with '?'. +#if !NETCOREAPP + "/" + url.Substring(pathStart, pathEnd - pathStart) +#else + string.Concat("/", url.AsSpan(pathStart, pathEnd - pathStart)) +#endif + : url.Substring(pathStart, pathEnd - pathStart); + } - return url.Substring(pathStart, pathEnd - pathStart); - } + private class Match(int priority = Match.NoMatchPriority) + { + private const int NoMatchPriority = -1; - return "/"; + public void Set(int priority) + { + Priority = priority; } - class Match(int priority = Match.NoMatchPriority, int line = 0) + public void Clear() { - private const int NoMatchPriority = -1; + Set(NoMatchPriority); + } - public void Set(int priority, int line) - { - Priority = priority; - Line = line; - } + public int Priority { get; private set; } = priority; + } - public void Clear() - { - Set(NoMatchPriority, 0); - } + // For each of the directives within user-agents, we keep global and specific + // match scores. + private class MatchHierarchy + { + public readonly Match Global = new(); // Match for '*' + public readonly Match Specific = new(); // Match for queried agent. - public int Priority { get; private set; } = priority; - public int Line { get; private set; } = line; - } - // For each of the directives within user-agents, we keep global and specific - // match scores. - class MatchHierarchy + public void Clear() { - public readonly Match Global = new Match(); // Match for '*' - public readonly Match Specific = new Match(); // Match for queried agent. - - - public void Clear() - { - Global.Clear(); - Specific.Clear(); - } + Global.Clear(); + Specific.Clear(); } + } - readonly MatchHierarchy _allow = new MatchHierarchy(); // Characters of 'url' matching Allow. - readonly MatchHierarchy _disallow = new MatchHierarchy(); // Characters of 'url' matching Disallow. + private readonly MatchHierarchy _allow = new(); // Characters of 'url' matching Allow. + private readonly MatchHierarchy _disallow = new(); // Characters of 'url' matching Disallow. - bool _seenGlobalAgent; // True if processing global agent rules. - bool _seenSpecificAgent; // True if processing our specific agent. - bool _everSeenSpecificAgent; // True if we ever saw a block for our agent. - bool _seenSeparator; // True if saw any key: value pair. + private bool _seenGlobalAgent; // True if processing global agent rules. + private bool _seenSpecificAgent; // True if processing our specific agent. + private bool _everSeenSpecificAgent; // True if we ever saw a block for our agent. + private bool _seenSeparator; // True if saw any key: value pair. - // The path we want to pattern match. Set by InitUserAgentsAndPath. - byte[]? _path; - private List? _userAgents; // Set by InitUserAgentsAndPath. + // The path we want to pattern match. Set by InitUserAgentsAndPath. + private byte[]? _path; + private List? _userAgents; // Set by InitUserAgentsAndPath. - public bool OneAgentAllowedByRobots(byte[] robotsContent, byte[] userAgent, string url) - { - var userAgents = new List { userAgent, }; - return AllowedByRobots(robotsContent, userAgents, url); - } + public bool OneAgentAllowedByRobots(byte[] robotsContent, byte[] userAgent, string url) + { + var userAgents = new List { userAgent, }; + return AllowedByRobots(robotsContent, userAgents, url); + } - internal static bool IsValidUserAgentToObey(Span userAgent) - { - return userAgent.Length > 0 && ExtractUserAgent(userAgent) == userAgent; - } - internal static bool IsValidUserAgentToObey(string userAgent) - { - return IsValidUserAgentToObey(Encoding.UTF8.GetBytes(userAgent)); - } + public static bool IsValidUserAgentToObey(Span userAgent) + { + return userAgent.Length > 0 && ExtractUserAgent(userAgent) == userAgent; + } + public static bool IsValidUserAgentToObey(string userAgent) + { + return IsValidUserAgentToObey(Encoding.UTF8.GetBytes(userAgent)); } } diff --git a/RobotsTxt/RobotsTxt.csproj b/RobotsTxt/RobotsTxt.csproj index 5f94b42..4ca386c 100644 --- a/RobotsTxt/RobotsTxt.csproj +++ b/RobotsTxt/RobotsTxt.csproj @@ -14,8 +14,8 @@ - - + + diff --git a/RobotsTxt/RobotsTxtParser.cs b/RobotsTxt/RobotsTxtParser.cs index fcf4609..5fb8418 100644 --- a/RobotsTxt/RobotsTxtParser.cs +++ b/RobotsTxt/RobotsTxtParser.cs @@ -1,248 +1,244 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public class RobotsTxtParser(byte[] robotsBody, IRobotsParseHandler handler) { - public class RobotsTxtParser(byte[] robotsBody, IRobotsParseHandler handler) - { - static readonly byte[] UtfBom = [0xEF, 0xBB, 0xBF]; - static readonly byte[] HexDigits = "0123456789ABCDEF"u8.ToArray(); + private static readonly byte[] UtfBom = [0xEF, 0xBB, 0xBF,]; + private static readonly byte[] HexDigits = "0123456789ABCDEF"u8.ToArray(); - public void Parse() + public void Parse() + { + // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's + // fairly safe to assume any valid line isn't going to be more than many times + // that max url length of 2KB. We want some padding for + // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. + // If so, we can ignore the chars on a line past that. + const int maxLineLen = 2083 * 8; + // Allocate a buffer used to process the current line. + var lineBuffer = new byte[maxLineLen]; + var linePos = 0; + var lineNum = 0; + var bomPos = 0; + var lastWasCarriageReturn = false; + handler.HandleRobotsStart(); + + foreach (var ch in robotsBody) { - // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's - // fairly safe to assume any valid line isn't going to be more than many times - // that max url length of 2KB. We want some padding for - // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. - // If so, we can ignore the chars on a line past that. - const int maxLineLen = 2083 * 8; - // Allocate a buffer used to process the current line. - var lineBuffer = new byte[maxLineLen]; - var linePos = 0; - var lineNum = 0; - var bomPos = 0; - bool lastWasCarriageReturn = false; - handler.HandleRobotsStart(); - - foreach (var ch in robotsBody) + // Google-specific optimization: UTF-8 byte order marks should never + // appear in a robots.txt file, but they do nevertheless. Skipping + // possible BOM-prefix in the first bytes of the input. + if (bomPos < 3 && ch == UtfBom[bomPos++]) { - // Google-specific optimization: UTF-8 byte order marks should never - // appear in a robots.txt file, but they do nevertheless. Skipping - // possible BOM-prefix in the first bytes of the input. - if (bomPos < 3 && ch == UtfBom[bomPos++]) - { - continue; - } + continue; + } - bomPos = 3; - if (ch != '\n' && ch != '\r') + bomPos = 3; + if (ch != '\n' && ch != '\r') + { + // Non-line-ending char case. + // Put in next spot on current line, as long as there's room. + if (linePos < maxLineLen) { - // Non-line-ending char case. - // Put in next spot on current line, as long as there's room. - if (linePos < maxLineLen) - { - lineBuffer[linePos++] = ch; - } + lineBuffer[linePos++] = ch; } - else + } + else + { + // Line-ending character char case. + var span = lineBuffer.AsSpan(0, linePos); + // Only emit an empty line if this was not due to the second character + // of the DOS line-ending \r\n . + var isCrlfContinuation = span.Length == 0 && lastWasCarriageReturn && ch == '\n'; + if (!isCrlfContinuation) { - // Line-ending character char case. - var span = lineBuffer.AsSpan(0, linePos); - // Only emit an empty line if this was not due to the second character - // of the DOS line-ending \r\n . - bool isCrlfContinuation = span.Length == 0 && lastWasCarriageReturn && ch == '\n'; - if (!isCrlfContinuation) - { - ParseAndEmitLine(++lineNum, span); - } - - linePos = 0; - lastWasCarriageReturn = ch == '\r'; + ParseAndEmitLine(++lineNum, span); } + + linePos = 0; + lastWasCarriageReturn = ch == '\r'; } + } - var spanLeft = lineBuffer.AsSpan(0, linePos); - ParseAndEmitLine(++lineNum, spanLeft); - handler.HandleRobotsEnd(); + var spanLeft = lineBuffer.AsSpan(0, linePos); + ParseAndEmitLine(++lineNum, spanLeft); + handler.HandleRobotsEnd(); + } + + private void ParseAndEmitLine(int currentLine, ReadOnlySpan line) + { + if (!GetKeyAndValueFrom(out var stringKey, out var value, line)) + { + return; } - void ParseAndEmitLine(int currentLine, ReadOnlySpan line) + var key = new ParsedRobotsKey(); + key.Parse(stringKey); + if (NeedEscapeValueForKey(key)) { - if (!GetKeyAndValueFrom(out var stringKey, out var value, line)) - { - return; - } + var escapedValue = MaybeEscapePattern(value); + EmitKeyValueToHandler(currentLine, key, escapedValue); + } + else + { + EmitKeyValueToHandler(currentLine, key, value); + } + } + + private void EmitKeyValueToHandler(int currentLine, ParsedRobotsKey key, ReadOnlySpan value) + { + switch (key.Type) + { + case ParsedRobotsKey.KeyType.UserAgent: + handler.HandleUserAgent(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Sitemap: + handler.HandleSitemap(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Allow: + handler.HandleAllow(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Disallow: + handler.HandleDisallow(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Unknown: + handler.HandleUnknownAction(currentLine, key.UnknownText, value); + break; + default: + throw new ArgumentOutOfRangeException(nameof(key)); + } + } - ParsedRobotsKey key = new ParsedRobotsKey(); - key.Parse(stringKey); - if (NeedEscapeValueForKey(key)) + public static ReadOnlySpan MaybeEscapePattern(ReadOnlySpan src) + { + var numToEscape = 0; + var needCapitalize = false; + for (var i = 0; i < src.Length; i++) + { + // (a) % escape sequence. + var c = src[i]; + if (c == '%' && i + 2 < src.Length && + (('a' <= src[i + 1] && src[i + 1] <= 'f') || ('a' <= src[i + 2] && src[i + 2] <= 'f'))) { - var escapedValue = MaybeEscapePattern(value); - EmitKeyValueToHandler(currentLine, key, escapedValue); + needCapitalize = true; + i += 2; } - else + // (b) needs escaping. + else if (c >= 0x80) { - EmitKeyValueToHandler(currentLine, key, value); + numToEscape += 1; } + // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). } - private void EmitKeyValueToHandler(int currentLine, ParsedRobotsKey key, ReadOnlySpan value) + if (numToEscape == 0 && !needCapitalize) { - switch (key.Type) - { - case ParsedRobotsKey.KeyType.UserAgent: - handler.HandleUserAgent(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Sitemap: - handler.HandleSitemap(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Allow: - handler.HandleAllow(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Disallow: - handler.HandleDisallow(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Unknown: - handler.HandleUnknownAction(currentLine, key.UnknownText, value); - break; - default: - throw new ArgumentOutOfRangeException(); - } + return src; } - public static ReadOnlySpan MaybeEscapePattern(ReadOnlySpan src) + var dst = new byte[numToEscape * 2 + src.Length]; + var j = 0; + for (var i = 0; i < src.Length; i++) { - int numToEscape = 0; - bool needCapitalize = false; - for (int i = 0; i < src.Length; i++) + var c = src[i]; + if (c == '%' && i + 2 < src.Length && src[i + 1].IsXDigit() && src[i + 2].IsXDigit()) { - // (a) % escape sequence. - var c = src[i]; - if (c == '%' && i + 2 < src.Length && - (('a' <= src[i + 1] && src[i + 1] <= 'f') || ('a' <= src[i + 2] && src[i + 2] <= 'f'))) - { - needCapitalize = true; - i += 2; - } - // (b) needs escaping. - else if (c >= 0x80) - { - numToEscape += 1; - } - // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). + dst[j++] = src[i++]; + dst[j++] = src[i++].ToUpper(); + dst[j++] = src[i++].ToUpper(); } - - if (numToEscape == 0 && !needCapitalize) + else if (c >= 0x80) { - return src; + dst[j++] = (byte)'%'; + dst[j++] = HexDigits[(c >> 4) & 0xf]; + dst[j++] = HexDigits[c & 0xf]; } - - var dst = new byte[numToEscape * 2 + src.Length]; - var j = 0; - for (int i = 0; i < src.Length; i++) + else { - var c = src[i]; - if (c == '%' && i + 2 < src.Length && src[i + 1].IsXDigit() && src[i + 2].IsXDigit()) - { - dst[j++] = src[i++]; - dst[j++] = src[i++].ToUpper(); - dst[j++] = src[i++].ToUpper(); - } - else if (c >= 0x80) - { - dst[j++] = (byte)'%'; - dst[j++] = HexDigits[(c >> 4) & 0xf]; - dst[j++] = HexDigits[c & 0xf]; - } - else - { - dst[j++] = c; - } + dst[j++] = c; } - - return dst; } - private bool NeedEscapeValueForKey(ParsedRobotsKey key) + return dst; + } + + private static bool NeedEscapeValueForKey(ParsedRobotsKey key) + { + return key.Type switch { - switch (key.Type) - { - case ParsedRobotsKey.KeyType.UserAgent: - case ParsedRobotsKey.KeyType.Sitemap: - return false; - default: - return true; - } - } + ParsedRobotsKey.KeyType.UserAgent or ParsedRobotsKey.KeyType.Sitemap => false, + _ => true, + }; + } - internal static bool GetKeyAndValueFrom(out ReadOnlySpan key, out ReadOnlySpan value, - ReadOnlySpan line) + internal static bool GetKeyAndValueFrom(out ReadOnlySpan key, out ReadOnlySpan value, + ReadOnlySpan line) + { + var comment = line.IndexOf((byte)'#'); + if (comment != -1) { - var comment = line.IndexOf((byte)'#'); - if (comment != -1) - { - line = line.Slice(0, comment); - } + line = line[..comment]; + } - line = StripWhitespaceSlowly(line); + line = StripWhitespaceSlowly(line); - // Rules must match the following pattern: - // [ \t]*:[ \t]* - var sep = line.IndexOf((byte)':'); - if (sep == -1) + // Rules must match the following pattern: + // [ \t]*:[ \t]* + var sep = line.IndexOf((byte)':'); + if (sep == -1) + { + // Google-specific optimization: some people forget the colon, so we need to + // accept whitespace in its stead. + sep = line.IndexOfAny((byte)' ', (byte)'\t'); + if (sep != -1) { - // Google-specific optimization: some people forget the colon, so we need to - // accept whitespace in its stead. - sep = line.IndexOfAny((byte)' ', (byte)'\t'); - if (sep != -1) + var val = line[(sep + 1)..]; + if (val.IndexOfAny((byte)' ', (byte)'\t') != -1) { - var val = line.Slice(sep + 1); - if (val.IndexOfAny((byte)' ', (byte)'\t') != -1) - { - // We only accept whitespace as a separator if there are exactly two - // sequences of non-whitespace characters. If we get here, there were - // more than 2 such sequences since we stripped trailing whitespace - // above. - key = null; - value = null; - return false; - } + // We only accept whitespace as a separator if there are exactly two + // sequences of non-whitespace characters. If we get here, there were + // more than 2 such sequences since we stripped trailing whitespace + // above. + key = null; + value = null; + return false; } } + } - if (sep == -1) - { - key = null; - value = null; - return false; // Couldn't find a separator. - } - - key = line.Slice(0, sep); // Key starts at beginning of line. And stops at the separator. - key = StripWhitespaceSlowly(key); // Get rid of any trailing whitespace. - if (key.Length > 0) - { - value = line.Slice(sep + 1); // Value starts after the separator. - value = StripWhitespaceSlowly(value); // Get rid of any leading whitespace. - return true; - } - + if (sep == -1) + { + key = null; value = null; - return false; + return false; // Couldn't find a separator. } - internal static ReadOnlySpan StripWhitespaceSlowly(ReadOnlySpan s) + key = line[..sep]; // Key starts at beginning of line. And stops at the separator. + key = StripWhitespaceSlowly(key); // Get rid of any trailing whitespace. + if (key.Length > 0) { - int start, end; - for (start = 0; start < s.Length; start++) - { - if (s[start] != ' ' && s[start] != '\t') - break; - } + value = line[(sep + 1)..]; // Value starts after the separator. + value = StripWhitespaceSlowly(value); // Get rid of any leading whitespace. + return true; + } - for (end = s.Length; end > start; end--) - { - if (s[end - 1] != ' ' && s[end - 1] != '\t') - break; - } + value = null; + return false; + } + + internal static ReadOnlySpan StripWhitespaceSlowly(ReadOnlySpan s) + { + int start, end; + for (start = 0; start < s.Length; start++) + { + if (s[start] != ' ' && s[start] != '\t') + break; + } - return s.Slice(start, end - start); + for (end = s.Length; end > start; end--) + { + if (s[end - 1] != ' ' && s[end - 1] != '\t') + break; } + + return s.Slice(start, end - start); } } diff --git a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs index 82dacae..7c40643 100644 --- a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs +++ b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs @@ -1,5 +1,7 @@ using System.Text; + using Xunit; + using RobotsTxt; namespace TestRobotsTxt @@ -22,11 +24,19 @@ public class TestsLongestMatchRobotsMatchStrategy public void TestMatch(string path, string pattern, bool expected) { var actual = - LongestMatchRobotsMatchStrategy.Matches( + LongestMatchRobotsMatchStrategy.MatchesSlow( Encoding.UTF8.GetBytes(path), Encoding.UTF8.GetBytes(pattern) ); Assert.Equal(expected, actual); + var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$'); + actual = + LongestMatchRobotsMatchStrategy.MatchesFast( + Encoding.UTF8.GetBytes(path), + Encoding.UTF8.GetBytes(pattern), + haveWildcards + ); + Assert.Equal(expected, actual); } } } diff --git a/TestRobotsTxt/TestRobotsMachine.cs b/TestRobotsTxt/TestRobotsMachine.cs new file mode 100644 index 0000000..ae88f13 --- /dev/null +++ b/TestRobotsTxt/TestRobotsMachine.cs @@ -0,0 +1,434 @@ +using System.Text; + +using RobotsTxt; + +using Xunit; + +namespace TestRobotsTxt; + +public class TestRobotsMachine +{ + private readonly byte[][] _robotsTxt = new byte[][] + { + @"# ROW robots from TAS +# update 08-12-2024 semi configs and login redirect blocked +# Updated on 06-09-2023 +# Added Disallow: /*/orders for all bots - SELF-223 +# Added Disallow for COAs for google, bing and APAC bots 09-27-2024 + +User-Agent: * +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: */search/*focus=papers +Disallow: /*/*/life-science/assistant + +#Specific allows for chatGPT - note directives apply to both bots +User-agent: GPTBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-agent: ChatGPT-User +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-Agent: Googlebot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: */search/*focus=papers +Disallow: /*/*/life-science/assistant + +# added 03-20-2024 +User-Agent: Bingbot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-Agent: Botify +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-Agent: Adsbot-Google +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +# APAC Bots +# China +User-Agent: Baiduspider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: /api +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sosospider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sogou spider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sogou+spider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: YoudaoBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Naverbot - Korea +User-agent: Yeti +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Daum +User-agent: DAUM +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Yandex +# Added gc fb sid id redirect param clean 12/13/2021 +User-agent: YandexBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search/?focus= +Disallow: /*/search?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +Clean-param: redirect /*/login +Clean-param: gc /* +Clean-param: fb /* +Clean-param: redirect /* +Clean-param: sid /* +Clean-param: id /* + +# Added 11-03-2022 +User-agent: PetalBot +Disallow: / + +User-agent: ConveraCrawler +Disallow: / + +User-agent: DotBot +Disallow: / + +User-agent: ingenieur +Disallow: / + +User-agent: Mail.Ru +Disallow: / + +User-agent: JikeSpider +Disallow: / + +User-agent: EasouSpider +Disallow: / + +User-agent: YisouSpider +Disallow: / + +Sitemap: https://www.sigmaaldrich.com/sitemap_index.xml +"u8.ToArray(), + @"User-agent: * +Disallow: /account/ +Disallow: /adRedir.do* +Disallow: /ads/ +Disallow: /b2b/ +Disallow: /billboard/ +Disallow: /cart/ +Disallow: /catalog/browseCatalog.do* +Disallow: /catalogrequest/ +Disallow: /catalog/search.do* +Disallow: /checkout/ +Disallow: /common/ +Disallow: /compare/ +Disallow: /contracts/ +Disallow: /csl/ +Disallow: /customerservice/ +Disallow: /default/ +Disallow: /employeepurchase +Disallow: /employeepurchases.do* +Disallow: /epp +Disallow: /examples/ +Disallow: /inkTonerManuf.do* +Disallow: /internal/ +Disallow: /mb/search.do* +Disallow: /mb/stores/list.do* +Disallow: /mb/wifiConnect.do* +Disallow: /mb/cart.do* +Disallow: /orderhistory/ +Disallow: /printconfigurator/ +Disallow: /promo/ +Disallow: /qp/ +Disallow: /shop/ +Disallow: /storelocator/wifiConnect.do* +Disallow: /stores/wifiConnect.do* +Disallow: /tealeaf/ +Disallow: /textSearch.do* +Disallow: /txtSearchDD.do* +Disallow: /userprofile/ +Disallow: /vendor/ +Disallow: /workflow/ +Disallow: /select +Disallow: /businessrewards/ +Disallow: /ccpa/lookup.do* +Disallow: /a/search/ +Disallow: /b/widget/ +Disallow: /b/*/*/*/*/N-* +Disallow: /b/clearance/ +Allow: /b/clearance/Featured_Items--Clearance/clearance +Sitemap: https://www.example.com/sitemap.xml"u8.ToArray(), + }; + + [Theory] + [InlineData(0, "/US/en/search/7423-31-6?focus=papers&page=1&perpage=30&sort=relevance&term=7423-31-6&type=citation_search", false)] + [InlineData(1, "/", true)] + public void Test1(int index, string path, bool expected) + { + var machine = new RobotsMachine(_robotsTxt[index], + ["botify"u8.ToArray(), "googlebot"u8.ToArray()]); + var actual = machine.PathAllowedByRobots(Encoding.UTF8.GetBytes(path)); + Assert.Equal(expected, actual); + } +} diff --git a/TestRobotsTxt/TestRobotsTxt.csproj b/TestRobotsTxt/TestRobotsTxt.csproj index 9493cfb..70211f6 100644 --- a/TestRobotsTxt/TestRobotsTxt.csproj +++ b/TestRobotsTxt/TestRobotsTxt.csproj @@ -11,13 +11,13 @@ - - - + + + runtime; build; native; contentfiles; analyzers; buildtransitive all - + runtime; build; native; contentfiles; analyzers; buildtransitive all