From 14272476847bafdf4a85c5fba4710a1cc6724fb3 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Sun, 7 Sep 2025 20:31:51 +0200 Subject: [PATCH 1/4] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20rules=20=E2=80=9Ccompi?= =?UTF-8?q?ler=E2=80=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-parse robots.txt rules. --- RobotsTxt/LongestMatchRobotsMatchStrategy.cs | 81 +++- RobotsTxt/RobotsMachine.cs | 239 ++++++++++ RobotsTxt/RobotsMatcher.cs | 24 +- .../TestLongestMatchRobotsMatchStrategy.cs | 12 +- TestRobotsTxt/TestRobotsMachine.cs | 434 ++++++++++++++++++ 5 files changed, 770 insertions(+), 20 deletions(-) create mode 100644 RobotsTxt/RobotsMachine.cs create mode 100644 TestRobotsTxt/TestRobotsMachine.cs diff --git a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs index 2898083..27d626a 100644 --- a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs +++ b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs @@ -1,4 +1,6 @@ -namespace RobotsTxt +using System.Runtime.CompilerServices; + +namespace RobotsTxt { /// @@ -17,17 +19,17 @@ /// internal static class LongestMatchRobotsMatchStrategy { - internal static int MatchAllow(ReadOnlySpan path, ReadOnlySpan pattern) + internal static int MatchAllowSlow(ReadOnlySpan path, ReadOnlySpan pattern) { - return Matches(path, pattern) ? pattern.Length : -1; + return MatchesSlow(path, pattern) ? pattern.Length : -1; } - internal static int MatchDisallow(ReadOnlySpan path, ReadOnlySpan pattern) + internal static int MatchDisallowSlow(ReadOnlySpan path, ReadOnlySpan pattern) { - return Matches(path, pattern) ? pattern.Length : -1; + return MatchesSlow(path, pattern) ? pattern.Length : -1; } - internal static bool Matches(ReadOnlySpan path, ReadOnlySpan pattern) + internal static bool MatchesSlow(ReadOnlySpan path, ReadOnlySpan pattern) { var pathlen = path.Length; var pos = new int[pathlen + 1]; @@ -68,5 +70,72 @@ internal static bool Matches(ReadOnlySpan path, ReadOnlySpan pattern return true; } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchAllowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchDisallowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool MatchesFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + if (pattern.Length == 0) return true; + if (path.Length == 0) return pattern.Length == 0; + + if (!haveWildcards) + { + return path.IndexOf(pattern) != -1; + } + + Span pos = stackalloc int[path.Length + 1]; + int numpos = 1; + + for (var j = 0; j < pattern.Length; j++) + { + var ch = pattern[j]; + + // Check for end anchor + if (ch == '$' && j + 1 == pattern.Length) + { + return pos[numpos - 1] == path.Length; + } + + if (ch == '*') + { + int startPos = pos[0]; + numpos = path.Length - startPos + 1; + + for (int i = 0; i < numpos; i++) + { + pos[i] = startPos + i; + } + } + else + { + int newnumpos = 0; + int pathLen = path.Length; + + for (int i = 0; i < numpos && pos[i] < pathLen; i++) + { + if (path[pos[i]] == ch) + { + pos[newnumpos++] = pos[i] + 1; + } + } + + if (newnumpos == 0) return false; + numpos = newnumpos; + } + } + + return true; + } } } diff --git a/RobotsTxt/RobotsMachine.cs b/RobotsTxt/RobotsMachine.cs new file mode 100644 index 0000000..61061d1 --- /dev/null +++ b/RobotsTxt/RobotsMachine.cs @@ -0,0 +1,239 @@ +using System.Runtime.CompilerServices; + +namespace RobotsTxt; + +public class RobotsMachine : IRobotsParseHandler +{ + class State + { + } + + // class StartState : State + // { + // } + + class UserAgentState(UserAgentState.UserAgentType type) : State + { + // Either store all UAs with their rules, or just the last useful one. + public enum UserAgentType + { + // Unknown, + Global, + Specific, + } + + // Remove? + public UserAgentType Type { get; } = type; + } + + class AllowState(byte[] pattern, bool haveWildcards) : State + { + public byte[] Pattern { get; } = pattern; + public bool HaveWildcards { get; } = haveWildcards; + } + + class DisallowState(byte[] pattern, bool haveWildcards) : State + { + public byte[] Pattern { get; } = pattern; + public bool HaveWildcards { get; } = haveWildcards; + } + + private readonly List _userAgents; + + private List _globalStates = new(); + private List _specificStates = new(); + + private bool _currentAgentIsSpecific = false; // True if we're in a block for our agent. + private bool EverSeenSpecificAgent => _specificStates.Count > 0; + + public RobotsMachine(byte[] robotsBody, List userAgents) + { + _userAgents = userAgents; + ParseRobotsTxt(robotsBody, this); + } + + private static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + { + var parser = new RobotsTxtParser(robotsBody, parseCallback); + parser.Parse(); + } + + public void HandleRobotsStart() + { + } + + public void HandleRobotsEnd() + { + } + + internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + { + // Allowed characters in user-agent are [a-zA-Z_-]. + var i = 0; + for (; i < userAgent.Length; i++) + { + var c = userAgent[i]; + if (!(c.IsAlpha() || c == '_' || c == '-')) + { + break; + } + } + + return userAgent.Slice(0, i); + } + + public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + { + // Google-specific optimization: a '*' followed by space and more characters + // in a user-agent record is still regarded a global rule. + if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) + { + _globalStates.Add(new UserAgentState(UserAgentState.UserAgentType.Global)); + _currentAgentIsSpecific = false; + return; + } + userAgent = ExtractUserAgent(userAgent); + foreach (var ua in _userAgents) + { + if (!userAgent.EqualsIgnoreCase(ua)) continue; + _specificStates.Add(new UserAgentState(UserAgentState.UserAgentType.Specific)); + _currentAgentIsSpecific = true; + return; + } + } + + private bool SeenAnyAgent => _specificStates.Count > 0 || _globalStates.Count > 0; + + public void HandleAllow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + var states = _currentAgentIsSpecific ? _specificStates : _globalStates; + var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); + states.Add(new AllowState(value.ToArray(), haveWildcards)); + } + public void HandleDisallow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + var states = _currentAgentIsSpecific ? _specificStates : _globalStates; + var haveWildcards = value.Length >= 1 && (value.Contains((byte)'*') || value[^1] == '$'); + states.Add(new DisallowState(value.ToArray(), haveWildcards)); + } + + public void HandleSitemap(int lineNum, ReadOnlySpan value) + { + } + + public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) + { + } + + public bool PathAllowedByRobots(byte[] path) + { + return !Disallow(path); + } + + private bool Disallow(byte[] path) + { + if (!SeenAnyAgent) + return false; + + var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); + if (allowHierarchy.Priority > 0 || disallowHierarchy.Priority > 0) + { + return (disallowHierarchy.Priority > allowHierarchy.Priority); + } + + if (EverSeenSpecificAgent) + { + // Matching group for user-agent but either without disallow or empty one, + // i.e. priority == 0. + return false; + } + + (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _globalStates); + + if (disallowHierarchy.Priority > 0 || allowHierarchy.Priority > 0) + { + return disallowHierarchy.Priority > allowHierarchy.Priority; + } + + return false; + } + + private (Match, Match) AssessAccessRules(byte[] path, List states) + { + Match allowHierarchy = new(); // Characters of 'url' matching Allow. + Match disallowHierarchy = new(); // Characters of 'url' matching Disallow. + foreach (var state in states) + { + switch (state) + { + case AllowState allow: + CheckAllow(path, allow.Pattern, allow.HaveWildcards, allowHierarchy); + break; + case DisallowState disallow: + CheckDisallow(path, disallow.Pattern, disallow.HaveWildcards, disallowHierarchy); + break; + } + } + return (allowHierarchy, disallowHierarchy); + } + + private class Match(int priority = Match.NoMatchPriority) + { + private const int NoMatchPriority = -1; + + public void Clear() + { + Priority = NoMatchPriority; + } + + public int Priority { get; set; } = priority; + } + + readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void CheckAllow(byte[] path, ReadOnlySpan pattern, bool haveWildcards, Match allow) + { + var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); + if (priority >= 0) + { + if (allow.Priority < priority) + { + allow.Priority = priority; + } + } + else + { + // Google-specific optimization: 'index.htm' and 'index.html' are normalized + // to '/'. + var slashPos = pattern.LastIndexOf((byte)'/'); + + if (slashPos != -1 && + pattern.Slice(slashPos).StartsWith(_indexHtmBytes)) + { + var len = slashPos + 1; + var newpattern = new byte[len + 1]; + pattern.Slice(0, len).CopyTo(newpattern); + newpattern[len] = (byte)'$'; + CheckAllow(path, newpattern, true, allow); + } + } + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + private void CheckDisallow(byte[] path, ReadOnlySpan value, bool haveWildcards, Match disallow) + { + var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards); + if (priority >= 0) + { + if (disallow.Priority < priority) + { + disallow.Priority = priority; + } + } + } +} diff --git a/RobotsTxt/RobotsMatcher.cs b/RobotsTxt/RobotsMatcher.cs index b944065..fbe6add 100644 --- a/RobotsTxt/RobotsMatcher.cs +++ b/RobotsTxt/RobotsMatcher.cs @@ -90,7 +90,7 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) return; Debug.Assert(_allow != null); _seenSeparator = true; - var priority = LongestMatchRobotsMatchStrategy.MatchAllow(_path, value); + var priority = LongestMatchRobotsMatchStrategy.MatchAllowSlow(_path, value); if (priority >= 0) { if (_seenSpecificAgent) @@ -98,7 +98,7 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) Debug.Assert(_allow.Specific != null); if (_allow.Specific.Priority < priority) { - _allow.Specific.Set(priority, lineNum); + _allow.Specific.Set(priority); } } else @@ -107,7 +107,7 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) Debug.Assert(_allow.Global != null); if (_allow.Global.Priority < priority) { - _allow.Global.Set(priority, lineNum); + _allow.Global.Set(priority); } } } @@ -134,14 +134,14 @@ public void HandleDisallow(int lineNum, ReadOnlySpan value) if (!SeenAnyAgent) return; _seenSeparator = true; - var priority = LongestMatchRobotsMatchStrategy.MatchDisallow(_path, value); + var priority = LongestMatchRobotsMatchStrategy.MatchDisallowSlow(_path, value); if (priority >= 0) { if (_seenSpecificAgent) { if (_disallow.Specific.Priority < priority) { - _disallow.Specific.Set(priority, lineNum); + _disallow.Specific.Set(priority); } } else @@ -149,7 +149,7 @@ public void HandleDisallow(int lineNum, ReadOnlySpan value) Debug.Assert(_seenGlobalAgent); if (_disallow.Global.Priority < priority) { - _disallow.Global.Set(priority, lineNum); + _disallow.Global.Set(priority); } } } @@ -256,23 +256,21 @@ internal static string GetPathParamsQuery(string url) return "/"; } - class Match(int priority = Match.NoMatchPriority, int line = 0) + private class Match(int priority = Match.NoMatchPriority) { private const int NoMatchPriority = -1; - public void Set(int priority, int line) + public void Set(int priority) { Priority = priority; - Line = line; } public void Clear() { - Set(NoMatchPriority, 0); + Set(NoMatchPriority); } public int Priority { get; private set; } = priority; - public int Line { get; private set; } = line; } // For each of the directives within user-agents, we keep global and specific @@ -308,11 +306,11 @@ public bool OneAgentAllowedByRobots(byte[] robotsContent, byte[] userAgent, stri return AllowedByRobots(robotsContent, userAgents, url); } - internal static bool IsValidUserAgentToObey(Span userAgent) + public static bool IsValidUserAgentToObey(Span userAgent) { return userAgent.Length > 0 && ExtractUserAgent(userAgent) == userAgent; } - internal static bool IsValidUserAgentToObey(string userAgent) + public static bool IsValidUserAgentToObey(string userAgent) { return IsValidUserAgentToObey(Encoding.UTF8.GetBytes(userAgent)); } diff --git a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs index 82dacae..7c40643 100644 --- a/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs +++ b/TestRobotsTxt/TestLongestMatchRobotsMatchStrategy.cs @@ -1,5 +1,7 @@ using System.Text; + using Xunit; + using RobotsTxt; namespace TestRobotsTxt @@ -22,11 +24,19 @@ public class TestsLongestMatchRobotsMatchStrategy public void TestMatch(string path, string pattern, bool expected) { var actual = - LongestMatchRobotsMatchStrategy.Matches( + LongestMatchRobotsMatchStrategy.MatchesSlow( Encoding.UTF8.GetBytes(path), Encoding.UTF8.GetBytes(pattern) ); Assert.Equal(expected, actual); + var haveWildcards = pattern.Length >= 1 && (pattern.Contains('*') || pattern[^1] == '$'); + actual = + LongestMatchRobotsMatchStrategy.MatchesFast( + Encoding.UTF8.GetBytes(path), + Encoding.UTF8.GetBytes(pattern), + haveWildcards + ); + Assert.Equal(expected, actual); } } } diff --git a/TestRobotsTxt/TestRobotsMachine.cs b/TestRobotsTxt/TestRobotsMachine.cs new file mode 100644 index 0000000..ae88f13 --- /dev/null +++ b/TestRobotsTxt/TestRobotsMachine.cs @@ -0,0 +1,434 @@ +using System.Text; + +using RobotsTxt; + +using Xunit; + +namespace TestRobotsTxt; + +public class TestRobotsMachine +{ + private readonly byte[][] _robotsTxt = new byte[][] + { + @"# ROW robots from TAS +# update 08-12-2024 semi configs and login redirect blocked +# Updated on 06-09-2023 +# Added Disallow: /*/orders for all bots - SELF-223 +# Added Disallow for COAs for google, bing and APAC bots 09-27-2024 + +User-Agent: * +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: */search/*focus=papers +Disallow: /*/*/life-science/assistant + +#Specific allows for chatGPT - note directives apply to both bots +User-agent: GPTBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-agent: ChatGPT-User +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-Agent: Googlebot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: */search/*focus=papers +Disallow: /*/*/life-science/assistant + +# added 03-20-2024 +User-Agent: Bingbot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-Agent: Botify +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Allow: /api?operation=PricingAndAvailability +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +User-Agent: Adsbot-Google +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: */jcr:content/ +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/life-science/assistant + +# APAC Bots +# China +User-Agent: Baiduspider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: /api +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sosospider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sogou spider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: Sogou+spider +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +User-agent: YoudaoBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Naverbot - Korea +User-agent: Yeti +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Daum +User-agent: DAUM +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search?focus= +Disallow: /*/search/?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +# Yandex +# Added gc fb sid id redirect param clean 12/13/2021 +User-agent: YandexBot +Allow: / +Disallow: /*/semi-configurators/sirna?term= +Disallow: /*/semi-configurators/shrna?term= +Disallow: */jcr:content/ +Disallow: login?redirect +Disallow: /*/login?redirect +Disallow: /api +Disallow: /*/search/?focus= +Disallow: /*/search?focus= +Disallow: /*/product/compare? +Disallow: /*/product/compare- +Disallow: /*/orders +Disallow: /*undefined/undefined +Disallow: /*/*/life-science/quality-and-regulatory-management/m-clarity-program +Disallow: /*/*/services/support/bulk-quotation-request +Disallow: /*/*/coa/ +Disallow: /certificates/Graphics/COfAInfo/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/ +Disallow: /certificates/sapfs/PROD/sap/certificate_pdfs/COA/Q14/ +Disallow: /*/*/life-science/assistant + +Clean-param: redirect /*/login +Clean-param: gc /* +Clean-param: fb /* +Clean-param: redirect /* +Clean-param: sid /* +Clean-param: id /* + +# Added 11-03-2022 +User-agent: PetalBot +Disallow: / + +User-agent: ConveraCrawler +Disallow: / + +User-agent: DotBot +Disallow: / + +User-agent: ingenieur +Disallow: / + +User-agent: Mail.Ru +Disallow: / + +User-agent: JikeSpider +Disallow: / + +User-agent: EasouSpider +Disallow: / + +User-agent: YisouSpider +Disallow: / + +Sitemap: https://www.sigmaaldrich.com/sitemap_index.xml +"u8.ToArray(), + @"User-agent: * +Disallow: /account/ +Disallow: /adRedir.do* +Disallow: /ads/ +Disallow: /b2b/ +Disallow: /billboard/ +Disallow: /cart/ +Disallow: /catalog/browseCatalog.do* +Disallow: /catalogrequest/ +Disallow: /catalog/search.do* +Disallow: /checkout/ +Disallow: /common/ +Disallow: /compare/ +Disallow: /contracts/ +Disallow: /csl/ +Disallow: /customerservice/ +Disallow: /default/ +Disallow: /employeepurchase +Disallow: /employeepurchases.do* +Disallow: /epp +Disallow: /examples/ +Disallow: /inkTonerManuf.do* +Disallow: /internal/ +Disallow: /mb/search.do* +Disallow: /mb/stores/list.do* +Disallow: /mb/wifiConnect.do* +Disallow: /mb/cart.do* +Disallow: /orderhistory/ +Disallow: /printconfigurator/ +Disallow: /promo/ +Disallow: /qp/ +Disallow: /shop/ +Disallow: /storelocator/wifiConnect.do* +Disallow: /stores/wifiConnect.do* +Disallow: /tealeaf/ +Disallow: /textSearch.do* +Disallow: /txtSearchDD.do* +Disallow: /userprofile/ +Disallow: /vendor/ +Disallow: /workflow/ +Disallow: /select +Disallow: /businessrewards/ +Disallow: /ccpa/lookup.do* +Disallow: /a/search/ +Disallow: /b/widget/ +Disallow: /b/*/*/*/*/N-* +Disallow: /b/clearance/ +Allow: /b/clearance/Featured_Items--Clearance/clearance +Sitemap: https://www.example.com/sitemap.xml"u8.ToArray(), + }; + + [Theory] + [InlineData(0, "/US/en/search/7423-31-6?focus=papers&page=1&perpage=30&sort=relevance&term=7423-31-6&type=citation_search", false)] + [InlineData(1, "/", true)] + public void Test1(int index, string path, bool expected) + { + var machine = new RobotsMachine(_robotsTxt[index], + ["botify"u8.ToArray(), "googlebot"u8.ToArray()]); + var actual = machine.PathAllowedByRobots(Encoding.UTF8.GetBytes(path)); + Assert.Equal(expected, actual); + } +} From 8b01c214b07a4bafe05fb893db50ff403fbf21f3 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Sun, 5 Oct 2025 16:43:51 +0200 Subject: [PATCH 2/4] =?UTF-8?q?=F0=9F=8E=A8=20cleanups?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Yves Bastide --- RobotsTxt/Extensions.cs | 103 +++-- RobotsTxt/IRobotsParseHandler.cs | 21 +- RobotsTxt/LongestMatchRobotsMatchStrategy.cs | 196 +++++---- RobotsTxt/ParsedRobotsKey.cs | 136 +++--- RobotsTxt/RobotsMachine.cs | 102 ++--- RobotsTxt/RobotsMatcher.cs | 434 +++++++++---------- RobotsTxt/RobotsTxtParser.cs | 388 ++++++++--------- 7 files changed, 670 insertions(+), 710 deletions(-) diff --git a/RobotsTxt/Extensions.cs b/RobotsTxt/Extensions.cs index ee5e28c..107233d 100644 --- a/RobotsTxt/Extensions.cs +++ b/RobotsTxt/Extensions.cs @@ -1,73 +1,72 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public static class MyExtensions { - public static class MyExtensions + public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) { - public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) + if (self.Length != other.Length) + { + return false; + } + + for (var i = 0; i < self.Length; i++) { - if (self.Length != other.Length) + var c1 = self[i]; + var c2 = other[i]; + if ('A' <= c1 && c1 <= 'Z') + c1 += 32; + if ('A' <= c2 && c2 <= 'Z') + c2 += 32; + if (c1 != c2) { return false; } + } - for (var i = 0; i < self.Length; i++) - { - var c1 = self[i]; - var c2 = other[i]; - if ('A' <= c1 && c1 <= 'Z') - c1 += 32; - if ('A' <= c2 && c2 <= 'Z') - c2 += 32; - if (c1 != c2) - { - return false; - } - } + return true; + } - return true; + public static bool StartsWithIgnoreCase(this ReadOnlySpan span, ReadOnlySpan value) + { + if (span.Length < value.Length) + { + return false; } - public static bool StartsWithIgnoreCase(this ReadOnlySpan span, ReadOnlySpan value) + for (var i = 0; i < value.Length; i++) { - if (span.Length < value.Length) + var c1 = span[i]; + var c2 = value[i]; + if ('A' <= c1 && c1 <= 'Z') + c1 += 32; + if ('A' <= c2 && c2 <= 'Z') + c2 += (byte)' '; + if (c1 != c2) { return false; } - - for (var i = 0; i < value.Length; i++) - { - var c1 = span[i]; - var c2 = value[i]; - if ('A' <= c1 && c1 <= 'Z') - c1 += 32; - if ('A' <= c2 && c2 <= 'Z') - c2 += (byte)' '; - if (c1 != c2) - { - return false; - } - } - - return true; } - public static bool IsXDigit(this byte c) - { - return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); - } + return true; + } - public static bool IsAlpha(this byte c) - { - return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); - } + public static bool IsXDigit(this byte c) + { + return ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F'); + } - public static bool IsSpace(this byte c) - { - return c == ' ' || c == '\t'; - } + public static bool IsAlpha(this byte c) + { + return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'); + } - public static byte ToUpper(this byte c) - { - return (byte)('a' <= c && c <= 'z' ? c - ' ' : c); - } + public static bool IsSpace(this byte c) + { + return c == ' ' || c == '\t'; + } + + public static byte ToUpper(this byte c) + { + return (byte)('a' <= c && c <= 'z' ? c - ' ' : c); } } diff --git a/RobotsTxt/IRobotsParseHandler.cs b/RobotsTxt/IRobotsParseHandler.cs index 8a6cf60..1b673ec 100644 --- a/RobotsTxt/IRobotsParseHandler.cs +++ b/RobotsTxt/IRobotsParseHandler.cs @@ -1,14 +1,13 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public interface IRobotsParseHandler { - public interface IRobotsParseHandler - { - void HandleRobotsStart(); - void HandleRobotsEnd(); + void HandleRobotsStart(); + void HandleRobotsEnd(); - void HandleUserAgent(int lineNum, ReadOnlySpan value); - void HandleAllow(int lineNum, ReadOnlySpan value); - void HandleDisallow(int lineNum, ReadOnlySpan value); - void HandleSitemap(int lineNum, ReadOnlySpan value); - void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value); - } + void HandleUserAgent(int lineNum, ReadOnlySpan userAgent); + void HandleAllow(int lineNum, ReadOnlySpan value); + void HandleDisallow(int lineNum, ReadOnlySpan value); + void HandleSitemap(int lineNum, ReadOnlySpan value); + void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value); } diff --git a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs index 27d626a..b1942b6 100644 --- a/RobotsTxt/LongestMatchRobotsMatchStrategy.cs +++ b/RobotsTxt/LongestMatchRobotsMatchStrategy.cs @@ -1,141 +1,139 @@ using System.Runtime.CompilerServices; -namespace RobotsTxt +namespace RobotsTxt; + +/// +/// A RobotsMatchStrategy defines a strategy for matching individual lines in a +/// robots.txt file. Each Match* method should return a match priority, which is +/// interpreted as: +/// +/// match priority < 0: +/// No match. +/// +/// match priority == 0: +/// Match, but treat it as if matched an empty pattern. +/// +/// match priority > 0: +/// Match. +/// +internal static class LongestMatchRobotsMatchStrategy { + internal static int MatchAllowSlow(ReadOnlySpan path, ReadOnlySpan pattern) + { + return MatchesSlow(path, pattern) ? pattern.Length : -1; + } - /// - /// A RobotsMatchStrategy defines a strategy for matching individual lines in a - /// robots.txt file. Each Match* method should return a match priority, which is - /// interpreted as: - /// - /// match priority < 0: - /// No match. - /// - /// match priority == 0: - /// Match, but treat it as if matched an empty pattern. - /// - /// match priority > 0: - /// Match. - /// - internal static class LongestMatchRobotsMatchStrategy + internal static int MatchDisallowSlow(ReadOnlySpan path, ReadOnlySpan pattern) { - internal static int MatchAllowSlow(ReadOnlySpan path, ReadOnlySpan pattern) - { - return MatchesSlow(path, pattern) ? pattern.Length : -1; - } + return MatchesSlow(path, pattern) ? pattern.Length : -1; + } - internal static int MatchDisallowSlow(ReadOnlySpan path, ReadOnlySpan pattern) + internal static bool MatchesSlow(ReadOnlySpan path, ReadOnlySpan pattern) + { + var pathlen = path.Length; + var pos = new int[pathlen + 1]; + var numpos = 1; + var patlen = pattern.Length; + for (var j = 0; j < patlen; j++) { - return MatchesSlow(path, pattern) ? pattern.Length : -1; - } + var ch = pattern[j]; + if (ch == '$' && j + 1 == patlen) + { + return pos[numpos - 1] == pathlen; + } - internal static bool MatchesSlow(ReadOnlySpan path, ReadOnlySpan pattern) - { - var pathlen = path.Length; - var pos = new int[pathlen + 1]; - int numpos = 1; - var patlen = pattern.Length; - for (var j = 0; j < patlen; j++) + if (ch == '*') { - var ch = pattern[j]; - if (ch == '$' && j + 1 == patlen) + numpos = pathlen - pos[0] + 1; + for (var i = 1; i < numpos; i++) { - return (pos[numpos - 1] == pathlen); + pos[i] = pos[i - 1] + 1; } - - if (ch == '*') + } + else + { + // Includes '$' when not at end of pattern. + var newnumpos = 0; + for (var i = 0; i < numpos; i++) { - numpos = pathlen - pos[0] + 1; - for (int i = 1; i < numpos; i++) + if (pos[i] < pathlen && path[pos[i]] == ch) { - pos[i] = pos[i - 1] + 1; + pos[newnumpos++] = pos[i] + 1; } } - else - { - // Includes '$' when not at end of pattern. - int newnumpos = 0; - for (int i = 0; i < numpos; i++) - { - if (pos[i] < pathlen && path[pos[i]] == ch) - { - pos[newnumpos++] = pos[i] + 1; - } - } - numpos = newnumpos; - if (numpos == 0) return false; - } + numpos = newnumpos; + if (numpos == 0) return false; } - - return true; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int MatchAllowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) - { - return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; - } + return true; + } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static int MatchDisallowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchAllowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static int MatchDisallowFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + internal static bool MatchesFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + { + if (pattern.Length == 0) return true; + if (path.Length == 0) return pattern.Length == 0; + + if (!haveWildcards) { - return MatchesFast(path, pattern, haveWildcards) ? pattern.Length : -1; + return path.IndexOf(pattern) != -1; } - [MethodImpl(MethodImplOptions.AggressiveInlining)] - internal static bool MatchesFast(ReadOnlySpan path, ReadOnlySpan pattern, bool haveWildcards) + Span pos = stackalloc int[path.Length + 1]; + var numpos = 1; + + for (var j = 0; j < pattern.Length; j++) { - if (pattern.Length == 0) return true; - if (path.Length == 0) return pattern.Length == 0; + var ch = pattern[j]; - if (!haveWildcards) + // Check for end anchor + if (ch == '$' && j + 1 == pattern.Length) { - return path.IndexOf(pattern) != -1; + return pos[numpos - 1] == path.Length; } - Span pos = stackalloc int[path.Length + 1]; - int numpos = 1; - - for (var j = 0; j < pattern.Length; j++) + if (ch == '*') { - var ch = pattern[j]; + var startPos = pos[0]; + numpos = path.Length - startPos + 1; - // Check for end anchor - if (ch == '$' && j + 1 == pattern.Length) + for (var i = 0; i < numpos; i++) { - return pos[numpos - 1] == path.Length; + pos[i] = startPos + i; } + } + else + { + var newnumpos = 0; + var pathLen = path.Length; - if (ch == '*') + for (var i = 0; i < numpos && pos[i] < pathLen; i++) { - int startPos = pos[0]; - numpos = path.Length - startPos + 1; - - for (int i = 0; i < numpos; i++) + if (path[pos[i]] == ch) { - pos[i] = startPos + i; + pos[newnumpos++] = pos[i] + 1; } } - else - { - int newnumpos = 0; - int pathLen = path.Length; - - for (int i = 0; i < numpos && pos[i] < pathLen; i++) - { - if (path[pos[i]] == ch) - { - pos[newnumpos++] = pos[i] + 1; - } - } - if (newnumpos == 0) return false; - numpos = newnumpos; - } + if (newnumpos == 0) return false; + numpos = newnumpos; } - - return true; } + + return true; } } diff --git a/RobotsTxt/ParsedRobotsKey.cs b/RobotsTxt/ParsedRobotsKey.cs index b10b356..b5bb154 100644 --- a/RobotsTxt/ParsedRobotsKey.cs +++ b/RobotsTxt/ParsedRobotsKey.cs @@ -1,93 +1,91 @@ using System.Diagnostics; -namespace RobotsTxt +namespace RobotsTxt; + +internal class ParsedRobotsKey { - class ParsedRobotsKey - { - private byte[]? _keyText; - const bool AllowFrequentTypos = true; + private byte[]? _keyText; + private const bool AllowFrequentTypos = true; - public enum KeyType - { - // Generic high level fields. - UserAgent, - Sitemap, + public enum KeyType + { + // Generic high level fields. + UserAgent, + Sitemap, - // Fields within a user-agent. - Allow, - Disallow, + // Fields within a user-agent. + Allow, + Disallow, - // Unrecognized field; kept as-is. High number so that additions to the - // enumeration above does not change the serialization. - Unknown = 128, - }; + // Unrecognized field; kept as-is. High number so that additions to the + // enumeration above does not change the serialization. + Unknown = 128, + } - public void Parse(ReadOnlySpan key) + public void Parse(ReadOnlySpan key) + { + _keyText = null; + if (KeyIsUserAgent(key)) { - _keyText = null; - if (KeyIsUserAgent(key)) - { - Type = KeyType.UserAgent; - } - else if (KeyIsAllow(key)) - { - Type = KeyType.Allow; - } - else if (KeyIsDisallow(key)) - { - Type = KeyType.Disallow; - } - else if (KeyIsSitemap(key)) - { - Type = KeyType.Sitemap; - } - else - { - Type = KeyType.Unknown; - UnknownText = key.ToArray(); - } + Type = KeyType.UserAgent; } - - private bool KeyIsSitemap(ReadOnlySpan key) + else if (KeyIsAllow(key)) { - return key.StartsWithIgnoreCase("sitemap"u8) || - key.StartsWithIgnoreCase("site-map"u8); + Type = KeyType.Allow; } - - private bool KeyIsDisallow(ReadOnlySpan key) + else if (KeyIsDisallow(key)) { - return ( - key.StartsWithIgnoreCase("disallow"u8) || - (AllowFrequentTypos && (key.StartsWithIgnoreCase("dissallow"u8) || - key.StartsWithIgnoreCase("dissalow"u8) || - key.StartsWithIgnoreCase("disalow"u8) || - key.StartsWithIgnoreCase("diasllow"u8) || - key.StartsWithIgnoreCase("disallaw"u8)))); + Type = KeyType.Disallow; } - - private bool KeyIsAllow(ReadOnlySpan key) + else if (KeyIsSitemap(key)) { - return key.StartsWithIgnoreCase("allow"u8); + Type = KeyType.Sitemap; } - - private bool KeyIsUserAgent(ReadOnlySpan key) + else { - return key.StartsWithIgnoreCase("user-agent"u8) || - (AllowFrequentTypos && (key.StartsWithIgnoreCase("useragent"u8) || - key.StartsWithIgnoreCase("user agent"u8))); + Type = KeyType.Unknown; + UnknownText = key.ToArray(); } + } + + private static bool KeyIsSitemap(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("sitemap"u8) || + key.StartsWithIgnoreCase("site-map"u8); + } + + private static bool KeyIsDisallow(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("disallow"u8) || + (AllowFrequentTypos && (key.StartsWithIgnoreCase("dissallow"u8) || + key.StartsWithIgnoreCase("dissalow"u8) || + key.StartsWithIgnoreCase("disalow"u8) || + key.StartsWithIgnoreCase("diasllow"u8) || + key.StartsWithIgnoreCase("disallaw"u8))); + } + + private static bool KeyIsAllow(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("allow"u8); + } + + private static bool KeyIsUserAgent(ReadOnlySpan key) + { + return key.StartsWithIgnoreCase("user-agent"u8) || + (AllowFrequentTypos && (key.StartsWithIgnoreCase("useragent"u8) || + key.StartsWithIgnoreCase("user agent"u8))); + } - public KeyType Type { get; private set; } = KeyType.Unknown; + public KeyType Type { get; private set; } = KeyType.Unknown; - public byte[]? UnknownText + public byte[]? UnknownText + { + get { - get - { - Debug.Assert(Type == KeyType.Unknown); - return _keyText; - } - private set => _keyText = value; + Debug.Assert(Type == KeyType.Unknown); + return _keyText; } + private set => _keyText = value; } } diff --git a/RobotsTxt/RobotsMachine.cs b/RobotsTxt/RobotsMachine.cs index 61061d1..891c50c 100644 --- a/RobotsTxt/RobotsMachine.cs +++ b/RobotsTxt/RobotsMachine.cs @@ -4,35 +4,17 @@ namespace RobotsTxt; public class RobotsMachine : IRobotsParseHandler { - class State - { - } + private class State; - // class StartState : State - // { - // } + private class UserAgentState : State; - class UserAgentState(UserAgentState.UserAgentType type) : State - { - // Either store all UAs with their rules, or just the last useful one. - public enum UserAgentType - { - // Unknown, - Global, - Specific, - } - - // Remove? - public UserAgentType Type { get; } = type; - } - - class AllowState(byte[] pattern, bool haveWildcards) : State + private class AllowState(byte[] pattern, bool haveWildcards) : State { public byte[] Pattern { get; } = pattern; public bool HaveWildcards { get; } = haveWildcards; } - class DisallowState(byte[] pattern, bool haveWildcards) : State + private class DisallowState(byte[] pattern, bool haveWildcards) : State { public byte[] Pattern { get; } = pattern; public bool HaveWildcards { get; } = haveWildcards; @@ -40,10 +22,10 @@ class DisallowState(byte[] pattern, bool haveWildcards) : State private readonly List _userAgents; - private List _globalStates = new(); - private List _specificStates = new(); + private readonly List _globalStates = []; + private readonly List _specificStates = []; - private bool _currentAgentIsSpecific = false; // True if we're in a block for our agent. + private bool _currentAgentIsSpecific; // True if we're in a block for our agent. private bool EverSeenSpecificAgent => _specificStates.Count > 0; public RobotsMachine(byte[] robotsBody, List userAgents) @@ -79,7 +61,7 @@ internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent } } - return userAgent.Slice(0, i); + return userAgent[..i]; } public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) @@ -88,7 +70,7 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) // in a user-agent record is still regarded a global rule. if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) { - _globalStates.Add(new UserAgentState(UserAgentState.UserAgentType.Global)); + _globalStates.Add(new UserAgentState()); _currentAgentIsSpecific = false; return; } @@ -96,7 +78,7 @@ public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) foreach (var ua in _userAgents) { if (!userAgent.EqualsIgnoreCase(ua)) continue; - _specificStates.Add(new UserAgentState(UserAgentState.UserAgentType.Specific)); + _specificStates.Add(new UserAgentState()); _currentAgentIsSpecific = true; return; } @@ -142,7 +124,7 @@ private bool Disallow(byte[] path) var (allowHierarchy, disallowHierarchy) = AssessAccessRules(path, _specificStates); if (allowHierarchy.Priority > 0 || disallowHierarchy.Priority > 0) { - return (disallowHierarchy.Priority > allowHierarchy.Priority); + return disallowHierarchy.Priority > allowHierarchy.Priority; } if (EverSeenSpecificAgent) @@ -162,7 +144,7 @@ private bool Disallow(byte[] path) return false; } - private (Match, Match) AssessAccessRules(byte[] path, List states) + private static (Match, Match) AssessAccessRules(byte[] path, List states) { Match allowHierarchy = new(); // Characters of 'url' matching Allow. Match disallowHierarchy = new(); // Characters of 'url' matching Disallow. @@ -185,55 +167,53 @@ private class Match(int priority = Match.NoMatchPriority) { private const int NoMatchPriority = -1; - public void Clear() - { - Priority = NoMatchPriority; - } - public int Priority { get; set; } = priority; } - readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); + private static readonly byte[] IndexHtmBytes = "/index.htm"u8.ToArray(); [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void CheckAllow(byte[] path, ReadOnlySpan pattern, bool haveWildcards, Match allow) + private static void CheckAllow(byte[] path, ReadOnlySpan pattern, bool haveWildcards, Match allow) { - var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); - if (priority >= 0) + while (true) { - if (allow.Priority < priority) + var priority = LongestMatchRobotsMatchStrategy.MatchAllowFast(path, pattern, haveWildcards); + if (priority >= 0) { - allow.Priority = priority; + if (allow.Priority < priority) + { + allow.Priority = priority; + } } - } - else - { - // Google-specific optimization: 'index.htm' and 'index.html' are normalized - // to '/'. - var slashPos = pattern.LastIndexOf((byte)'/'); - - if (slashPos != -1 && - pattern.Slice(slashPos).StartsWith(_indexHtmBytes)) + else { - var len = slashPos + 1; - var newpattern = new byte[len + 1]; - pattern.Slice(0, len).CopyTo(newpattern); - newpattern[len] = (byte)'$'; - CheckAllow(path, newpattern, true, allow); + // Google-specific optimization: 'index.htm' and 'index.html' are normalized + // to '/'. + var slashPos = pattern.LastIndexOf((byte)'/'); + + if (slashPos != -1 && pattern[slashPos..].StartsWith(IndexHtmBytes)) + { + var len = slashPos + 1; + var newpattern = new byte[len + 1]; + pattern[..len].CopyTo(newpattern); + newpattern[len] = (byte)'$'; + pattern = newpattern; + haveWildcards = true; + continue; + } } + break; } } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private void CheckDisallow(byte[] path, ReadOnlySpan value, bool haveWildcards, Match disallow) + private static void CheckDisallow(byte[] path, ReadOnlySpan value, bool haveWildcards, Match disallow) { var priority = LongestMatchRobotsMatchStrategy.MatchDisallowFast(path, value, haveWildcards); - if (priority >= 0) + if (priority < 0) return; + if (disallow.Priority < priority) { - if (disallow.Priority < priority) - { - disallow.Priority = priority; - } + disallow.Priority = priority; } } } diff --git a/RobotsTxt/RobotsMatcher.cs b/RobotsTxt/RobotsMatcher.cs index fbe6add..c6c5603 100644 --- a/RobotsTxt/RobotsMatcher.cs +++ b/RobotsTxt/RobotsMatcher.cs @@ -1,93 +1,92 @@ using System.Diagnostics; using System.Text; -namespace RobotsTxt +namespace RobotsTxt; + +/// +/// Create a RobotsMatcher with the default matching strategy. The default +/// matching strategy is longest-match as opposed to the former internet draft +/// that provisioned first-match strategy. Analysis shows that longest-match, +/// while more restrictive for crawlers, is what webmasters assume when writing +/// directives. For example, in case of conflicting matches (both Allow and +/// Disallow), the longest match is the one the user wants. For example, in +/// case of a robots.txt file that has the following rules +/// Allow: / +/// Disallow: /cgi-bin +/// it's pretty obvious what the webmaster wants: they want to allow crawl of +/// every URI except /cgi-bin. However, according to the expired internet +/// standard, crawlers should be allowed to crawl everything with such a rule. +/// +public class RobotsMatcher : IRobotsParseHandler { - /// - /// Create a RobotsMatcher with the default matching strategy. The default - /// matching strategy is longest-match as opposed to the former internet draft - /// that provisioned first-match strategy. Analysis shows that longest-match, - /// while more restrictive for crawlers, is what webmasters assume when writing - /// directives. For example, in case of conflicting matches (both Allow and - /// Disallow), the longest match is the one the user wants. For example, in - /// case of a robots.txt file that has the following rules - /// Allow: / - /// Disallow: /cgi-bin - /// it's pretty obvious what the webmaster wants: they want to allow crawl of - /// every URI except /cgi-bin. However, according to the expired internet - /// standard, crawlers should be allowed to crawl everything with such a rule. - /// - public class RobotsMatcher : IRobotsParseHandler + public void HandleRobotsStart() { - public void HandleRobotsStart() - { - // This is a new robots.txt file, so we need to reset all the instance member - // variables. We do it in the same order the instance member variables are - // declared, so it's easier to keep track of which ones we have (or maybe - // haven't!) done. - _allow.Clear(); - _disallow.Clear(); - - _seenGlobalAgent = false; - _seenSpecificAgent = false; - _everSeenSpecificAgent = false; - _seenSeparator = false; - } + // This is a new robots.txt file, so we need to reset all the instance member + // variables. We do it in the same order the instance member variables are + // declared, so it's easier to keep track of which ones we have (or maybe + // haven't!) done. + _allow.Clear(); + _disallow.Clear(); + + _seenGlobalAgent = false; + _seenSpecificAgent = false; + _everSeenSpecificAgent = false; + _seenSeparator = false; + } - public void HandleRobotsEnd() - { - } + public void HandleRobotsEnd() + { + } - internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + internal static ReadOnlySpan ExtractUserAgent(ReadOnlySpan userAgent) + { + // Allowed characters in user-agent are [a-zA-Z_-]. + var i = 0; + for (; i < userAgent.Length; i++) { - // Allowed characters in user-agent are [a-zA-Z_-]. - var i = 0; - for (; i < userAgent.Length; i++) + var c = userAgent[i]; + if (!(c.IsAlpha() || c == '_' || c == '-')) { - var c = userAgent[i]; - if (!(c.IsAlpha() || c == '_' || c == '-')) - { - break; - } + break; } - - return userAgent.Slice(0, i); } - public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + return userAgent[..i]; + } + + public void HandleUserAgent(int lineNum, ReadOnlySpan userAgent) + { + if (_seenSeparator) { - if (_seenSeparator) - { - _seenSpecificAgent = _seenGlobalAgent = _seenSeparator = false; - } + _seenSpecificAgent = _seenGlobalAgent = _seenSeparator = false; + } - // Google-specific optimization: a '*' followed by space and more characters - // in a user-agent record is still regarded a global rule. - if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) - { - _seenGlobalAgent = true; - } - else + // Google-specific optimization: a '*' followed by space and more characters + // in a user-agent record is still regarded a global rule. + if (userAgent.Length >= 1 && userAgent[0] == '*' && (userAgent.Length == 1 || userAgent[1].IsSpace())) + { + _seenGlobalAgent = true; + } + else + { + userAgent = ExtractUserAgent(userAgent); + Debug.Assert(_userAgents != null); + foreach (var ua in _userAgents) { - userAgent = ExtractUserAgent(userAgent); - Debug.Assert(_userAgents != null); - foreach (var ua in _userAgents) - { - if (userAgent.EqualsIgnoreCase(ua)) - { - _everSeenSpecificAgent = _seenSpecificAgent = true; - break; - } - } + if (!userAgent.EqualsIgnoreCase(ua)) continue; + _everSeenSpecificAgent = _seenSpecificAgent = true; + break; } } + } - readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); + private readonly byte[] _indexHtmBytes = "/index.htm"u8.ToArray(); - public void HandleAllow(int lineNum, ReadOnlySpan value) + public void HandleAllow(int lineNum, ReadOnlySpan value) + { + while (true) { - if (!SeenAnyAgent) - return; + if (!SeenAnyAgent) return; Debug.Assert(_allow != null); _seenSeparator = true; var priority = LongestMatchRobotsMatchStrategy.MatchAllowSlow(_path, value); @@ -117,202 +116,193 @@ public void HandleAllow(int lineNum, ReadOnlySpan value) // to '/'. var slashPos = value.LastIndexOf((byte)'/'); - if (slashPos != -1 && - value.Slice(slashPos).StartsWith(_indexHtmBytes)) - { - var len = slashPos + 1; - var newpattern = new byte[len + 1]; - value.Slice(0, len).CopyTo(newpattern); - newpattern[len] = (byte)'$'; - HandleAllow(lineNum, newpattern); - } + if (slashPos == -1 || !value[slashPos..].StartsWith(_indexHtmBytes)) return; + var len = slashPos + 1; + var newpattern = new byte[len + 1]; + value[..len].CopyTo(newpattern); + newpattern[len] = (byte)'$'; + value = newpattern; + continue; } + break; } + } - public void HandleDisallow(int lineNum, ReadOnlySpan value) + public void HandleDisallow(int lineNum, ReadOnlySpan value) + { + if (!SeenAnyAgent) + return; + _seenSeparator = true; + var priority = LongestMatchRobotsMatchStrategy.MatchDisallowSlow(_path, value); + if (priority < 0) return; + if (_seenSpecificAgent) { - if (!SeenAnyAgent) - return; - _seenSeparator = true; - var priority = LongestMatchRobotsMatchStrategy.MatchDisallowSlow(_path, value); - if (priority >= 0) + if (_disallow.Specific.Priority < priority) { - if (_seenSpecificAgent) - { - if (_disallow.Specific.Priority < priority) - { - _disallow.Specific.Set(priority); - } - } - else - { - Debug.Assert(_seenGlobalAgent); - if (_disallow.Global.Priority < priority) - { - _disallow.Global.Set(priority); - } - } + _disallow.Specific.Set(priority); } } - - public void HandleSitemap(int lineNum, ReadOnlySpan value) + else { + Debug.Assert(_seenGlobalAgent); + if (_disallow.Global.Priority < priority) + { + _disallow.Global.Set(priority); + } } + } - public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) - { - } + public void HandleSitemap(int lineNum, ReadOnlySpan value) + { + } - private void InitUserAgentsAndPath(List userAgents, byte[] path) - { - _userAgents = userAgents; - Debug.Assert(path.Length > 0 && path[0] == '/'); - _path = path; - } + public void HandleUnknownAction(int lineNum, ReadOnlySpan action, ReadOnlySpan value) + { + } + + private void InitUserAgentsAndPath(List userAgents, byte[] path) + { + _userAgents = userAgents; + Debug.Assert(path.Length > 0 && path[0] == '/'); + _path = path; + } + + private bool SeenAnyAgent => _seenGlobalAgent || _seenSpecificAgent; + + public bool AllowedByRobots(byte[] robotsBody, List userAgents, string url) + { + // The url is not normalized (escaped, percent encoded) here because the user + // is asked to provide it in escaped form already. + var path = GetPathParamsQuery(url); + return PathAllowedByRobots(robotsBody, userAgents, new UTF8Encoding().GetBytes(path)); + } + + public bool PathAllowedByRobots(byte[] robotsBody, List userAgents, byte[] path) + { + InitUserAgentsAndPath(userAgents, path); + ParseRobotsTxt(robotsBody, this); + return !Disallow(); + } - private bool SeenAnyAgent => _seenGlobalAgent || _seenSpecificAgent; + private bool Disallow() + { + Debug.Assert(_allow != null); + Debug.Assert(_disallow != null); - public bool AllowedByRobots(byte[] robotsBody, List userAgents, string url) + if (_allow.Specific.Priority > 0 || _disallow.Specific.Priority > 0) { - // The url is not normalized (escaped, percent encoded) here because the user - // is asked to provide it in escaped form already. - var path = GetPathParamsQuery(url); - return PathAllowedByRobots(robotsBody, userAgents, new UTF8Encoding().GetBytes(path)); + return _disallow.Specific.Priority > _allow.Specific.Priority; } - public bool PathAllowedByRobots(byte[] robotsBody, List userAgents, byte[] path) + if (_everSeenSpecificAgent) { - InitUserAgentsAndPath(userAgents, path); - ParseRobotsTxt(robotsBody, this); - return !Disallow(); + // Matching group for user-agent but either without disallow or empty one, + // i.e. priority == 0. + return false; } - private bool Disallow() + if (_disallow.Global.Priority > 0 || _allow.Global.Priority > 0) { - Debug.Assert(_allow != null); - Debug.Assert(_disallow != null); - - if (_allow.Specific.Priority > 0 || _disallow.Specific.Priority > 0) - { - return (_disallow.Specific.Priority > _allow.Specific.Priority); - } + return _disallow.Global.Priority > _allow.Global.Priority; + } - if (_everSeenSpecificAgent) - { - // Matching group for user-agent but either without disallow or empty one, - // i.e. priority == 0. - return false; - } + return false; + } - if (_disallow.Global.Priority > 0 || _allow.Global.Priority > 0) - { - return _disallow.Global.Priority > _allow.Global.Priority; - } + internal static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + { + var parser = new RobotsTxtParser(robotsBody, parseCallback); + parser.Parse(); + } - return false; + internal static string GetPathParamsQuery(string url) + { + var searchStart = 0; + if (url is ['/', '/', ..,]) searchStart = 2; + var earlyPath = url.IndexOfAny(['/', '?', ';',], searchStart); + var protocolEnd = url.IndexOf("://", searchStart, StringComparison.Ordinal); + if (earlyPath < protocolEnd) + { + protocolEnd = -1; } - internal static void ParseRobotsTxt(byte[] robotsBody, IRobotsParseHandler parseCallback) + if (protocolEnd == -1) { - var parser = new RobotsTxtParser(robotsBody, parseCallback); - parser.Parse(); + protocolEnd = searchStart; } - - internal static string GetPathParamsQuery(string url) + else { - var searchStart = 0; - if (url is ['/', '/', ..]) searchStart = 2; - var earlyPath = url.IndexOfAny(['/', '?', ';',], searchStart); - var protocolEnd = url.IndexOf("://", searchStart, StringComparison.Ordinal); - if (earlyPath < protocolEnd) - { - protocolEnd = -1; - } - - if (protocolEnd == -1) - { - protocolEnd = searchStart; - } - else - { - protocolEnd += 3; - } + protocolEnd += 3; + } - var pathStart = url.IndexOfAny(['/', '?', ';',], protocolEnd); - if (pathStart != -1) - { - var hashPos = url.IndexOf('#', searchStart); - if (hashPos >= 0 && hashPos < pathStart) return "/"; - var pathEnd = (hashPos == -1) ? url.Length : hashPos; - if (url[pathStart] != '/') - { - // Prepend a slash if the result would start e.g. with '?'. - return "/" + url.Substring(pathStart, pathEnd - pathStart); - } + var pathStart = url.IndexOfAny(['/', '?', ';',], protocolEnd); + if (pathStart == -1) return "/"; + var hashPos = url.IndexOf('#', searchStart); + if (hashPos >= 0 && hashPos < pathStart) return "/"; + var pathEnd = hashPos == -1 ? url.Length : hashPos; + return url[pathStart] != '/' + ? + // Prepend a slash if the result would start e.g. with '?'. + string.Concat("/", url.AsSpan(pathStart, pathEnd - pathStart)) + : url.Substring(pathStart, pathEnd - pathStart); + } - return url.Substring(pathStart, pathEnd - pathStart); - } + private class Match(int priority = Match.NoMatchPriority) + { + private const int NoMatchPriority = -1; - return "/"; + public void Set(int priority) + { + Priority = priority; } - private class Match(int priority = Match.NoMatchPriority) + public void Clear() { - private const int NoMatchPriority = -1; + Set(NoMatchPriority); + } - public void Set(int priority) - { - Priority = priority; - } + public int Priority { get; private set; } = priority; + } - public void Clear() - { - Set(NoMatchPriority); - } + // For each of the directives within user-agents, we keep global and specific + // match scores. + private class MatchHierarchy + { + public readonly Match Global = new(); // Match for '*' + public readonly Match Specific = new(); // Match for queried agent. - public int Priority { get; private set; } = priority; - } - // For each of the directives within user-agents, we keep global and specific - // match scores. - class MatchHierarchy + public void Clear() { - public readonly Match Global = new Match(); // Match for '*' - public readonly Match Specific = new Match(); // Match for queried agent. - - - public void Clear() - { - Global.Clear(); - Specific.Clear(); - } + Global.Clear(); + Specific.Clear(); } + } - readonly MatchHierarchy _allow = new MatchHierarchy(); // Characters of 'url' matching Allow. - readonly MatchHierarchy _disallow = new MatchHierarchy(); // Characters of 'url' matching Disallow. + private readonly MatchHierarchy _allow = new(); // Characters of 'url' matching Allow. + private readonly MatchHierarchy _disallow = new(); // Characters of 'url' matching Disallow. - bool _seenGlobalAgent; // True if processing global agent rules. - bool _seenSpecificAgent; // True if processing our specific agent. - bool _everSeenSpecificAgent; // True if we ever saw a block for our agent. - bool _seenSeparator; // True if saw any key: value pair. + private bool _seenGlobalAgent; // True if processing global agent rules. + private bool _seenSpecificAgent; // True if processing our specific agent. + private bool _everSeenSpecificAgent; // True if we ever saw a block for our agent. + private bool _seenSeparator; // True if saw any key: value pair. - // The path we want to pattern match. Set by InitUserAgentsAndPath. - byte[]? _path; - private List? _userAgents; // Set by InitUserAgentsAndPath. + // The path we want to pattern match. Set by InitUserAgentsAndPath. + private byte[]? _path; + private List? _userAgents; // Set by InitUserAgentsAndPath. - public bool OneAgentAllowedByRobots(byte[] robotsContent, byte[] userAgent, string url) - { - var userAgents = new List { userAgent, }; - return AllowedByRobots(robotsContent, userAgents, url); - } + public bool OneAgentAllowedByRobots(byte[] robotsContent, byte[] userAgent, string url) + { + var userAgents = new List { userAgent, }; + return AllowedByRobots(robotsContent, userAgents, url); + } - public static bool IsValidUserAgentToObey(Span userAgent) - { - return userAgent.Length > 0 && ExtractUserAgent(userAgent) == userAgent; - } - public static bool IsValidUserAgentToObey(string userAgent) - { - return IsValidUserAgentToObey(Encoding.UTF8.GetBytes(userAgent)); - } + public static bool IsValidUserAgentToObey(Span userAgent) + { + return userAgent.Length > 0 && ExtractUserAgent(userAgent) == userAgent; + } + public static bool IsValidUserAgentToObey(string userAgent) + { + return IsValidUserAgentToObey(Encoding.UTF8.GetBytes(userAgent)); } } diff --git a/RobotsTxt/RobotsTxtParser.cs b/RobotsTxt/RobotsTxtParser.cs index fcf4609..5fb8418 100644 --- a/RobotsTxt/RobotsTxtParser.cs +++ b/RobotsTxt/RobotsTxtParser.cs @@ -1,248 +1,244 @@ -namespace RobotsTxt +namespace RobotsTxt; + +public class RobotsTxtParser(byte[] robotsBody, IRobotsParseHandler handler) { - public class RobotsTxtParser(byte[] robotsBody, IRobotsParseHandler handler) - { - static readonly byte[] UtfBom = [0xEF, 0xBB, 0xBF]; - static readonly byte[] HexDigits = "0123456789ABCDEF"u8.ToArray(); + private static readonly byte[] UtfBom = [0xEF, 0xBB, 0xBF,]; + private static readonly byte[] HexDigits = "0123456789ABCDEF"u8.ToArray(); - public void Parse() + public void Parse() + { + // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's + // fairly safe to assume any valid line isn't going to be more than many times + // that max url length of 2KB. We want some padding for + // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. + // If so, we can ignore the chars on a line past that. + const int maxLineLen = 2083 * 8; + // Allocate a buffer used to process the current line. + var lineBuffer = new byte[maxLineLen]; + var linePos = 0; + var lineNum = 0; + var bomPos = 0; + var lastWasCarriageReturn = false; + handler.HandleRobotsStart(); + + foreach (var ch in robotsBody) { - // Certain browsers limit the URL length to 2083 bytes. In a robots.txt, it's - // fairly safe to assume any valid line isn't going to be more than many times - // that max url length of 2KB. We want some padding for - // UTF-8 encoding/nulls/etc. but a much smaller bound would be okay as well. - // If so, we can ignore the chars on a line past that. - const int maxLineLen = 2083 * 8; - // Allocate a buffer used to process the current line. - var lineBuffer = new byte[maxLineLen]; - var linePos = 0; - var lineNum = 0; - var bomPos = 0; - bool lastWasCarriageReturn = false; - handler.HandleRobotsStart(); - - foreach (var ch in robotsBody) + // Google-specific optimization: UTF-8 byte order marks should never + // appear in a robots.txt file, but they do nevertheless. Skipping + // possible BOM-prefix in the first bytes of the input. + if (bomPos < 3 && ch == UtfBom[bomPos++]) { - // Google-specific optimization: UTF-8 byte order marks should never - // appear in a robots.txt file, but they do nevertheless. Skipping - // possible BOM-prefix in the first bytes of the input. - if (bomPos < 3 && ch == UtfBom[bomPos++]) - { - continue; - } + continue; + } - bomPos = 3; - if (ch != '\n' && ch != '\r') + bomPos = 3; + if (ch != '\n' && ch != '\r') + { + // Non-line-ending char case. + // Put in next spot on current line, as long as there's room. + if (linePos < maxLineLen) { - // Non-line-ending char case. - // Put in next spot on current line, as long as there's room. - if (linePos < maxLineLen) - { - lineBuffer[linePos++] = ch; - } + lineBuffer[linePos++] = ch; } - else + } + else + { + // Line-ending character char case. + var span = lineBuffer.AsSpan(0, linePos); + // Only emit an empty line if this was not due to the second character + // of the DOS line-ending \r\n . + var isCrlfContinuation = span.Length == 0 && lastWasCarriageReturn && ch == '\n'; + if (!isCrlfContinuation) { - // Line-ending character char case. - var span = lineBuffer.AsSpan(0, linePos); - // Only emit an empty line if this was not due to the second character - // of the DOS line-ending \r\n . - bool isCrlfContinuation = span.Length == 0 && lastWasCarriageReturn && ch == '\n'; - if (!isCrlfContinuation) - { - ParseAndEmitLine(++lineNum, span); - } - - linePos = 0; - lastWasCarriageReturn = ch == '\r'; + ParseAndEmitLine(++lineNum, span); } + + linePos = 0; + lastWasCarriageReturn = ch == '\r'; } + } - var spanLeft = lineBuffer.AsSpan(0, linePos); - ParseAndEmitLine(++lineNum, spanLeft); - handler.HandleRobotsEnd(); + var spanLeft = lineBuffer.AsSpan(0, linePos); + ParseAndEmitLine(++lineNum, spanLeft); + handler.HandleRobotsEnd(); + } + + private void ParseAndEmitLine(int currentLine, ReadOnlySpan line) + { + if (!GetKeyAndValueFrom(out var stringKey, out var value, line)) + { + return; } - void ParseAndEmitLine(int currentLine, ReadOnlySpan line) + var key = new ParsedRobotsKey(); + key.Parse(stringKey); + if (NeedEscapeValueForKey(key)) { - if (!GetKeyAndValueFrom(out var stringKey, out var value, line)) - { - return; - } + var escapedValue = MaybeEscapePattern(value); + EmitKeyValueToHandler(currentLine, key, escapedValue); + } + else + { + EmitKeyValueToHandler(currentLine, key, value); + } + } + + private void EmitKeyValueToHandler(int currentLine, ParsedRobotsKey key, ReadOnlySpan value) + { + switch (key.Type) + { + case ParsedRobotsKey.KeyType.UserAgent: + handler.HandleUserAgent(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Sitemap: + handler.HandleSitemap(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Allow: + handler.HandleAllow(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Disallow: + handler.HandleDisallow(currentLine, value); + break; + case ParsedRobotsKey.KeyType.Unknown: + handler.HandleUnknownAction(currentLine, key.UnknownText, value); + break; + default: + throw new ArgumentOutOfRangeException(nameof(key)); + } + } - ParsedRobotsKey key = new ParsedRobotsKey(); - key.Parse(stringKey); - if (NeedEscapeValueForKey(key)) + public static ReadOnlySpan MaybeEscapePattern(ReadOnlySpan src) + { + var numToEscape = 0; + var needCapitalize = false; + for (var i = 0; i < src.Length; i++) + { + // (a) % escape sequence. + var c = src[i]; + if (c == '%' && i + 2 < src.Length && + (('a' <= src[i + 1] && src[i + 1] <= 'f') || ('a' <= src[i + 2] && src[i + 2] <= 'f'))) { - var escapedValue = MaybeEscapePattern(value); - EmitKeyValueToHandler(currentLine, key, escapedValue); + needCapitalize = true; + i += 2; } - else + // (b) needs escaping. + else if (c >= 0x80) { - EmitKeyValueToHandler(currentLine, key, value); + numToEscape += 1; } + // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). } - private void EmitKeyValueToHandler(int currentLine, ParsedRobotsKey key, ReadOnlySpan value) + if (numToEscape == 0 && !needCapitalize) { - switch (key.Type) - { - case ParsedRobotsKey.KeyType.UserAgent: - handler.HandleUserAgent(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Sitemap: - handler.HandleSitemap(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Allow: - handler.HandleAllow(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Disallow: - handler.HandleDisallow(currentLine, value); - break; - case ParsedRobotsKey.KeyType.Unknown: - handler.HandleUnknownAction(currentLine, key.UnknownText, value); - break; - default: - throw new ArgumentOutOfRangeException(); - } + return src; } - public static ReadOnlySpan MaybeEscapePattern(ReadOnlySpan src) + var dst = new byte[numToEscape * 2 + src.Length]; + var j = 0; + for (var i = 0; i < src.Length; i++) { - int numToEscape = 0; - bool needCapitalize = false; - for (int i = 0; i < src.Length; i++) + var c = src[i]; + if (c == '%' && i + 2 < src.Length && src[i + 1].IsXDigit() && src[i + 2].IsXDigit()) { - // (a) % escape sequence. - var c = src[i]; - if (c == '%' && i + 2 < src.Length && - (('a' <= src[i + 1] && src[i + 1] <= 'f') || ('a' <= src[i + 2] && src[i + 2] <= 'f'))) - { - needCapitalize = true; - i += 2; - } - // (b) needs escaping. - else if (c >= 0x80) - { - numToEscape += 1; - } - // (c) Already escaped and escape-characters normalized (eg. %2f -> %2F). + dst[j++] = src[i++]; + dst[j++] = src[i++].ToUpper(); + dst[j++] = src[i++].ToUpper(); } - - if (numToEscape == 0 && !needCapitalize) + else if (c >= 0x80) { - return src; + dst[j++] = (byte)'%'; + dst[j++] = HexDigits[(c >> 4) & 0xf]; + dst[j++] = HexDigits[c & 0xf]; } - - var dst = new byte[numToEscape * 2 + src.Length]; - var j = 0; - for (int i = 0; i < src.Length; i++) + else { - var c = src[i]; - if (c == '%' && i + 2 < src.Length && src[i + 1].IsXDigit() && src[i + 2].IsXDigit()) - { - dst[j++] = src[i++]; - dst[j++] = src[i++].ToUpper(); - dst[j++] = src[i++].ToUpper(); - } - else if (c >= 0x80) - { - dst[j++] = (byte)'%'; - dst[j++] = HexDigits[(c >> 4) & 0xf]; - dst[j++] = HexDigits[c & 0xf]; - } - else - { - dst[j++] = c; - } + dst[j++] = c; } - - return dst; } - private bool NeedEscapeValueForKey(ParsedRobotsKey key) + return dst; + } + + private static bool NeedEscapeValueForKey(ParsedRobotsKey key) + { + return key.Type switch { - switch (key.Type) - { - case ParsedRobotsKey.KeyType.UserAgent: - case ParsedRobotsKey.KeyType.Sitemap: - return false; - default: - return true; - } - } + ParsedRobotsKey.KeyType.UserAgent or ParsedRobotsKey.KeyType.Sitemap => false, + _ => true, + }; + } - internal static bool GetKeyAndValueFrom(out ReadOnlySpan key, out ReadOnlySpan value, - ReadOnlySpan line) + internal static bool GetKeyAndValueFrom(out ReadOnlySpan key, out ReadOnlySpan value, + ReadOnlySpan line) + { + var comment = line.IndexOf((byte)'#'); + if (comment != -1) { - var comment = line.IndexOf((byte)'#'); - if (comment != -1) - { - line = line.Slice(0, comment); - } + line = line[..comment]; + } - line = StripWhitespaceSlowly(line); + line = StripWhitespaceSlowly(line); - // Rules must match the following pattern: - // [ \t]*:[ \t]* - var sep = line.IndexOf((byte)':'); - if (sep == -1) + // Rules must match the following pattern: + // [ \t]*:[ \t]* + var sep = line.IndexOf((byte)':'); + if (sep == -1) + { + // Google-specific optimization: some people forget the colon, so we need to + // accept whitespace in its stead. + sep = line.IndexOfAny((byte)' ', (byte)'\t'); + if (sep != -1) { - // Google-specific optimization: some people forget the colon, so we need to - // accept whitespace in its stead. - sep = line.IndexOfAny((byte)' ', (byte)'\t'); - if (sep != -1) + var val = line[(sep + 1)..]; + if (val.IndexOfAny((byte)' ', (byte)'\t') != -1) { - var val = line.Slice(sep + 1); - if (val.IndexOfAny((byte)' ', (byte)'\t') != -1) - { - // We only accept whitespace as a separator if there are exactly two - // sequences of non-whitespace characters. If we get here, there were - // more than 2 such sequences since we stripped trailing whitespace - // above. - key = null; - value = null; - return false; - } + // We only accept whitespace as a separator if there are exactly two + // sequences of non-whitespace characters. If we get here, there were + // more than 2 such sequences since we stripped trailing whitespace + // above. + key = null; + value = null; + return false; } } + } - if (sep == -1) - { - key = null; - value = null; - return false; // Couldn't find a separator. - } - - key = line.Slice(0, sep); // Key starts at beginning of line. And stops at the separator. - key = StripWhitespaceSlowly(key); // Get rid of any trailing whitespace. - if (key.Length > 0) - { - value = line.Slice(sep + 1); // Value starts after the separator. - value = StripWhitespaceSlowly(value); // Get rid of any leading whitespace. - return true; - } - + if (sep == -1) + { + key = null; value = null; - return false; + return false; // Couldn't find a separator. } - internal static ReadOnlySpan StripWhitespaceSlowly(ReadOnlySpan s) + key = line[..sep]; // Key starts at beginning of line. And stops at the separator. + key = StripWhitespaceSlowly(key); // Get rid of any trailing whitespace. + if (key.Length > 0) { - int start, end; - for (start = 0; start < s.Length; start++) - { - if (s[start] != ' ' && s[start] != '\t') - break; - } + value = line[(sep + 1)..]; // Value starts after the separator. + value = StripWhitespaceSlowly(value); // Get rid of any leading whitespace. + return true; + } - for (end = s.Length; end > start; end--) - { - if (s[end - 1] != ' ' && s[end - 1] != '\t') - break; - } + value = null; + return false; + } + + internal static ReadOnlySpan StripWhitespaceSlowly(ReadOnlySpan s) + { + int start, end; + for (start = 0; start < s.Length; start++) + { + if (s[start] != ' ' && s[start] != '\t') + break; + } - return s.Slice(start, end - start); + for (end = s.Length; end > start; end--) + { + if (s[end - 1] != ' ' && s[end - 1] != '\t') + break; } + + return s.Slice(start, end - start); } } From 7bc4306410c53344d587b2fc905a27d1888f8255 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Thu, 9 Oct 2025 15:01:15 +0200 Subject: [PATCH 3/4] =?UTF-8?q?=F0=9F=90=9B=20.NETFramework=204.8=20compat?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Yves Bastide --- RobotsTxt/Extensions.cs | 14 ++++++++++++++ RobotsTxt/RobotsMatcher.cs | 4 ++++ 2 files changed, 18 insertions(+) diff --git a/RobotsTxt/Extensions.cs b/RobotsTxt/Extensions.cs index 107233d..b874a79 100644 --- a/RobotsTxt/Extensions.cs +++ b/RobotsTxt/Extensions.cs @@ -2,6 +2,20 @@ namespace RobotsTxt; public static class MyExtensions { +#if !NETCOREAPP + public static bool Contains(this ReadOnlySpan self, byte other) + { + foreach (var c in self) + { + if (c == other) + { + return true; + } + } + return false; + } +#endif + public static bool EqualsIgnoreCase(this ReadOnlySpan self, ReadOnlySpan other) { if (self.Length != other.Length) diff --git a/RobotsTxt/RobotsMatcher.cs b/RobotsTxt/RobotsMatcher.cs index c6c5603..9be04ad 100644 --- a/RobotsTxt/RobotsMatcher.cs +++ b/RobotsTxt/RobotsMatcher.cs @@ -243,7 +243,11 @@ internal static string GetPathParamsQuery(string url) return url[pathStart] != '/' ? // Prepend a slash if the result would start e.g. with '?'. +#if !NETCOREAPP + "/" + url.Substring(pathStart, pathEnd - pathStart) +#else string.Concat("/", url.AsSpan(pathStart, pathEnd - pathStart)) +#endif : url.Substring(pathStart, pathEnd - pathStart); } From 19acdd5aaa05c5bf9c155c17e7fc3f263d40dfe0 Mon Sep 17 00:00:00 2001 From: Yves Bastide Date: Thu, 9 Oct 2025 15:01:52 +0200 Subject: [PATCH 4/4] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20upgrade=20dependencies?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Yves Bastide --- RobotsTxt/RobotsTxt.csproj | 4 ++-- TestRobotsTxt/TestRobotsTxt.csproj | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/RobotsTxt/RobotsTxt.csproj b/RobotsTxt/RobotsTxt.csproj index 5f94b42..4ca386c 100644 --- a/RobotsTxt/RobotsTxt.csproj +++ b/RobotsTxt/RobotsTxt.csproj @@ -14,8 +14,8 @@ - - + + diff --git a/TestRobotsTxt/TestRobotsTxt.csproj b/TestRobotsTxt/TestRobotsTxt.csproj index 9493cfb..70211f6 100644 --- a/TestRobotsTxt/TestRobotsTxt.csproj +++ b/TestRobotsTxt/TestRobotsTxt.csproj @@ -11,13 +11,13 @@ - - - + + + runtime; build; native; contentfiles; analyzers; buildtransitive all - + runtime; build; native; contentfiles; analyzers; buildtransitive all