From 3ebebdcdc176408396a70a341e67e01a82f4aaba Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:15:36 -0500 Subject: [PATCH 1/6] Added support for Yahoo Gemini's crawler --- Tests/fixtures/bots.yml | 5 ++++- regexes/bots.yml | 8 +++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 72cb8504f9..429ecb572e 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -695,4 +695,7 @@ name: 'Tiny Tiny RSS' - user_agent: 'Tiny Tiny RSS/1.11.4c63934 (http://tt-rss.org/)' - name: 'Tiny Tiny RSS' \ No newline at end of file + name: 'Tiny Tiny RSS' +- + user_agent: 'Mozilla/5.0 (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)' + name: 'Yahoo Gemini' diff --git a/regexes/bots.yml b/regexes/bots.yml index 9ec5f9ed1e..495413a71d 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -933,7 +933,13 @@ name: 'Yottaa' url: 'http://www.yottaa.com/' - +- regex: 'Yahoo Ad monitoring.*yahoo-ad-monitoring-SLN24857.*' + name: 'Yahoo Gemini' + category: 'Crawler' + url: 'https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html' + producer: + name: 'Yahoo! Inc.' + url: 'http://www.yahoo.com' - regex: 'lycos' name: 'Lycos' From 8461a3512b82ea0baf6a1bee63fc38d0cae7dcdd Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:16:11 -0500 Subject: [PATCH 2/6] Added support for Outbrain's crawler --- Tests/fixtures/bots.yml | 3 +++ regexes/bots.yml | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 429ecb572e..e3d5fbed91 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -699,3 +699,6 @@ - user_agent: 'Mozilla/5.0 (compatible; Yahoo Ad monitoring; https://help.yahoo.com/kb/yahoo-ad-monitoring-SLN24857.html)' name: 'Yahoo Gemini' +- + user_agent: 'Mozilla/5.0 (Java) outbrain' + name: 'Outbrain' \ No newline at end of file diff --git a/regexes/bots.yml b/regexes/bots.yml index 495413a71d..eb5b4c9d8b 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -941,6 +941,15 @@ name: 'Yahoo! Inc.' url: 'http://www.yahoo.com' +- regex: '.*Java.*outbrain' + name: 'Outbrain' + category: 'Crawler' + url: '' + producer: + name: 'Outbrain' + url: 'http://www.outbrain.com/' + + - regex: 'lycos' name: 'Lycos' From 88c442bc7e337bbefd19d74bbed03d0efae99cd1 Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:28:56 -0500 Subject: [PATCH 3/6] Added support for HubPages --- Tests/fixtures/bots.yml | 5 ++++- regexes/bots.yml | 7 +++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index e3d5fbed91..96b786a297 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -701,4 +701,7 @@ name: 'Yahoo Gemini' - user_agent: 'Mozilla/5.0 (Java) outbrain' - name: 'Outbrain' \ No newline at end of file + name: 'Outbrain' +- + user_agent: 'HubPages V0.2.2 (http://hubpages.com/help/crawlingpolicy)' + name: 'HubPages' \ No newline at end of file diff --git a/regexes/bots.yml b/regexes/bots.yml index eb5b4c9d8b..f7fcfa9b20 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -949,6 +949,13 @@ name: 'Outbrain' url: 'http://www.outbrain.com/' +- regex: 'HubPages.*crawlingpolicy' + name: 'HubPages' + category: 'Crawler' + url: 'http://hubpages.com/help/crawlingpolicy' + producer: + name: 'HubPages' + url: 'http://hubpages.com/' - regex: 'lycos' name: 'Lycos' From 08b162243cbc07a02f68a691e76ad23340f65f20 Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:30:25 -0500 Subject: [PATCH 4/6] Added support for ADMantX's crawler --- Tests/fixtures/bots.yml | 5 ++++- regexes/bots.yml | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 96b786a297..8cfbdbadac 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -704,4 +704,7 @@ name: 'Outbrain' - user_agent: 'HubPages V0.2.2 (http://hubpages.com/help/crawlingpolicy)' - name: 'HubPages' \ No newline at end of file + name: 'HubPages' +- + user_agent: 'ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com' + name: 'ADMantX' \ No newline at end of file diff --git a/regexes/bots.yml b/regexes/bots.yml index f7fcfa9b20..bc08508aa4 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -1002,6 +1002,8 @@ - regex: 'NetLyzer FastProbe' name: 'NetLyzer FastProbe' +- regex: 'AdMantX.*admantx.com' + name: 'ADMantX' - regex: '(nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex|zao|zeal|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Catchpoint bot|Google SketchUp|Read%20Later|Minimo|RackspaceBot)' name: 'Bot' From e1ac0ebc4571cd12a195899bbd18d29e4fd80066 Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:35:00 -0500 Subject: [PATCH 5/6] Added support for Pinterest's bot --- Tests/fixtures/bots.yml | 5 ++++- regexes/bots.yml | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index 8cfbdbadac..bbd5ac09a7 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -707,4 +707,7 @@ name: 'HubPages' - user_agent: 'ADmantX Platform Semantic Analyzer - ADmantX Inc. - www.admantx.com - support@admantx.com' - name: 'ADMantX' \ No newline at end of file + name: 'ADMantX' +- + user_agent: 'Pinterest/0.2 (+http://www.pinterest.com/)' + name: 'Pinterest' diff --git a/regexes/bots.yml b/regexes/bots.yml index bc08508aa4..a5e6a13700 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -957,6 +957,14 @@ name: 'HubPages' url: 'http://hubpages.com/' +- regex: 'Pinterest/\d\.\d.*www.pinterest.com.*' + name: 'Pinterest' + url: '' + category: 'Crawler' + producer: + name: 'Pinterest' + url: 'http://www.pinterest.com/' + - regex: 'lycos' name: 'Lycos' From aee0a818273ce9c8880bce33782514db13352484 Mon Sep 17 00:00:00 2001 From: Angelo Vargas Date: Fri, 3 Apr 2015 18:37:48 -0500 Subject: [PATCH 6/6] Added support for Server Density --- Tests/fixtures/bots.yml | 3 +++ regexes/bots.yml | 3 +++ 2 files changed, 6 insertions(+) diff --git a/Tests/fixtures/bots.yml b/Tests/fixtures/bots.yml index bbd5ac09a7..51d26a6a0d 100644 --- a/Tests/fixtures/bots.yml +++ b/Tests/fixtures/bots.yml @@ -711,3 +711,6 @@ - user_agent: 'Pinterest/0.2 (+http://www.pinterest.com/)' name: 'Pinterest' +- + user_agent: 'Server Density Service Monitoring v2' + name: 'Server Density' diff --git a/regexes/bots.yml b/regexes/bots.yml index a5e6a13700..0014ed0e76 100644 --- a/regexes/bots.yml +++ b/regexes/bots.yml @@ -1013,6 +1013,9 @@ - regex: 'AdMantX.*admantx.com' name: 'ADMantX' +- regex: 'Server Density Service Monitoring.*' + name: 'Server Density' + - regex: '(nuhk|TsolCrawler|Yammybot|Openbot|Gulper Web Bot|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|teoma|blitzbot|oegp|furlbot|http%20client|polybot|htdig|mogimogi|larbin|scrubby|searchsight|seekbot|semanticdiscovery|snappy|vortex|zao|zeal|fast-webcrawler|converacrawler|dataparksearch|findlinks|BrowserMob|HttpMonitor|ThumbShotsBot|URL2PNG|ZooShot|GomezA|Catchpoint bot|Google SketchUp|Read%20Later|Minimo|RackspaceBot)' name: 'Bot'