Skip to content

Commit

Permalink
Merge pull request #29 from nicoja/master
Browse files Browse the repository at this point in the history
Handle multiple user-agent
  • Loading branch information
freekmurze authored Dec 7, 2020
2 parents e4672f2 + cf7bedb commit fe32f62
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 8 deletions.
44 changes: 38 additions & 6 deletions src/RobotsTxt.php
Original file line number Diff line number Diff line change
Expand Up @@ -126,38 +126,60 @@ protected function getDisallowsPerUserAgent(string $content): array

$disallowsPerUserAgent = [];

$currentUserAgent = null;
$currentUserAgents = [];

$treatAllowDisallowLine = false;

foreach ($lines as $line) {
if ($this->isCommentLine($line)) {
if ($this->isComment($line)) {
continue;
}

if ($this->isEmptyLine($line)) {
continue;
}

if ($this->isUserAgentLine($line)) {
if ($treatAllowDisallowLine) {
$treatAllowDisallowLine = false;
$currentUserAgents = [];
}
$disallowsPerUserAgent[$this->parseUserAgent($line)] = [];

$currentUserAgent = &$disallowsPerUserAgent[$this->parseUserAgent($line)];
$currentUserAgents[] = &$disallowsPerUserAgent[$this->parseUserAgent($line)];

continue;
}

if ($currentUserAgent === null) {
if ($this->isDisallowLine($line)) {
$treatAllowDisallowLine = true;
}

if ($this->isAllowLine($line)) {
$treatAllowDisallowLine = true;
continue;
}

$disallowUrl = $this->parseDisallow($line);

$currentUserAgent[$disallowUrl] = $disallowUrl;
foreach ($currentUserAgents as &$currentUserAgent) {
$currentUserAgent[$disallowUrl] = $disallowUrl;
}
}

return $disallowsPerUserAgent;
}

protected function isCommentLine(string $line): bool
protected function isComment(string $line): bool
{
return strpos(trim($line), '#') === 0;
}

protected function isEmptyLine(string $line): bool
{
return trim($line) === '';
}

protected function isUserAgentLine(string $line): bool
{
return strpos(trim(strtolower($line)), 'user-agent') === 0;
Expand All @@ -173,6 +195,16 @@ protected function parseDisallow(string $line): string
return trim(substr_replace(strtolower(trim($line)), '', 0, 8), ': ');
}

protected function isDisallowLine(string $line): string
{
return trim(substr(str_replace(' ', '', strtolower(trim($line))), 0, 8), ': ') === 'disallow';
}

protected function isAllowLine(string $line): string
{
return trim(substr(str_replace(' ', '', strtolower(trim($line))), 0, 6), ': ') === 'allow';
}

/**
* @deprecated
*/
Expand Down
72 changes: 72 additions & 0 deletions tests/RobotsTxtTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -128,4 +128,76 @@ public function the_disallows_user_agent_check_is_case_insensitive()
$this->assertFalse($robots->allows('/no-agents', 'UserAgent007'));
$this->assertFalse($robots->allows('/no-agents', strtolower('UserAgent007')));
}

/** @test */
public function it_can_handle_multiple_user_agent_query_strings()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertFalse($robots->allows('/en/admin?print=true', 'UserAgent010'));
$this->assertFalse($robots->allows('/en/admin?print=true', 'UserAgent011'));
$this->assertTrue($robots->allows('/en/admin?print=true', 'UserAgent012'));
$this->assertTrue($robots->allows('/en/admin?print=true', 'UserAgent013'));
}

/** @test */
public function it_can_handle_multiple_user_agent_root_path()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertTrue($robots->allows('/', 'UserAgent010'));
$this->assertTrue($robots->allows('/', 'UserAgent011'));
$this->assertTrue($robots->allows('/', 'UserAgent012'));
$this->assertTrue($robots->allows('/', 'UserAgent013'));
}

/** @test */
public function it_can_handle_multiple_user_agent_first_in_list()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertTrue($robots->allows('/fr/ad', 'UserAgent010'));
$this->assertFalse($robots->allows('/fr/admin', 'UserAgent010'));
$this->assertTrue($robots->allows('/fr/admin/', 'UserAgent010'));
$this->assertTrue($robots->allows('/fr/admin?', 'UserAgent010'));
$this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent010'));
}

/** @test */
public function it_can_handle_multiple_user_agent_last_in_list()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertTrue($robots->allows('/fr/ad', 'UserAgent011'));
$this->assertFalse($robots->allows('/fr/admin', 'UserAgent011'));
$this->assertTrue($robots->allows('/fr/admin/', 'UserAgent011'));
$this->assertTrue($robots->allows('/fr/admin?', 'UserAgent011'));
$this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent011'));
}

/** @test */
public function it_can_handle_multiple_user_agent_first_in_list_with_empty_and_comment_lines()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertTrue($robots->allows('/fr/ad', 'UserAgent012'));
$this->assertTrue($robots->allows('/fr/admin', 'UserAgent012'));
$this->assertTrue($robots->allows('/fr/admin/', 'UserAgent012'));
$this->assertTrue($robots->allows('/fr/admin?', 'UserAgent012'));
$this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent012'));
$this->assertFalse($robots->allows('/es/admin-disallow/', 'UserAgent013'));
}

/** @test */
public function it_can_handle_multiple_user_agent_last_in_list_with_empty_and_comment_line()
{
$robots = RobotsTxt::readFrom(__DIR__.'/data/robots.txt');

$this->assertTrue($robots->allows('/fr/ad', 'UserAgent013'));
$this->assertTrue($robots->allows('/fr/admin', 'UserAgent013'));
$this->assertTrue($robots->allows('/fr/admin/', 'UserAgent013'));
$this->assertTrue($robots->allows('/fr/admin?', 'UserAgent013'));
$this->assertTrue($robots->allows('/fr/admin?test', 'UserAgent013'));
$this->assertFalse($robots->allows('/es/admin-disallow/', 'UserAgent013'));
}
}
23 changes: 22 additions & 1 deletion tests/data/robots.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,25 @@ User-agent: google
Disallow: /
User-agent: UserAgent007

Disallow: /no-agents
Disallow: /no-agents

User-agent: UserAgent010
User-agent: UserAgent011
Disallow: /*?print
Disallow: /nl/admin/
Disallow: /en/admin/*
Disallow: /fr/admin$
Disallow: /es/admin-disallow/

User-agent: UserAgent012

User-agent: UserAgent013

Allow: /*?print
Disallow: /nl/admin/

Disallow: /en/admin/*
Allow: /fr/admin$

#comment
Disallow: /es/admin-disallow/
2 changes: 1 addition & 1 deletion tests/server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@
"author": "",
"license": "MIT",
"dependencies": {
"express": "^4.13.3"
"express": "^4.17.1"
}
}

0 comments on commit fe32f62

Please sign in to comment.