diff --git a/.gitignore b/.gitignore index 0973003..baf74ba 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,5 @@ vendor/* composer.lock composer.phar -build/logs/coverage.xml +codeclimate.json +build/logs/* diff --git a/phpunit.xml b/phpunit.xml index 552a2dd..a59c8a7 100644 --- a/phpunit.xml +++ b/phpunit.xml @@ -1,17 +1,12 @@ - + - test/AtSymbol.php - test/CrawlDelay.php - test/EmptyDisallow.php - test/Host.php - test/CleanParam.php - test/Whitespaces.php - test/Comments.php + test/cases diff --git a/robotstxtparser.php b/source/robotstxtparser.php similarity index 76% rename from robotstxtparser.php rename to source/robotstxtparser.php index 52193f0..9682aaa 100644 --- a/robotstxtparser.php +++ b/source/robotstxtparser.php @@ -72,7 +72,7 @@ public function __construct($content, $encoding = self::DEFAULT_ENCODING) // Ensure that there's a newline at the end of the file, otherwise the // last line is ignored - $this->content .= "\n"; + $this->content .= PHP_EOL; // set default state $this->state = self::STATE_ZERO_POINT; @@ -81,27 +81,6 @@ public function __construct($content, $encoding = self::DEFAULT_ENCODING) $this->prepareRules(); } - public function getRules($userAgent = NULL) - { - if (is_null($userAgent)) { - //return all rules - return $this->rules; - } - else { - if (isset($this->rules[$userAgent])) { - return $this->rules[$userAgent]; - } - else { - return array(); - } - } - } - - public function getContent() - { - return $this->content; - } - // signals /** @@ -151,10 +130,10 @@ protected function lineSeparator() { */ protected function newLine() { - return ($this->current_char == "\n" - || $this->current_word == "\r\n" - || $this->current_word == "\n\r" - ); + return in_array(PHP_EOL, array( + $this->current_char, + $this->current_word + )); } /** @@ -248,6 +227,7 @@ protected function zeroPoint() /** * Read directive + * * @return RobotsTxtParser */ protected function readDirective() @@ -275,6 +255,7 @@ protected function readDirective() /** * Skip space + * * @return RobotsTxtParser */ protected function skipSpace() @@ -314,41 +295,79 @@ protected function readValue() return $this; } + /** + * Add value to directive based on the directive type + */ private function addValueToDirective() - { - if ($this->current_directive == self::DIRECTIVE_USERAGENT) - { - if (empty($this->rules[$this->current_word])) { - $this->rules[$this->current_word] = array(); - } - $this->userAgent = $this->current_word; - } - elseif ($this->current_directive == self::DIRECTIVE_CRAWL_DELAY) - { - $this->rules[$this->userAgent][$this->current_directive] = (double) $this->current_word; - } - elseif ($this->current_directive == self::DIRECTIVE_SITEMAP) { - $this->rules[$this->userAgent][$this->current_directive][] = $this->current_word; - } - elseif ($this->current_directive == self::DIRECTIVE_CLEAN_PARAM) { - $this->rules[$this->userAgent][$this->current_directive][] = $this->current_word; - } - elseif ($this->current_directive == self::DIRECTIVE_HOST) { - $this->rules[$this->userAgent][$this->current_directive] = $this->current_word; - } - else { - if (!empty($this->current_word)) { - if ($this->current_directive == self::DIRECTIVE_ALLOW - || $this->current_directive == self::DIRECTIVE_DISALLOW - ) { - $this->current_word = "/".ltrim($this->current_word, '/'); - } - $this->rules[$this->userAgent][$this->current_directive][] = self::prepareRegexRule($this->current_word); - } - } - $this->current_word = ""; - $this->switchState(self::STATE_ZERO_POINT); - } + { + switch ($this->current_directive) + { + case self::DIRECTIVE_USERAGENT: + $this->setUserAgent($this->current_word); + break; + + case self::DIRECTIVE_CRAWL_DELAY: + $this->addRule("floatval", false); + break; + + case self::DIRECTIVE_SITEMAP: + case self::DIRECTIVE_CLEAN_PARAM: + $this->addRule(); + break; + + case self::DIRECTIVE_HOST: + $this->addRule("trim", false); + break; + + case self::DIRECTIVE_ALLOW: + case self::DIRECTIVE_DISALLOW: + if (empty($this->current_word)) { + break; + } + $this->addRule("self::prepareRegexRule"); + break; + } + + // clean-up + $this->current_word = ""; + $this->switchState(self::STATE_ZERO_POINT); + } + + /** + * Set current user agent + * @param string $newAgent + */ + private function setUserAgent($newAgent = "*") + { + $this->userAgent = $newAgent; + + // create empty array if not there yet + if (empty($this->rules[$this->userAgent])) { + $this->rules[$this->userAgent] = array(); + } + } + + /** + * Prepare rule value and set the one + * @param callable $convert + * @param bool $append + * @return void + */ + private function addRule($convert = null, $append = true) + { + // convert value + $value = (!is_null($convert)) + ? call_user_func($convert, $this->current_word) + : $this->current_word; + + // set to rules + if ($append === true) { + $this->rules[$this->userAgent][$this->current_directive][] = $value; + } + else { + $this->rules[$this->userAgent][$this->current_directive] = $value; + } + } /** * Machine step @@ -383,13 +402,14 @@ protected function step() /** * Convert robots.txt rules to php regex - * + * * @link https://developers.google.com/webmasters/control-crawl-index/docs/robots_txt * @param string $value * @return string */ protected function prepareRegexRule($value) { + $value = "/" . ltrim($value, '/'); $value = str_replace('$', '\$', $value); $value = str_replace('?', '\?', $value); $value = str_replace('.', '\.', $value); @@ -509,4 +529,32 @@ public function getSitemaps($userAgent = '*') return $this->rules[$userAgent][self::DIRECTIVE_SITEMAP]; } + + /** + * Get rules based on user agent + * + * @param string|null $userAgent + * @return array + */ + public function getRules($userAgent = null) + { + // return all rules + if (is_null($userAgent)) { + return $this->rules; + } + elseif (isset($this->rules[$userAgent])) { + return $this->rules[$userAgent]; + } + else { + return array(); + } + } + + /** + * @return string + */ + public function getContent() + { + return $this->content; + } } diff --git a/test/bootstrap.php b/test/bootstrap.php new file mode 100644 index 0000000..2f94b84 --- /dev/null +++ b/test/bootstrap.php @@ -0,0 +1,2 @@ +assertInstanceOf('RobotsTxtParser', $parser); - $rules = $parser->getRules('*'); - $this->assertEmpty($rules, 'expected remove comments'); } @@ -38,9 +25,7 @@ public function testRemoveCommentsFromValue($robotsTxtContent, $expectedDisallow { $parser = new RobotsTxtParser($robotsTxtContent); $this->assertInstanceOf('RobotsTxtParser', $parser); - $rules = $parser->getRules('*'); - $this->assertNotEmpty($rules, 'expected data'); $this->assertArrayHasKey('disallow', $rules); $this->assertNotEmpty($rules['disallow'], 'disallow expected'); @@ -82,7 +67,7 @@ public function generateDataFor2Test() return array( array( "User-agent: * - Disallow: /tech #comment", + Disallow: /tech #comment", 'disallowValue' => '/tech', ), ); diff --git a/test/CrawlDelay.php b/test/cases/CrawlDelay.php similarity index 79% rename from test/CrawlDelay.php rename to test/cases/CrawlDelay.php index 050e410..67b1d81 100644 --- a/test/CrawlDelay.php +++ b/test/cases/CrawlDelay.php @@ -1,17 +1,6 @@