diff --git a/src/Smalot/PdfParser/RawData/RawDataParser.php b/src/Smalot/PdfParser/RawData/RawDataParser.php index 7d6d45f3..ec1c6a59 100644 --- a/src/Smalot/PdfParser/RawData/RawDataParser.php +++ b/src/Smalot/PdfParser/RawData/RawDataParser.php @@ -631,58 +631,60 @@ protected function getRawObject($pdfData, $offset = 0) case '[': // \x5B LEFT SQUARE BRACKET case ']': // \x5D RIGHT SQUARE BRACKET - // array object - $objtype = $char; - ++$offset; - if ('[' == $char) { + // array object + $objtype = $char; + ++$offset; + if ('[' == $char) { + // get array content + $objval = []; + do { + $oldOffset = $offset; + // get element + $element = $this->getRawObject($pdfData, $offset); + $offset = $element[2]; + $objval[] = $element; + } while ((']' != $element[0]) && ($offset != $oldOffset)); + // remove closing delimiter + array_pop($objval); + } + break; + + case '<': // \x3C LESS-THAN SIGN + case '>': // \x3E GREATER-THAN SIGN + if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) { + // dictionary object + $objtype = $char.$char; + $offset += 2; + if ('<' == $char) { // get array content $objval = []; do { + $oldOffset = $offset; // get element $element = $this->getRawObject($pdfData, $offset); $offset = $element[2]; $objval[] = $element; - } while (']' != $element[0]); + } while (('>>' != $element[0]) && ($offset != $oldOffset)); // remove closing delimiter array_pop($objval); } - break; - - case '<': // \x3C LESS-THAN SIGN - case '>': // \x3E GREATER-THAN SIGN - if (isset($pdfData[($offset + 1)]) && ($pdfData[($offset + 1)] == $char)) { - // dictionary object - $objtype = $char.$char; - $offset += 2; - if ('<' == $char) { - // get array content - $objval = []; - do { - // get element - $element = $this->getRawObject($pdfData, $offset); - $offset = $element[2]; - $objval[] = $element; - } while ('>>' != $element[0]); - // remove closing delimiter - array_pop($objval); - } - } else { - // hexadecimal string object - $objtype = $char; - ++$offset; - $pregResult = preg_match( + } else { + // hexadecimal string object + $objtype = $char; + ++$offset; + $pregResult = preg_match( '/^([0-9A-Fa-f\x09\x0a\x0c\x0d\x20]+)>/iU', substr($pdfData, $offset), $matches ); - if (('<' == $char) && 1 == $pregResult) { - // remove white space characters - $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", ''); - $offset += \strlen($matches[0]); - } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) { - $offset = $endpos + 1; - } + if (('<' == $char) && 1 == $pregResult) { + // remove white space characters + $objval = strtr($matches[1], "\x09\x0a\x0c\x0d\x20", ''); + $offset += \strlen($matches[0]); + } elseif (false !== ($endpos = strpos($pdfData, '>', $offset))) { + $offset = $endpos + 1; } + } break; default: diff --git a/tests/Integration/RawData/RawDataParserTest.php b/tests/Integration/RawData/RawDataParserTest.php new file mode 100644 index 00000000..de2c5af0 --- /dev/null +++ b/tests/Integration/RawData/RawDataParserTest.php @@ -0,0 +1,87 @@ + + * @date 2020-06-01 + * + * @author Sébastien MALOT + * @date 2017-01-03 + * + * @license LGPLv3 + * @url + * + * PdfParser is a pdf library written in PHP, extraction oriented. + * Copyright (C) 2017 - Sébastien MALOT + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. + * If not, see . + */ + +namespace Tests\Smalot\PdfParser\Integration\RawData; + +use Smalot\PdfParser\RawData\RawDataParser; +use Tests\Smalot\PdfParser\TestCase; + +class RawDataParserHelper extends RawDataParser +{ + /** + * Expose protected function "getRawObject". + */ + public function exposeGetRawObject($pdfData, $offset = 0) + { + return $this->getRawObject($pdfData, $offset); + } +} + +class RawDataParserTest extends TestCase +{ + protected function setUp() + { + parent::setUp(); + + $this->fixture = new RawDataParserHelper(); + } + + /** + * Tests buggy behavior of getRawObject. + * + * When PDF has corrupted xref table getRawObject may run into an infinite loop. + * + * @see https://github.com/smalot/pdfparser/issues/372 + * @see https://github.com/smalot/pdfparser/pull/377 + */ + public function testGetRawObjectIssue372() + { + // The following $data content is a minimal example to trigger the infinite loop + $data = '<>'; + + // calling "getRawObject" via "exposeGetRawObject" would result in an infinite loop + // if the fix is not there. + $result = $this->fixture->exposeGetRawObject($data); + + $this->assertEquals( + [ + '<<', + [ + ['/', 'Producer', 11], + ['(', 'eDkºãa˜þõ‚LÅòÕ�PïÙ��', 52], + ], + 52, + ], + $result + ); + } +}