diff --git a/src/PhpSpreadsheet/Reader/Security/XmlScanner.php b/src/PhpSpreadsheet/Reader/Security/XmlScanner.php index 362a70c127..be64981920 100644 --- a/src/PhpSpreadsheet/Reader/Security/XmlScanner.php +++ b/src/PhpSpreadsheet/Reader/Security/XmlScanner.php @@ -6,6 +6,9 @@ class XmlScanner { + private const ENCODING_PATTERN = '/encoding\\s*=\\s*(["\'])(.+?)\\1/s'; + private const ENCODING_UTF7 = '/encoding\\s*=\\s*(["\'])UTF-7\\1/si'; + private string $pattern; /** @var ?callable */ @@ -36,13 +39,24 @@ private static function forceString(mixed $arg): string private function toUtf8(string $xml): string { $charset = $this->findCharSet($xml); + $foundUtf7 = $charset === 'UTF-7'; if ($charset !== 'UTF-8') { + $testStart = '/^.{0,4}\\s*findCharSet($xml); - if ($charset !== 'UTF-8') { - throw new Reader\Exception('Suspicious Double-encoded XML, spreadsheet file load() aborted to prevent XXE/XEE attacks'); + if ($startWithXml1 === 1 && preg_match($testStart, $xml) !== 1) { + throw new Reader\Exception('Double encoding not permitted'); } + $foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1); + $xml = preg_replace(self::ENCODING_PATTERN, '', $xml) ?? $xml; + } else { + $foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1); + } + if ($foundUtf7) { + throw new Reader\Exception('UTF-7 encoding not permitted'); + } + if (substr($xml, 0, Reader\Csv::UTF8_BOM_LEN) === Reader\Csv::UTF8_BOM) { + $xml = substr($xml, Reader\Csv::UTF8_BOM_LEN); } return $xml; @@ -50,15 +64,16 @@ private function toUtf8(string $xml): string private function findCharSet(string $xml): string { - $patterns = [ - '/encoding\\s*=\\s*"([^"]*]?)"/', - "/encoding\\s*=\\s*'([^']*?)'/", - ]; - - foreach ($patterns as $pattern) { - if (preg_match($pattern, $xml, $matches)) { - return strtoupper($matches[1]); - } + if (substr($xml, 0, 4) === "\x4c\x6f\xa7\x94") { + throw new Reader\Exception('EBCDIC encoding not permitted'); + } + $encoding = Reader\Csv::guessEncodingBom('', $xml); + if ($encoding !== '') { + return $encoding; + } + $xml = str_replace("\0", '', $xml); + if (preg_match(self::ENCODING_PATTERN, $xml, $matches)) { + return strtoupper($matches[2]); } return 'UTF-8'; @@ -71,13 +86,15 @@ private function findCharSet(string $xml): string */ public function scan($xml): string { + // Don't rely purely on libxml_disable_entity_loader() + $pattern = '/\\0*' . implode('\\0*', str_split($this->pattern)) . '\\0*/'; + $xml = "$xml"; + if (preg_match($pattern, $xml)) { + throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks'); + } $xml = $this->toUtf8($xml); - - // Don't rely purely on libxml_disable_entity_loader() - $pattern = '/\\0?' . implode('\\0?', str_split($this->pattern)) . '\\0?/'; - if (preg_match($pattern, $xml)) { throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks'); } @@ -90,7 +107,7 @@ public function scan($xml): string } /** - * Scan theXML for use of getSecurityScannerOrThrow()->scan($data); // Why? //$data = str_replace("'", '"', $data); // fix headers with single quote @@ -106,14 +106,6 @@ public function canRead(string $filename): bool } } - // Retrieve charset encoding - if (preg_match('//m', $data, $matches)) { - $charSet = strtoupper($matches[1]); - if (preg_match('/^ISO-8859-\d[\dL]?$/i', $charSet) === 1) { - $data = StringHelper::convertEncoding($data, 'UTF-8', $charSet); - $data = (string) preg_replace('/()/um', '$1' . 'UTF-8' . '$2', $data, 1); - } - } $this->fileContents = $data; return $valid; diff --git a/tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php b/tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php index aa214b4a06..9299161f0e 100644 --- a/tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php +++ b/tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php @@ -30,7 +30,11 @@ public static function providerValidXML(): array self::assertNotFalse($glob); foreach ($glob as $file) { $filename = realpath($file); - $expectedResult = file_get_contents($file); + $expectedResult = (string) file_get_contents($file); + if (preg_match('/UTF-16(LE|BE)?/', $file, $matches) == 1) { + $expectedResult = (string) mb_convert_encoding($expectedResult, 'UTF-8', $matches[0]); + $expectedResult = preg_replace('/encoding\\s*=\\s*[\'"]UTF-\\d+(LE|BE)?[\'"]/', '', $expectedResult) ?? $expectedResult; + } $tests[basename($file)] = [$filename, $expectedResult]; } @@ -132,19 +136,47 @@ public function testEncodingAllowsMixedCase(): void self::assertSame($input, $output); } - public function testUtf7Whitespace(): void + /** + * @dataProvider providerInvalidXlsx + */ + public function testInvalidXlsx(string $filename, string $message): void { $this->expectException(ReaderException::class); - $this->expectExceptionMessage('Double-encoded'); + $this->expectExceptionMessage($message); $reader = new Xlsx(); - $reader->load('tests/data/Reader/XLSX/utf7white.dontuse'); + $reader->load("tests/data/Reader/XLSX/$filename"); } - public function testUtf8Entity(): void + public static function providerInvalidXlsx(): array + { + return [ + ['utf7white.dontuse', 'UTF-7 encoding not permitted'], + ['utf7quoteorder.dontuse', 'UTF-7 encoding not permitted'], + ['utf8and16.dontuse', 'Double encoding not permitted'], + ['utf8and16.entity.dontuse', 'Detected use of ENTITY'], + ['utf8entity.dontuse', 'Detected use of ENTITY'], + ['utf16entity.dontuse', 'Detected use of ENTITY'], + ['ebcdic.dontuse', 'EBCDIC encoding not permitted'], + ]; + } + + /** + * @dataProvider providerValidUtf16 + */ + public function testValidUtf16(string $filename): void { - $this->expectException(ReaderException::class); - $this->expectExceptionMessage('Detected use of ENTITY'); $reader = new Xlsx(); - $reader->load('tests/data/Reader/XLSX/utf8entity.dontuse'); + $spreadsheet = $reader->load("tests/data/Reader/XLSX/$filename"); + $sheet = $spreadsheet->getActiveSheet(); + self::assertSame(1, $sheet->getCell('A1')->getValue()); + $spreadsheet->disconnectWorksheets(); + } + + public static function providerValidUtf16(): array + { + return [ + ['utf16be.xlsx'], + ['utf16be.bom.xlsx'], + ]; } } diff --git a/tests/data/Reader/XLSX/ebcdic.dontuse b/tests/data/Reader/XLSX/ebcdic.dontuse new file mode 100644 index 0000000000..9d366bd127 Binary files /dev/null and b/tests/data/Reader/XLSX/ebcdic.dontuse differ diff --git a/tests/data/Reader/XLSX/utf16be.bom.xlsx b/tests/data/Reader/XLSX/utf16be.bom.xlsx new file mode 100644 index 0000000000..d18c3600e6 Binary files /dev/null and b/tests/data/Reader/XLSX/utf16be.bom.xlsx differ diff --git a/tests/data/Reader/XLSX/utf16be.xlsx b/tests/data/Reader/XLSX/utf16be.xlsx new file mode 100644 index 0000000000..6bcaf0767d Binary files /dev/null and b/tests/data/Reader/XLSX/utf16be.xlsx differ diff --git a/tests/data/Reader/XLSX/utf16entity.dontuse b/tests/data/Reader/XLSX/utf16entity.dontuse new file mode 100644 index 0000000000..2462cb870a Binary files /dev/null and b/tests/data/Reader/XLSX/utf16entity.dontuse differ diff --git a/tests/data/Reader/XLSX/utf7quoteorder.dontuse b/tests/data/Reader/XLSX/utf7quoteorder.dontuse new file mode 100644 index 0000000000..1ad4adab22 Binary files /dev/null and b/tests/data/Reader/XLSX/utf7quoteorder.dontuse differ diff --git a/tests/data/Reader/XLSX/utf8and16.dontuse b/tests/data/Reader/XLSX/utf8and16.dontuse new file mode 100644 index 0000000000..e95fecdc04 Binary files /dev/null and b/tests/data/Reader/XLSX/utf8and16.dontuse differ diff --git a/tests/data/Reader/XLSX/utf8and16.entity.dontuse b/tests/data/Reader/XLSX/utf8and16.entity.dontuse new file mode 100644 index 0000000000..b604458d0a Binary files /dev/null and b/tests/data/Reader/XLSX/utf8and16.entity.dontuse differ