Skip to content

Commit

Permalink
Merge commit from fork
Browse files Browse the repository at this point in the history
* Security Patch

* Throw Exception for EBCDIC Encoding

* Mixed UTF-8 and UTF-16

Further mischief. I don't know if the examples truly are valid Xml, but PhpSpreadsheet is letting them sneak through.
  • Loading branch information
oleibman authored Nov 10, 2024
1 parent 7c973ab commit c04a938
Show file tree
Hide file tree
Showing 10 changed files with 77 additions and 36 deletions.
53 changes: 35 additions & 18 deletions src/PhpSpreadsheet/Reader/Security/XmlScanner.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@

class XmlScanner
{
private const ENCODING_PATTERN = '/encoding\\s*=\\s*(["\'])(.+?)\\1/s';
private const ENCODING_UTF7 = '/encoding\\s*=\\s*(["\'])UTF-7\\1/si';

private string $pattern;

/** @var ?callable */
Expand Down Expand Up @@ -36,29 +39,41 @@ private static function forceString(mixed $arg): string
private function toUtf8(string $xml): string
{
$charset = $this->findCharSet($xml);
$foundUtf7 = $charset === 'UTF-7';
if ($charset !== 'UTF-8') {
$testStart = '/^.{0,4}\\s*<?xml/s';
$startWithXml1 = preg_match($testStart, $xml);
$xml = self::forceString(mb_convert_encoding($xml, 'UTF-8', $charset));

$charset = $this->findCharSet($xml);
if ($charset !== 'UTF-8') {
throw new Reader\Exception('Suspicious Double-encoded XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
if ($startWithXml1 === 1 && preg_match($testStart, $xml) !== 1) {
throw new Reader\Exception('Double encoding not permitted');
}
$foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1);
$xml = preg_replace(self::ENCODING_PATTERN, '', $xml) ?? $xml;
} else {
$foundUtf7 = $foundUtf7 || (preg_match(self::ENCODING_UTF7, $xml) === 1);
}
if ($foundUtf7) {
throw new Reader\Exception('UTF-7 encoding not permitted');
}
if (substr($xml, 0, Reader\Csv::UTF8_BOM_LEN) === Reader\Csv::UTF8_BOM) {
$xml = substr($xml, Reader\Csv::UTF8_BOM_LEN);
}

return $xml;
}

private function findCharSet(string $xml): string
{
$patterns = [
'/encoding\\s*=\\s*"([^"]*]?)"/',
"/encoding\\s*=\\s*'([^']*?)'/",
];

foreach ($patterns as $pattern) {
if (preg_match($pattern, $xml, $matches)) {
return strtoupper($matches[1]);
}
if (substr($xml, 0, 4) === "\x4c\x6f\xa7\x94") {
throw new Reader\Exception('EBCDIC encoding not permitted');
}
$encoding = Reader\Csv::guessEncodingBom('', $xml);
if ($encoding !== '') {
return $encoding;
}
$xml = str_replace("\0", '', $xml);
if (preg_match(self::ENCODING_PATTERN, $xml, $matches)) {
return strtoupper($matches[2]);
}

return 'UTF-8';
Expand All @@ -71,13 +86,15 @@ private function findCharSet(string $xml): string
*/
public function scan($xml): string
{
// Don't rely purely on libxml_disable_entity_loader()
$pattern = '/\\0*' . implode('\\0*', str_split($this->pattern)) . '\\0*/';

$xml = "$xml";
if (preg_match($pattern, $xml)) {
throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
}

$xml = $this->toUtf8($xml);

// Don't rely purely on libxml_disable_entity_loader()
$pattern = '/\\0?' . implode('\\0?', str_split($this->pattern)) . '\\0?/';

if (preg_match($pattern, $xml)) {
throw new Reader\Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
}
Expand All @@ -90,7 +107,7 @@ public function scan($xml): string
}

/**
* Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks.
* Scan the XML for use of <!ENTITY to prevent XXE/XEE attacks.
*/
public function scanFile(string $filestream): string
{
Expand Down
12 changes: 2 additions & 10 deletions src/PhpSpreadsheet/Reader/Xml.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
use PhpOffice\PhpSpreadsheet\Settings;
use PhpOffice\PhpSpreadsheet\Shared\Date;
use PhpOffice\PhpSpreadsheet\Shared\File;
use PhpOffice\PhpSpreadsheet\Shared\StringHelper;
use PhpOffice\PhpSpreadsheet\Spreadsheet;
use PhpOffice\PhpSpreadsheet\Worksheet\SheetView;
use PhpOffice\PhpSpreadsheet\Worksheet\Worksheet;
Expand Down Expand Up @@ -91,7 +90,8 @@ public function canRead(string $filename): bool
];

// Open file
$data = file_get_contents($filename) ?: '';
$data = (string) file_get_contents($filename);
$data = $this->getSecurityScannerOrThrow()->scan($data);

// Why?
//$data = str_replace("'", '"', $data); // fix headers with single quote
Expand All @@ -106,14 +106,6 @@ public function canRead(string $filename): bool
}
}

// Retrieve charset encoding
if (preg_match('/<?xml.*encoding=[\'"](.*?)[\'"].*?>/m', $data, $matches)) {
$charSet = strtoupper($matches[1]);
if (preg_match('/^ISO-8859-\d[\dL]?$/i', $charSet) === 1) {
$data = StringHelper::convertEncoding($data, 'UTF-8', $charSet);
$data = (string) preg_replace('/(<?xml.*encoding=[\'"]).*?([\'"].*?>)/um', '$1' . 'UTF-8' . '$2', $data, 1);
}
}
$this->fileContents = $data;

return $valid;
Expand Down
48 changes: 40 additions & 8 deletions tests/PhpSpreadsheetTests/Reader/Security/XmlScannerTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,11 @@ public static function providerValidXML(): array
self::assertNotFalse($glob);
foreach ($glob as $file) {
$filename = realpath($file);
$expectedResult = file_get_contents($file);
$expectedResult = (string) file_get_contents($file);
if (preg_match('/UTF-16(LE|BE)?/', $file, $matches) == 1) {
$expectedResult = (string) mb_convert_encoding($expectedResult, 'UTF-8', $matches[0]);
$expectedResult = preg_replace('/encoding\\s*=\\s*[\'"]UTF-\\d+(LE|BE)?[\'"]/', '', $expectedResult) ?? $expectedResult;
}
$tests[basename($file)] = [$filename, $expectedResult];
}

Expand Down Expand Up @@ -132,19 +136,47 @@ public function testEncodingAllowsMixedCase(): void
self::assertSame($input, $output);
}

public function testUtf7Whitespace(): void
/**
* @dataProvider providerInvalidXlsx
*/
public function testInvalidXlsx(string $filename, string $message): void
{
$this->expectException(ReaderException::class);
$this->expectExceptionMessage('Double-encoded');
$this->expectExceptionMessage($message);
$reader = new Xlsx();
$reader->load('tests/data/Reader/XLSX/utf7white.dontuse');
$reader->load("tests/data/Reader/XLSX/$filename");
}

public function testUtf8Entity(): void
public static function providerInvalidXlsx(): array
{
return [
['utf7white.dontuse', 'UTF-7 encoding not permitted'],
['utf7quoteorder.dontuse', 'UTF-7 encoding not permitted'],
['utf8and16.dontuse', 'Double encoding not permitted'],
['utf8and16.entity.dontuse', 'Detected use of ENTITY'],
['utf8entity.dontuse', 'Detected use of ENTITY'],
['utf16entity.dontuse', 'Detected use of ENTITY'],
['ebcdic.dontuse', 'EBCDIC encoding not permitted'],
];
}

/**
* @dataProvider providerValidUtf16
*/
public function testValidUtf16(string $filename): void
{
$this->expectException(ReaderException::class);
$this->expectExceptionMessage('Detected use of ENTITY');
$reader = new Xlsx();
$reader->load('tests/data/Reader/XLSX/utf8entity.dontuse');
$spreadsheet = $reader->load("tests/data/Reader/XLSX/$filename");
$sheet = $spreadsheet->getActiveSheet();
self::assertSame(1, $sheet->getCell('A1')->getValue());
$spreadsheet->disconnectWorksheets();
}

public static function providerValidUtf16(): array
{
return [
['utf16be.xlsx'],
['utf16be.bom.xlsx'],
];
}
}
Binary file added tests/data/Reader/XLSX/ebcdic.dontuse
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf16be.bom.xlsx
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf16be.xlsx
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf16entity.dontuse
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf7quoteorder.dontuse
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf8and16.dontuse
Binary file not shown.
Binary file added tests/data/Reader/XLSX/utf8and16.entity.dontuse
Binary file not shown.

0 comments on commit c04a938

Please sign in to comment.