From 209c5673e538e026abfd108c30ebaed71f2faaec Mon Sep 17 00:00:00 2001 From: Rene Fischer Date: Wed, 14 Jul 2021 11:00:43 +0200 Subject: [PATCH] ITST-20737: XLSX reader incorrectly handles empty elements - Fix - Refactor $row_open logic into $reader_points_at_new_row logic to simplify the code and reduce code duplication. - Add test EmptyRowsTest to cover the failing behavior. - Sidechange: Extend __construct() phpdoc with missing note regarding OutputColumnNames parameter. --- lib/Reader.php | 87 ++++++++++++++++++----------------------- tests/EmptyRowsTest.php | 37 ++++++++++++++++++ 2 files changed, 74 insertions(+), 50 deletions(-) create mode 100644 tests/EmptyRowsTest.php diff --git a/lib/Reader.php b/lib/Reader.php index 40f63a4..16f0be7 100644 --- a/lib/Reader.php +++ b/lib/Reader.php @@ -53,8 +53,8 @@ class Reader implements Iterator, Countable /** @var bool Internal storage for the result of the valid() method related to the Iterator interface. */ private $valid = false; - /** @var bool Whether the reader is currently looking at an element within a node. */ - private $row_open = false; + /** @var bool Whether the reader is currently pointing at a starting node. */ + private $reader_points_at_new_row = false; /** @var int Current row number in the file. */ private $row_number = 0; @@ -73,6 +73,8 @@ class Reader implements Iterator, Countable * If true, row content will not contain empty cells * - SharedStringsConfiguration (SharedStringsConfiguration) * Configuration options to control shared string reading and caching behaviour + * - OutputColumnNames (bool) + * If true, output columns will use Excel-style column names (A-ZZ) instead of numbers as column keys. * * @throws Exception * @throws RuntimeException @@ -236,7 +238,7 @@ public function rewind() $this->worksheet_reader->open($this->worksheet_path); $this->valid = true; - $this->row_open = false; + $this->reader_points_at_new_row = false; $this->current_row = false; $this->row_number = 0; } @@ -267,59 +269,45 @@ public function current() public function next() { $this->row_number++; - $this->current_row = array(); - // Walk through the document until the beginning of the first spreadsheet row. - if (!$this->row_open) { - while ($this->valid = $this->worksheet_reader->read()) { - if (!$this->worksheet_reader->matchesElement('row')) { - continue; - } - - $this->row_open = true; - - /* Getting the row spanning area (stored as e.g. "1:12", or more rarely, "1:3 6:8 11:12") - * so that the last cells will be present, even if empty. */ - $row_spans = $this->worksheet_reader->getAttributeNsId('spans'); - - if ($row_spans) { - $row_spans = explode(':', $row_spans); - $current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure. - } else { - $current_row_column_count = 0; - } - - // If configured: Return empty strings for empty values - if ($current_row_column_count > 0 && !$this->skip_empty_cells) { - $this->current_row = array_fill(0, $current_row_column_count, ''); - } - - // Do not read further than here if the current 'row' node is not the one to be read - if ((int) $this->worksheet_reader->getAttributeNsId('r') !== $this->row_number) { - return self::adjustRowOutput($this->current_row); - } - break; + // Ensure that the read pointer is pointing at an opening element. + while (!$this->reader_points_at_new_row && $this->valid = $this->worksheet_reader->read()) { + if ($this->worksheet_reader->matchesElement('row')) { + $this->reader_points_at_new_row = true; } - + } + if (!$this->reader_points_at_new_row) { // No (further) rows found for reading. - if (!$this->row_open) { - return array(); - } + return array(); + } + + /* Get the row spanning area (stored as e.g. "1:12", or more rarely, "1:3 6:8 11:12") + * so that the last cells will be present, even if empty. */ + $row_spans = $this->worksheet_reader->getAttributeNsId('spans'); + if ($row_spans) { + $row_spans = explode(':', $row_spans); + $current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure. + } else { + $current_row_column_count = 0; + } + + // If configured: Return empty strings for empty values + if ($current_row_column_count > 0 && !$this->skip_empty_cells) { + $this->current_row = array_fill(0, $current_row_column_count, ''); } - // Do not read further than here if the current 'row' node is not the one to be read if ((int) $this->worksheet_reader->getAttributeNsId('r') !== $this->row_number) { - $row_spans = $this->worksheet_reader->getAttributeNsId('spans'); - if ($row_spans) { - $row_spans = explode(':', $row_spans); - $current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure. - } else { - $current_row_column_count = 0; - } - if ($current_row_column_count > 0 && !$this->skip_empty_cells) { - $this->current_row = array_fill(0, $current_row_column_count, ''); - } + // We just skipped over one or multiple empty row(s). Keep current reader state and return empty cells. + return self::adjustRowOutput($this->current_row); + } + + // From here on out, successive next() calls must start with a read() for the next row. + $this->reader_points_at_new_row = false; + + // Handle self-closing row tags (e.g. ) caused by e.g. usage of thick borders in adjacent cells. + $this->worksheet_reader->moveToElement(); // Necessary for isEmptyElement to work correctly. + if ($this->worksheet_reader->isEmptyElement) { return self::adjustRowOutput($this->current_row); } @@ -341,7 +329,6 @@ public function next() // tag: Finish row reading. case 'row': if ($this->worksheet_reader->isClosingTag()) { - $this->row_open = false; break 2; } break; diff --git a/tests/EmptyRowsTest.php b/tests/EmptyRowsTest.php new file mode 100644 index 0000000..80f1120 --- /dev/null +++ b/tests/EmptyRowsTest.php @@ -0,0 +1,37 @@ +open(self::TEST_FILE); + $output_cells = array(); + while ($row = $reader->next()) { + $output_cells[] = $row[1]; // All values to check for are in the 2nd column. Ignore all other columns. + } + $reader->close(); + + self::assertSame( + array('', 'row 2', '', '', 'row 5'), + $output_cells, + 'The retrieved sheet content was not as expected.' + ); + } +}