Skip to content

Commit

Permalink
ITST-20737: XLSX reader incorrectly handles empty <row> elements - Fix
Browse files Browse the repository at this point in the history
- Refactor $row_open logic into $reader_points_at_new_row logic to simplify the code and reduce code duplication.
- Add test EmptyRowsTest to cover the failing behavior.
- Sidechange: Extend __construct() phpdoc with missing note regarding OutputColumnNames parameter.
  • Loading branch information
adirfische committed Jul 14, 2021
1 parent d36339d commit 209c567
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 50 deletions.
87 changes: 37 additions & 50 deletions lib/Reader.php
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,8 @@ class Reader implements Iterator, Countable
/** @var bool Internal storage for the result of the valid() method related to the Iterator interface. */
private $valid = false;

/** @var bool Whether the reader is currently looking at an element within a <row> node. */
private $row_open = false;
/** @var bool Whether the reader is currently pointing at a starting <row> node. */
private $reader_points_at_new_row = false;

/** @var int Current row number in the file. */
private $row_number = 0;
Expand All @@ -73,6 +73,8 @@ class Reader implements Iterator, Countable
* If true, row content will not contain empty cells
* - SharedStringsConfiguration (SharedStringsConfiguration)
* Configuration options to control shared string reading and caching behaviour
* - OutputColumnNames (bool)
* If true, output columns will use Excel-style column names (A-ZZ) instead of numbers as column keys.
*
* @throws Exception
* @throws RuntimeException
Expand Down Expand Up @@ -236,7 +238,7 @@ public function rewind()
$this->worksheet_reader->open($this->worksheet_path);

$this->valid = true;
$this->row_open = false;
$this->reader_points_at_new_row = false;
$this->current_row = false;
$this->row_number = 0;
}
Expand Down Expand Up @@ -267,59 +269,45 @@ public function current()
public function next()
{
$this->row_number++;

$this->current_row = array();

// Walk through the document until the beginning of the first spreadsheet row.
if (!$this->row_open) {
while ($this->valid = $this->worksheet_reader->read()) {
if (!$this->worksheet_reader->matchesElement('row')) {
continue;
}

$this->row_open = true;

/* Getting the row spanning area (stored as e.g. "1:12", or more rarely, "1:3 6:8 11:12")
* so that the last cells will be present, even if empty. */
$row_spans = $this->worksheet_reader->getAttributeNsId('spans');

if ($row_spans) {
$row_spans = explode(':', $row_spans);
$current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure.
} else {
$current_row_column_count = 0;
}

// If configured: Return empty strings for empty values
if ($current_row_column_count > 0 && !$this->skip_empty_cells) {
$this->current_row = array_fill(0, $current_row_column_count, '');
}

// Do not read further than here if the current 'row' node is not the one to be read
if ((int) $this->worksheet_reader->getAttributeNsId('r') !== $this->row_number) {
return self::adjustRowOutput($this->current_row);
}
break;
// Ensure that the read pointer is pointing at an opening <row> element.
while (!$this->reader_points_at_new_row && $this->valid = $this->worksheet_reader->read()) {
if ($this->worksheet_reader->matchesElement('row')) {
$this->reader_points_at_new_row = true;
}

}
if (!$this->reader_points_at_new_row) {
// No (further) rows found for reading.
if (!$this->row_open) {
return array();
}
return array();
}

/* Get the row spanning area (stored as e.g. "1:12", or more rarely, "1:3 6:8 11:12")
* so that the last cells will be present, even if empty. */
$row_spans = $this->worksheet_reader->getAttributeNsId('spans');
if ($row_spans) {
$row_spans = explode(':', $row_spans);
$current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure.
} else {
$current_row_column_count = 0;
}

// If configured: Return empty strings for empty values
if ($current_row_column_count > 0 && !$this->skip_empty_cells) {
$this->current_row = array_fill(0, $current_row_column_count, '');
}

// Do not read further than here if the current 'row' node is not the one to be read
if ((int) $this->worksheet_reader->getAttributeNsId('r') !== $this->row_number) {
$row_spans = $this->worksheet_reader->getAttributeNsId('spans');
if ($row_spans) {
$row_spans = explode(':', $row_spans);
$current_row_column_count = array_pop($row_spans); // Always get the last segment, regardless of spans structure.
} else {
$current_row_column_count = 0;
}
if ($current_row_column_count > 0 && !$this->skip_empty_cells) {
$this->current_row = array_fill(0, $current_row_column_count, '');
}
// We just skipped over one or multiple empty row(s). Keep current reader state and return empty cells.
return self::adjustRowOutput($this->current_row);
}

// From here on out, successive next() calls must start with a read() for the next row.
$this->reader_points_at_new_row = false;

// Handle self-closing row tags (e.g. <row r="1"/>) caused by e.g. usage of thick borders in adjacent cells.
$this->worksheet_reader->moveToElement(); // Necessary for isEmptyElement to work correctly.
if ($this->worksheet_reader->isEmptyElement) {
return self::adjustRowOutput($this->current_row);
}

Expand All @@ -341,7 +329,6 @@ public function next()
// </row> tag: Finish row reading.
case 'row':
if ($this->worksheet_reader->isClosingTag()) {
$this->row_open = false;
break 2;
}
break;
Expand Down
37 changes: 37 additions & 0 deletions tests/EmptyRowsTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
<?php

namespace Aspera\Spreadsheet\XLSX\Tests;

require_once __DIR__ . '/../vendor/autoload.php';

use Exception;
use PHPUnit\Framework\TestCase;
use Aspera\Spreadsheet\XLSX\Reader;

class EmptyRowsTest extends TestCase
{
const TEST_FILE = __DIR__ . '/input_files/empty_rows_test.xlsx';

/**
* Check if empty rows are detected and reported correctly.
* Includes test of self-closing row elements caused by e.g. the usage of thick borders in adjacent cells.
*
* @throws Exception
*/
public function testCellContent()
{
$reader = new Reader();
$reader->open(self::TEST_FILE);
$output_cells = array();
while ($row = $reader->next()) {
$output_cells[] = $row[1]; // All values to check for are in the 2nd column. Ignore all other columns.
}
$reader->close();

self::assertSame(
array('', 'row 2', '', '', 'row 5'),
$output_cells,
'The retrieved sheet content was not as expected.'
);
}
}

0 comments on commit 209c567

Please sign in to comment.