Skip to content

Commit

Permalink
Optimize CSV validation and header mapping logic if header is enabled
Browse files Browse the repository at this point in the history
This commit addresses several changes in the CSV validation and header mapping logic. Firstly, a method to set IDs has been added in Column.php. Also, handling of headers in CSV files has been updated to better support files with and without headers. In addition, mapping of columns by header names has been moved to CsvFile.php from Schema.php for better alignment with data flow. Lastly, unnecessary exception throwing in getColumn method of Schema.php has been removed. Unit tests have also been updated to reflect these changes.
  • Loading branch information
SmetDenis committed Mar 30, 2024
1 parent 22b4951 commit 2027c76
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 64 deletions.
5 changes: 5 additions & 0 deletions src/Csv/Column.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ public function validateCell(string $cellValue, int $line = Error::UNDEFINED_LIN
return $this->getValidator()->validateCell($cellValue, $line);
}

public function setId(int $realIndex): void
{
$this->id = $realIndex;
}

private function prepareRuleSet(string $schemaKey): array
{
$rules = [];
Expand Down
35 changes: 31 additions & 4 deletions src/Csv/CsvFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ public function getHeader(): array
if ($this->structure->isHeader() && !$this->isEmpty) {
// TODO: add handler for empty file
// League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0
$this->header = $this->reader->getHeader();
$this->header = $this->getRecordsChunk(0, 1)->first();
} else {
$this->header = \range(0, \count($this->getRecordsChunk(0, 1)->first()) - 1);
}
}

Expand All @@ -74,12 +76,12 @@ public function getHeader(): array

public function getRecords(): \Iterator
{
return $this->reader->getRecords($this->getHeader());
return $this->reader->getRecords([]);
}

public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataReader
{
return Statement::create(null, $offset, $limit)->process($this->reader, $this->getHeader());
return Statement::create(null, $offset, $limit)->process($this->reader, []); // No headers is required!
}

public function validate(bool $quickStop = false): ErrorSuite
Expand All @@ -92,13 +94,38 @@ public function getRealColumNumber(): int
return \count($this->getRecordsChunk(0, 1)->first());
}

public function getSchema(): Schema
{
return $this->schema;
}

/**
* @return Column[]
*/
public function getColumnsMappedByHeader(): array
{
$map = [];

$realHeader = $this->getHeader();
foreach ($realHeader as $realIndex => $realColumn) {
$schemaColumn = $this->schema->getColumn($realColumn);

if ($schemaColumn !== null) {
$schemaColumn->setId($realIndex);

Check failure on line 114 in src/Csv/CsvFile.php

View workflow job for this annotation

GitHub Actions / Tests - Current

Argument 1 `($realIndex)` is `$realIndex` of type `int|string` but `\JBZoo\CsvBlueprint\Csv\Column::setId()` takes `int` `(string` is incompatible) defined at `src/Csv/Column.php:110` Rule : PhanPartialTypeMismatchArgument File Path: src/Csv/CsvFile.php:114 Severity : warning

Check failure on line 114 in src/Csv/CsvFile.php

View workflow job for this annotation

GitHub Actions / Tests - Latest

Argument 1 `($realIndex)` is `$realIndex` of type `int|string` but `\JBZoo\CsvBlueprint\Csv\Column::setId()` takes `int` `(string` is incompatible) defined at `src/Csv/Column.php:110` Rule : PhanPartialTypeMismatchArgument File Path: src/Csv/CsvFile.php:114 Severity : warning

Check failure on line 114 in src/Csv/CsvFile.php

View workflow job for this annotation

GitHub Actions / Tests - Lowest

Argument 1 `($realIndex)` is `$realIndex` of type `int|string` but `\JBZoo\CsvBlueprint\Csv\Column::setId()` takes `int` `(string` is incompatible) defined at `src/Csv/Column.php:110` Rule : PhanPartialTypeMismatchArgument File Path: src/Csv/CsvFile.php:114 Severity : warning
$map[$realIndex] = $schemaColumn;
}
}

return $map;
}

private function prepareReader(): LeagueReader
{
$reader = LeagueReader::createFromPath($this->csvFilename)
->setDelimiter($this->structure->getDelimiter())
->setEnclosure($this->structure->getEnclosure())
->setEscape($this->structure->getQuoteChar())
->setHeaderOffset($this->structure->isHeader() ? 0 : null);
->setHeaderOffset(null); // It's important to set it to null to optimize memory usage!

if ($this->structure->isBom()) {
$reader->includeInputBOM();
Expand Down
28 changes: 5 additions & 23 deletions src/Schema.php
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,6 @@ public function getColumns(): array
return $this->columns;
}

/**
* @return Column[]|null[]
* @phan-suppress PhanPartialTypeMismatchReturn
*/
public function getColumnsMappedByHeader(array $header): array
{
$map = [];

if ($this->getCsvStructure()->isHeader()) {
foreach ($header as $headerName) {
$map[$headerName] = $this->columns[$headerName] ?? null;
}
} else {
return $this->getColumns();
}

return $map;
}

public function getColumn(int|string $columNameOrId): ?Column
{
if (\is_int($columNameOrId)) {
Expand All @@ -113,10 +94,6 @@ public function getColumn(int|string $columNameOrId): ?Column
$column = $this->getColumns()[$columNameOrId] ?? null;
}

if ($column === null) {
throw new Exception("Column \"{$columNameOrId}\" not found in schema \"{$this->filename}\"");
}

return $column;
}

Expand Down Expand Up @@ -154,6 +131,11 @@ public function getData(): AbstractData
return clone $this->data;
}

public function getSchemaHeader(): array
{
return \array_keys($this->getColumns());
}

/**
* @return Column[]
*/
Expand Down
6 changes: 4 additions & 2 deletions src/Utils.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ public static function debug(int|string $message): void

public static function debugSpeed(string $messPrefix, int $lines, float $startTimer): void
{
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
if (\defined('DEBUG_MODE')) {
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
}
}

public static function kebabToCamelCase(string $input): string
Expand Down
33 changes: 17 additions & 16 deletions src/Validators/ValidatorCsv.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,13 @@ private function validateHeader(bool $quickStop = false): ErrorSuite
private function validateLines(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$mappedColumns = $this->csv->getColumnsMappedByHeader();
$isHeaderEnabled = $this->schema->getCsvStructure()->isHeader();

foreach ($realColumns as $column) {
$columValues = [];
if ($column === null) {
continue;
}
foreach ($mappedColumns as $columnIndex => $column) {
$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -"; // System message prefix. Debug only!

$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -";
$columValues = [];

Utils::debug("{$messPrefix} Column start");
$colValidator = $column->getValidator();
Expand All @@ -138,30 +136,34 @@ private function validateLines(bool $quickStop = false): ErrorSuite
$lineCounter = 0;
$startTimer = \microtime(true);
foreach ($this->csv->getRecords() as $line => $record) {
if ($isHeaderEnabled && $line === 0) {
continue;
}

$lineCounter++;
$lineNum = (int)$line + 1;

if ($isRules) { // Time optimization
if (!isset($record[$column->getKey()])) {
if (!isset($record[$columnIndex])) {
$errors->addError(
new Error(
'csv.column',
"Column index:{$column->getKey()} not found",
"Column index:{$columnIndex} not found",
$column->getHumanName(),
$lineNum,
),
);
} else {
$errors->addErrorSuit($colValidator->validateCell($record[$column->getKey()], $lineNum));
$errors->addErrorSuit($colValidator->validateCell($record[$columnIndex], $lineNum));
}

if ($quickStop && $errors->count() > 0) {
return $errors;
}
}

if ($isAggRules && isset($record[$column->getKey()])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$column->getKey()], $aggInputType);
if ($isAggRules && isset($record[$columnIndex])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$columnIndex], $aggInputType);
}
}
Utils::debug("{$messPrefix} Lines <yellow>" . \number_format($lineCounter) . '</yellow>');
Expand Down Expand Up @@ -213,10 +215,9 @@ private function validateColumn(bool $quickStop): ErrorSuite
$errors = new ErrorSuite();

if ($this->schema->getCsvStructure()->isHeader()) {
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$schemaColumns = $this->schema->getColumns();

$notFoundColums = \array_diff(\array_keys($schemaColumns), \array_keys($realColumns));
$realColumns = $this->csv->getHeader();
$schemaColumns = $this->schema->getSchemaHeader();
$notFoundColums = \array_diff($schemaColumns, $realColumns);

if (\count($notFoundColums) > 0) {
$error = new Error(
Expand Down
13 changes: 7 additions & 6 deletions tests/Csv/CsvFileTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public function testReadCsvFileWithoutHeader(): void
$csv = new CsvFile(Tools::CSV_SIMPLE_NO_HEADER, Tools::SCHEMA_SIMPLE_NO_HEADER);
isSame(Tools::CSV_SIMPLE_NO_HEADER, $csv->getCsvFilename());

isSame([], $csv->getHeader());
isSame([0, 1], $csv->getHeader());

isSame([
['1', 'true'],
Expand All @@ -50,18 +50,19 @@ public function testReadCsvFileWithHeader(): void
isSame(['seq', 'bool', 'exact'], $csv->getHeader());

isSame([
['seq' => '1', 'bool' => 'true', 'exact' => '1'],
['seq' => '2', 'bool' => 'true', 'exact' => '1'],
['seq' => '3', 'bool' => 'false', 'exact' => '1'],
['seq', 'bool', 'exact'],
['1', 'true', '1'],
['2', 'true', '1'],
['3', 'false', '1'],
], $this->fetchRows($csv->getRecords()));

isSame(
[['seq' => '2', 'bool' => 'true', 'exact' => '1']],
[['1', 'true', '1']],
$this->fetchRows($csv->getRecordsChunk(1, 1)),
);

isSame(
[['seq' => '2', 'bool' => 'true', 'exact' => '1'], ['seq' => '3', 'bool' => 'false', 'exact' => '1']],
[['1', 'true', '1'], ['2', 'true', '1']],
$this->fetchRows($csv->getRecordsChunk(1, 2)),
);
}
Expand Down
6 changes: 0 additions & 6 deletions tests/SchemaTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -115,18 +115,12 @@ public function testColumnByNameAndId(): void

public function testGetUndefinedColumnById(): void
{
$this->expectExceptionMessage(
'Column "1000" not found in schema "' . Tools::SCHEMA_EXAMPLE_EMPTY . '"',
);
$schemaFull = new Schema(Tools::SCHEMA_EXAMPLE_EMPTY);
isNull($schemaFull->getColumn(1000));
}

public function testGetUndefinedColumnByName(): void
{
$this->expectExceptionMessage(
'Column "undefined_column" not found in schema "' . Tools::SCHEMA_EXAMPLE_EMPTY . '"',
);
$schemaFull = new Schema(Tools::SCHEMA_EXAMPLE_EMPTY);
isNull($schemaFull->getColumn('undefined_column'));
}
Expand Down
81 changes: 74 additions & 7 deletions tests/Validators/CsvValidatorTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,6 @@ public function testInvalidWithoutHeader(): void
isSame(
<<<'TEXT'
"csv.header" at line 1. Real number of columns is less than schema: 2 < 3.
"csv.column" at line 1, column "2:". Column index:2 not found.
"csv.column" at line 2, column "2:". Column index:2 not found.
"csv.column" at line 3, column "2:". Column index:2 not found.

TEXT,
\strip_tags((string)$csv->validate()),
Expand Down Expand Up @@ -82,7 +79,7 @@ public function testCellRule(): void

$csv = new CsvFile(Tools::CSV_COMPLEX, Tools::getRule('integer', 'not_empty', true));
isSame(
'"not_empty" at line 19, column "0:integer". Value is empty.' . "\n",
'"not_empty" at line 19, column "3:integer". Value is empty.' . "\n",
\strip_tags((string)$csv->validate()),
);
}
Expand All @@ -94,7 +91,7 @@ public function testAggregateRule(): void

$csv = new CsvFile(Tools::DEMO_CSV, Tools::getAggregateRule('City', 'is_unique', true));
isSame(
'"ag:is_unique" at line 1, column "0:City". Column has non-unique values. Unique: 9, total: 10.' . "\n",
'"ag:is_unique" at line 1, column "1:City". Column has non-unique values. Unique: 9, total: 10.' . "\n",
\strip_tags((string)$csv->validate()),
);

Expand All @@ -109,7 +106,7 @@ public function testAggregateRuleCombo(): void

$csv = new CsvFile(Tools::DEMO_CSV, Tools::getAggregateRule('Float', 'sum', 20));
isSame(
'"ag:sum" at line <red>1</red>, column "0:Float". The sum of numbers in the column is ' .
'"ag:sum" at line <red>1</red>, column "2:Float". The sum of numbers in the column is ' .
'"<c>4691.3235</c>", which is not equal than the expected "<green>20</green>".' . "\n",
(string)$csv->validate(),
);
Expand Down Expand Up @@ -143,10 +140,12 @@ public function testQuickStop(): void
public function testErrorToArray(): void
{
$csv = new CsvFile(Tools::CSV_COMPLEX, Tools::getRule('yn', 'is_email', true));
// dump($csv);

isSame([
'ruleCode' => 'is_email',
'message' => 'Value "<c>N</c>" is not a valid email',
'columnName' => '0:yn',
'columnName' => '2:yn',
'line' => 2,
], $csv->validate(true)->get(0)->toArray());
}
Expand All @@ -169,4 +168,72 @@ public function testFilenamePattern(): void
$csv = new CsvFile(Tools::CSV_COMPLEX, ['filename_pattern' => '/.*\.csv$/']);
isSame('', (string)$csv->validate());
}

public function testHeaderMatchingIfHeaderEnabled(): void
{
$columns = [
['name' => 'Name'],
['name' => 'City'],
['name' => 'Float'],
// ['name' => 'Birthday'], // We skip it for tests
['name' => 'Favorite color'],
];

$csv = new CsvFile(Tools::DEMO_CSV, ['csv' => ['header' => true], 'columns' => $columns]);

isSame(['Name', 'City', 'Float', 'Birthday', 'Favorite color'], $csv->getHeader());
isSame(['Name', 'City', 'Float', 'Favorite color'], $csv->getSchema()->getSchemaHeader());

$mappedColumns = $csv->getColumnsMappedByHeader();
isSame('not_set', $mappedColumns[3] ?? 'not_set');

isSame([0, 1, 2, 4], \array_keys($mappedColumns));

$names = [];
foreach ($mappedColumns as $columnIndex => $column) {
isSame($columnIndex, $column->getId());
$names[] = [$column->getName(), $column->getHumanName()];
}

isSame([
['Name', '0:Name'],
['City', '1:City'],
['Float', '2:Float'],
['Favorite color', '4:Favorite color'], // 4 is important here
], $names);
}

public function testHeaderMatchingIfHeaderDisabled(): void
{
$columns = [
['name' => 'Name'],
['name' => 'City'],
['name' => 'Float'],
// ['name' => 'Birthday'], // We skip it for tests
['name' => 'Favorite color'],
];

$csv = new CsvFile(Tools::DEMO_CSV, ['csv' => ['header' => false], 'columns' => $columns]);

isSame([0, 1, 2, 3, 4], $csv->getHeader());
isSame(['Name', 'City', 'Float', 'Favorite color'], $csv->getSchema()->getSchemaHeader());

$mappedColumns = $csv->getColumnsMappedByHeader();
isSame('not_set', $mappedColumns[4] ?? 'not_set');

isSame([0, 1, 2, 3], \array_keys($mappedColumns));

$names = [];
foreach ($mappedColumns as $columnIndex => $column) {
isSame($columnIndex, $column->getId());
$names[] = [$column->getName(), $column->getHumanName()];
}

isSame([
['Name', '0:Name'],
['City', '1:City'],
['Float', '2:Float'],
['Favorite color', '3:Favorite color'], // 3 is important here
], $names);
}
}

0 comments on commit 2027c76

Please sign in to comment.