From 2027c767608b563fa9ead83fc89fc72e5b51f0e1 Mon Sep 17 00:00:00 2001 From: SmetDenis Date: Sun, 31 Mar 2024 00:48:50 +0400 Subject: [PATCH] Optimize CSV validation and header mapping logic if header is enabled This commit addresses several changes in the CSV validation and header mapping logic. Firstly, a method to set IDs has been added in Column.php. Also, handling of headers in CSV files has been updated to better support files with and without headers. In addition, mapping of columns by header names has been moved to CsvFile.php from Schema.php for better alignment with data flow. Lastly, unnecessary exception throwing in getColumn method of Schema.php has been removed. Unit tests have also been updated to reflect these changes. --- src/Csv/Column.php | 5 ++ src/Csv/CsvFile.php | 35 ++++++++++-- src/Schema.php | 28 ++------- src/Utils.php | 6 +- src/Validators/ValidatorCsv.php | 33 +++++------ tests/Csv/CsvFileTest.php | 13 +++-- tests/SchemaTest.php | 6 -- tests/Validators/CsvValidatorTest.php | 81 ++++++++++++++++++++++++--- 8 files changed, 143 insertions(+), 64 deletions(-) diff --git a/src/Csv/Column.php b/src/Csv/Column.php index 40265ca9..f3e361b9 100644 --- a/src/Csv/Column.php +++ b/src/Csv/Column.php @@ -107,6 +107,11 @@ public function validateCell(string $cellValue, int $line = Error::UNDEFINED_LIN return $this->getValidator()->validateCell($cellValue, $line); } + public function setId(int $realIndex): void + { + $this->id = $realIndex; + } + private function prepareRuleSet(string $schemaKey): array { $rules = []; diff --git a/src/Csv/CsvFile.php b/src/Csv/CsvFile.php index 96320a43..795de775 100644 --- a/src/Csv/CsvFile.php +++ b/src/Csv/CsvFile.php @@ -65,7 +65,9 @@ public function getHeader(): array if ($this->structure->isHeader() && !$this->isEmpty) { // TODO: add handler for empty file // League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0 - $this->header = $this->reader->getHeader(); + $this->header = $this->getRecordsChunk(0, 1)->first(); + } else { + $this->header = \range(0, \count($this->getRecordsChunk(0, 1)->first()) - 1); } } @@ -74,12 +76,12 @@ public function getHeader(): array public function getRecords(): \Iterator { - return $this->reader->getRecords($this->getHeader()); + return $this->reader->getRecords([]); } public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataReader { - return Statement::create(null, $offset, $limit)->process($this->reader, $this->getHeader()); + return Statement::create(null, $offset, $limit)->process($this->reader, []); // No headers is required! } public function validate(bool $quickStop = false): ErrorSuite @@ -92,13 +94,38 @@ public function getRealColumNumber(): int return \count($this->getRecordsChunk(0, 1)->first()); } + public function getSchema(): Schema + { + return $this->schema; + } + + /** + * @return Column[] + */ + public function getColumnsMappedByHeader(): array + { + $map = []; + + $realHeader = $this->getHeader(); + foreach ($realHeader as $realIndex => $realColumn) { + $schemaColumn = $this->schema->getColumn($realColumn); + + if ($schemaColumn !== null) { + $schemaColumn->setId($realIndex); + $map[$realIndex] = $schemaColumn; + } + } + + return $map; + } + private function prepareReader(): LeagueReader { $reader = LeagueReader::createFromPath($this->csvFilename) ->setDelimiter($this->structure->getDelimiter()) ->setEnclosure($this->structure->getEnclosure()) ->setEscape($this->structure->getQuoteChar()) - ->setHeaderOffset($this->structure->isHeader() ? 0 : null); + ->setHeaderOffset(null); // It's important to set it to null to optimize memory usage! if ($this->structure->isBom()) { $reader->includeInputBOM(); diff --git a/src/Schema.php b/src/Schema.php index a105d83d..17bcf9d2 100644 --- a/src/Schema.php +++ b/src/Schema.php @@ -86,25 +86,6 @@ public function getColumns(): array return $this->columns; } - /** - * @return Column[]|null[] - * @phan-suppress PhanPartialTypeMismatchReturn - */ - public function getColumnsMappedByHeader(array $header): array - { - $map = []; - - if ($this->getCsvStructure()->isHeader()) { - foreach ($header as $headerName) { - $map[$headerName] = $this->columns[$headerName] ?? null; - } - } else { - return $this->getColumns(); - } - - return $map; - } - public function getColumn(int|string $columNameOrId): ?Column { if (\is_int($columNameOrId)) { @@ -113,10 +94,6 @@ public function getColumn(int|string $columNameOrId): ?Column $column = $this->getColumns()[$columNameOrId] ?? null; } - if ($column === null) { - throw new Exception("Column \"{$columNameOrId}\" not found in schema \"{$this->filename}\""); - } - return $column; } @@ -154,6 +131,11 @@ public function getData(): AbstractData return clone $this->data; } + public function getSchemaHeader(): array + { + return \array_keys($this->getColumns()); + } + /** * @return Column[] */ diff --git a/src/Utils.php b/src/Utils.php index 7c624892..df65e85d 100644 --- a/src/Utils.php +++ b/src/Utils.php @@ -63,8 +63,10 @@ public static function debug(int|string $message): void public static function debugSpeed(string $messPrefix, int $lines, float $startTimer): void { - $kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000); - self::debug("{$messPrefix} " . \number_format($kiloLines) . 'K lines/sec'); + if (\defined('DEBUG_MODE')) { + $kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000); + self::debug("{$messPrefix} " . \number_format($kiloLines) . 'K lines/sec'); + } } public static function kebabToCamelCase(string $input): string diff --git a/src/Validators/ValidatorCsv.php b/src/Validators/ValidatorCsv.php index adaf67f8..5a0a941f 100644 --- a/src/Validators/ValidatorCsv.php +++ b/src/Validators/ValidatorCsv.php @@ -106,15 +106,13 @@ private function validateHeader(bool $quickStop = false): ErrorSuite private function validateLines(bool $quickStop = false): ErrorSuite { $errors = new ErrorSuite(); - $realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader()); + $mappedColumns = $this->csv->getColumnsMappedByHeader(); + $isHeaderEnabled = $this->schema->getCsvStructure()->isHeader(); - foreach ($realColumns as $column) { - $columValues = []; - if ($column === null) { - continue; - } + foreach ($mappedColumns as $columnIndex => $column) { + $messPrefix = "Column \"{$column->getHumanName()}\" -"; // System message prefix. Debug only! - $messPrefix = "Column \"{$column->getHumanName()}\" -"; + $columValues = []; Utils::debug("{$messPrefix} Column start"); $colValidator = $column->getValidator(); @@ -138,21 +136,25 @@ private function validateLines(bool $quickStop = false): ErrorSuite $lineCounter = 0; $startTimer = \microtime(true); foreach ($this->csv->getRecords() as $line => $record) { + if ($isHeaderEnabled && $line === 0) { + continue; + } + $lineCounter++; $lineNum = (int)$line + 1; if ($isRules) { // Time optimization - if (!isset($record[$column->getKey()])) { + if (!isset($record[$columnIndex])) { $errors->addError( new Error( 'csv.column', - "Column index:{$column->getKey()} not found", + "Column index:{$columnIndex} not found", $column->getHumanName(), $lineNum, ), ); } else { - $errors->addErrorSuit($colValidator->validateCell($record[$column->getKey()], $lineNum)); + $errors->addErrorSuit($colValidator->validateCell($record[$columnIndex], $lineNum)); } if ($quickStop && $errors->count() > 0) { @@ -160,8 +162,8 @@ private function validateLines(bool $quickStop = false): ErrorSuite } } - if ($isAggRules && isset($record[$column->getKey()])) { // Time & memory optimization - $columValues[] = ValidatorColumn::prepareValue($record[$column->getKey()], $aggInputType); + if ($isAggRules && isset($record[$columnIndex])) { // Time & memory optimization + $columValues[] = ValidatorColumn::prepareValue($record[$columnIndex], $aggInputType); } } Utils::debug("{$messPrefix} Lines " . \number_format($lineCounter) . ''); @@ -213,10 +215,9 @@ private function validateColumn(bool $quickStop): ErrorSuite $errors = new ErrorSuite(); if ($this->schema->getCsvStructure()->isHeader()) { - $realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader()); - $schemaColumns = $this->schema->getColumns(); - - $notFoundColums = \array_diff(\array_keys($schemaColumns), \array_keys($realColumns)); + $realColumns = $this->csv->getHeader(); + $schemaColumns = $this->schema->getSchemaHeader(); + $notFoundColums = \array_diff($schemaColumns, $realColumns); if (\count($notFoundColums) > 0) { $error = new Error( diff --git a/tests/Csv/CsvFileTest.php b/tests/Csv/CsvFileTest.php index aa4c46ef..d8a824f2 100644 --- a/tests/Csv/CsvFileTest.php +++ b/tests/Csv/CsvFileTest.php @@ -29,7 +29,7 @@ public function testReadCsvFileWithoutHeader(): void $csv = new CsvFile(Tools::CSV_SIMPLE_NO_HEADER, Tools::SCHEMA_SIMPLE_NO_HEADER); isSame(Tools::CSV_SIMPLE_NO_HEADER, $csv->getCsvFilename()); - isSame([], $csv->getHeader()); + isSame([0, 1], $csv->getHeader()); isSame([ ['1', 'true'], @@ -50,18 +50,19 @@ public function testReadCsvFileWithHeader(): void isSame(['seq', 'bool', 'exact'], $csv->getHeader()); isSame([ - ['seq' => '1', 'bool' => 'true', 'exact' => '1'], - ['seq' => '2', 'bool' => 'true', 'exact' => '1'], - ['seq' => '3', 'bool' => 'false', 'exact' => '1'], + ['seq', 'bool', 'exact'], + ['1', 'true', '1'], + ['2', 'true', '1'], + ['3', 'false', '1'], ], $this->fetchRows($csv->getRecords())); isSame( - [['seq' => '2', 'bool' => 'true', 'exact' => '1']], + [['1', 'true', '1']], $this->fetchRows($csv->getRecordsChunk(1, 1)), ); isSame( - [['seq' => '2', 'bool' => 'true', 'exact' => '1'], ['seq' => '3', 'bool' => 'false', 'exact' => '1']], + [['1', 'true', '1'], ['2', 'true', '1']], $this->fetchRows($csv->getRecordsChunk(1, 2)), ); } diff --git a/tests/SchemaTest.php b/tests/SchemaTest.php index 20426e68..e43391da 100644 --- a/tests/SchemaTest.php +++ b/tests/SchemaTest.php @@ -115,18 +115,12 @@ public function testColumnByNameAndId(): void public function testGetUndefinedColumnById(): void { - $this->expectExceptionMessage( - 'Column "1000" not found in schema "' . Tools::SCHEMA_EXAMPLE_EMPTY . '"', - ); $schemaFull = new Schema(Tools::SCHEMA_EXAMPLE_EMPTY); isNull($schemaFull->getColumn(1000)); } public function testGetUndefinedColumnByName(): void { - $this->expectExceptionMessage( - 'Column "undefined_column" not found in schema "' . Tools::SCHEMA_EXAMPLE_EMPTY . '"', - ); $schemaFull = new Schema(Tools::SCHEMA_EXAMPLE_EMPTY); isNull($schemaFull->getColumn('undefined_column')); } diff --git a/tests/Validators/CsvValidatorTest.php b/tests/Validators/CsvValidatorTest.php index 15e6e97f..ceec7d3e 100644 --- a/tests/Validators/CsvValidatorTest.php +++ b/tests/Validators/CsvValidatorTest.php @@ -42,9 +42,6 @@ public function testInvalidWithoutHeader(): void isSame( <<<'TEXT' "csv.header" at line 1. Real number of columns is less than schema: 2 < 3. - "csv.column" at line 1, column "2:". Column index:2 not found. - "csv.column" at line 2, column "2:". Column index:2 not found. - "csv.column" at line 3, column "2:". Column index:2 not found. TEXT, \strip_tags((string)$csv->validate()), @@ -82,7 +79,7 @@ public function testCellRule(): void $csv = new CsvFile(Tools::CSV_COMPLEX, Tools::getRule('integer', 'not_empty', true)); isSame( - '"not_empty" at line 19, column "0:integer". Value is empty.' . "\n", + '"not_empty" at line 19, column "3:integer". Value is empty.' . "\n", \strip_tags((string)$csv->validate()), ); } @@ -94,7 +91,7 @@ public function testAggregateRule(): void $csv = new CsvFile(Tools::DEMO_CSV, Tools::getAggregateRule('City', 'is_unique', true)); isSame( - '"ag:is_unique" at line 1, column "0:City". Column has non-unique values. Unique: 9, total: 10.' . "\n", + '"ag:is_unique" at line 1, column "1:City". Column has non-unique values. Unique: 9, total: 10.' . "\n", \strip_tags((string)$csv->validate()), ); @@ -109,7 +106,7 @@ public function testAggregateRuleCombo(): void $csv = new CsvFile(Tools::DEMO_CSV, Tools::getAggregateRule('Float', 'sum', 20)); isSame( - '"ag:sum" at line 1, column "0:Float". The sum of numbers in the column is ' . + '"ag:sum" at line 1, column "2:Float". The sum of numbers in the column is ' . '"4691.3235", which is not equal than the expected "20".' . "\n", (string)$csv->validate(), ); @@ -143,10 +140,12 @@ public function testQuickStop(): void public function testErrorToArray(): void { $csv = new CsvFile(Tools::CSV_COMPLEX, Tools::getRule('yn', 'is_email', true)); + // dump($csv); + isSame([ 'ruleCode' => 'is_email', 'message' => 'Value "N" is not a valid email', - 'columnName' => '0:yn', + 'columnName' => '2:yn', 'line' => 2, ], $csv->validate(true)->get(0)->toArray()); } @@ -169,4 +168,72 @@ public function testFilenamePattern(): void $csv = new CsvFile(Tools::CSV_COMPLEX, ['filename_pattern' => '/.*\.csv$/']); isSame('', (string)$csv->validate()); } + + public function testHeaderMatchingIfHeaderEnabled(): void + { + $columns = [ + ['name' => 'Name'], + ['name' => 'City'], + ['name' => 'Float'], + // ['name' => 'Birthday'], // We skip it for tests + ['name' => 'Favorite color'], + ]; + + $csv = new CsvFile(Tools::DEMO_CSV, ['csv' => ['header' => true], 'columns' => $columns]); + + isSame(['Name', 'City', 'Float', 'Birthday', 'Favorite color'], $csv->getHeader()); + isSame(['Name', 'City', 'Float', 'Favorite color'], $csv->getSchema()->getSchemaHeader()); + + $mappedColumns = $csv->getColumnsMappedByHeader(); + isSame('not_set', $mappedColumns[3] ?? 'not_set'); + + isSame([0, 1, 2, 4], \array_keys($mappedColumns)); + + $names = []; + foreach ($mappedColumns as $columnIndex => $column) { + isSame($columnIndex, $column->getId()); + $names[] = [$column->getName(), $column->getHumanName()]; + } + + isSame([ + ['Name', '0:Name'], + ['City', '1:City'], + ['Float', '2:Float'], + ['Favorite color', '4:Favorite color'], // 4 is important here + ], $names); + } + + public function testHeaderMatchingIfHeaderDisabled(): void + { + $columns = [ + ['name' => 'Name'], + ['name' => 'City'], + ['name' => 'Float'], + // ['name' => 'Birthday'], // We skip it for tests + ['name' => 'Favorite color'], + ]; + + $csv = new CsvFile(Tools::DEMO_CSV, ['csv' => ['header' => false], 'columns' => $columns]); + + isSame([0, 1, 2, 3, 4], $csv->getHeader()); + isSame(['Name', 'City', 'Float', 'Favorite color'], $csv->getSchema()->getSchemaHeader()); + + $mappedColumns = $csv->getColumnsMappedByHeader(); + isSame('not_set', $mappedColumns[4] ?? 'not_set'); + + isSame([0, 1, 2, 3], \array_keys($mappedColumns)); + + $names = []; + foreach ($mappedColumns as $columnIndex => $column) { + isSame($columnIndex, $column->getId()); + $names[] = [$column->getName(), $column->getHumanName()]; + } + + isSame([ + ['Name', '0:Name'], + ['City', '1:City'], + ['Float', '2:Float'], + ['Favorite color', '3:Favorite color'], // 3 is important here + ], $names); + } }