Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize CSV validation and header mapping logic if header is enabled #115

Merged
merged 4 commits into from
Mar 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 24 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,11 @@ BENCH_ROWS_SRC ?= 2000
BENCH_CSV_PATH := ./build/bench/$(BENCH_COLS)_$(BENCH_ROWS_SRC)_000.csv
BENCH_CSV := --csv='$(BENCH_CSV_PATH)'
BENCH_FLAGS := --debug --profile --report=text -vvv
BENCH_SCHEMAS := --schema='./tests/Benchmarks/bench_*.yml'
BENCH_SCHEMAS_ALL := --schema='./tests/Benchmarks/bench_*.yml'
BENCH_SCHEMAS_0 := --schema='./tests/Benchmarks/bench_0_*.yml'
BENCH_SCHEMAS_1 := --schema='./tests/Benchmarks/bench_1_*.yml'
BENCH_SCHEMAS_2 := --schema='./tests/Benchmarks/bench_2_*.yml'
BENCH_SCHEMAS_3 := --schema='./tests/Benchmarks/bench_3_*.yml'


bench: ##@Benchmarks Run all benchmarks
Expand All @@ -109,23 +113,37 @@ bench-create-csv: ##@Benchmarks Create CSV file


bench-docker: ##@Benchmarks Run CSV file with Docker
@docker run --rm $(DOCKER_IMAGE) --ansi --version
@echo "::group::Quickest"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_0_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_0) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Minimum"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_1_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_1) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Realistic"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_2_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_2) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::All aggregations at once"
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) --schema='./tests/Benchmarks/bench_3_*.yml' $(BENCH_FLAGS)
-$(BLUEPRINT_DOCKER) $(BENCH_CSV) $(BENCH_SCHEMAS_3) $(BENCH_FLAGS)
@echo "::endgroup::"


bench-phar: ##@Benchmarks Run CSV file with Phar
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS) $(BENCH_FLAGS)
./build/csv-blueprint.phar --ansi --version
@echo "::group::Quickest"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_0) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Minimum"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_1) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::Realistic"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_2) $(BENCH_FLAGS)
@echo "::endgroup::"
@echo "::group::All aggregations at once"
-$(BLUEPRINT_PHAR) $(BENCH_CSV) $(BENCH_SCHEMAS_3) $(BENCH_FLAGS)
@echo "::endgroup::"


bench-php: ##@Benchmarks Run CSV file with classic PHP binary
$(PHP_BIN) ./csv-blueprint --ansi --version
-$(BLUEPRINT) $(BENCH_CSV) $(BENCH_SCHEMAS) $(BENCH_FLAGS)
5 changes: 5 additions & 0 deletions src/Csv/Column.php
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,11 @@ public function validateCell(string $cellValue, int $line = Error::UNDEFINED_LIN
return $this->getValidator()->validateCell($cellValue, $line);
}

public function setId(int $realIndex): void
{
$this->id = $realIndex;
}

private function prepareRuleSet(string $schemaKey): array
{
$rules = [];
Expand Down
36 changes: 32 additions & 4 deletions src/Csv/CsvFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,9 @@ public function getHeader(): array
if ($this->structure->isHeader() && !$this->isEmpty) {
// TODO: add handler for empty file
// League\Csv\SyntaxError : The header record does not exist or is empty at offset: `0
$this->header = $this->reader->getHeader();
$this->header = $this->getRecordsChunk(0, 1)->first();
} else {
$this->header = \range(0, \count($this->getRecordsChunk(0, 1)->first()) - 1);
}
}

Expand All @@ -74,12 +76,12 @@ public function getHeader(): array

public function getRecords(): \Iterator
{
return $this->reader->getRecords($this->getHeader());
return $this->reader->getRecords([]);
}

public function getRecordsChunk(int $offset = 0, int $limit = -1): TabularDataReader
{
return Statement::create(null, $offset, $limit)->process($this->reader, $this->getHeader());
return Statement::create(null, $offset, $limit)->process($this->reader, []); // No headers is required!
}

public function validate(bool $quickStop = false): ErrorSuite
Expand All @@ -92,13 +94,39 @@ public function getRealColumNumber(): int
return \count($this->getRecordsChunk(0, 1)->first());
}

public function getSchema(): Schema
{
return $this->schema;
}

/**
* @return Column[]
*/
public function getColumnsMappedByHeader(): array
{
$map = [];

$realHeader = $this->getHeader();
foreach ($realHeader as $realIndex => $realColumn) {
$realIndex = (int)$realIndex;
$schemaColumn = $this->schema->getColumn($realColumn);

if ($schemaColumn !== null) {
$schemaColumn->setId($realIndex);
$map[$realIndex] = $schemaColumn;
}
}

return $map;
}

private function prepareReader(): LeagueReader
{
$reader = LeagueReader::createFromPath($this->csvFilename)
->setDelimiter($this->structure->getDelimiter())
->setEnclosure($this->structure->getEnclosure())
->setEscape($this->structure->getQuoteChar())
->setHeaderOffset($this->structure->isHeader() ? 0 : null);
->setHeaderOffset(null); // It's important to set it to null to optimize memory usage!

if ($this->structure->isBom()) {
$reader->includeInputBOM();
Expand Down
28 changes: 5 additions & 23 deletions src/Schema.php
Original file line number Diff line number Diff line change
Expand Up @@ -86,25 +86,6 @@ public function getColumns(): array
return $this->columns;
}

/**
* @return Column[]|null[]
* @phan-suppress PhanPartialTypeMismatchReturn
*/
public function getColumnsMappedByHeader(array $header): array
{
$map = [];

if ($this->getCsvStructure()->isHeader()) {
foreach ($header as $headerName) {
$map[$headerName] = $this->columns[$headerName] ?? null;
}
} else {
return $this->getColumns();
}

return $map;
}

public function getColumn(int|string $columNameOrId): ?Column
{
if (\is_int($columNameOrId)) {
Expand All @@ -113,10 +94,6 @@ public function getColumn(int|string $columNameOrId): ?Column
$column = $this->getColumns()[$columNameOrId] ?? null;
}

if ($column === null) {
throw new Exception("Column \"{$columNameOrId}\" not found in schema \"{$this->filename}\"");
}

return $column;
}

Expand Down Expand Up @@ -154,6 +131,11 @@ public function getData(): AbstractData
return clone $this->data;
}

public function getSchemaHeader(): array
{
return \array_keys($this->getColumns());
}

/**
* @return Column[]
*/
Expand Down
6 changes: 4 additions & 2 deletions src/Utils.php
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ public static function debug(int|string $message): void

public static function debugSpeed(string $messPrefix, int $lines, float $startTimer): void
{
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
if (\defined('DEBUG_MODE')) {
$kiloLines = \round(($lines / (\microtime(true) - $startTimer)) / 1000);
self::debug("{$messPrefix} <blue>" . \number_format($kiloLines) . 'K</blue> lines/sec');
}
}

public static function kebabToCamelCase(string $input): string
Expand Down
33 changes: 17 additions & 16 deletions src/Validators/ValidatorCsv.php
Original file line number Diff line number Diff line change
Expand Up @@ -106,15 +106,13 @@ private function validateHeader(bool $quickStop = false): ErrorSuite
private function validateLines(bool $quickStop = false): ErrorSuite
{
$errors = new ErrorSuite();
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$mappedColumns = $this->csv->getColumnsMappedByHeader();
$isHeaderEnabled = $this->schema->getCsvStructure()->isHeader();

foreach ($realColumns as $column) {
$columValues = [];
if ($column === null) {
continue;
}
foreach ($mappedColumns as $columnIndex => $column) {
$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -"; // System message prefix. Debug only!

$messPrefix = "<i>Column</i> \"{$column->getHumanName()}\" -";
$columValues = [];

Utils::debug("{$messPrefix} Column start");
$colValidator = $column->getValidator();
Expand All @@ -138,30 +136,34 @@ private function validateLines(bool $quickStop = false): ErrorSuite
$lineCounter = 0;
$startTimer = \microtime(true);
foreach ($this->csv->getRecords() as $line => $record) {
if ($isHeaderEnabled && $line === 0) {
continue;
}

$lineCounter++;
$lineNum = (int)$line + 1;

if ($isRules) { // Time optimization
if (!isset($record[$column->getKey()])) {
if (!isset($record[$columnIndex])) {
$errors->addError(
new Error(
'csv.column',
"Column index:{$column->getKey()} not found",
"Column index:{$columnIndex} not found",
$column->getHumanName(),
$lineNum,
),
);
} else {
$errors->addErrorSuit($colValidator->validateCell($record[$column->getKey()], $lineNum));
$errors->addErrorSuit($colValidator->validateCell($record[$columnIndex], $lineNum));
}

if ($quickStop && $errors->count() > 0) {
return $errors;
}
}

if ($isAggRules && isset($record[$column->getKey()])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$column->getKey()], $aggInputType);
if ($isAggRules && isset($record[$columnIndex])) { // Time & memory optimization
$columValues[] = ValidatorColumn::prepareValue($record[$columnIndex], $aggInputType);
}
}
Utils::debug("{$messPrefix} Lines <yellow>" . \number_format($lineCounter) . '</yellow>');
Expand Down Expand Up @@ -213,10 +215,9 @@ private function validateColumn(bool $quickStop): ErrorSuite
$errors = new ErrorSuite();

if ($this->schema->getCsvStructure()->isHeader()) {
$realColumns = $this->schema->getColumnsMappedByHeader($this->csv->getHeader());
$schemaColumns = $this->schema->getColumns();

$notFoundColums = \array_diff(\array_keys($schemaColumns), \array_keys($realColumns));
$realColumns = $this->csv->getHeader();
$schemaColumns = $this->schema->getSchemaHeader();
$notFoundColums = \array_diff($schemaColumns, $realColumns);

if (\count($notFoundColums) > 0) {
$error = new Error(
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,7 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_0_quickest_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
aggregate_rules:
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
is_int: true
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,8 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
average: 0
count: 0
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_1_mini_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
is_int: true
aggregate_rules:
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
length_max: 100
is_int: true
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic_agg.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- aggregate_rules:
- name: id
aggregate_rules:
is_unique: true
sorted: [ desc, natural ]
count: 0
Expand Down
6 changes: 2 additions & 4 deletions tests/Benchmarks/bench_2_realistic_combo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@

filename_pattern: /.csv$/i

csv:
header: false

columns:
- rules:
- name: id
rules:
not_empty: true
length_max: 100
is_int: true
Expand Down
Loading