From 7148505a84f109d58da62ca837e13e1c9c4ef405 Mon Sep 17 00:00:00 2001
From: Neil O'Toole <neilotoole@apache.org>
Date: Tue, 14 Mar 2023 23:04:49 -0600
Subject: [PATCH] CSV: check for mismatch field count vs explicitly specified
 column names (#147)

* csv: check for mismatch field count vs explicitly specified column names

* CHANGELOG update
---
 CHANGELOG.md          |  9 +++++++++
 drivers/csv/import.go | 36 ++++++++++++++++++++++++------------
 2 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 826e7a1ea..698fc3fc1 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,14 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [v0.24.3] - 2023-03-14
+
+### Added
+
+- When a CSV source has explicit column names (via `--opts cols=A,B,C`), `sq` now verifies
+  that the CSV data record field count matches the number of explicit columns.
+
+
 ## [v0.24.2] - 2023-03-13
 
 ### Fixed
@@ -145,6 +153,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#89]: Bug with SQL generated for joins.
 
 
+[v0.24.3]: https://github.com/neilotoole/sq/compare/v0.24.2...v0.24.3
 [v0.24.2]: https://github.com/neilotoole/sq/compare/v0.24.1...v0.24.2
 [v0.24.1]: https://github.com/neilotoole/sq/compare/v0.24.0...v0.24.1
 [v0.24.0]: https://github.com/neilotoole/sq/compare/v0.23.0...v0.24.0
diff --git a/drivers/csv/import.go b/drivers/csv/import.go
index c8336d141..1abb0b02b 100644
--- a/drivers/csv/import.go
+++ b/drivers/csv/import.go
@@ -364,10 +364,9 @@ func getColNames(cr *csv.Reader, src *source.Source, readAheadRecs *[][]string)
 		return headerRec, nil
 	}
 
-	// The CSV file does not have a header record. We will generate
-	// col names [A,B,C...]. To do so, we need to know how many fields
-	// there are in the first record.
-	firstDataRecord, err := cr.Read()
+	// Read ahead the first record. We need this to determine the number
+	// of columns.
+	firstRec, err := cr.Read()
 	if err == io.EOF { //nolint:errorlint
 		return nil, errz.Errorf("data source %s is empty", src.Handle)
 	}
@@ -375,13 +374,27 @@ func getColNames(cr *csv.Reader, src *source.Source, readAheadRecs *[][]string)
 		return nil, errz.Wrapf(err, "read from data source %s", src.Handle)
 	}
 
-	// firstRecord contains actual data, so append it to initialRecs.
-	*readAheadRecs = append(*readAheadRecs, firstDataRecord)
+	// firstRec contains actual data, so append it to readAheadRecs.
+	*readAheadRecs = append(*readAheadRecs, firstRec)
+
+	// If we have explicit column names, we still need to verify the
+	// column name count against the data.
+	if len(explicitColNames) > 0 {
+		if len(explicitColNames) != len(firstRec) {
+			return nil, errz.Errorf("mismatch: source has %d explicit column names specified, but first data record has %d fields", //nolint:lll
+				len(explicitColNames), len(firstRec))
+		}
+
+		return explicitColNames, nil
+	}
 
+	// The CSV file does not have a header record. We will generate
+	// col names [A,B,C...]. To do so, we need to know how many fields
+	// there are in the first record.
 	// If no column names yet, we generate them based on the number
-	// of fields in firstDataRecord.
-	generatedColNames := make([]string, len(firstDataRecord))
-	for i := range firstDataRecord {
+	// of fields in firstRec.
+	generatedColNames := make([]string, len(firstRec))
+	for i := range firstRec {
 		generatedColNames[i] = stringz.GenerateAlphaColName(i, false)
 	}
 
@@ -429,13 +442,12 @@ func getDelimFromOptions(opts options.Options) (r rune, ok bool, err error) {
 		return 0, false, nil
 	}
 
-	const key = "delim"
-	_, ok = opts[key]
+	_, ok = opts[options.OptDelim]
 	if !ok {
 		return 0, false, nil
 	}
 
-	val := opts.Get(key)
+	val := opts.Get(options.OptDelim)
 	if val == "" {
 		return 0, false, nil
 	}