From 5edca43b1c8e0fed88f70a52e5d4999f9fdbb506 Mon Sep 17 00:00:00 2001 From: Tristan Nelson <85176679+vtgreen@users.noreply.github.com> Date: Tue, 23 Sep 2025 13:36:52 -0500 Subject: [PATCH 1/2] Add new output-stats flag to output to file - Added --output-stats (-o) flag to output performance statistics to a file - Added --output-format flag with JSON and text options - Made --output-format only valid when used with --output-stats - Provided clear descriptions in the help output - Maintained existing stderr logging behavior --- scrubcsv/src/main.rs | 73 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 69 insertions(+), 4 deletions(-) diff --git a/scrubcsv/src/main.rs b/scrubcsv/src/main.rs index 4bfe58a..c37f50d 100644 --- a/scrubcsv/src/main.rs +++ b/scrubcsv/src/main.rs @@ -9,8 +9,8 @@ use log::debug; use regex::{bytes::Regex as BytesRegex, Regex}; use std::{ borrow::Cow, - fs, - io::{self, prelude::*}, + fs::{self, File}, + io::{self, prelude::*, BufWriter}, path::PathBuf, process, time::Instant, @@ -33,6 +33,25 @@ use crate::util::CharSpecifier; /// by `csv`. const BUFFER_SIZE: usize = 256 * 1024; +/// Output format for statistics. +#[derive(Debug, Clone)] +enum OutputFormat { + Json, + Text, +} + +impl std::str::FromStr for OutputFormat { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "json" => Ok(OutputFormat::Json), + "text" => Ok(OutputFormat::Text), + _ => Err(format!("Invalid output format '{}'. Supported formats: json, text", s)), + } + } +} + /// Our command-line arguments. #[derive(Debug, StructOpt)] #[structopt( @@ -103,6 +122,17 @@ struct Opt { #[structopt(short = "q", long = "quiet")] quiet: bool, + /// Output performance statistics to the specified file in addition to stderr. + /// Stats include total rows processed, bad rows found, processing time, and throughput. + /// Use with --output-format to specify the output format (default: json). + #[structopt(short = "o", long = "output-stats")] + output_stats: Option, + + /// Format for the statistics output file. Only valid with --output-stats. + /// 'json' outputs structured data, 'text' outputs human-readable format. + #[structopt(long = "output-format", default_value = "json", requires = "output-stats")] + output_format: OutputFormat, + /// Character used to quote entries. May be set to "none" to ignore all /// quoting. #[structopt(value_name = "CHAR", long = "quote", default_value = "\"")] @@ -349,9 +379,10 @@ fn run() -> Result<()> { wtr.flush().context("error writing records")?; // Print out some information about our run. + let ellapsed = start_time.elapsed().as_secs_f64(); + let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; + if !opt.quiet { - let ellapsed = start_time.elapsed().as_secs_f64(); - let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; eprintln!( "{} rows ({} bad) in {:.2} seconds, {}/sec", rows, @@ -361,6 +392,40 @@ fn run() -> Result<()> { ); } + // Write stats to output file if specified + if let Some(output_path) = &opt.output_stats { + let mut file = BufWriter::new( + File::create(output_path) + .with_context(|_| format!("cannot create output file {}", output_path.display()))? + ); + + let stats_content = match opt.output_format { + OutputFormat::Json => { + format!( + r#"{{"rows": {}, "bad_rows": {}, "elapsed_seconds": {:.2}, "bytes_per_second": {}}}"#, + rows, + bad_rows, + ellapsed, + bytes_per_second + ) + } + OutputFormat::Text => { + format!( + "{} rows ({} bad) in {:.2} seconds, {}/sec", + rows, + bad_rows, + ellapsed, + bytes_per_second.file_size(file_size_opts::BINARY)? + ) + } + }; + + writeln!(file, "{}", stats_content) + .with_context(|_| format!("cannot write to output file {}", output_path.display()))?; + file.flush() + .with_context(|_| format!("cannot flush output file {}", output_path.display()))?; + } + // If more than 10% of rows are bad, assume something has gone horribly // wrong. if bad_rows.checked_mul(10).expect("multiplication overflow") > rows { From 3e096acb30d35dc4edfb1395f381a44a961fb2a4 Mon Sep 17 00:00:00 2001 From: Tristan Nelson <85176679+vtgreen@users.noreply.github.com> Date: Wed, 24 Sep 2025 10:38:46 -0500 Subject: [PATCH 2/2] adding tests --- scrubcsv/src/main.rs | 7 ++-- scrubcsv/tests/tests.rs | 93 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) diff --git a/scrubcsv/src/main.rs b/scrubcsv/src/main.rs index c37f50d..d2636c3 100644 --- a/scrubcsv/src/main.rs +++ b/scrubcsv/src/main.rs @@ -130,8 +130,8 @@ struct Opt { /// Format for the statistics output file. Only valid with --output-stats. /// 'json' outputs structured data, 'text' outputs human-readable format. - #[structopt(long = "output-format", default_value = "json", requires = "output-stats")] - output_format: OutputFormat, + #[structopt(long = "output-format", requires = "output-stats")] + output_format: Option, /// Character used to quote entries. May be set to "none" to ignore all /// quoting. @@ -399,7 +399,8 @@ fn run() -> Result<()> { .with_context(|_| format!("cannot create output file {}", output_path.display()))? ); - let stats_content = match opt.output_format { + let format = opt.output_format.unwrap_or(OutputFormat::Json); + let stats_content = match format { OutputFormat::Json => { format!( r#"{{"rows": {}, "bad_rows": {}, "elapsed_seconds": {:.2}, "bytes_per_second": {}}}"#, diff --git a/scrubcsv/tests/tests.rs b/scrubcsv/tests/tests.rs index 05cd114..097b30f 100644 --- a/scrubcsv/tests/tests.rs +++ b/scrubcsv/tests/tests.rs @@ -256,3 +256,96 @@ a,b,c "# ); } + +#[test] +fn output_stats_json_format() { + let testdir = TestDir::new("scrubcsv", "output_stats_json"); + testdir.create_file( + "input.csv", + "\ +a,b,c +1,2,3 +4,5,6 +", + ); + let output = testdir + .cmd() + .args(&["--output-stats", "stats.json", "--output-format", "json"]) + .arg("input.csv") + .expect_success(); + + // Check that CSV output is still correct + assert_eq!( + output.stdout_str(), + "\ +a,b,c +1,2,3 +4,5,6 +" + ); + + // Check that stats were written to file + let stats_content = std::fs::read_to_string(testdir.path("stats.json")) + .expect("Failed to read stats.json"); + assert!(stats_content.contains(r#""rows": 3"#)); + assert!(stats_content.contains(r#""bad_rows": 0"#)); + assert!(stats_content.contains(r#""elapsed_seconds""#)); + assert!(stats_content.contains(r#""bytes_per_second""#)); + + // Verify it's valid JSON structure + assert!(stats_content.starts_with("{")); + assert!(stats_content.trim_end().ends_with("}")); +} + +#[test] +fn output_stats_text_format() { + let testdir = TestDir::new("scrubcsv", "output_stats_text"); + testdir.create_file( + "input.csv", + "\ +name,age +Alice,25 +Bob,30 +", + ); + let output = testdir + .cmd() + .args(&["--output-stats", "stats.txt", "--output-format", "text"]) + .arg("input.csv") + .expect_success(); + + // Check that CSV output is still correct + assert_eq!( + output.stdout_str(), + "\ +name,age +Alice,25 +Bob,30 +" + ); + + // Check that stats were written to file in text format + let stats_content = std::fs::read_to_string(testdir.path("stats.txt")) + .expect("Failed to read stats.txt"); + assert!(stats_content.contains("3 rows (0 bad)")); + assert!(stats_content.contains("seconds")); + assert!(stats_content.contains("/sec")); + + // Verify it matches the stderr format pattern + assert!(!stats_content.starts_with("{")); // Not JSON +} + +#[test] +fn output_format_requires_output_stats() { + let testdir = TestDir::new("scrubcsv", "output_format_dependency"); + testdir.create_file("input.csv", "a,b,c\n1,2,3\n"); + + // This should fail because --output-format requires --output-stats + let output = testdir + .cmd() + .args(&["--output-format", "json"]) + .arg("input.csv") + .expect_failure(); + + assert!(output.stderr_str().contains("--output-stats")); +}