diff --git a/scrubcsv/src/main.rs b/scrubcsv/src/main.rs index 4bfe58a..d2636c3 100644 --- a/scrubcsv/src/main.rs +++ b/scrubcsv/src/main.rs @@ -9,8 +9,8 @@ use log::debug; use regex::{bytes::Regex as BytesRegex, Regex}; use std::{ borrow::Cow, - fs, - io::{self, prelude::*}, + fs::{self, File}, + io::{self, prelude::*, BufWriter}, path::PathBuf, process, time::Instant, @@ -33,6 +33,25 @@ use crate::util::CharSpecifier; /// by `csv`. const BUFFER_SIZE: usize = 256 * 1024; +/// Output format for statistics. +#[derive(Debug, Clone)] +enum OutputFormat { + Json, + Text, +} + +impl std::str::FromStr for OutputFormat { + type Err = String; + + fn from_str(s: &str) -> Result { + match s.to_lowercase().as_str() { + "json" => Ok(OutputFormat::Json), + "text" => Ok(OutputFormat::Text), + _ => Err(format!("Invalid output format '{}'. Supported formats: json, text", s)), + } + } +} + /// Our command-line arguments. #[derive(Debug, StructOpt)] #[structopt( @@ -103,6 +122,17 @@ struct Opt { #[structopt(short = "q", long = "quiet")] quiet: bool, + /// Output performance statistics to the specified file in addition to stderr. + /// Stats include total rows processed, bad rows found, processing time, and throughput. + /// Use with --output-format to specify the output format (default: json). + #[structopt(short = "o", long = "output-stats")] + output_stats: Option, + + /// Format for the statistics output file. Only valid with --output-stats. + /// 'json' outputs structured data, 'text' outputs human-readable format. + #[structopt(long = "output-format", requires = "output-stats")] + output_format: Option, + /// Character used to quote entries. May be set to "none" to ignore all /// quoting. #[structopt(value_name = "CHAR", long = "quote", default_value = "\"")] @@ -349,9 +379,10 @@ fn run() -> Result<()> { wtr.flush().context("error writing records")?; // Print out some information about our run. + let ellapsed = start_time.elapsed().as_secs_f64(); + let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; + if !opt.quiet { - let ellapsed = start_time.elapsed().as_secs_f64(); - let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64; eprintln!( "{} rows ({} bad) in {:.2} seconds, {}/sec", rows, @@ -361,6 +392,41 @@ fn run() -> Result<()> { ); } + // Write stats to output file if specified + if let Some(output_path) = &opt.output_stats { + let mut file = BufWriter::new( + File::create(output_path) + .with_context(|_| format!("cannot create output file {}", output_path.display()))? + ); + + let format = opt.output_format.unwrap_or(OutputFormat::Json); + let stats_content = match format { + OutputFormat::Json => { + format!( + r#"{{"rows": {}, "bad_rows": {}, "elapsed_seconds": {:.2}, "bytes_per_second": {}}}"#, + rows, + bad_rows, + ellapsed, + bytes_per_second + ) + } + OutputFormat::Text => { + format!( + "{} rows ({} bad) in {:.2} seconds, {}/sec", + rows, + bad_rows, + ellapsed, + bytes_per_second.file_size(file_size_opts::BINARY)? + ) + } + }; + + writeln!(file, "{}", stats_content) + .with_context(|_| format!("cannot write to output file {}", output_path.display()))?; + file.flush() + .with_context(|_| format!("cannot flush output file {}", output_path.display()))?; + } + // If more than 10% of rows are bad, assume something has gone horribly // wrong. if bad_rows.checked_mul(10).expect("multiplication overflow") > rows { diff --git a/scrubcsv/tests/tests.rs b/scrubcsv/tests/tests.rs index 05cd114..097b30f 100644 --- a/scrubcsv/tests/tests.rs +++ b/scrubcsv/tests/tests.rs @@ -256,3 +256,96 @@ a,b,c "# ); } + +#[test] +fn output_stats_json_format() { + let testdir = TestDir::new("scrubcsv", "output_stats_json"); + testdir.create_file( + "input.csv", + "\ +a,b,c +1,2,3 +4,5,6 +", + ); + let output = testdir + .cmd() + .args(&["--output-stats", "stats.json", "--output-format", "json"]) + .arg("input.csv") + .expect_success(); + + // Check that CSV output is still correct + assert_eq!( + output.stdout_str(), + "\ +a,b,c +1,2,3 +4,5,6 +" + ); + + // Check that stats were written to file + let stats_content = std::fs::read_to_string(testdir.path("stats.json")) + .expect("Failed to read stats.json"); + assert!(stats_content.contains(r#""rows": 3"#)); + assert!(stats_content.contains(r#""bad_rows": 0"#)); + assert!(stats_content.contains(r#""elapsed_seconds""#)); + assert!(stats_content.contains(r#""bytes_per_second""#)); + + // Verify it's valid JSON structure + assert!(stats_content.starts_with("{")); + assert!(stats_content.trim_end().ends_with("}")); +} + +#[test] +fn output_stats_text_format() { + let testdir = TestDir::new("scrubcsv", "output_stats_text"); + testdir.create_file( + "input.csv", + "\ +name,age +Alice,25 +Bob,30 +", + ); + let output = testdir + .cmd() + .args(&["--output-stats", "stats.txt", "--output-format", "text"]) + .arg("input.csv") + .expect_success(); + + // Check that CSV output is still correct + assert_eq!( + output.stdout_str(), + "\ +name,age +Alice,25 +Bob,30 +" + ); + + // Check that stats were written to file in text format + let stats_content = std::fs::read_to_string(testdir.path("stats.txt")) + .expect("Failed to read stats.txt"); + assert!(stats_content.contains("3 rows (0 bad)")); + assert!(stats_content.contains("seconds")); + assert!(stats_content.contains("/sec")); + + // Verify it matches the stderr format pattern + assert!(!stats_content.starts_with("{")); // Not JSON +} + +#[test] +fn output_format_requires_output_stats() { + let testdir = TestDir::new("scrubcsv", "output_format_dependency"); + testdir.create_file("input.csv", "a,b,c\n1,2,3\n"); + + // This should fail because --output-format requires --output-stats + let output = testdir + .cmd() + .args(&["--output-format", "json"]) + .arg("input.csv") + .expect_failure(); + + assert!(output.stderr_str().contains("--output-stats")); +}