Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 70 additions & 4 deletions scrubcsv/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ use log::debug;
use regex::{bytes::Regex as BytesRegex, Regex};
use std::{
borrow::Cow,
fs,
io::{self, prelude::*},
fs::{self, File},
io::{self, prelude::*, BufWriter},
path::PathBuf,
process,
time::Instant,
Expand All @@ -33,6 +33,25 @@ use crate::util::CharSpecifier;
/// by `csv`.
const BUFFER_SIZE: usize = 256 * 1024;

/// Output format for statistics.
#[derive(Debug, Clone)]
enum OutputFormat {
Json,
Text,
}

impl std::str::FromStr for OutputFormat {
type Err = String;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"json" => Ok(OutputFormat::Json),
"text" => Ok(OutputFormat::Text),
_ => Err(format!("Invalid output format '{}'. Supported formats: json, text", s)),
}
}
}

/// Our command-line arguments.
#[derive(Debug, StructOpt)]
#[structopt(
Expand Down Expand Up @@ -103,6 +122,17 @@ struct Opt {
#[structopt(short = "q", long = "quiet")]
quiet: bool,

/// Output performance statistics to the specified file in addition to stderr.
/// Stats include total rows processed, bad rows found, processing time, and throughput.
/// Use with --output-format to specify the output format (default: json).
#[structopt(short = "o", long = "output-stats")]
output_stats: Option<PathBuf>,

/// Format for the statistics output file. Only valid with --output-stats.
/// 'json' outputs structured data, 'text' outputs human-readable format.
#[structopt(long = "output-format", requires = "output-stats")]
output_format: Option<OutputFormat>,

/// Character used to quote entries. May be set to "none" to ignore all
/// quoting.
#[structopt(value_name = "CHAR", long = "quote", default_value = "\"")]
Expand Down Expand Up @@ -349,9 +379,10 @@ fn run() -> Result<()> {
wtr.flush().context("error writing records")?;

// Print out some information about our run.
let ellapsed = start_time.elapsed().as_secs_f64();
let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64;

if !opt.quiet {
let ellapsed = start_time.elapsed().as_secs_f64();
let bytes_per_second = (rdr.position().byte() as f64 / ellapsed) as i64;
eprintln!(
"{} rows ({} bad) in {:.2} seconds, {}/sec",
rows,
Expand All @@ -361,6 +392,41 @@ fn run() -> Result<()> {
);
}

// Write stats to output file if specified
if let Some(output_path) = &opt.output_stats {
let mut file = BufWriter::new(
File::create(output_path)
.with_context(|_| format!("cannot create output file {}", output_path.display()))?
);

let format = opt.output_format.unwrap_or(OutputFormat::Json);
let stats_content = match format {
OutputFormat::Json => {
format!(
r#"{{"rows": {}, "bad_rows": {}, "elapsed_seconds": {:.2}, "bytes_per_second": {}}}"#,
rows,
bad_rows,
ellapsed,
bytes_per_second
)
}
OutputFormat::Text => {
format!(
"{} rows ({} bad) in {:.2} seconds, {}/sec",
rows,
bad_rows,
ellapsed,
bytes_per_second.file_size(file_size_opts::BINARY)?
)
}
};

writeln!(file, "{}", stats_content)
.with_context(|_| format!("cannot write to output file {}", output_path.display()))?;
file.flush()
.with_context(|_| format!("cannot flush output file {}", output_path.display()))?;
}

// If more than 10% of rows are bad, assume something has gone horribly
// wrong.
if bad_rows.checked_mul(10).expect("multiplication overflow") > rows {
Expand Down
93 changes: 93 additions & 0 deletions scrubcsv/tests/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -256,3 +256,96 @@ a,b,c
"#
);
}

#[test]
fn output_stats_json_format() {
let testdir = TestDir::new("scrubcsv", "output_stats_json");
testdir.create_file(
"input.csv",
"\
a,b,c
1,2,3
4,5,6
",
);
let output = testdir
.cmd()
.args(&["--output-stats", "stats.json", "--output-format", "json"])
.arg("input.csv")
.expect_success();

// Check that CSV output is still correct
assert_eq!(
output.stdout_str(),
"\
a,b,c
1,2,3
4,5,6
"
);

// Check that stats were written to file
let stats_content = std::fs::read_to_string(testdir.path("stats.json"))
.expect("Failed to read stats.json");
assert!(stats_content.contains(r#""rows": 3"#));
assert!(stats_content.contains(r#""bad_rows": 0"#));
assert!(stats_content.contains(r#""elapsed_seconds""#));
assert!(stats_content.contains(r#""bytes_per_second""#));

// Verify it's valid JSON structure
assert!(stats_content.starts_with("{"));
assert!(stats_content.trim_end().ends_with("}"));
}

#[test]
fn output_stats_text_format() {
let testdir = TestDir::new("scrubcsv", "output_stats_text");
testdir.create_file(
"input.csv",
"\
name,age
Alice,25
Bob,30
",
);
let output = testdir
.cmd()
.args(&["--output-stats", "stats.txt", "--output-format", "text"])
.arg("input.csv")
.expect_success();

// Check that CSV output is still correct
assert_eq!(
output.stdout_str(),
"\
name,age
Alice,25
Bob,30
"
);

// Check that stats were written to file in text format
let stats_content = std::fs::read_to_string(testdir.path("stats.txt"))
.expect("Failed to read stats.txt");
assert!(stats_content.contains("3 rows (0 bad)"));
assert!(stats_content.contains("seconds"));
assert!(stats_content.contains("/sec"));

// Verify it matches the stderr format pattern
assert!(!stats_content.starts_with("{")); // Not JSON
}

#[test]
fn output_format_requires_output_stats() {
let testdir = TestDir::new("scrubcsv", "output_format_dependency");
testdir.create_file("input.csv", "a,b,c\n1,2,3\n");

// This should fail because --output-format requires --output-stats
let output = testdir
.cmd()
.args(&["--output-format", "json"])
.arg("input.csv")
.expect_failure();

assert!(output.stderr_str().contains("--output-stats"));
}
Loading