From 4b09a27ac66076c33a23aab9877190e5f064ff50 Mon Sep 17 00:00:00 2001 From: zleyyij Date: Thu, 25 Apr 2024 17:46:19 -0600 Subject: [PATCH] feat: improved resiliance --- src/scripts/parser/src/lib.rs | 30 ++++++++++++++---------------- src/scripts/parser/src/parser.rs | 21 +++++++++------------ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/src/scripts/parser/src/lib.rs b/src/scripts/parser/src/lib.rs index 3789f37..89f57e5 100644 --- a/src/scripts/parser/src/lib.rs +++ b/src/scripts/parser/src/lib.rs @@ -2,12 +2,10 @@ mod lexer; mod parser; -use encoding::all::{UTF_16BE, UTF_16LE, UTF_8}; -use encoding::codec::utf_8::from_utf8; -use encoding::{all::ISO_8859_1, Encoding}; +use encoding::all::UTF_8; +use encoding::Encoding; use lexer::lexer::lex_csv; use parser::parser::deserialize_csv; -use std::borrow::Cow; use std::collections::HashMap; use wasm_bindgen::prelude::*; @@ -20,23 +18,22 @@ pub fn parse_csv_wasm(raw_csv: &[u8]) -> JsValue { fn parse_csv(raw_csv: &[u8]) -> HashMap> { // translate the csv from ISO-8859-1 to a UTF 8 strings - let transcoded_csv = - transcode_csv(raw_csv); + let transcoded_csv = transcode_csv(raw_csv); let lexed_csv: Vec> = lex_csv(&transcoded_csv).unwrap(); let parsed_csv: HashMap> = deserialize_csv(lexed_csv); return parsed_csv; } /// Ever since HWINFO 8.0 (), logs are encoded with -/// unicode, switching from ISO-8559-1 encoding. +/// unicode, switching from ISO-8559-1 encoding. /// Take a buffer of presumably UTF-8 encoded bytes, and transcode them to a standard rust String. #[inline] fn transcode_csv(unencoded_csv: &[u8]) -> String { // see if it's valid utf 8, for some reason the encoding crate handles this better than the standard library's implementation - match UTF_16BE.decode(unencoded_csv, encoding::DecoderTrap::Strict) { + match UTF_8.decode(unencoded_csv, encoding::DecoderTrap::Strict) { Ok(s) => return s, Err(e) => { - console_log!("warning: the provided file is not valid UTF 8: interpreting with UTF-8 failed with error {e:?}, falling back to UTF-8 with replacement"); + console_log!("warning: The provided file is not valid UTF 8: interpreting with UTF-8 failed with error {e:?}, falling back to UTF-8 with replacement."); // match ISO_8859_1.decode(iso_8859_1_csv, encoding::DecoderTrap::Strict) { // Ok(s) => return s, // Err(e) => { @@ -44,9 +41,11 @@ fn transcode_csv(unencoded_csv: &[u8]) -> String { // return UTF_8.decode(iso_8859_1_csv, encoding::DecoderTrap::Replace).unwrap(); // } // this is fine because Replace should be infallible(within reason) - return UTF_8.decode(unencoded_csv, encoding::DecoderTrap::Replace).unwrap() - } + return UTF_8 + .decode(unencoded_csv, encoding::DecoderTrap::Replace) + .unwrap(); } + } // } // if let Err(e) = UTF_8.decode(iso_8859_1_csv, encoding::DecoderTrap::Strict) { // console_log!("Warning: input file contains invalid UTF-8: {e:?}"); @@ -66,7 +65,6 @@ mod tests { use std::fs::File; use std::io::Read; - // ISO-8859-1 encoding is really close to UTF-8, so more general characters are fine fn gen_mock_csv(num_rows: usize, num_columns: usize) -> (String, HashMap>) { // todo!() let mock_spreadsheet: Vec>> = @@ -135,9 +133,9 @@ mod tests { #[test] fn parse_csv_from_file() { // TODO - let mut file_handle = File::open("/Users/arc/Downloads/help.csv").unwrap(); - let mut file_vec = Vec::new(); - file_handle.read_to_end(&mut file_vec).unwrap(); - parse_csv(&file_vec); + // let mut file_handle = File::open("/Users/arc/Downloads/HWINFO.CSV").unwrap(); + // let mut file_vec = Vec::new(); + // file_handle.read_to_end(&mut file_vec).unwrap(); + // parse_csv(&file_vec); } } diff --git a/src/scripts/parser/src/parser.rs b/src/scripts/parser/src/parser.rs index 85ec536..dd61d96 100644 --- a/src/scripts/parser/src/parser.rs +++ b/src/scripts/parser/src/parser.rs @@ -79,14 +79,7 @@ pub mod parser { processed_column.push(parsed_val); } Err(_) => { - #[cfg(wasm)] - { - console_log!("Failed to parse entry {entry} into a float, skipping"); - } - #[cfg(not(wasm))] - { - println!("Failed to parse entry {entry} into a float, skipping column {}", column[0]); - } + console_log!("Failed to parse entry {entry} into a float, skipping"); } } } @@ -102,8 +95,8 @@ pub mod parser { // .chars().count() is used here instead of .len() because .len() // breaks for multibyte chars [insertion_key.chars().count() - 3..] - .iter() - .collect::() + .iter() + .collect::() } else { insertion_key.to_string() }; @@ -167,13 +160,18 @@ pub mod parser { /// in a hwinfo csv, translate to another vec of vectors, where /// the output is a list of columns fn rows_to_columns(input: Vec>) -> Vec> { + // I don't know how to make it warn that the last column is malformed only once + let mut sent_warning = false; let mut columnar_input: Vec> = vec![Vec::with_capacity(input.len()); input[0].len()]; for row in input { for (index, item) in row.iter().enumerate() { // sometimes there are other columns that are longer than the first column if columnar_input.len() - 1 < index { - console_log!("The last column in this CSV file is malformed, skipping. Please open in a spreadsheet to view"); + if !sent_warning { + console_log!("warning: The last column in this CSV file is malformed, skipping. Please open in a spreadsheet to view."); + sent_warning = true; + } } else { columnar_input[index].push(*item); } @@ -293,7 +291,6 @@ pub mod parser { expected_output.insert("🦆 (2)".to_owned(), vec![2.0, 2.0]); assert_eq!(deserialize_csv(mock_csv), expected_output); - } } }