Skip to content

Commit

Permalink
feat: improved resiliance
Browse files Browse the repository at this point in the history
  • Loading branch information
zleyyij committed Apr 25, 2024
1 parent 2a76d14 commit 4b09a27
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 28 deletions.
30 changes: 14 additions & 16 deletions src/scripts/parser/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@
mod lexer;
mod parser;

use encoding::all::{UTF_16BE, UTF_16LE, UTF_8};
use encoding::codec::utf_8::from_utf8;
use encoding::{all::ISO_8859_1, Encoding};
use encoding::all::UTF_8;
use encoding::Encoding;
use lexer::lexer::lex_csv;
use parser::parser::deserialize_csv;
use std::borrow::Cow;
use std::collections::HashMap;
use wasm_bindgen::prelude::*;

Expand All @@ -20,33 +18,34 @@ pub fn parse_csv_wasm(raw_csv: &[u8]) -> JsValue {

fn parse_csv(raw_csv: &[u8]) -> HashMap<String, Vec<f64>> {
// translate the csv from ISO-8859-1 to a UTF 8 strings
let transcoded_csv =
transcode_csv(raw_csv);
let transcoded_csv = transcode_csv(raw_csv);
let lexed_csv: Vec<Vec<&str>> = lex_csv(&transcoded_csv).unwrap();
let parsed_csv: HashMap<String, Vec<f64>> = deserialize_csv(lexed_csv);
return parsed_csv;
}

/// Ever since HWINFO 8.0 (<https://www.hwinfo.com/version-history/>), logs are encoded with
/// unicode, switching from ISO-8559-1 encoding.
/// unicode, switching from ISO-8559-1 encoding.
/// Take a buffer of presumably UTF-8 encoded bytes, and transcode them to a standard rust String.
#[inline]
fn transcode_csv(unencoded_csv: &[u8]) -> String {
// see if it's valid utf 8, for some reason the encoding crate handles this better than the standard library's implementation
match UTF_16BE.decode(unencoded_csv, encoding::DecoderTrap::Strict) {
match UTF_8.decode(unencoded_csv, encoding::DecoderTrap::Strict) {
Ok(s) => return s,
Err(e) => {
console_log!("warning: the provided file is not valid UTF 8: interpreting with UTF-8 failed with error {e:?}, falling back to UTF-8 with replacement");
console_log!("warning: The provided file is not valid UTF 8: interpreting with UTF-8 failed with error {e:?}, falling back to UTF-8 with replacement.");
// match ISO_8859_1.decode(iso_8859_1_csv, encoding::DecoderTrap::Strict) {
// Ok(s) => return s,
// Err(e) => {
// console_log!("Unable to interpret as ISO-8559-1, falling back to UTF-8 with replacement, may god help us all (failed with {e:?}");
// return UTF_8.decode(iso_8859_1_csv, encoding::DecoderTrap::Replace).unwrap();
// }
// this is fine because Replace should be infallible(within reason)
return UTF_8.decode(unencoded_csv, encoding::DecoderTrap::Replace).unwrap()
}
return UTF_8
.decode(unencoded_csv, encoding::DecoderTrap::Replace)
.unwrap();
}
}
// }
// if let Err(e) = UTF_8.decode(iso_8859_1_csv, encoding::DecoderTrap::Strict) {
// console_log!("Warning: input file contains invalid UTF-8: {e:?}");
Expand All @@ -66,7 +65,6 @@ mod tests {
use std::fs::File;
use std::io::Read;

// ISO-8859-1 encoding is really close to UTF-8, so more general characters are fine
fn gen_mock_csv(num_rows: usize, num_columns: usize) -> (String, HashMap<String, Vec<f64>>) {
// todo!()
let mock_spreadsheet: Vec<Vec<RefCell<String>>> =
Expand Down Expand Up @@ -135,9 +133,9 @@ mod tests {
#[test]
fn parse_csv_from_file() {
// TODO
let mut file_handle = File::open("/Users/arc/Downloads/help.csv").unwrap();
let mut file_vec = Vec::new();
file_handle.read_to_end(&mut file_vec).unwrap();
parse_csv(&file_vec);
// let mut file_handle = File::open("/Users/arc/Downloads/HWINFO.CSV").unwrap();
// let mut file_vec = Vec::new();
// file_handle.read_to_end(&mut file_vec).unwrap();
// parse_csv(&file_vec);
}
}
21 changes: 9 additions & 12 deletions src/scripts/parser/src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,14 +79,7 @@ pub mod parser {
processed_column.push(parsed_val);
}
Err(_) => {
#[cfg(wasm)]
{
console_log!("Failed to parse entry {entry} into a float, skipping");
}
#[cfg(not(wasm))]
{
println!("Failed to parse entry {entry} into a float, skipping column {}", column[0]);
}
console_log!("Failed to parse entry {entry} into a float, skipping");
}
}
}
Expand All @@ -102,8 +95,8 @@ pub mod parser {
// .chars().count() is used here instead of .len() because .len()
// breaks for multibyte chars
[insertion_key.chars().count() - 3..]
.iter()
.collect::<String>()
.iter()
.collect::<String>()
} else {
insertion_key.to_string()
};
Expand Down Expand Up @@ -167,13 +160,18 @@ pub mod parser {
/// in a hwinfo csv, translate to another vec of vectors, where
/// the output is a list of columns
fn rows_to_columns(input: Vec<Vec<&str>>) -> Vec<Vec<&str>> {
// I don't know how to make it warn that the last column is malformed only once
let mut sent_warning = false;
let mut columnar_input: Vec<Vec<&str>> =
vec![Vec::with_capacity(input.len()); input[0].len()];
for row in input {
for (index, item) in row.iter().enumerate() {
// sometimes there are other columns that are longer than the first column
if columnar_input.len() - 1 < index {
console_log!("The last column in this CSV file is malformed, skipping. Please open in a spreadsheet to view");
if !sent_warning {
console_log!("warning: The last column in this CSV file is malformed, skipping. Please open in a spreadsheet to view.");
sent_warning = true;
}
} else {
columnar_input[index].push(*item);
}
Expand Down Expand Up @@ -293,7 +291,6 @@ pub mod parser {
expected_output.insert("🦆 (2)".to_owned(), vec![2.0, 2.0]);

assert_eq!(deserialize_csv(mock_csv), expected_output);

}
}
}

0 comments on commit 4b09a27

Please sign in to comment.