From 476f47e704c7d041c45c36d84bcb38515d213f45 Mon Sep 17 00:00:00 2001 From: Andy Kurnia Date: Sat, 30 Mar 2024 11:07:42 +0800 Subject: [PATCH] reproduce magpie kwg --- src/build.rs | 71 +++++++++++++++++++++++++++++++++++------------ src/main_build.rs | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 17 deletions(-) diff --git a/src/build.rs b/src/build.rs index 1185332..67cad29 100644 --- a/src/build.rs +++ b/src/build.rs @@ -191,7 +191,7 @@ struct StatesDefragger<'a> { } impl StatesDefragger<'_> { - fn defrag(&mut self, mut p: u32) { + fn defrag(&mut self, mut p: u32) { loop { let prev = self.prev_indexes[p as usize]; if prev == 0 { @@ -202,32 +202,53 @@ impl StatesDefragger<'_> { if self.destination[p as usize] != 0 { return; } + let mut initial_num_written = self.num_written; // temp value to break self-cycles. self.destination[p as usize] = !0; let mut write_p = p; + if !WOLGES_MODE { + // non-wolges mode reserves the space first. + loop { + self.num_written += 1; + p = self.states[p as usize].next_index; + if p == 0 { + break; + } + } + p = write_p; + } let mut num = 0u32; loop { num += 1; let a = self.states[p as usize].arc_index; if a != 0 { - self.defrag(a); + self.defrag::(a); } p = self.states[p as usize].next_index; if p == 0 { break; } } + if WOLGES_MODE { + initial_num_written = self.num_written; + } self.destination[write_p as usize] = 0; for ofs in 0..num { // prefer earlier index, so dawg part does not point to gaddag part if self.destination[write_p as usize] != 0 { break; } - self.destination[write_p as usize] = self.num_written + ofs; + if WOLGES_MODE || ofs == 0 { + self.destination[write_p as usize] = initial_num_written + ofs; + // non-wolges mode does not merge tail nodes. + } write_p = self.states[write_p as usize].next_index; } // Always += num even if some nodes are necessarily duplicated due to sharing by different prev_nodes. - self.num_written += num; + if WOLGES_MODE { + // non-wolges mode already reserves the space. + self.num_written += num; + } } // encoding: little endian of @@ -268,8 +289,8 @@ impl StatesDefragger<'_> { 0, ); match build_format { - BuildFormat::DawgOnly => (), - BuildFormat::Gaddawg => { + BuildFormat::DawgOnly | BuildFormat::DawgOnlyMagpie => (), + BuildFormat::Gaddawg | BuildFormat::GaddawgMagpie => { self.write_node( &mut ret[4..], gaddag_start_state, @@ -322,6 +343,8 @@ fn gen_prev_indexes(states: &[State]) -> Vec { pub enum BuildFormat { DawgOnly, Gaddawg, + DawgOnlyMagpie, + GaddawgMagpie, } // machine_words must be sorted and unique. @@ -345,13 +368,14 @@ pub fn build( states_finder: &mut states_finder, }; let dawg_start_state = match build_format { - BuildFormat::DawgOnly | BuildFormat::Gaddawg => { - state_maker.make_dawg(machine_words, 0, false) - } + BuildFormat::DawgOnly + | BuildFormat::Gaddawg + | BuildFormat::DawgOnlyMagpie + | BuildFormat::GaddawgMagpie => state_maker.make_dawg(machine_words, 0, false), }; let gaddag_start_state = match build_format { - BuildFormat::DawgOnly => 0, - BuildFormat::Gaddawg => state_maker.make_dawg( + BuildFormat::DawgOnly | BuildFormat::DawgOnlyMagpie => 0, + BuildFormat::Gaddawg | BuildFormat::GaddawgMagpie => state_maker.make_dawg( &gen_machine_drowwords(machine_words), dawg_start_state, true, @@ -360,19 +384,32 @@ pub fn build( let mut states_defragger = StatesDefragger { states: &states, - prev_indexes: &gen_prev_indexes(&states), + prev_indexes: &match build_format { + BuildFormat::DawgOnly | BuildFormat::Gaddawg => gen_prev_indexes(&states), + BuildFormat::DawgOnlyMagpie | BuildFormat::GaddawgMagpie => vec![0u32; states.len()], + }, destination: &mut vec![0u32; states.len()], num_written: match build_format { - BuildFormat::DawgOnly => 1, - BuildFormat::Gaddawg => 2, + BuildFormat::DawgOnly | BuildFormat::DawgOnlyMagpie => 1, + BuildFormat::Gaddawg | BuildFormat::GaddawgMagpie => 2, }, }; states_defragger.destination[0] = !0; // useful for empty lexicon - states_defragger.defrag(dawg_start_state); match build_format { - BuildFormat::DawgOnly => (), + BuildFormat::DawgOnly | BuildFormat::Gaddawg => { + states_defragger.defrag::(dawg_start_state) + } + BuildFormat::DawgOnlyMagpie | BuildFormat::GaddawgMagpie => { + states_defragger.defrag::(dawg_start_state) + } + } + match build_format { + BuildFormat::DawgOnly | BuildFormat::DawgOnlyMagpie => (), BuildFormat::Gaddawg => { - states_defragger.defrag(gaddag_start_state); + states_defragger.defrag::(gaddag_start_state); + } + BuildFormat::GaddawgMagpie => { + states_defragger.defrag::(gaddag_start_state); } } states_defragger.destination[0] = 0; // useful for empty lexicon diff --git a/src/main_build.rs b/src/main_build.rs index b53550a..1564bde 100644 --- a/src/main_build.rs +++ b/src/main_build.rs @@ -249,6 +249,66 @@ fn do_lang alphabet::Alphabet>( )?)?; Ok(true) } + "-kwg-magpie" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::GaddawgMagpie, + &read_machine_words( + &alphabet::AlphabetReader::new_for_words(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?, + )?)?; + Ok(true) + } + "-kwg-magpie-dawg" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::DawgOnlyMagpie, + &read_machine_words( + &alphabet::AlphabetReader::new_for_words(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?, + )?)?; + Ok(true) + } + "-kwg-magpie-alpha" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::DawgOnlyMagpie, + &build::make_alphagrams(&read_machine_words( + &alphabet::AlphabetReader::new_for_words(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?), + )?)?; + Ok(true) + } + "-kwg-magpie-score" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::GaddawgMagpie, + &read_machine_words( + &alphabet::AlphabetReader::new_for_word_scores(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?, + )?)?; + Ok(true) + } + "-kwg-magpie-score-dawg" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::DawgOnlyMagpie, + &read_machine_words( + &alphabet::AlphabetReader::new_for_word_scores(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?, + )?)?; + Ok(true) + } + "-kwg-magpie-score-alpha" => { + make_writer(&args[3])?.write_all(&build::build( + build::BuildFormat::DawgOnlyMagpie, + &build::make_alphagrams(&read_machine_words( + &alphabet::AlphabetReader::new_for_word_scores(&make_alphabet()), + &read_to_string(&mut make_reader(&args[2])?)?, + )?), + )?)?; + Ok(true) + } "-macondo" => { let alphabet = make_alphabet(); let kwg = kwg::Kwg::from_bytes_alloc(&std::fs::read(&args[2])?); @@ -304,6 +364,7 @@ fn main() -> error::Returns<()> { english-kwg-score-alpha CSW21.txt CSW21.kad english-kwg-score-dawg CSW21.txt outfile.dwg same as above but with representative same-score tiles + (english-kwg can also be english-kwg-magpie for bigger magpie-style kwg) (english can also be catalan, french, german, norwegian, polish, slovene, spanish, yupik) input/output files can be \"-\" (not advisable for binary files)"