Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

redo internals of State #185

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
project(lttoolbox
VERSION 3.7.13
VERSION 3.7.14
LANGUAGES CXX C
)
set(VERSION ${PROJECT_VERSION})
Expand Down
2 changes: 2 additions & 0 deletions lttoolbox/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ set(LIBLTTOOLBOX_HEADERS
node.h
pattern_list.h
regexp_compiler.h
reusable_state.h
serialiser.h
sorted_vector.h
sorted_vector.hpp
Expand Down Expand Up @@ -53,6 +54,7 @@ set(LIBLTTOOLBOX_SOURCES
node.cc
pattern_list.cc
regexp_compiler.cc
reusable_state.cc
sorted_vector.cc
state.cc
string_utils.cc
Expand Down
54 changes: 40 additions & 14 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,20 @@ FSTProcessor::filterFinals(const State& state, UStringView casefrom)
uppercase, firstupper, 0);
}

UString
FSTProcessor::filterFinals(const ReusableState& state, UStringView casefrom)
{
bool firstupper = false, uppercase = false;
if (!dictionaryCase) {
firstupper = u_isupper(casefrom[0]);
uppercase = (casefrom.size() > 1 &&
firstupper && u_isupper(casefrom[casefrom.size()-1]));
}
return state.filterFinals(all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0);
}

void
FSTProcessor::writeEscaped(UStringView str, UFILE *output)
{
Expand Down Expand Up @@ -793,6 +807,7 @@ void
FSTProcessor::initBiltrans()
{
initGeneration();
escaped_chars.insert('*');
}


Expand Down Expand Up @@ -886,7 +901,8 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
bool last_incond = false;
bool last_postblank = false;
bool last_preblank = false;
State current_state = initial_state;
ReusableState current_state;
current_state.init(&root);
UString lf; // analysis (lexical form and tags)
UString sf; // surface form
UString lf_spcmp; // space compound analysis
Expand Down Expand Up @@ -1141,7 +1157,7 @@ FSTProcessor::analysis(InputFile& input, UFILE *output)
}
}

current_state = initial_state;
current_state.init(&root);
lf.clear();
sf.clear();
last_start = input_buffer.getPos();
Expand Down Expand Up @@ -1343,7 +1359,8 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
generation_wrapper_null_flush(input, output, mode);
}

State current_state = initial_state;
ReusableState current_state;
current_state.init(&root);
UString sf;

outOfWord = false;
Expand Down Expand Up @@ -1412,8 +1429,8 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
bool firstupper = false, uppercase = false;
if(!dictionaryCase)
{
uppercase = sf.size() > 1 && u_isupper(sf[1]);
firstupper= u_isupper(sf[0]);
uppercase = firstupper && sf.size() > 1 && u_isupper(sf[1]);
}

if(mode == gm_tagged || mode == gm_tagged_nm)
Expand Down Expand Up @@ -1468,7 +1485,7 @@ FSTProcessor::generation(InputFile& input, UFILE *output, GenerationMode mode)
}
}

current_state = initial_state;
current_state.init(&root);
sf.clear();
}
else if(u_isspace(val) && sf.size() == 0)
Expand Down Expand Up @@ -1525,7 +1542,8 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
size_t cur_word = 0;
size_t cur_pos = 0;
size_t match_pos = 0;
State current_state = initial_state;
ReusableState current_state;
current_state.init(&root);
UString last_match;
int space_diff = 0;

Expand Down Expand Up @@ -1705,7 +1723,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
firstupper = false;
have_first = false;
have_second = false;
current_state = initial_state;
current_state.init(&root);
}
}
}
Expand Down Expand Up @@ -2043,28 +2061,31 @@ void
FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
{
std::vector<int32_t> symbols;
ReusableState current_state;
current_state.init(&root);
while (!input.eof()) {
nextBilingualWord(input, output, symbols, mode);
if (symbols.empty()) continue;

State current_state = initial_state;
current_state.reinit(&root);

bool firstupper = (symbols[0] > 0 && u_isupper(symbols[0]));
bool uppercase = (firstupper && symbols.size() > 1 &&
symbols[1] > 0 && u_isupper(symbols[1]));

bool seenTags = false;
size_t queue_start = 0;
UString result;
std::vector<UString> result;
for (size_t i = 0; i < symbols.size(); i++) {
seenTags = seenTags || alphabet.isTag(symbols[i]);
current_state.step_case(symbols[i], beCaseSensitive(current_state));
if (current_state.isFinal(all_finals)) {
queue_start = i;
result = current_state.filterFinals(all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses,
maxWeightClasses, uppercase,
firstupper, 0);
result = current_state.filterFinalsArray(all_finals, alphabet,
escaped_chars,
displayWeightsMode, maxAnalyses,
maxWeightClasses, uppercase,
firstupper, 0);
}
}
// if there are no tags, we only return complete matches
Expand All @@ -2081,7 +2102,12 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
write(source, output);

if (!result.empty()) {
write(compose(result, source.substr(queue_pos)), output);
UString queue = source.substr(queue_pos);
for (auto& piece : result) {
u_fputc('/', output);
write(piece, output);
write(queue, output);
}
} else {
u_fputc('/', output);
u_fputc((mode == gm_all ? '#' : '@'), output);
Expand Down
11 changes: 9 additions & 2 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <lttoolbox/buffer.h>
#include <lttoolbox/my_stdio.h>
#include <lttoolbox/state.h>
#include <lttoolbox/reusable_state.h>
#include <lttoolbox/trans_exe.h>
#include <lttoolbox/input_file.h>
#include <libxml/xmlreader.h>
Expand Down Expand Up @@ -328,6 +329,7 @@ class FSTProcessor
* Assumes that casefrom is non-empty
*/
UString filterFinals(const State& state, UStringView casefrom);
UString filterFinals(const ReusableState& state, UStringView casefrom);

/**
* Write a string to an output stream,
Expand Down Expand Up @@ -450,11 +452,11 @@ class FSTProcessor
*
* @return running with --case-sensitive or state size exceeds max
*/
bool beCaseSensitive(const State& state) {
bool beCaseSensitive(size_t size) {
if(caseSensitive) {
return true;
}
else if(state.size() < max_case_insensitive_state_size) {
else if(size < max_case_insensitive_state_size) {
return false; // ie. do case-folding
}
else {
Expand All @@ -467,6 +469,11 @@ class FSTProcessor
}
}

bool beCaseSensitive(const State& s) { return beCaseSensitive(s.size()); }
bool beCaseSensitive(const ReusableState& s) {
return beCaseSensitive(s.size());
}

public:

/*
Expand Down
3 changes: 3 additions & 0 deletions lttoolbox/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <map>

class State;
class ReusableState;
class Node;


Expand All @@ -35,6 +36,7 @@ class Dest
double *out_weight;

friend class State;
friend class ReusableState;
friend class Node;

void copy(Dest const &d)
Expand Down Expand Up @@ -112,6 +114,7 @@ class Node
{
private:
friend class State;
friend class ReusableState;

/**
* The outgoing transitions of this node.
Expand Down
Loading
Loading