Skip to content

Commit

Permalink
Add filterFinalsArray() and refactor biltrans
Browse files Browse the repository at this point in the history
This allows biltrans code to be cleaner, since it doesn't have to deal
with splitting and re-merging the result string, and also causes it to
handle tags with slashes more correctly.
  • Loading branch information
mr-martian committed Sep 12, 2024
1 parent 439a306 commit 042b085
Show file tree
Hide file tree
Showing 7 changed files with 147 additions and 192 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cmake_minimum_required(VERSION 3.12 FATAL_ERROR)
cmake_policy(VERSION ${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION})
project(lttoolbox
VERSION 3.7.14
VERSION 3.7.15
LANGUAGES CXX C
)
set(VERSION ${PROJECT_VERSION})
Expand Down
160 changes: 45 additions & 115 deletions lttoolbox/fst_processor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1711,8 +1711,7 @@ FSTProcessor::transliteration(InputFile& input, UFILE *output)
}

bool
FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue,
bool delim, bool mark)
FSTProcessor::step_biltrans(UStringView word, std::vector<UString>& result, UString& queue)
{
State current_state = initial_state;
bool firstupper = u_isupper(word[0]);
Expand All @@ -1723,13 +1722,11 @@ FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue,
current_state.step(val, beCaseSensitive(current_state));
}
if (current_state.isFinal(all_finals)) {
result.clear();
if (delim) result += '^';
if (mark) result += '=';
result += current_state.filterFinals(all_finals, alphabet,
escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0).substr(1);
current_state.filterFinalsArray(result,
all_finals, alphabet,
escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0);
}
if (current_state.size() == 0) {
if (!result.empty()) queue.append(symbol);
Expand All @@ -1742,7 +1739,7 @@ FSTProcessor::step_biltrans(UStringView word, UString& result, UString& queue,
UString
FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
{
UString result;
std::vector<UString> result;
unsigned int start_point = 1;
unsigned int end_point = input_word.size()-2;
UString queue;
Expand All @@ -1766,7 +1763,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
}

auto word = input_word.substr(start_point, end_point-start_point);
bool exists = step_biltrans(word, result, queue, with_delim, mark);
bool exists = step_biltrans(word, result, queue);
if (!exists) {
if (with_delim) return "^@"_u + US(input_word.substr(1));
else return "@"_u + US(input_word);
Expand All @@ -1778,23 +1775,7 @@ FSTProcessor::biltransfull(UStringView input_word, bool with_delim)
}
// attach unmatched queue automatically

if(!queue.empty())
{
UString result_with_queue = compose(result, queue);
if(with_delim)
{
result_with_queue += '$';
}
return result_with_queue;
}
else
{
if(with_delim)
{
result += '$';
}
return result;
}
return compose(result, queue, with_delim, mark);
}


Expand All @@ -1803,7 +1784,7 @@ UString
FSTProcessor::biltrans(UStringView input_word, bool with_delim)
{
State current_state = initial_state;
UString result;
std::vector<UString> result;
unsigned int start_point = 1;
unsigned int end_point = input_word.size()-2;
UString queue;
Expand All @@ -1827,55 +1808,32 @@ FSTProcessor::biltrans(UStringView input_word, bool with_delim)
}

UStringView word = input_word.substr(start_point, end_point-start_point);
bool exists = step_biltrans(word, result, queue, with_delim, mark);
bool exists = step_biltrans(word, result, queue);
if (!exists) {
if (with_delim) return "^@"_u + US(input_word.substr(1));
else return "@"_u + US(input_word);
}

// attach unmatched queue automatically

if(!queue.empty())
{
UString result_with_queue = compose(result, queue);
if(with_delim)
{
result_with_queue += '$';
}
return result_with_queue;
}
else
{
if(with_delim)
{
result += '$';
}
return result;
}
return compose(result, queue, with_delim, mark);
}

UString
FSTProcessor::compose(UStringView lexforms, UStringView queue) const
FSTProcessor::compose(const std::vector<UString>& lexforms, UStringView queue,
bool delim, bool mark) const
{
UString result;
result.reserve(lexforms.size() + 2 * queue.size());
result += '/';

for(unsigned int i = 1; i< lexforms.size(); i++)
{
if(lexforms[i] == '\\')
{
result += '\\';
i++;
}
else if(lexforms[i] == '/')
{
result.append(queue);
}
result += lexforms[i];
}

result += queue;
if (delim) result += '^';
if (mark) result += '=';
bool first = true;
for (auto& it : lexforms) {
if (!first) result += '/';
first = false;
result += it;
result += queue;
}
if (delim) result += '$';
return result;
}

Expand Down Expand Up @@ -2060,16 +2018,17 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)

bool seenTags = false;
size_t queue_start = 0;
UString result;
std::vector<UString> result;
for (size_t i = 0; i < symbols.size(); i++) {
seenTags = seenTags || alphabet.isTag(symbols[i]);
current_state.step_case(symbols[i], beCaseSensitive(current_state));
if (current_state.isFinal(all_finals)) {
queue_start = i;
result = current_state.filterFinals(all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses,
maxWeightClasses, uppercase,
firstupper, 0);
current_state.filterFinalsArray(result,
all_finals, alphabet, escaped_chars,
displayWeightsMode, maxAnalyses,
maxWeightClasses, uppercase,
firstupper, 0);
}
}
// if there are no tags, we only return complete matches
Expand All @@ -2084,11 +2043,11 @@ FSTProcessor::bilingual(InputFile& input, UFILE *output, GenerationMode mode)
}

write(source, output);
u_fputc('/', output);

if (!result.empty()) {
write(compose(result, source.substr(queue_pos)), output);
} else {
u_fputc('/', output);
u_fputc((mode == gm_all ? '#' : '@'), output);
write(source, output);
}
Expand All @@ -2100,7 +2059,8 @@ std::pair<UString, int>
FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
{
State current_state = initial_state;
UString result;
std::vector<UString> result;
std::vector<UString> temp;
unsigned int start_point = 1;
unsigned int end_point = input_word.size()-2;
UString queue;
Expand Down Expand Up @@ -2142,17 +2102,10 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
}
if(current_state.isFinal(all_finals))
{
result.clear();
if (with_delim) {
result += '^';
}
if (mark) {
result += '=';
}
result += current_state.filterFinals(all_finals, alphabet,
escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0).substr(1);
current_state.filterFinalsArray(result, all_finals, alphabet,
escaped_chars,
displayWeightsMode, maxAnalyses, maxWeightClasses,
uppercase, firstupper, 0);
}

if(current_state.size() == 0)
Expand All @@ -2166,13 +2119,12 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
// word is not present
if(with_delim)
{
result = "^@"_u + US(input_word.substr(1));
return {"^@"_u + US(input_word.substr(1)), 0};
}
else
{
result = "@"_u + US(input_word);
return {"@"_u + US(input_word), 0};
}
return std::pair<UString, int>(result, 0);
}
}
}
Expand All @@ -2185,43 +2137,25 @@ FSTProcessor::biltransWithQueue(UStringView input_word, bool with_delim)
// word is not present
if(with_delim)
{
result = "^@"_u + US(input_word.substr(1));
return {"^@"_u + US(input_word.substr(1)), 0};
}
else
{
result = "@"_u + US(input_word);
return {"@"_u + US(input_word), 0};
}
return {result, 0};
}



// attach unmatched queue automatically

if(!queue.empty())
{
UString result_with_queue = compose(result, queue);
if(with_delim)
{
result_with_queue += '$';
}
return {result_with_queue, queue.size()};
}
else
{
if(with_delim)
{
result += '$';
}
return {result, 0};
}
return {compose(result, queue, with_delim, mark), queue.size()};
}

UString
FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)
{
State current_state = initial_state;
UString result;
std::vector<UString> result;
unsigned int start_point = 1;
unsigned int end_point = input_word.size()-2;
bool mark = false;
Expand All @@ -2245,17 +2179,13 @@ FSTProcessor::biltransWithoutQueue(UStringView input_word, bool with_delim)

auto word = input_word.substr(start_point, end_point-start_point);
UString queue;
bool exists = step_biltrans(word, result, queue, with_delim, mark);
bool exists = step_biltrans(word, result, queue);
if (!exists || !queue.empty()) {
if (with_delim) return "^@"_u + US(input_word.substr(1));
else return "@"_u + US(input_word);
}

if(with_delim)
{
result += '$';
}
return result;
return compose(result, ""_u, with_delim, mark);
}


Expand Down
6 changes: 3 additions & 3 deletions lttoolbox/fst_processor.h
Original file line number Diff line number Diff line change
Expand Up @@ -429,9 +429,9 @@ class FSTProcessor
void analysis_wrapper_null_flush(InputFile& input, UFILE *output);
void generation_wrapper_null_flush(InputFile& input, UFILE *output,
GenerationMode mode);
UString compose(UStringView lexforms, UStringView queue) const;
bool step_biltrans(UStringView word, UString& result, UString& queue,
bool delim, bool mark);
UString compose(const std::vector<UString>& lexforms, UStringView queue,
bool delim = false, bool mark = false) const;
bool step_biltrans(UStringView word, std::vector<UString>& result, UString& queue);

void procNodeICX();
void procNodeRCX();
Expand Down
Loading

0 comments on commit 042b085

Please sign in to comment.