Skip to content

Commit

Permalink
some gendir fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Jan 26, 2024
1 parent 9804307 commit 98d95b7
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 46 deletions.
12 changes: 2 additions & 10 deletions am-shared/tools-spellcheckers-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -70,22 +70,14 @@ noinst_DATA=$(GT_COMMON_SPELLER_HFST)
quit\n" | $(HFST_XFST) -p $(MORE_VERBOSITY) $(HFST_FORMAT)

# Copy the tmp transducer to the final one. This allows local overrides.
%.hfst: .generated/%.tmp.hfst
.generated/%.hfst: .generated/%.tmp.hfst
$(AM_V_CP)cp -f $< $@

# Invert the final fst, to enable symmetric yaml tests and easy manual testing:
analyser-%.hfst: generator-%.hfst
.generated/analyser-%.hfst: .generated/generator-%.hfst
$(AM_V_INVERT)$(HFST_INVERT) $(MORE_VERBOSITY) $(HFST_FLAGS) -i $< -o $@

####### Other targets: ###########
clean-local:
-rm -f *.hfst *.xfst
-rm -f *.hfst *.xfst *.zhfst easteregg.* *.oxt *.xpi *.zip
-rm -rf 3 *.service build
-rm -f editdist.default.regex editdist.all.default.regex
if ! [ "x$(CORPUSNAME)" = "x" ] ; then \
rm -f $(CORPUSNAME).* ; \
fi

# Keep these intermediate targets when building using --debug:
.SECONDARY: editdist.all.default.hfst \
Expand Down
30 changes: 15 additions & 15 deletions am-shared/tools-spellcheckers-fstbased-desktop-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ UW_SPELLER_SRC=generator-desktopspeller-gt-norm-base.hfst
tag_weighted_dep=$(shell \
if [[ $(ENABLE_CORPUS_WEIGHTS) == 'yes' ]] ; \
then \
echo "generator-desktopspeller-gt-norm-unit_weighted.hfst"; \
echo ".generated/generator-desktopspeller-gt-norm-unit_weighted.hfst"; \
else \
echo "generator-desktopspeller-gt-norm-base.hfst"; \
echo ".generated/generator-desktopspeller-gt-norm-base.hfst"; \
fi)

####### Automake targets: ########
Expand All @@ -40,13 +40,13 @@ endif # CAN_HFST
endif # WANT_SPELLERS

#### 1. Copy base fst from parent dir:
generator-desktopspeller-gt-norm-base.hfst: generator-fstspeller-gt-norm.hfst
.generated/generator-desktopspeller-gt-norm-base.hfst: .generated/generator-fstspeller-gt-norm.hfst
$(AM_V_FST2FST)$(HFST_FST2FST) --format=openfst-tropical -i $< -o $@

#### 2. Add corpus-based frequency weights (all non-hits will disappear from
#### the fst):
generator-desktopspeller-gt-norm-freq_weighted.hfst: \
generator-desktopspeller-gt-norm-base.hfst \
.generated/generator-desktopspeller-gt-norm-freq_weighted.hfst: \
.generated/generator-desktopspeller-gt-norm-base.hfst \
$(SURFWEIGHTS)
$(AM_V_COMPOSE)$(HFST_COMPOSE) $(HFST_FLAGS) -F \
$< $(SURFWEIGHTS) \
Expand All @@ -60,9 +60,9 @@ generator-desktopspeller-gt-norm-freq_weighted.hfst: \
####
#### Future plan: replace standard union | with priority union .P., when it
#### works properly for weighted fst's.
generator-desktopspeller-gt-norm-unit_weighted.hfst: \
generator-desktopspeller-gt-norm-freq_weighted.hfst \
unitweighted.hfst
.generated/generator-desktopspeller-gt-norm-unit_weighted.hfst: \
.generated/generator-desktopspeller-gt-norm-freq_weighted.hfst \
.generated/unitweighted.hfst
$(AM_V_HXFST)$(PRINTF) "\
set encode-weights ON \n\
read regex \
Expand All @@ -74,36 +74,36 @@ generator-desktopspeller-gt-norm-unit_weighted.hfst: \

#### 4. Add tag-based weights, for adjusting weights according to morphology
#### and other tag-based penalties like words that should not be suggested:
generator-desktopspeller-gt-norm-tag_weighted.hfst: $(tag_weighted_dep) \
.generated/generator-desktopspeller-gt-norm-tag_weighted.hfst: $(tag_weighted_dep) \
$(srcdir)/weights/$(TAGWEIGHTS)
$(AM_V_REWEIGHT)$(HFST_REWEIGHT) $(MORE_VERBOSITY) $(HFST_FLAGS) \
-T $(srcdir)/weights/$(TAGWEIGHTS) --arcs-only -i $< \
-o $@

#### 5. Finally, remove the surface word boundary symbol and do other
#### general cleanup:
generator-desktopspeller-gt-norm.tmp.hfst: \
generator-desktopspeller-gt-norm-tag_weighted.hfst \
.generated/generator-desktopspeller-gt-norm.tmp.hfst: \
.generated/generator-desktopspeller-gt-norm-tag_weighted.hfst \
filters/remove-word-boundary.hfst \
filters/remove-usage-tags.hfst \
easteregg.default.desktop.hfst
.generated/easteregg.default.desktop.hfst
$(AM_V_HXFST)$(PRINTF) "\
set encode-weights ON \n\
read regex [ \
@\"filters/remove-usage-tags.hfst\" \
.o. @\"$<\" \
.o. @\"filters/remove-word-boundary.hfst\" ] \
| @\"easteregg.default.desktop.hfst\" \
| @\".generated/easteregg.default.desktop.hfst\" \
; \n\
save stack $@\n\
quit\n" | $(HFST_XFST) -p $(MORE_VERBOSITY)

# Copy the tmp transducer to the final one. This allows local overrides.
%.hfst: %.tmp.hfst
.generated/%.hfst: .generated/%.tmp.hfst
$(AM_V_CP)cp -f $< $@

# Invert the final fst, to enable symmetric yaml tests and easy manual testing:
analyser-desktopspeller-gt-norm.hfst: generator-desktopspeller-gt-norm.hfst
.generated/analyser-desktopspeller-gt-norm.hfst: .generated/generator-desktopspeller-gt-norm.hfst
$(AM_V_INVERT)$(HFST_INVERT) $(MORE_VERBOSITY) $(HFST_FLAGS) -i $< \
| $(HFST_PRUNE_ALPHABET) $(MORE_VERBOSITY) \
| $(HFST_REMOVE_EPSILONS) $(MORE_VERBOSITY) -o $@
Expand Down
28 changes: 14 additions & 14 deletions am-shared/tools-spellcheckers-fstbased-desktop-hfst-dir-include.am
Original file line number Diff line number Diff line change
Expand Up @@ -247,21 +247,21 @@ easteregg.%.desktop.suggtxt: easteregg.%.desktop.txt
| perl -pe 's/(.)\t(.+)/\2\t\1/' \
> $@

easteregg.%.desktop.errorth.hfst:
.generated/easteregg.%.desktop.errorth.hfst: $(GENDIR)
$(AM_V_GEN)echo \
'n u v v i D s p e l l e r:D i v v u n s p e l l e r +N +Err/Orth' \
| $(HFST_STRINGS2FST) -S $(HFST_FLAGS) \
> $@

easteregg.%.desktop.analyser.hfst:
.generated/easteregg.%.desktop.analyser.hfst: $(GENDIR)
$(AM_V_GEN)echo \
'D i v v u n s p e l l e r +N' \
| $(HFST_STRINGS2FST) -S $(HFST_FLAGS) \
> $@

# Easter egg string acceptor:
# easteregg.%.desktop.temp.hfst: easteregg.%.desktop.txt
easteregg.%.desktop.hfst: easteregg.%.desktop.txt
.generated/easteregg.%.desktop.hfst: easteregg.%.desktop.txt $(GENDIR)
$(AM_V_GEN)$(HFST_STRINGS2FST) $(HFST_FLAGS) -j < $< \
> $@

Expand Down Expand Up @@ -289,50 +289,50 @@ editdist.%.regex: editdist.%.txt $(initial_letter_deps)
--output-file=$@ \
$(initial_letter_error_model_option)

editdist.%.hfst: editdist.%.regex
.generated/editdist.%.hfst: editdist.%.regex $(GENDIR)
$(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\
--format=openfst-tropical \
-o $@

# Initial string edits, if enabled:
initial_letters.txt.%.hfst: initial_letters.%.txt
.generated/initial_letters.txt.%.hfst: initial_letters.%.txt $(GENDIR)
$(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \
| $(HFST_STRINGS2FST) $(HFST_FLAGS) -j -p \
-o $@

initial_letters.regex.%.hfst: initial_letters.%.regex
.generated/initial_letters.regex.%.hfst: initial_letters.%.regex $(GENDIR)
$(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\
--format=openfst-tropical \
-o $@

initial_letters.all.%.hfst: $(initial_letter_all_deps)
.generated/initial_letters.all.%.hfst: $(initial_letter_all_deps) $(GENDIR)
$(initial_letter_all_build)

# Final string edits, if enabled:
final_strings.txt.%.hfst: final_strings.%.txt
.generated/final_strings.txt.%.hfst: final_strings.%.txt $(GENDIR)
$(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \
| $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \
--format=openfst-tropical \
-o $@

final_strings.regex.%.hfst: final_strings.%.regex
.generated/final_strings.regex.%.hfst: final_strings.%.regex $(GENDIR)
$(AM_V_RGX2FST)$(HFST_REGEXP2FST) -S $(HFST_FLAGS) -i $<\
--format=openfst-tropical \
-o $@

final_strings.all.%.hfst: $(final_strings_all_deps)
.generated/final_strings.all.%.hfst: $(final_strings_all_deps) $(GENDIR)
$(final_strings_all_build)

# Helper fst:
anystar.hfst:
.generated/anystar.hfst: $(GENDIR)
$(AM_V_RGX2FST)echo "?*" | $(HFST_REGEXP2FST) -o $@

# In-word list of strings known to be misspelled:
strings.txt.%.hfst: strings.%.txt anystar.hfst
.generated/strings.txt.%.hfst: strings.%.txt .generated/anystar.hfst
$(AM_V_STR2FST)grep -v '^#' $< | grep -v '^$$' | cut -f1-2 \
| $(HFST_STRINGS2FST) $(HFST_FLAGS) -j \
| $(HFST_CONCATENATE) anystar.hfst - \
| $(HFST_CONCATENATE) - anystar.hfst \
| $(HFST_CONCATENATE) .generated/anystar.hfst - \
| $(HFST_CONCATENATE) - .generated/anystar.hfst \
-o $@

# strings regex file:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ corpus_size_limit_command=$(shell \
echo ""; \
fi)

%.hfst: weights/%.att
.genrated/%.hfst: weights/%.att $(GENDIR)
$(AM_V_GEN)$(HFST_TXT2FST) $(HFST_FLAGS) -f openfst-tropical $< -o $@

# sort the clean corpus:
Expand Down Expand Up @@ -74,13 +74,13 @@ corpus_size_limit_command=$(shell \
> $@

# build an fst of surface forms with tropical weights for each word form:
%.surfs.hfst: %.tropical.txt
.generated/%.surfs.hfst: %.tropical.txt $(GENDIR)
$(AM_V_STR2FST)cat $< |\
$(HFST_STRINGS2FST) -j $(HFST_FLAGS) -f openfst-tropical -o $@

# Build an fst with surface form weights that also handles compounds:
%.surfweights.hfst: %.surfs.hfst \
word-boundary.hfst
.generated/%.surfweights.hfst: .generated/%.surfs.hfst \
.generated/word-boundary.hfst
$(AM_V_HMINIM)$(HFST_MINIMIZE) $(HFST_FLAGS) -i $< -o $@
# Commented out the repetition build step - we only want to promote the compouns
# actually found in the corpus.
Expand All @@ -90,7 +90,7 @@ corpus_size_limit_command=$(shell \
# | $(HFST_MINIMIZE) -o $@

# Remove word forms covered by the corpus:
unitweighted_limited.hfst: generator-%-gt-norm-freq_weighted.hfst \
.generated/unitweighted_limited.hfst: .generated/generator-%-gt-norm-freq_weighted.hfst \
$(UW_SPELLER_SRC)
$(AM_V_FST2FST)$(HFST_FST2FST) --format=foma -i $(UW_SPELLER_SRC) -o $@.tmpfoma.hfst
$(AM_V_FST2FST)$(HFST_FST2FST) --format=foma -i $< -o $<.tmpfoma.hfst
Expand All @@ -103,8 +103,8 @@ unitweighted_limited.hfst: generator-%-gt-norm-freq_weighted.hfst \

# Add the unit weight to each unit in compounds, both dynamic and lexical:
#unitweighted.hfst: unitweighted_limited.hfst
unitweighted.hfst: $(UW_SPELLER_SRC) \
$(UNITWEIGHT)
.generated/unitweighted.hfst: $(UW_SPELLER_SRC) \
$(UNITWEIGHT) $(GENDIR)
$(AM_V_REWEIGHT)$(HFST_FST2FST) --format=openfst-tropical -i $< \
| $(HFST_REWEIGHT) $(HFST_FLAGS) \
-e -a $$(cat $(UNITWEIGHT)) \
Expand Down

0 comments on commit 98d95b7

Please sign in to comment.