-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
script for collecting gec candidates with checker
- Loading branch information
Showing
1 changed file
with
76 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
#!/bin/bash | ||
# script to extract candidates for grammar error corpus based on positive | ||
# matches in current grammar checker: | ||
# https://giellalt.github.io/proof/gramcheck/extracting-precision-sentences.html | ||
#set -x | ||
|
||
if test $# -lt 1 ; then | ||
echo "Usage: $0 LANGCODE [CORPUS-DIR [VARIANT]]" | ||
echo | ||
echo "LANGCODE should be three-letter name of langs- and corpus- repos." | ||
echo "if CORPUS is not given, is extracted from corpus-LANGCODE." | ||
echo "if VARIANT is not given, divvun-checker uses the dafault variant." | ||
echo | ||
echo "Environment variable GTLANGS must point to root of giellalt github" | ||
exit 1 | ||
fi | ||
LANGCODE=$1 | ||
LANGDIR=$GTLANGS/lang-$LANGCODE/ | ||
if ! test -d $LANGDIR ; then | ||
echo "missing $LANGDIR" | ||
exit 1 | ||
fi | ||
shift | ||
if test $# -ge 1 ; then | ||
CORPUSDIR=$1 | ||
shift | ||
else | ||
CORPUSDIR=$GTLANGS/corpus-$LANGCODE/converted/ | ||
fi | ||
if ! test -d $CORPUSDIR ; then | ||
echo "missing $CORPUSDIR" | ||
exit 1 | ||
fi | ||
if test $# -ge 1 ; then | ||
VARIANT=-n $1 | ||
shift | ||
else | ||
VARIANT= | ||
fi | ||
if ! test -f "$LANGDIR/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst" ; then | ||
echo "missing $LANGDIR/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst" | ||
echo "$LANGDIR must be built with --enable-tokenisers" | ||
exit 1 | ||
fi | ||
if ! test -f "$LANGDIR/tools/grammarcheckers/$LANGCODE.zcheck" ; then | ||
echo "missing $LANGDIR/tools/grammarcheckers/$LANGCODE.zcheck" | ||
echo "$LANGDIR must be built with --enable-grammarchecker" | ||
exit 1 | ||
fi | ||
ccat -l "$LANGCODE" "$CORPUSDIR" |\ | ||
hfst-tokenise -i "$LANGDIR/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst" |\ | ||
sed 's/ \([.?!] \)/\1£/g;'|\ | ||
sed 's/£/\n/g' |\ | ||
sed 's/ \([:;,]\)/\1/g;' |\ | ||
divvun-checker -a "$LANGDIR/tools/grammarcheckers/$LANGCODE.zcheck" \ | ||
$VARIANT |\ | ||
grep -F -v '"errs":[]' > "candidates-$LANGCODE.json" | ||
echo "intermediate results saved in candidates-$LANGCODE.json" | ||
if test -f taglist.txt ; then | ||
for t in $(<taglist.txt) ; do | ||
printf -- "---\nConfig:\n Spec: ../pipespec.xml\n" > "candidates-$t.yaml" | ||
printf " Variant: %sgram-dev\n\n" "$LANGCODE" >> "candidates-$t.yaml" | ||
printf "Tests:\n" >> "candidates-$t.yaml" | ||
grep -F "$t" < "candidates-$LANGCODE.json" |\ | ||
rev | cut -d '"' -f 2 | rev |\ | ||
sed -e 's/^/ - "/' -e 's/$/"/' >> "candidates-$t.yaml" | ||
echo "yaml test candidates for $t saved in candidates-$t.yaml" | ||
done | ||
else | ||
printf -- "---\nConfig:\n Spec: ../pipespec.xml\n" > "candidates-$LANGCODE.yaml" | ||
printf " Variant: %sgram-dev\n\n" "$LANGCODE" >> "candidates-$LANGCODE.yaml" | ||
printf "Tests:\n" >> "candidates-$LANGCODE.yaml" | ||
rev < "candidates-$LANGCODE.json" | cut -d '"' -f 2 | rev |\ | ||
sed -e 's/^/ - "/' -e 's/$/"/' >> "candidates-$LANGCODE.yaml" | ||
echo "yaml test candidates saved in candidates-$LANGCODE.yaml" | ||
fi |