Skip to content

Commit

Permalink
count restricted if can plus gec separately
Browse files Browse the repository at this point in the history
  • Loading branch information
flammie committed Jan 26, 2024
1 parent 31bc5e1 commit f7da699
Showing 1 changed file with 39 additions and 12 deletions.
51 changes: 39 additions & 12 deletions scripts/corpus-stats.bash
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,43 @@ fi


for ll in "$@" ; do
CORPUS="$GTLANGS/corpus-$ll/converted/"
TOKENISER="$GTLANGS/lang-$ll/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst"
ANALYSER="$GTLANGS/lang-$ll/src/analyser-gt-desc.hfstol"
echo "$ll"
if ! test -f "$ll.tokens" ; then
ccat -l "$ll" "$CORPUS" | hfst-tokenise "$TOKENISER" > "$ll.tokens"
fi
if ! test -f "$ll.freqs" ; then
sort < "$ll.tokens" | uniq -c | sort -nr > "$ll.freqs"
fi
python "$GTHOME/scripts/freq-evals.py" -a "$ANALYSER" -i "$ll.freqs" \
-m "$ll.missinglist"
for copyright in "" -x-closed ; do
CORPUS="$GTLANGS/corpus-$ll$copyright/converted/"
TOKENISER="$GTLANGS/lang-$ll/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst"
ANALYSER="$GTLANGS/lang-$ll/src/fst/analyser-gt-desc.hfstol"
echo "$ll$copyright"
if ! test -f "$ll$copyright.text" ; then
ccat -l "$ll" "$CORPUS" > "$ll$copyright.text"
fi
if ! test -f "$ll$copyright.tokens" ; then
cat "$ll$copyright.text" | hfst-tokenise "$TOKENISER" > "$ll$copyright.tokens"
fi
if ! test -f "$ll$copyright.freqs" ; then
sort < "$ll$copyright.tokens" | uniq -c | sort -nr > "$ll$copyright.freqs"
fi
printf "paragraphs tokens characters\n"
wc "$ll$copyright.text"
python "$GTHOME/scripts/freq-evals.py" -a "$ANALYSER" -i "$ll$copyright.freqs" \
-m "$ll$copyright.missinglist"
done
for gecs in goldstandard correct-no-gs ; do
CORPUS="$GTLANGS/corpus-$ll/$gecs/converted/"
TOKENISER="$GTLANGS/lang-$ll/tools/tokenisers/tokeniser-disamb-gt-desc.pmhfst"
ANALYSER="$GTLANGS/lang-$ll/src/fst/analyser-gt-desc.hfstol"
echo "$ll$gecs"
if ! test -f "$ll$gecs.text" ; then
ccat -l "$ll" "$CORPUS" > "$ll$gecs.text"
fi
if ! test -f "$ll$gecs.tokens" ; then
cat "$ll$gecs.text" | hfst-tokenise "$TOKENISER" > "$ll$gecs.tokens"
fi
if ! test -f "$ll$gecs.freqs" ; then
sort < "$ll$gecs.tokens" | uniq -c | sort -nr > "$ll$gecs.freqs"
fi
printf "paragraphs tokens characters\n"
wc "$ll$gecs.text"
python "$GTHOME/scripts/freq-evals.py" -a "$ANALYSER" -i "$ll$gecs.freqs" \
-m "$ll$gecs.missinglist"

done
done

0 comments on commit f7da699

Please sign in to comment.