-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
findaffix.X
executable file
·373 lines (369 loc) · 11.9 KB
/
findaffix.X
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
!!POUNDBANG!!
#
# $Id: findaffix.X,v 1.23 2015-02-08 00:35:41-08 geoff Exp $
#
# Copyright 1992, 1993, 1999, 2001, 2005, Geoff Kuenning, Claremont, CA
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. All modifications to the source code must be clearly marked as
# such. Binary redistributions based on modified source code
# must be clearly marked as modified versions in the documentation
# and/or other materials provided with the distribution.
# 4. The code that causes the 'ispell -v' command to display a prominent
# link to the official ispell Web site may not be removed.
# 5. The name of Geoff Kuenning may not be used to endorse or promote
# products derived from this software without specific prior
# written permission.
#
# THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
#
# Find possible affixes for use with ispell
#
# Usage:
#
# findaffix [-p | -s] [-f] [-c] [-m min] [-M max] [-e elim] [-l low] \
# [-t tabchar] [files]
#
# Each common prefix (-p) or suffix (-s, default) is presented, along
# with statistics to indicate how useful such an affix might be in
# reducing the size of the input file. Only those affixes which
# produce a legal root (one found in the original input) are reported.
#
# If the "-c" option is not given, the output lines are in the
# following format:
#
# strip/add/count/bytes
#
# where "strip" is the string that should be stripped from a root
# word before adding the affix, "add" is the affix to be added, "count"
# is a count of the number of times that this "strip/add" combination
# appears, and "bytes" is an estimate of the number of bytes that
# will be saved in the raw dictionary file if this combination is
# added to the affix file. The field separator in the output will
# normally be the tab character specified by the "-t" switch; the
# default is a slash ("/").
#
# If the "-c" ("clean output") option is given, the appearance of
# the output is made cleaner by changing it to:
#
# -strip+add<tab>count<tab>bytes
#
# where "strip," "add," "count," and "bytes" are as before, and "<tab>"
# represents the ASCII tab character.
#
# The method used to generate possible affixes will also generate
# longer affixes which have common headers or trailers. For example,
# the two words "moth" and "mother" will generate not only the obvious
# substition "+er" but also "-h+her" and "-th+ther" (and possibly
# even longer ones, depending on the value of "min"). To prevent
# cluttering the output with such affixes, any affix pair that shares
# a common header (or, for prefixes, trailer) string longer than
# "elim" characters (default 1) will be suppressed. You may want to
# set "elim" to a value greater than 1 if your language has string
# characters; usually the need for this parameter will become obvious
# when you examine the output of your findaffix run.
#
# Normally, the output is sorted on the "bytes" field. If the "-f"
# flag is given, the output is sorted according to the "count" field.
#
# No affix longer than "max" characters (default 8) will be reported.
# Smaller values of "max" will make the script run faster.
#
# Affixes which appear fewer than "low" times (default 10) are
# suppressed. This significantly reduces the size of the output file.
#
# Affixes which generate stems shorter than "min" characters (default 3)
# are suppressed. (A stem is the word after the "strip" string has
# been removed, and before the "add" string has been added.) This
# reduces both the running time and the size of the output file. "Min"
# should only be set to 1 if you have a *lot* of free time and disk
# space.
#
# The script requires a non-blank field-separator character for internal
# use. Normally, this character is a slash ("/"), but if the slash
# appears as a character in the input word list, a different character
# can be specified with the "-t" switch.
#
# If the input files are ispell dictionaries, they should be expanded
# before being fed to this script.
#
# If the input files contains characters other than [A-Za-z], they
# should be translated to lowercase before being fed to this script.
#
# $Log: findaffix.X,v $
# Revision 1.23 2015-02-08 00:35:41-08 geoff
# Be a bit more paranoid about creating temporary files.
#
# Revision 1.22 2005/04/27 01:18:34 geoff
# Work around idiotic POSIX incompatibilities in sort. Add secure
# temp-file handling.
#
# Revision 1.21 2005/04/14 14:39:33 geoff
# Use /tmp as the default temp directory
#
# Revision 1.20 2005/04/14 14:38:23 geoff
# Update license. Protect against modernized (i.e., incompatible) and
# internationalized sort commands.
#
# Revision 1.19 2001/09/06 00:30:28 geoff
# Many changes from Eli Zaretskii to support DJGPP compilation.
#
# Revision 1.18 2001/07/25 21:51:46 geoff
# Minor license update.
#
# Revision 1.17 2001/07/23 20:24:03 geoff
# Update the copyright and the license.
#
# Revision 1.16 1999/01/07 01:22:55 geoff
# Update the copyright.
#
# Revision 1.15 1994/01/25 07:11:29 geoff
# Get rid of all old RCS log lines in preparation for the 3.1 release.
#
#
# In one of the most incredibly stupid decisions of all time, some
# genius decided to break backwards compatibility by "deprecating" the
# old-style sort switches even though it was trivial to recognize both
# styles. The result is that that thousands of people (like me) will
# have to rewrite shell scripts to tolerate that stupidity. (It's not
# that the new syntax is bad--it's definitely easier to understand.
# But that doesn't excuse breaking compatibility.)
#
# Detect whether sort accepts old-style switches.
if sort +0 -1 /dev/null >/dev/null 2>&1
then
CRETIN_SORT=false
else
CRETIN_SORT=true
fi
#
# The following is necessary so that some internationalized versions of
# sort(1) don't confuse things by sorting into a nonstandard order.
#
LANG=C
LOCALE=C
LC_ALL=C
LC_COLLATE=C
LC_CTYPE=C
export LANG LOCALE LC_COLLATE LC_CTYPE
#
# The following aren't strictly necessary, but I've been made paranoid
# by problems with the stuff above. It can't hurt to set them to a
# sensible value.
LC_MESSAGES=C
LC_MONETARY=C
LC_NUMERIC=C
LC_TIME=C
export LC_MESSAGES LC_MONETARY LC_NUMERIC LC_TIME
TDIR=${TMPDIR-/tmp}
TEMPDIR=`mktemp -d ${TDIR}/faffXXXXXXXXXX 2>/dev/null` || (umask 077; mkdir "$TDIR/faff$$" || (echo "Can't create temp directory: ${TDIR}/faff$$" 1>&2; exit 1); TEMPDIR="$TDIR/faff$$")
TMP=${TEMPDIR}/faff.
SORTTMP="-T ${TDIR}" # !!SORTTMP!!
USAGE='Usage: findaffix [-p | -s] [-f] [-c] [-e elim] [-m min] [-M max] [-l low] [-t tabch] [files]'
LOOP='
i = len - maxlim + 1
if (i < minstem + 1)
i = minstem + 1
for ( ; i <= len; i++)
print substr ($0, 1, i - 1) tabch substr ($0, i) tabch len
print $0 tabch tabch len'
ELIM='$1!=$2 \
{
if (substr ($1, 1, elimlen) != substr ($2, 1, elimlen))
print
}'
maxlim=8
minstem=3
elimlen=1
lowcount=10
cleanout=no
if $CRETIN_SORT
then
finalsortopts='-k 4rn,4 -k 3rn,3 -k 2,2 -k 1,1'
else
finalsortopts='+3rn -4 +2rn -3 +1 -2 +0 -1'
fi
tabch=/
while :
do
case "$1" in
-p)
LOOP='
lim = len - minstem
if (lim > maxlim)
lim = maxlim
for (i = 1; i <= lim; i++)
print substr ($0, i + 1) tabch substr ($0, 1, i) tabch len
print $0 tabch tabch len'
ELIM='$1!=$2 \
{
if (substr ($1, length ($1), elimlen) \
!= substr ($2, length ($2), elimlen))
print
}'
shift
;;
-s)
shift
;;
-f)
if $CRETIN_SORT
then
finalsortopts='-k 3rn,3 -k 4rn,4 -k 2,2 -k 1,1'
else
finalsortopts='+2rn -3 +3rn -4 +1 -2 +0 -1'
fi
shift
;;
-c)
cleanout=yes
shift
;;
-e)
elimlen=$2
shift; shift
;;
-m)
minstem=$2
shift; shift
;;
-M)
maxlim=$2
shift; shift
;;
-l)
lowcount=$2
shift; shift
;;
-t)
tabch="$2"
shift; shift
;;
-*)
echo "$USAGE" 1>&2
exit 1
;;
*)
break
;;
esac
done
trap "rm -rf $TEMPDIR; exit 1" 1 2 15
trap "rm -rf $TEMPDIR; exit 0" 13
#
# We are ready to do the work. First, we collect all input, translate it
# to lowercase, sort it (dropping duplications), and save it for later.
#
if [ $# -ne 0 ]
then
cat "$@" | tr '[A-Z]' '[a-z]'
else
tr '[A-Z]' '[a-z]'
fi \
| sort -u $SORTTMP > ${TMP}a
#
# Now the monstrous pipeline. The awk command produces several lines for
# each input word. Each line contains a possible stem (first field),
# a possible affix, and the length of the original word. The loop which
# does this was placed into the LOOP variable by the code above (q.v.).
#
# The first sort puts this output into an order appropriate for feeding
# to 'join'. The join command then combines stems and affixes, and for
# each puts out an affix to strip, an affix to add, and the length of
# the word before and after modification.
#
# From here on out the job is relatively easy. The second 'awk' gets rid
# of lines that have the same strip and add affixes, and also eliminates
# lines where the strip and add affix have a common leading (for suffixes)
# or trailing (for prefixes) substring, or where the strip affix is longer
# than the add affix (this is all done by the $ELIM variable, which is also
# set up by the code above. The second sort collects identical affixes;
# the third 'awk' functions like 'uniq -c', replacing duplicate affixes
# with a count and summing the estimate of bytes saved. It also eliminates
# any affixes which appear less frequently than the minimum ("lowcount").
# Finally, the third sort ($finalsortopts) rearranges the list in the chosen
# sort order.
#
if $CRETIN_SORT
then
sortopts1='-k 1,1 -k 2'
sortopts2='-k 2,2 -k 1,1'
else
sortopts1='+0 -1 +1'
sortopts2='+1 -2 +0 -1'
fi
awk "BEGIN{minstem=$minstem; maxlim=$maxlim; tabch="'"'"$tabch"'"}
{
len = length ($0)
if (len < 2)
next
'"$LOOP"'
}' < ${TMP}a \
| sort "-t$tabch" $sortopts1 $SORTTMP -o ${TMP}a
join "-t$tabch" -o 1.2 2.2 2.3 ${TMP}a ${TMP}a \
| awk "-F$tabch" "BEGIN{elimlen=$elimlen}$ELIM" \
| sort "-t$tabch" $sortopts2 $SORTTMP \
| awk "-F$tabch" 'BEGIN{tabch="'"$tabch"'"; lowcount='"$lowcount"'}
{
if ($1 == last1 && $2 == last2)
{
count++
totchars += $3
}
else
{
if ((last1 != "" || last2 != "") && count >= lowcount)
print last1 tabch last2 tabch count tabch totchars
count = 1
last1 = $1
last2 = $2
totchars = $3
}
}
END {
if ((last1 != "" || last2 != "") && count >= lowcount)
print last1 tabch last2 tabch count tabch totchars
}' \
| sort "-t$tabch" $finalsortopts $SORTTMP \
| if [ "$cleanout" = "yes" ]
then
case "$tabch" in
/)
sedsub=/
sedsep=';'
;;
.|\*|\[|\^|\$|\\)
sedsub="\\$tabch"
sedsep=/
;;
*)
sedsub="$tabch"
sedsep=/
;;
esac
exec sed -e "s$sedsep$sedsub$sedsep ${sedsep}g" \
-e 's/ /+/' -e 's/^/-/' \
-e 's/^-+/+/' -e 's/+ / /'
else
exec cat
fi
rm -rf $TEMPDIR