-
Notifications
You must be signed in to change notification settings - Fork 3
/
template.awk
475 lines (406 loc) · 16 KB
/
template.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
#!/usr/bin/awk -f
# This AWK library provides 4 functions for working with UTF-8 strings:
#
# - wcscolumns(string): Returns the number of colums needed to display a
# string, but unlike "wcswidth" and "wcwidth" which are written to function
# identically to their POSIX counterparts, this function always returns a
# value greater than or equal to 0.
# - wcsexpand(string, tab_stop): Expand tabs to spaces in a wide
# character-aware manner.
# - wcstruncate(string, columns): Returns a string truncated to span a limited
# number of columns.
# - wcswidth(string): Returns the number of columns needed to display a string.
# - wcwidth(character): Returns the number of columns needed to display a
# character.
#
# More detailed explanations of how these functions work can be found in
# comments immediately preceding their definitions.
#
# To minimize the likelihood of name conflicts, all global variables used by
# this code begin with "WCWIDTH_...", all internal functions begin with
# "_wcwidth_...", and all arguments / function-local variables that are not
# arguments begin with a "_". The library will work regardless of when it's
# loaded relative to the scripts that use it, but one "reference to
# uninitialized variable" warning will be generated by GAWK's linter if the
# library is loaded after its caller AND if the caller uses a library function
# in a "BEGIN" block.
#
# Author: Eric Pruitt (https://www.codevat.com)
# License: 2-Clause BSD (http://opensource.org/licenses/BSD-2-Clause)
# Project Page: https://github.com/ericpruitt/wcwidth.awk
# ---
# Determine the number of columns needed to display a string. This function
# differs from the "wcswidth" function in its handling of non-printable
# characters; instead of making the function abort and immediately return -1,
# non-printable ASCII characters are ignored while all others are treated as
# having a width of 1 because they will typically be rendered as a
# single-column ".notdef" glyph
# (https://www.microsoft.com/typography/otspec/recom.htm).
#
# Arguments:
# - _str: A string of any length. In AWK interpreters that are not multi-byte
# safe, this argument is interpreted as a UTF-8 encoded string.
#
# Returns: The number of columns needed to display the string. This value will
# always be greater than or equal to 0.
#
function wcscolumns(_str, _length, _max, _min, _offset, _rl, _rs, _total,
_wchar, _width) {
_total = 0
if (!WCWIDTH_INITIALIZED) {
_wcwidth_initialize_library()
}
if (WCWIDTH_MULTIBYTE_SAFE) {
# Optimization for Latin and whatever else I could fit on one line.
_total = length(_str)
gsub(/[ -~ -¬®-˿Ͱ-ͷͺ-Ϳ΄-ΊΌΎ-ΡΣ-҂Ҋ-ԯԱ-Ֆՙ-՟ա-և։֊־׀׃׆א-תװ-״]+/, "", _str)
if (!_str) {
return _total
}
# Optimization for common wide CJK characters. Based on data from
# http://corpus.leeds.ac.uk/list.html, this covers ~95% of all
# characters used on Chinese and Japanese sites. U+3099 is a combining
# character, so it has been replaced with an octal sequence to keep
# terminal screens from getting munged.
_length = length(_str)
_total -= _length
gsub(/[가-힣一-鿕!-⦆ぁ-ゖ\343\202\231-ヿ]+/, "", _str)
_total += (_length - length(_str)) * 2
_offset = 1
}
if (!_str) {
return _total
}
_rs = RSTART
_rl = RLENGTH
while (1) {
if (!WCWIDTH_MULTIBYTE_SAFE) {
# Optimization for ASCII text.
_total += length(_str)
sub(/^[\040-\176]+/, "", _str)
if (!_str) {
break
}
_total -= length(_str)
# Optimization for a subset of the "Latin and whatever" characters
# mentioned above. Experimenting showed that performance in MAWK
# eventually begins drop off rapidly for the French corpus as the
# regex complexity increases.
if (match(_str, /^([\303-\313][\200-\277][ -~]*)+/)) {
_wchar = substr(_str, RSTART, RLENGTH)
_total += gsub(/[^ -~]/, "", _wchar) / 2 + length(_wchar)
if (RLENGTH == length(_str)) {
break
}
_str = substr(_str, RSTART + RLENGTH)
}
# Optimization for common wide CJK characters. The regular
# expression used here covers the exact same range as the regex for
# multi-byte safe interpreters.
if (match(_str, WCWIDTH_WIDE_CJK_RUNES_REGEX)) {
_wchar = substr(_str, RSTART, RLENGTH)
_total += gsub(/[^ -~]/, "", _wchar) / 3 * 2 + length(_wchar)
if (RLENGTH == length(_str)) {
break
}
_str = substr(_str, RSTART + RLENGTH)
}
match(_str, WCWIDTH_UTF8_ANCHORED_RUNE_REGEX)
_wchar = substr(_str, RSTART, RLENGTH)
_str = RLENGTH == length(_str) ? "" : substr(_str, RLENGTH + 1)
} else if (_offset > length(_str)) {
break
} else {
_wchar = substr(_str, _offset++, 1)
}
if (_wchar in WCWIDTH_CACHE) {
_width = WCWIDTH_CACHE[_wchar]
} else if (!WCWIDTH_TABLE_LENGTH) {
_width = _wcwidth_unpack_data(_wchar)
} else {
# Do a binary search to find the width of the character.
_min = 0
_max = WCWIDTH_TABLE_LENGTH - 1
_width = -1
do {
if (_wchar < WCWIDTH_RANGE_START[WCWIDTH_SEARCH_CURSOR]) {
_max = WCWIDTH_SEARCH_CURSOR - 1
} else if (_wchar > WCWIDTH_RANGE_END[WCWIDTH_SEARCH_CURSOR]) {
_min = WCWIDTH_SEARCH_CURSOR + 1
} else {
_width = WCWIDTH_RANGE_WIDTH[WCWIDTH_SEARCH_CURSOR]
break
}
WCWIDTH_SEARCH_CURSOR = int((_min + _max) / 2)
} while (_min <= _max)
WCWIDTH_CACHE[_wchar] = _width
}
if (_width != -1) {
_total += _width
} else if (WCWIDTH_POSIX_MODE) {
_total = -1
break
} else {
# Ignore non-printable ASCII characters.
_total += length(_wchar) == 1 ? _wchar > "\177" : 1
}
}
RLENGTH = _rl
RSTART = _rs
return _total
}
# Expand tabs to spaces in a wide character-aware manner. Calculations done by
# this function assume the first character of the string is the first character
# of the line or the first character following a tab.
#
# Arguments:
# - _str: The string to expand.
# - _tab_stop: The maximum width of tabs. This must be an integer greater than
# zero.
#
# Returns: A string with all tabs replaced with spaces.
#
function wcsexpand(_str, _tab_stop, _column, _mark, _tab_index, _tab_width)
{
_column = 0
# An alternate implementation of this function used split(..., ..., "\t"),
# but that approach was generally slower.
for (_mark = 0; (_tab_index = index(_str, "\t")); _mark = _tab_index - 1) {
_column += wcscolumns(substr(_str, _mark + 1, _tab_index - _mark - 1))
_tab_width = _tab_stop - _column % _tab_stop
sub(/\t/, sprintf("%*s", _tab_width, ""), _str)
}
return _str
}
# Truncate a string so that it spans a limited number of columns.
#
# Arguments:
# - _str: A string of any length. In AWK interpreters that are not multi-byte
# safe, this argument is interpreted as a UTF-8 encoded string.
# - _columns: Maximum number of columns the resulting text may span.
#
# Returns: "_str" truncated as needed.
#
function wcstruncate(_str, _columns, _result, _rl, _rs, _wchar, _width)
{
_columns = 0 + _columns
# Use "substr" for strings composed of 1-column characters.
if (_str !~ /[^\040-\176]/ || (WCWIDTH_MULTIBYTE_SAFE &&
_str !~ /[^ -~ -¬®-˿Ͱ-ͷͺ-Ϳ΄-ΊΌΎ-ΡΣ-҂Ҋ-ԯԱ-Ֆՙ-՟ա-և։֊־׀׃׆א-תװ-״]/)) {
return length(_str) > _columns ? substr(_str, 1, _columns) : _str
}
# The individual widths of characters need not be checked when
# `(length(_str) * 2) <= _columns` because a character may only span 2
# columns at most.
if ((WCWIDTH_MULTIBYTE_SAFE && (length(_str) * 2) <= _columns) ||
(!WCWIDTH_MULTIBYTE_SAFE && WCWIDTH_INTERVAL_EXPRESSIONS_SUPPORTED &&
_str ~ ("^" WCWIDTH_UTF8_RUNE_REGEX "{," int(_columns / 2) "}$"))) {
return _str
}
_rl = RLENGTH
_rs = RSTART
_result = ""
while (_columns > 0 && _str) {
if (_str ~ /^[\040-\176]/) {
_wchar = substr(_str, 1, 1)
_str = substr(_str, 2)
_width = 1
} else if (WCWIDTH_MULTIBYTE_SAFE) {
_wchar = substr(_str, 1, 1)
_str = substr(_str, 2)
_width = wcscolumns(_wchar)
} else if (match(_str, WCWIDTH_UTF8_RUNE_REGEX)) {
_wchar = substr(_str, RSTART, RLENGTH)
_str = substr(_str, RSTART + RLENGTH)
_width = wcscolumns(_wchar)
}
_columns -= _width
if (_columns >= 0) {
_result = _result _wchar
}
}
RLENGTH = _rl
RSTART = _rs
return _result
}
# A reimplementation of the POSIX function of the same name to determine the
# number of columns needed to display a string.
#
# Arguments:
# - _str: A string of any length. In AWK interpreters that are not multi-byte
# safe, this argument is interpreted as a UTF-8 encoded string.
#
# Returns: The number of columns needed to display the string is returned if
# all of character are printable and -1 if any are not.
#
function wcswidth(_str, _width)
{
WCWIDTH_POSIX_MODE = 1
_width = wcscolumns(_str)
WCWIDTH_POSIX_MODE = 0
return _width
}
# A reimplementation of the POSIX function of the same name to determine the
# number of columns needed to display a single character.
#
# Arguments:
# - _wchar: A single character. In AWK interpreters that are not multi-byte
# safe, this argument may consist of multiple characters that together
# represent a single UTF-8 encoded code point.
#
# Returns: The number of columns needed to display the character if it is
# printable and -1 if it is not. If the argument does not contain exactly one
# character (or UTF-8 code point), -1 is returned.
#
function wcwidth(_wchar, _result, _rl, _rs)
{
_result = -1
if (!_wchar) {
# An empty string is an invalid argument.
} else if (WCWIDTH_MULTIBYTE_SAFE && length(_wchar) == 1) {
_result = wcswidth(_wchar)
} else if (!WCWIDTH_MULTIBYTE_SAFE) {
_rs = RSTART
_rl = RLENGTH
if (match(_wchar, WCWIDTH_UTF8_ANCHORED_RUNE_REGEX) &&
RLENGTH == length(_wchar)) {
_result = wcswidth(_wchar)
}
RSTART = _rs
RLENGTH = _rl
}
return _result
}
# ---
# The functions beyond this point are intended only for internal use and should
# be treated as implementation details.
# ---
BEGIN {
# Silence "defined but never called directly" warnings generated when using
# GAWK's linter.
if (0) {
wcscolumns()
wcsexpand()
wcstruncate()
wcswidth()
wcwidth()
}
WCWIDTH_POSIX_MODE = 0
_wcwidth_initialize_library()
}
# Initialize global state used by this library.
#
function _wcwidth_initialize_library( _entry, _nul)
{
# This method of checking for initialization will not generate a "reference
# to uninitialized variable" when using GAWK's linter.
for (_entry in WCWIDTH_CACHE) {
return
}
split("X", WCWIDTH_CACHE)
WCWIDTH_INTERVAL_EXPRESSIONS_SUPPORTED = "XXXX" ~ /^X{,4}$/
WCWIDTH_MULTIBYTE_SAFE = length("宽") == 1
if (!WCWIDTH_MULTIBYTE_SAFE) {
if (sprintf("%c%c%c", 229, 174, 189) != "宽") {
WCWIDTH_INITIALIZED = -1
print "wcwidth: the AWK interpreter is not multi-byte safe and" \
" its sprintf implementation does not support manual" \
" composition of UTF-8 sequences." >> "/dev/fd/2"
close("/dev/fd/2")
}
WCWIDTH_UTF8_RUNE_REGEX = "(" \
"[\001-\177]|" \
"[\302-\336\337][\200-\277]|" \
"\340[\240-\277][\200-\277]|" \
"[\341-\354\356\357][\200-\277][\200-\277]|" \
"\355[\200-\237][\200-\277]|" \
"\360[\220-\277][\200-\277][\200-\277]|" \
"[\361-\363][\200-\277][\200-\277][\200-\277]|" \
"\364[\200-\217][\200-\277][\200-\277]|" \
"." \
")"
WCWIDTH_UTF8_ANCHORED_RUNE_REGEX = "^" WCWIDTH_UTF8_RUNE_REGEX
WCWIDTH_WIDE_CJK_RUNES_REGEX = "^((" \
"\343(\201[\201-\277]|\202[\200-\226])|" \
"\343(\202[\231-\277]|\203[\200-\277])|" \
"\344([\270-\277][\200-\277])|" \
"[\345-\350]([\200-\277][\200-\277])|" \
"\351([\200-\276][\200-\277]|\277[\200-\225])|" \
"[\352-\354][\260-\277][\200-\277]|" \
"\355([\200-\235][\200-\277]|\236[\200-\243])|" \
"\357(\274[\201-\277]|\275[\200-\240])" \
")[ -~]*" \
")+"
}
# Kludges to support AWK implementations allow NUL bytes inside of strings.
if (length((_nul = sprintf("%c", 0)))) {
if (!WCWIDTH_MULTIBYTE_SAFE) {
WCWIDTH_UTF8_ANCHORED_RUNE_REGEX = \
WCWIDTH_UTF8_ANCHORED_RUNE_REGEX "|^" _nul
}
WCWIDTH_CACHE[_nul] = 0
}
WCWIDTH_POSIX_MODE = WCWIDTH_POSIX_MODE ? 1 : 0
WCWIDTH_TABLE_LENGTH = 0
WCWIDTH_INITIALIZED = 1
}
# Populate the data structures that contain character width information. For
# convenience, this function accepts a character and returns its width.
#
# Arguments:
# - _wchar: A single character as described in the "wcwidth" documentation.
#
# Returns: The width of the character i.e. `wcwidth(_wchar)`.
#
function _wcwidth_unpack_data(_wchar, _a, _b, _c, _data, _end, _entry,
_parts, _ranges, _start, _width, _width_of_wchar_argument) {
_data = \
# [WIDTH DATA]: This part of the function will be filled in automatically.
_width_of_wchar_argument = -1
WCWIDTH_TABLE_LENGTH = split(_data, _ranges, ",")
for (_entry = 0; _entry < WCWIDTH_TABLE_LENGTH; _entry++) {
split(_ranges[_entry + 1], _parts)
_width = 0 + _parts[1]
_start = 0 + _parts[2]
_end = 0 + _parts[3]
if (WCWIDTH_MULTIBYTE_SAFE || _end < 128) {
_start = sprintf("%c", _start)
_end = sprintf("%c", _end)
} else {
# Sequences for code points U+0080 and up must be composed manually
# if the interpreter is not multi-byte safe.
# Re-use of the length encoding addended values for both endpoints
# only works if both characters consist of the same number of
# bytes. This is enforced by the width data generator.
_a = _start >= 65536 ? 240 : 32
_b = _a != 32 ? 128 : _start >= 2048 ? 224 : 32
_c = _b != 32 ? 128 : _start >= 64 ? 192 : 32
_start = sprintf("%c%c%c%c",
_a + int(_start / 262144) % 64,
_b + int(_start / 4096) % 64,
_c + int(_start / 64) % 64,
128 + _start % 64 \
)
_end = sprintf("%c%c%c%c",
_a + int(_end / 262144) % 64,
_b + int(_end / 4096) % 64,
_c + int(_end / 64) % 64,
128 + _end % 64 \
)
if (_a == 32) {
_end = substr(_end, 2 + (_b == 32) + (_c == 32))
_start = substr(_start, 2 + (_b == 32) + (_c == 32))
}
}
if (_wchar <= _end) {
if (_wchar >= _start) {
_width_of_wchar_argument = _width
}
WCWIDTH_SEARCH_CURSOR = _entry
}
WCWIDTH_RANGE_WIDTH[_entry] = _width
WCWIDTH_RANGE_START[_entry] = _start
WCWIDTH_RANGE_END[_entry] = _end
}
return (WCWIDTH_CACHE[_wchar] = _width_of_wchar_argument)
}