diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md index d571063c..6fa4e40a 100644 --- a/ARCHITECTURE.md +++ b/ARCHITECTURE.md @@ -109,7 +109,7 @@ The main entry point is `src/terminal/init.lua`, which exposes the `terminal` mo - Holds version metadata and high-level helpers: - `terminal.size()` – wrapper around `system.termsize`. - `terminal.bell()` / `terminal.bell_seq()` – terminal bell. - - `terminal.preload_widths()` – preloads characters into the width cache for box drawing and progress spinners. + - `terminal.preload_widths()` – detects the terminal’s ambiguous character width (for East Asian width). Call after init if you use `terminal.draw` or `terminal.progress`. - Manages initialization/shutdown and integration with `system`: - Console flags, non-blocking input, code page, alternate screen buffer. - Sleep function wiring for async usage. @@ -237,12 +237,13 @@ Terminal UI must align and truncate text by **display columns**, not by bytes or ### 5.1 Display width -- **`terminal.text.width`** provides the width primitives: - - **`utf8cwidth(char)`** – width in columns of a single character (string or codepoint). Uses a cache when available; otherwise falls back to `system.utf8cwidth`. - - **`utf8swidth(str)`** – total display width of a string in columns. -- **Width cache:** Not all characters have a fixed width (e.g. East Asian ambiguous). The library maintains a cache of **tested** widths. To populate it: - - **`terminal.text.width.test(str)`** – writes characters invisibly, measures cursor movement, and records each character’s width. Call during startup or when you first display unknown glyphs. - - **`terminal.preload_widths(str)`** – convenience that tests the library’s own box-drawing and progress characters plus any optional `str`. Call once after `terminal.initialize` if you use `terminal.draw` or `terminal.progress`. +- **`terminal.text.width`** provides the width primitives (delegates to LuaSystem >= 0.7.0): + - **`utf8cwidth(char)`** – width in columns of a single character (string or codepoint). Uses **`system.utf8cwidth(char, ambiguous_width)`**. + - **`utf8swidth(str)`** – total display width of a string in columns. Uses **`system.utf8swidth(str, ambiguous_width)`**. +- **Ambiguous width:** East Asian ambiguous characters can be 1 or 2 columns. The library probes **one** ambiguous character at initialization and stores the result in **`terminal.text.width.ambiguous_width`** (1 or 2). All width calls pass this value to LuaSystem. + - **`terminal.text.width.detect_ambiguous_width()`** – probes the terminal (when initialized and TTY) and sets `ambiguous_width`; idempotent. Called automatically by `preload_widths` and by `test` / `test_write`. + - **`terminal.preload_widths(str)`** – calls `detect_ambiguous_width()`. Call once after `terminal.initialize` if you use `terminal.draw` or `terminal.progress`. The optional `str` is ignored (kept for API compatibility). + - **`terminal.text.width.test(str)`** / **`test_write(str)`** – ensure detection has run, then return `utf8swidth(str)` (and optionally write). No per-character cache or probing. - Use **`terminal.size()`** to get terminal dimensions (rows × columns) so you can fit text to the visible area. **Rule of thumb:** For correct alignment and truncation, always reason in **columns**. Use `utf8swidth` to measure strings and `utf8cwidth` for per-character width when implementing substrings or cursors. @@ -289,7 +290,7 @@ Key methods for display and layout: - **Simple truncation or fixed-width slice:** use **`utils.utf8sub_col(str, 1, max_col)`** (and optionally ellipsis). - **Editable single/multi-line text with cursor and word wrap:** use **EditLine** and **`EditLine:format(...)`**. -- **Measuring or testing width:** use **`terminal.text.width.utf8swidth`** / **`utf8cwidth`** and **`terminal.text.width.test`** / **`terminal.preload_widths`** as above. +- **Measuring or testing width:** use **`terminal.text.width.utf8swidth`** / **`utf8cwidth`**; call **`terminal.preload_widths()`** after init to detect ambiguous width. All terminal output must go through **`terminal.output`** (e.g. `terminal.output.write`), not raw `print` or `io.write`, so that the library’s stream and any patching behave correctly. diff --git a/spec/19-text_width_spec.lua b/spec/19-text_width_spec.lua new file mode 100644 index 00000000..deee39b7 --- /dev/null +++ b/spec/19-text_width_spec.lua @@ -0,0 +1,170 @@ +local helpers = require "spec.helpers" + + +describe("terminal.text.width", function() + + local width + + setup(function() + helpers.load() + width = require("terminal.text.width") + end) + + + teardown(function() + helpers.unload() + end) + + + + describe("utf8cwidth()", function() + + it("returns 1 for ASCII characters", function() + assert.are.equal(1, width.utf8cwidth(65)) + assert.are.equal(1, width.utf8cwidth("A")) + assert.are.equal(1, width.utf8cwidth(" ")) + end) + + + it("accepts string or codepoint and returns same width", function() + assert.are.equal(width.utf8cwidth("x"), width.utf8cwidth(0x78)) + end) + + + it("returns 2 for fullwidth characters (e.g. CJK)", function() + assert.are.equal(2, width.utf8cwidth("你")) + assert.are.equal(2, width.utf8cwidth(0x4F60)) + end) + + + it("uses ambiguous_width for ambiguous characters", function() + local mid = require("utf8").char(0x00B7) + width.set_ambiguous_width(1) + assert.are.equal(1, width.utf8cwidth(mid), "ambiguous_width=1 should give 1") + width.set_ambiguous_width(2) + local w2 = width.utf8cwidth(mid) + assert.is_true(w2 == 1 or w2 == 2, "ambiguous_width=2 should give 1 or 2, got " .. tostring(w2)) + width.set_ambiguous_width(1) + end) + + + it("errors on invalid type", function() + assert.has_error(function() + width.utf8cwidth({}) + end, "expected string or number, got table") + end) + + end) + + + + describe("utf8swidth()", function() + + it("returns 0 for empty string", function() + assert.are.equal(0, width.utf8swidth("")) + end) + + + it("returns correct width for ASCII string", function() + assert.are.equal(5, width.utf8swidth("Hello")) + end) + + + it("returns correct width for double-width characters", function() + assert.are.equal(4, width.utf8swidth("你好")) + end) + + + it("returns correct width for mixed ASCII and wide", function() + assert.are.equal(6, width.utf8swidth("Hi你好")) + end) + + + it("respects set ambiguous_width", function() + local mid = require("utf8").char(0x00B7) + width.set_ambiguous_width(1) + local w1 = width.utf8swidth(mid) + width.set_ambiguous_width(2) + local w2 = width.utf8swidth(mid) + assert.are.equal(1, w1) + assert.is_true(w2 == 1 or w2 == 2, "ambiguous_width=2 should give 1 or 2, got " .. tostring(w2)) + width.set_ambiguous_width(1) + end) + + end) + + + + describe("set_ambiguous_width()", function() + + it("accepts only 1 or 2", function() + width.set_ambiguous_width(1) + width.set_ambiguous_width(2) + assert.has_error(function() + width.set_ambiguous_width(0) + end, "ambiguous_width must be 1 or 2, got 0") + assert.has_error(function() + width.set_ambiguous_width(3) + end, "ambiguous_width must be 1 or 2, got 3") + end) + + end) + + + + describe("detect_ambiguous_width()", function() + + it("returns 1 when terminal not ready (no write)", function() + width.ambiguous_width = nil + local w = width.detect_ambiguous_width() + assert.are.equal(1, w) + assert.are.equal(1, width.ambiguous_width) + end) + + + it("is idempotent when ambiguous_width already set", function() + width.set_ambiguous_width(2) + local w = width.detect_ambiguous_width() + assert.are.equal(2, w) + width.set_ambiguous_width(1) + end) + + end) + + + + describe("test()", function() + + it("returns same value as utf8swidth for given string", function() + width.set_ambiguous_width(1) + local str = "hello" + assert.are.equal(width.utf8swidth(str), width.test(str)) + end) + + + it("returns 0 for empty or nil", function() + assert.are.equal(0, width.test("")) + assert.are.equal(0, width.test(nil)) + end) + + end) + + + + describe("test_write()", function() + + it("returns width of written string", function() + local str = "ab" + local w = width.test_write(str) + assert.are.equal(2, w) + end) + + + it("returns 0 for empty or nil", function() + assert.are.equal(0, width.test_write("")) + assert.are.equal(0, width.test_write(nil)) + end) + + end) + +end) diff --git a/src/terminal/init.lua b/src/terminal/init.lua index 2a77aeb0..64c219b2 100644 --- a/src/terminal/init.lua +++ b/src/terminal/init.lua @@ -83,16 +83,15 @@ end ---- Preload known characters into the width-cache. --- Typically this should be called right after initialization. It will check default --- characters in use by this library, and the optional specified characters in `str`. --- Characters loaded will be the `terminal.draw.box_fmt` formats, and the `progress` spinner sprites. --- Uses `terminal.text.width.test` to test the widths of the characters. --- @tparam[opt] string str additional character string to preload +--- Detect the terminal's ambiguous character width (for East Asian width). +-- Call once after `initialize` so that `terminal.text.width.utf8cwidth` and +-- `utf8swidth` use the correct width (1 or 2) for ambiguous-width characters. +-- Optional `str` is ignored; kept for API compatibility. +-- @tparam[opt] string str ignored; kept for backward compatibility -- @return true -- @within Initialization function M.preload_widths(str) - text.width.test((str or "") .. M.progress._spinner_fmt_chars() .. M.draw._box_fmt_chars()) + text.width.detect_ambiguous_width() return true end diff --git a/src/terminal/output.lua b/src/terminal/output.lua index 72ff52ee..64bef348 100644 --- a/src/terminal/output.lua +++ b/src/terminal/output.lua @@ -42,6 +42,14 @@ end +--- Returns the current output stream (e.g. for isatty checks). +-- @treturn file the stream set by `set_stream` or the default +function M.get_stream() + return t +end + + + --- Writes to the stream. -- This is a safer write-function than the standard Lua one. -- It doesn't add add tabs between arguments, and it doesn't add a newline at the end (like `print` does). diff --git a/src/terminal/text/width.lua b/src/terminal/text/width.lua index f7e8baa3..fe2d10ba 100644 --- a/src/terminal/text/width.lua +++ b/src/terminal/text/width.lua @@ -4,15 +4,16 @@ -- Especially the ['ambiguous width'](https://www.unicode.org/Public/UCD/latest/ucd/EastAsianWidth.txt) -- characters can be displayed with different -- widths especially when used with East Asian languages. --- The only way to truly know their display width is to write them to the terminal --- and measure the cursor position change. -- --- This module implements a cache of character widths as they have been measured. +-- This module delegates width calculation to LuaSystem (>= 0.7.0), which provides +-- `system.utf8cwidth(char, ambiguous_width)` and `system.utf8swidth(str, ambiguous_width)`. +-- A single ambiguous-width character is probed at initialization (e.g. via +-- `terminal.preload_widths`) and the result is stored globally; all width calls +-- use that value for ambiguous characters. -- --- To populate the cache with tested widths use `test` and `test_write`. --- --- To check width, using the cached widths, use `utf8cwidth` and `utf8swidth`. Any --- character not in the cache will be passed to `system.utf8cwidth` to determine the width. +-- To ensure the terminal's ambiguous width is detected, call `terminal.preload_widths()` +-- after `terminal.initialize()`. Width functions are safe before that: they use +-- a default ambiguous width of 1. -- @module terminal.text.width local M = {} @@ -20,18 +21,22 @@ package.loaded["terminal.text.width"] = M -- Register the module early to avoid local t = require "terminal" local sys = require "system" -local sys_utf8cwidth = sys.utf8cwidth local utf8 = require("utf8") -- explicit lua-utf8 library call, for <= Lua 5.3 compatibility +--- Stored width for ambiguous-width characters (1 or 2). Set by `detect_ambiguous_width`. +-- When nil, width functions use 1 (safe default). Do not set directly; use +-- `detect_ambiguous_width` or `set_ambiguous_width`. +M.ambiguous_width = nil -local char_widths = {} -- registry to keep track of already tested widths +local function ambiguous_width() + return M.ambiguous_width or 1 +end --- Returns the width of a character in columns, matches `system.utf8cwidth` signature. --- This will check the cache of recorded widths first, and if not found, --- use `system.utf8cwidth` to determine the width. It will not test the width. +-- Delegates to `system.utf8cwidth(char, ambiguous_width)`. -- @tparam string|number char the character (string or codepoint) to check -- @treturn number the width of the first character in columns function M.utf8cwidth(char) @@ -40,168 +45,119 @@ function M.utf8cwidth(char) elseif type(char) ~= "number" then error("expected string or number, got " .. type(char), 2) end - return char_widths[utf8.char(char)] or sys_utf8cwidth(char) + return sys.utf8cwidth(char, ambiguous_width()) end --- Returns the width of a string in columns, matches `system.utf8swidth` signature. --- It will use the cached widths, if no cached width is available it falls back on `system.utf8cwidth`. --- It will not test the width. +-- Delegates to `system.utf8swidth(str, ambiguous_width)`. -- @tparam string str the string to check -- @treturn number the width of the string in columns function M.utf8swidth(str) - local w = 0 - for pos, char in utf8.codes(str) do - w = w + (char_widths[utf8.char(char)] or sys_utf8cwidth(char)) - end - return w + return sys.utf8swidth(str, ambiguous_width()) end ---- Returns the width of the string, by test writing. --- Characters will be written 'invisible', so it does not show on the terminal, but it does need --- room to print them. The cursor is returned to its original position. --- It will read many character-widths at once, and hence is a lot faster than checking --- each character individually. The width of each character measured is recorded in the cache. --- --- - the text stack is used to set the brightness to 0 before, and restore colors/attributes after the test. --- - the test will be done at the current cursor position, and hence content there might be overwritten. Since --- a character is either 1 or 2 columns wide. The content of those 2 columns might have to be restored. --- @tparam string str the string of characters to test --- @treturn[1] number width in columns of the string +--- Sets the ambiguous character width used for all width calculations. +-- Normally called by `detect_ambiguous_width`; exposed for tests or overrides. +-- @tparam number aw 1 or 2 +-- @within Initialization +function M.set_ambiguous_width(aw) + assert(aw == 1 or aw == 2, "ambiguous_width must be 1 or 2, got " .. tostring(aw)) + M.ambiguous_width = aw +end + + + +--- Detects the terminal's width for ambiguous-width characters by probing one character. +-- Uses cursor-position report (CPR); only runs when terminal is initialized and +-- stdout/stderr is a TTY. Does not write to the terminal otherwise. Idempotent: +-- if `ambiguous_width` is already set, returns immediately. +-- @treturn[1] number the detected width (1 or 2), or 1 if detection was skipped -- @treturn[2] nil --- @treturn[2] string error message --- @within Testing -function M.test(str) - local size = 50 -- max number of characters to do in 1 terminal write - local test = {} - local dup = {} - local width = 0 - for pos, char in utf8.codes(str) do - char = utf8.char(char) -- convert back to utf8 string - local cw = char_widths[char] - if cw then - -- we already know the width - width = width + cw - elseif not dup[char] then - -- we have no width, and it is not yet in the test list, so add it - test[#test+1] = char - dup[char] = true - end +-- @treturn[2] string error message only when probe was attempted and failed +-- @within Initialization +function M.detect_ambiguous_width() + if M.ambiguous_width ~= nil then + return M.ambiguous_width end - if #test == 0 then - return width -- nothing to test, return the width + if not t.ready() then + M.ambiguous_width = 1 + return 1 end - t.text.stack.push({ brightness = 0 }) -- set color to "hidden" - - local r, c = t.cursor.position.get() -- retrieve current position - local setpos = t.cursor.position.set_seq(r, c) -- string to restore cursor to current position - local getpos = t.cursor.position.query_seq() -- string to inject query for current position - local chunk = {} - local chars = {} - for i = 1, #test do -- process in chunks of max size - chars[#chars+1] = test[i] - local s = test[i] -- the character - .. getpos -- query for new position - .. setpos -- restore cursor to current position - chunk[#chunk+1] = s - if #chunk == size or i == #test then - -- handle the chunk - t.output.write(table.concat(chunk) .. " " .. setpos) -- write the chunk - local positions, err = t.input.read_query_answer("^\27%[(%d+);(%d+)R$", #chunk) - if not positions then - t.text.stack.pop() -- restore color (drop hidden) - return nil, err - end - - -- record sizes reported - for j, pos in ipairs(positions) do - local w = pos[2] - c - if w < 0 then - -- cursor wrapped to next line - local _, cols = t.size() - w = w + cols - end - char_widths[chars[j]] = w - end - - chunk = {} -- clear for next chunk - chars = {} - end + -- Probe only when output is a TTY to avoid unnecessary write + if not sys.isatty(t.output.get_stream()) then + M.ambiguous_width = 1 + return 1 end - t.text.stack.pop() -- restore color (drop hidden) - return M.test(str) -- re-run to get the total width, since all widths are known now -end + -- Probe one ambiguous character (U+00B7 MIDDLE DOT) + local probe_char = utf8.char(0x00B7) + t.text.stack.push({ brightness = 0 }) + local r, c = t.cursor.position.get() + if not r or not c then + t.text.stack.pop() + M.ambiguous_width = 1 + return 1 + end + local setpos = t.cursor.position.set_seq(r, c) + local getpos = t.cursor.position.query_seq() + t.output.write(probe_char .. getpos .. setpos) + t.output.flush() ---- Returns the width of the string, and writes it to the terminal. --- Writes the string to the terminal, visible, whilst at the same time injecting cursor-position queries --- to detect the width of the unknown characters in the string. --- It will read many character-widths at once, and hence is a lot faster than checking --- each character individually. --- The width of each character measured is recorded in the cache. --- @tparam string str the string of characters to write and test --- @treturn number the width of the string in columns --- @within Testing -function M.test_write(str) - local chunk = {} -- every character, pre/post fixed with a query if needed - local chars = {} -- array chars to test - local width = 0 - - do -- parse the string to test - local getpos = t.cursor.position.query_seq() -- string to inject; query for current position - local dups = {} - - for pos, char in utf8.codes(str) do - char = utf8.char(char) -- convert back to utf8 string - local cw = char_widths[char] - local query = "" - if cw then - -- we already know the width - width = width + cw - elseif not dups[char] then - -- we have no width, and it is not yet in the test list, so add the query - query = getpos - chars[#chars+1] = char - dups[char] = true - end - chunk[#chunk+1] = query .. char .. query - end - end + local positions = t.input.read_query_answer("^\27%[(%d+);(%d+)R$", 1) + t.text.stack.pop() - t.output.write(table.concat(chunk)) - if #chars == 0 then - return width -- nothing to test, return the width + if not positions or #positions < 1 then + M.ambiguous_width = 1 + return 1 end - local positions, err = t.input.read_query_answer("^\27%[(%d+);(%d+)R$", #chars * 2) - if not positions then - return nil, err + local _, cols = t.size() + local col_after = tonumber(positions[1][2]) + local w = col_after - c + if w < 0 then + w = w + (cols or 80) end + M.ambiguous_width = (w == 2) and 2 or 1 + return M.ambiguous_width +end + + + +--- Returns the width of the string in columns. +-- Ensures ambiguous width has been detected (if terminal is ready and TTY), then +-- returns the same as `utf8swidth(str)`. Kept for API compatibility; no longer +-- probes or caches per-character widths. +-- @tparam string str the string to measure +-- @treturn[1] number width in columns of the string +-- @treturn[2] nil +-- @treturn[2] string error message (only if detection was run and failed) +-- @within Testing +function M.test(str) + M.detect_ambiguous_width() + return M.utf8swidth(str or "") +end + - -- record sizes reported - for j, pos in ipairs(positions) do - local char = chars[j] - local col_start = pos[j*2 - 1][2] - local col_end = pos[j*2][2] - local w = col_end - col_start - if w < 0 then - -- cursor wrapped to next line - local _, cols = t.size() - w = w + cols - end - char_widths[char] = w - end - -- re-run to get the total width, since all widths are known now, - -- but this time do not write the string, just return the width - return M.test(str) +--- Writes the string to the terminal and returns its width in columns. +-- Ensures ambiguous width has been detected, writes the string, then returns +-- `utf8swidth(str)`. Kept for API compatibility; no longer probes per-character. +-- @tparam string str the string to write and measure +-- @treturn number the width of the string in columns +-- @within Testing +function M.test_write(str) + M.detect_ambiguous_width() + t.output.write(str or "") + return M.utf8swidth(str or "") end + return M diff --git a/terminal-scm-1.rockspec b/terminal-scm-1.rockspec index c768ae55..293e0036 100644 --- a/terminal-scm-1.rockspec +++ b/terminal-scm-1.rockspec @@ -25,7 +25,7 @@ description = { dependencies = { "lua >= 5.1, < 5.6", - "luasystem >= 0.6.3", + "luasystem >= 0.7.0", "utf8 >= 1.3.0", }