From 326c3015b8c61f633f5bb07a452f58f1c0d1ad2c Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 00:14:56 +0300
Subject: [PATCH 1/6] bench: add three-way comparison with rivo/uniseg

- Add rivo/uniseg as third comparison library alongside go-runewidth
- Add Uniseg variants for all StringWidth benchmarks (15 new benchmarks)
- Add Complex Unicode section: flags, ZWJ sequences, combined strings (9 benchmarks)
- Update README.md with results tables and library comparison
- Total: 51 benchmarks (was 30)
- Addresses #1
---
 bench/README.md          | 170 ++++++++++++++---------
 bench/comparison_test.go | 289 +++++++++++++++++++++++++++++++++------
 bench/go.mod             |   1 +
 bench/go.sum             |   2 +
 4 files changed, 355 insertions(+), 107 deletions(-)

diff --git a/bench/README.md b/bench/README.md
index a49d6f6..a4ada10 100644
--- a/bench/README.md
+++ b/bench/README.md
@@ -1,107 +1,143 @@
 # uniwidth Performance Comparison Benchmarks
 
-This directory contains **performance comparison benchmarks** between `uniwidth` and `go-runewidth`.
+This directory contains **three-way performance comparison benchmarks** between Unicode width calculation libraries for Go.
 
-## 🎯 Purpose
+## Libraries Compared
 
-Demonstrate the **3.9-46x performance improvement** achieved by uniwidth's tiered lookup strategy compared to go-runewidth's traditional binary search approach.
+| Library | Approach | Strengths |
+|---------|----------|-----------|
+| [**uniwidth**](https://github.com/unilibs/uniwidth) | Tiered fast-path lookup | Extreme speed, zero allocs for ASCII |
+| [**go-runewidth**](https://github.com/mattn/go-runewidth) | Binary search over tables | Established, widely adopted |
+| [**uniseg**](https://github.com/rivo/uniseg) | Grapheme cluster segmentation | Full UAX #29 compliance, ZWJ support |
 
-## 📦 Why Separate Module?
+## Why Separate Module?
 
-**Best Practice 2025**: Keep benchmark dependencies separate from the production library.
+**Best Practice**: Benchmark dependencies are isolated from the production library.
 
-**Benefits**:
-- ✅ **Main module**: ZERO dependencies
-- ✅ **Clean go.mod**: Users don't see competitor library
-- ✅ **Professional**: Industry-standard approach (fasthttp, gjson, sonic)
-- ✅ **Optional**: Comparison benchmarks are not required for library usage
+- Main `uniwidth` module has **ZERO** dependencies
+- Competitor libraries appear only in this benchmark module
+- Industry-standard approach (used by fasthttp, gjson, sonic)
+- Users can verify performance claims independently
 
-## 🚀 Running Benchmarks
+## Running Benchmarks
 
-### Quick Comparison
+### All Benchmarks
 ```bash
 cd bench
 go test -bench=. -benchmem
 ```
 
-### Full Results
+### Full Results (saved to file)
 ```bash
 cd bench
-go test -bench=. -benchmem -run=^$ | tee results.txt
+go test -bench=. -benchmem -count=5 -run=^$ | tee results.txt
 ```
 
-### Compare Specific Categories
+### Filter by Library
 ```bash
-# ASCII strings
-go test -bench=ASCII -benchmem
+go test -bench=Uniwidth -benchmem      # uniwidth only
+go test -bench=GoRunewidth -benchmem    # go-runewidth only
+go test -bench=Uniseg -benchmem         # uniseg only
+```
 
-# CJK strings
-go test -bench=CJK -benchmem
+### Filter by Category
+```bash
+go test -bench=ASCII -benchmem          # ASCII strings
+go test -bench=CJK -benchmem            # CJK strings
+go test -bench=Emoji -benchmem          # Emoji strings
+go test -bench=Mixed -benchmem          # Mixed ASCII + CJK
+go test -bench=TUI -benchmem            # Real-world TUI scenarios
+go test -bench=Flags -benchmem          # Flag emoji (regional indicators)
+go test -bench=ZWJ -benchmem            # ZWJ emoji sequences
+go test -bench=Combined -benchmem       # Complex mixed strings
+```
 
-# Emoji strings
-go test -bench=Emoji -benchmem
+## Benchmark Categories
 
-# Real-world TUI scenarios
-go test -bench=TUI -benchmem
-```
+### Core Categories
+- **RuneWidth** - Single rune width (uniwidth vs go-runewidth only; uniseg does not expose RuneWidth)
+- **ASCII** - Pure ASCII strings (short / medium / long)
+- **CJK** - Chinese, Japanese, Korean characters
+- **Mixed** - ASCII + CJK combinations
+- **Emoji** - Emoji-containing strings
 
-## 📊 Expected Results
+### Real-world Scenarios
+- **TUI** - Terminal UI patterns (prompts, table headers, status lines)
 
-**ASCII Strings** (15-46x faster):
-```
-BenchmarkStringWidth_ASCII_Short_Uniwidth      149590729    9.5 ns/op    0 B/op   0 allocs/op
-BenchmarkStringWidth_ASCII_Short_GoRunewidth    10065044  150.1 ns/op    0 B/op   0 allocs/op
-                                                            ^^^^^^^^^^
-                                                            15.8x faster!
-```
+### Complex Unicode
+- **Flags** - Regional indicator pairs (e.g. `🇺🇸🇩🇪🇯🇵`)
+- **ZWJ** - Zero Width Joiner sequences (e.g. `👨‍👩‍👧‍👦`, `👩‍💻`, `🏳️‍🌈`)
+- **Combined** - All sequence types mixed in a single string
 
-**CJK Strings** (4-14x faster):
-```
-BenchmarkStringWidth_CJK_Short_Uniwidth         19064941   63.6 ns/op    0 B/op   0 allocs/op
-BenchmarkStringWidth_CJK_Short_GoRunewidth       2771077  368.0 ns/op    0 B/op   0 allocs/op
-                                                            ^^^^^^^^^^
-                                                            5.8x faster!
-```
+> **Note**: Width results may differ between libraries for ZWJ sequences. uniseg performs full grapheme cluster segmentation (UAX #29), while uniwidth and go-runewidth use simpler approaches optimized for speed.
 
-**Emoji Strings** (6-8x faster):
-```
-BenchmarkStringWidth_Emoji_Short_Uniwidth       12384722   96.2 ns/op    0 B/op   0 allocs/op
-BenchmarkStringWidth_Emoji_Short_GoRunewidth     1854066  646.8 ns/op    0 B/op   0 allocs/op
-                                                            ^^^^^^^^^^
-                                                            6.7x faster!
-```
+## Results
+
+Measured on Intel Core i7-1255U (Windows, amd64). Run benchmarks yourself to get results for your platform.
+
+### RuneWidth (single rune)
+
+uniseg does not expose a public `RuneWidth` function.
 
-## 📁 Structure
+| Input | uniwidth | go-runewidth | Speedup |
+|-------|----------|--------------|---------|
+| ASCII (`'a'`) | 2.1 ns/op | 3.7 ns/op | **1.7x** |
+| CJK (`'世'`) | 1.9 ns/op | 37.6 ns/op | **~20x** |
+| Emoji (`'😀'`) | 3.3 ns/op | 21.7 ns/op | **~6.5x** |
+
+### StringWidth
+
+| Input | uniwidth | go-runewidth | uniseg | vs go-runewidth | vs uniseg |
+|-------|----------|--------------|--------|-----------------|-----------|
+| ASCII Short (`"Hello"`) | 9 ns | 107 ns | 165 ns | **12x** | **18x** |
+| ASCII Medium (43 chars) | 71 ns | 832 ns | 1,224 ns | **12x** | **17x** |
+| ASCII Long (228 chars) | 340 ns | 5,380 ns | 8,058 ns | **16x** | **24x** |
+| CJK Short (`"你好世界"`) | 96 ns | 347 ns | 379 ns | **3.6x** | **3.9x** |
+| CJK Medium (30 chars) | 1,034 ns | 2,790 ns | 3,424 ns | **2.7x** | **3.3x** |
+| Mixed Short | 173 ns | 469 ns | 632 ns | **2.7x** | **3.7x** |
+| Mixed Medium | 635 ns | 1,603 ns | 2,172 ns | **2.5x** | **3.4x** |
+| Emoji Short | 158 ns | 380 ns | 534 ns | **2.4x** | **3.4x** |
+| Emoji Medium | 677 ns | 1,749 ns | 2,259 ns | **2.6x** | **3.3x** |
+
+### Real-world TUI Scenarios
+
+| Input | uniwidth | go-runewidth | uniseg | vs go-runewidth | vs uniseg |
+|-------|----------|--------------|--------|-----------------|-----------|
+| Prompt (`"❯ Enter command:"`) | 156 ns | 456 ns | 638 ns | **2.9x** | **4.1x** |
+| Table Header (box-drawing) | 949 ns | 1,281 ns | 1,708 ns | **1.3x** | **1.8x** |
+| Status Line (emoji-rich) | 664 ns | 1,614 ns | 2,174 ns | **2.4x** | **3.3x** |
+
+### Complex Unicode Sequences
+
+| Input | uniwidth | go-runewidth | uniseg | vs go-runewidth | vs uniseg |
+|-------|----------|--------------|--------|-----------------|-----------|
+| Flags (`🇺🇸🇩🇪🇯🇵🇬🇧🇫🇷`) | 201 ns | 391 ns | 455 ns | **1.9x** | **2.3x** |
+| ZWJ (`👨‍👩‍👧‍👦 👩‍💻 🏳️‍🌈`) | 323 ns | 326 ns | 691 ns | **~1x** | **2.1x** |
+| Combined (all types) | 374 ns | 505 ns | 1,175 ns | **1.4x** | **3.1x** |
+
+All libraries: **0 allocs/op** for short strings. uniwidth allocates 1 `[]rune` for medium/long Unicode strings (needed for lookahead on variation selectors and regional indicators).
+
+## Structure
 
 ```
 bench/
-├── go.mod               # Separate module with go-runewidth dependency
-├── go.sum               # Dependencies checksums
-├── comparison_test.go   # Comparison benchmarks (uniwidth vs go-runewidth)
+├── go.mod               # Separate module with benchmark dependencies
+├── go.sum               # Dependency checksums
+├── comparison_test.go   # Three-way comparison benchmarks
 └── README.md            # This file
 ```
 
-## 🔗 Dependencies
+## Dependencies
 
-This module depends on:
+This benchmark module depends on:
 - `github.com/unilibs/uniwidth` (parent module, via replace directive)
-- `github.com/mattn/go-runewidth` (competitor, for comparison only)
-
-**Note**: The main `uniwidth` library has ZERO dependencies. These dependencies exist only in this benchmark module for performance comparison purposes.
+- `github.com/mattn/go-runewidth` (comparison baseline)
+- `github.com/rivo/uniseg` (comparison baseline)
 
-## 📝 Notes
+The main `uniwidth` library has **ZERO** dependencies. These exist only in this benchmark module.
 
-- Benchmarks are isolated from the main library
-- Main `uniwidth` module remains dependency-free
-- Comparison benchmarks prove performance claims (marketing)
-- Users can verify performance independently
-
-## 🎓 Learn More
+## See Also
 
 - [Main Documentation](../README.md)
 - [Architecture Guide](../docs/ARCHITECTURE.md)
-- [PoC Results](../docs/POC_RESULTS.md)
-
----
-
-*These benchmarks demonstrate why uniwidth is 3.9-46x faster than go-runewidth.*
+- [Changelog](../CHANGELOG.md)
diff --git a/bench/comparison_test.go b/bench/comparison_test.go
index 003fb8e..e5a6190 100644
--- a/bench/comparison_test.go
+++ b/bench/comparison_test.go
@@ -4,32 +4,52 @@ import (
 	"testing"
 
 	"github.com/mattn/go-runewidth"
+	"github.com/rivo/uniseg"
 	"github.com/unilibs/uniwidth"
 )
 
 // ============================================================================
-// Comparison Benchmarks: uniwidth vs go-runewidth
+// Comparison Benchmarks: uniwidth vs go-runewidth vs uniseg
 //
-// This package contains performance comparison benchmarks between uniwidth
-// and the go-runewidth library. These benchmarks demonstrate the 3.9-46x
-// performance improvement achieved by uniwidth's tiered lookup strategy.
+// Three-way performance comparison between Unicode width calculation libraries:
+//   - uniwidth:      Tiered fast-path lookup (O(1) for common characters)
+//   - go-runewidth:  Binary search over Unicode tables
+//   - uniseg:        Grapheme cluster segmentation with width calculation
 //
-// Run comparison benchmarks:
-//   cd bench
-//   go test -bench=. -benchmem
+// Run all comparison benchmarks:
+//
+//	cd bench
+//	go test -bench=. -benchmem
+//
+// Filter by library:
+//
+//	go test -bench=Uniwidth -benchmem
+//	go test -bench=GoRunewidth -benchmem
+//	go test -bench=Uniseg -benchmem
+//
+// Filter by category:
+//
+//	go test -bench=ASCII -benchmem
+//	go test -bench=CJK -benchmem
+//	go test -bench=Emoji -benchmem
+//	go test -bench=TUI -benchmem
+//	go test -bench=Flags -benchmem
+//	go test -bench=ZWJ -benchmem
 //
-// Compare results:
-//   go test -bench=. -benchmem | tee results.txt
 // ============================================================================
 
 // ============================================================================
 // RuneWidth Benchmarks
+//
+// Note: uniseg does not expose a public RuneWidth function; it operates on
+// grapheme clusters via StringWidth and iterator APIs. RuneWidth comparison
+// is limited to uniwidth vs go-runewidth.
 // ============================================================================
 
 func BenchmarkRuneWidth_ASCII_Uniwidth(b *testing.B) {
 	r := 'a'
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.RuneWidth(r)
 	}
 }
@@ -37,7 +57,7 @@ func BenchmarkRuneWidth_ASCII_Uniwidth(b *testing.B) {
 func BenchmarkRuneWidth_ASCII_GoRunewidth(b *testing.B) {
 	r := 'a'
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.RuneWidth(r)
 	}
 }
@@ -45,7 +65,7 @@ func BenchmarkRuneWidth_ASCII_GoRunewidth(b *testing.B) {
 func BenchmarkRuneWidth_CJK_Uniwidth(b *testing.B) {
 	r := '世' // Chinese character
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.RuneWidth(r)
 	}
 }
@@ -53,7 +73,7 @@ func BenchmarkRuneWidth_CJK_Uniwidth(b *testing.B) {
 func BenchmarkRuneWidth_CJK_GoRunewidth(b *testing.B) {
 	r := '世'
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.RuneWidth(r)
 	}
 }
@@ -61,7 +81,7 @@ func BenchmarkRuneWidth_CJK_GoRunewidth(b *testing.B) {
 func BenchmarkRuneWidth_Emoji_Uniwidth(b *testing.B) {
 	r := '😀' // Smiling face
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.RuneWidth(r)
 	}
 }
@@ -69,7 +89,7 @@ func BenchmarkRuneWidth_Emoji_Uniwidth(b *testing.B) {
 func BenchmarkRuneWidth_Emoji_GoRunewidth(b *testing.B) {
 	r := '😀'
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.RuneWidth(r)
 	}
 }
@@ -81,7 +101,7 @@ func BenchmarkRuneWidth_Emoji_GoRunewidth(b *testing.B) {
 func BenchmarkStringWidth_ASCII_Short_Uniwidth(b *testing.B) {
 	s := "Hello"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -89,15 +109,23 @@ func BenchmarkStringWidth_ASCII_Short_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_ASCII_Short_GoRunewidth(b *testing.B) {
 	s := "Hello"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_ASCII_Short_Uniseg(b *testing.B) {
+	s := "Hello"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_ASCII_Medium_Uniwidth(b *testing.B) {
 	s := "The quick brown fox jumps over the lazy dog"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -105,15 +133,23 @@ func BenchmarkStringWidth_ASCII_Medium_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_ASCII_Medium_GoRunewidth(b *testing.B) {
 	s := "The quick brown fox jumps over the lazy dog"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_ASCII_Medium_Uniseg(b *testing.B) {
+	s := "The quick brown fox jumps over the lazy dog"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_ASCII_Long_Uniwidth(b *testing.B) {
 	s := "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -121,11 +157,19 @@ func BenchmarkStringWidth_ASCII_Long_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_ASCII_Long_GoRunewidth(b *testing.B) {
 	s := "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_ASCII_Long_Uniseg(b *testing.B) {
+	s := "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat."
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 // ============================================================================
 // StringWidth Benchmarks - CJK
 // ============================================================================
@@ -133,7 +177,7 @@ func BenchmarkStringWidth_ASCII_Long_GoRunewidth(b *testing.B) {
 func BenchmarkStringWidth_CJK_Short_Uniwidth(b *testing.B) {
 	s := "你好世界" // Hello World in Chinese
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -141,15 +185,23 @@ func BenchmarkStringWidth_CJK_Short_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_CJK_Short_GoRunewidth(b *testing.B) {
 	s := "你好世界"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_CJK_Short_Uniseg(b *testing.B) {
+	s := "你好世界"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_CJK_Medium_Uniwidth(b *testing.B) {
 	s := "これは日本語のテキストです。漢字とひらがなとカタカナが含まれています。"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -157,19 +209,27 @@ func BenchmarkStringWidth_CJK_Medium_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_CJK_Medium_GoRunewidth(b *testing.B) {
 	s := "これは日本語のテキストです。漢字とひらがなとカタカナが含まれています。"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_CJK_Medium_Uniseg(b *testing.B) {
+	s := "これは日本語のテキストです。漢字とひらがなとカタカナが含まれています。"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 // ============================================================================
-// StringWidth Benchmarks - Mixed
+// StringWidth Benchmarks - Mixed (ASCII + CJK)
 // ============================================================================
 
 func BenchmarkStringWidth_Mixed_Short_Uniwidth(b *testing.B) {
 	s := "Hello 世界 World"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -177,15 +237,23 @@ func BenchmarkStringWidth_Mixed_Short_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_Mixed_Short_GoRunewidth(b *testing.B) {
 	s := "Hello 世界 World"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_Mixed_Short_Uniseg(b *testing.B) {
+	s := "Hello 世界 World"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_Mixed_Medium_Uniwidth(b *testing.B) {
 	s := "User: John Doe (管理者) | Status: Active | 日本語対応"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -193,11 +261,19 @@ func BenchmarkStringWidth_Mixed_Medium_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_Mixed_Medium_GoRunewidth(b *testing.B) {
 	s := "User: John Doe (管理者) | Status: Active | 日本語対応"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_Mixed_Medium_Uniseg(b *testing.B) {
+	s := "User: John Doe (管理者) | Status: Active | 日本語対応"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 // ============================================================================
 // StringWidth Benchmarks - Emoji
 // ============================================================================
@@ -205,7 +281,7 @@ func BenchmarkStringWidth_Mixed_Medium_GoRunewidth(b *testing.B) {
 func BenchmarkStringWidth_Emoji_Short_Uniwidth(b *testing.B) {
 	s := "Hello 👋 World 😀"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -213,15 +289,23 @@ func BenchmarkStringWidth_Emoji_Short_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_Emoji_Short_GoRunewidth(b *testing.B) {
 	s := "Hello 👋 World 😀"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_Emoji_Short_Uniseg(b *testing.B) {
+	s := "Hello 👋 World 😀"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_Emoji_Medium_Uniwidth(b *testing.B) {
 	s := "Status: ✅ Success | Error: ❌ Failed | Progress: 🚀 Loading..."
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -229,11 +313,19 @@ func BenchmarkStringWidth_Emoji_Medium_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_Emoji_Medium_GoRunewidth(b *testing.B) {
 	s := "Status: ✅ Success | Error: ❌ Failed | Progress: 🚀 Loading..."
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_Emoji_Medium_Uniseg(b *testing.B) {
+	s := "Status: ✅ Success | Error: ❌ Failed | Progress: 🚀 Loading..."
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 // ============================================================================
 // Real-world TUI Scenarios
 // ============================================================================
@@ -241,7 +333,7 @@ func BenchmarkStringWidth_Emoji_Medium_GoRunewidth(b *testing.B) {
 func BenchmarkStringWidth_TUI_Prompt_Uniwidth(b *testing.B) {
 	s := "❯ Enter command:"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -249,15 +341,23 @@ func BenchmarkStringWidth_TUI_Prompt_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_TUI_Prompt_GoRunewidth(b *testing.B) {
 	s := "❯ Enter command:"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_TUI_Prompt_Uniseg(b *testing.B) {
+	s := "❯ Enter command:"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_TUI_TableHeader_Uniwidth(b *testing.B) {
 	s := "│ ID │ Name │ Status │ Created At │"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -265,15 +365,23 @@ func BenchmarkStringWidth_TUI_TableHeader_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_TUI_TableHeader_GoRunewidth(b *testing.B) {
 	s := "│ ID │ Name │ Status │ Created At │"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
 
+func BenchmarkStringWidth_TUI_TableHeader_Uniseg(b *testing.B) {
+	s := "│ ID │ Name │ Status │ Created At │"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
 func BenchmarkStringWidth_TUI_StatusLine_Uniwidth(b *testing.B) {
 	s := "✅ 12 passed | ❌ 3 failed | ⏭️  5 skipped | ⏱️  1.234s"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
 		_ = uniwidth.StringWidth(s)
 	}
 }
@@ -281,7 +389,108 @@ func BenchmarkStringWidth_TUI_StatusLine_Uniwidth(b *testing.B) {
 func BenchmarkStringWidth_TUI_StatusLine_GoRunewidth(b *testing.B) {
 	s := "✅ 12 passed | ❌ 3 failed | ⏭️  5 skipped | ⏱️  1.234s"
 	b.ResetTimer()
-	for i := 0; i < b.N; i++ {
+	for range b.N {
+		_ = runewidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_TUI_StatusLine_Uniseg(b *testing.B) {
+	s := "✅ 12 passed | ❌ 3 failed | ⏭️  5 skipped | ⏱️  1.234s"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
+// ============================================================================
+// Complex Unicode Sequences
+//
+// These benchmarks test handling of multi-codepoint sequences that require
+// context-aware processing:
+//   - Flag emoji (regional indicator pairs)
+//   - ZWJ sequences (family emoji, profession emoji)
+//   - Combined complex strings mixing all sequence types
+//
+// Width results may differ between libraries for complex sequences.
+// uniseg performs full grapheme cluster segmentation (UAX #29) which produces
+// the most accurate results for ZWJ sequences. uniwidth and go-runewidth use
+// simpler per-rune or limited lookahead approaches optimized for speed.
+// ============================================================================
+
+// Flag emoji: regional indicator pairs forming country flags.
+// Each flag is two regional indicator codepoints rendered as a single glyph.
+func BenchmarkStringWidth_Flags_Uniwidth(b *testing.B) {
+	s := "🇺🇸🇩🇪🇯🇵🇬🇧🇫🇷"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniwidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_Flags_GoRunewidth(b *testing.B) {
+	s := "🇺🇸🇩🇪🇯🇵🇬🇧🇫🇷"
+	b.ResetTimer()
+	for range b.N {
+		_ = runewidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_Flags_Uniseg(b *testing.B) {
+	s := "🇺🇸🇩🇪🇯🇵🇬🇧🇫🇷"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
+// ZWJ sequences: emoji composed with Zero Width Joiner (U+200D).
+// These form complex glyphs like family groups and gendered professions.
+func BenchmarkStringWidth_ZWJ_Uniwidth(b *testing.B) {
+	s := "👨‍👩‍👧‍👦 👩‍💻 🏳️‍🌈"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniwidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_ZWJ_GoRunewidth(b *testing.B) {
+	s := "👨‍👩‍👧‍👦 👩‍💻 🏳️‍🌈"
+	b.ResetTimer()
+	for range b.N {
+		_ = runewidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_ZWJ_Uniseg(b *testing.B) {
+	s := "👨‍👩‍👧‍👦 👩‍💻 🏳️‍🌈"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
+
+// Combined: real-world complex string mixing ASCII, CJK, flags, and ZWJ.
+// Represents a realistic worst-case scenario for width calculation.
+func BenchmarkStringWidth_Combined_Uniwidth(b *testing.B) {
+	s := "Hello 🇺🇸 世界 👨‍👩‍👧‍👦 café"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniwidth.StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_Combined_GoRunewidth(b *testing.B) {
+	s := "Hello 🇺🇸 世界 👨‍👩‍👧‍👦 café"
+	b.ResetTimer()
+	for range b.N {
 		_ = runewidth.StringWidth(s)
 	}
 }
+
+func BenchmarkStringWidth_Combined_Uniseg(b *testing.B) {
+	s := "Hello 🇺🇸 世界 👨‍👩‍👧‍👦 café"
+	b.ResetTimer()
+	for range b.N {
+		_ = uniseg.StringWidth(s)
+	}
+}
diff --git a/bench/go.mod b/bench/go.mod
index 30dfacc..94b7b16 100644
--- a/bench/go.mod
+++ b/bench/go.mod
@@ -6,6 +6,7 @@ replace github.com/unilibs/uniwidth => ../
 
 require (
 	github.com/mattn/go-runewidth v0.0.19
+	github.com/rivo/uniseg v0.4.7
 	github.com/unilibs/uniwidth v0.0.0-00010101000000-000000000000
 )
 
diff --git a/bench/go.sum b/bench/go.sum
index c925e39..d44648d 100644
--- a/bench/go.sum
+++ b/bench/go.sum
@@ -2,3 +2,5 @@ github.com/clipperhouse/uax29/v2 v2.2.0 h1:ChwIKnQN3kcZteTXMgb1wztSgaU+ZemkgWdoh
 github.com/clipperhouse/uax29/v2 v2.2.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
 github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw=
 github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
+github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
+github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=

From 4c7f02b178842205fd7ecc94070fd359f9338f11 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 00:42:28 +0300
Subject: [PATCH 2/6] perf: SWAR optimization for ASCII fast paths
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace byte-by-byte isASCIIOnly() with SWAR processing 8 bytes/iter
- Add asciiWidth() with Daniel Lemire's SWAR control char detection
- Add short string fast path (<8 bytes) to avoid SWAR call overhead
- ASCII Short: 9 ns → 6 ns (1.5x faster)
- ASCII Medium: 71 ns → 24 ns (3x faster)
- ASCII Long: 340 ns → 77 ns (4.4x faster)
- Now 46x faster than go-runewidth, 77x faster than uniseg on long ASCII
- Zero allocations maintained for all ASCII paths
---
 uniwidth.go | 150 +++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 136 insertions(+), 14 deletions(-)

diff --git a/uniwidth.go b/uniwidth.go
index ac847c5..8fd74ca 100644
--- a/uniwidth.go
+++ b/uniwidth.go
@@ -13,6 +13,7 @@ package uniwidth
 
 import (
 	"unicode"
+	"unsafe"
 )
 
 // RuneWidth returns the visual width of a rune in monospace terminals.
@@ -169,20 +170,29 @@ func RuneWidth(r rune) int {
 //   - Variation selectors (U+FE0E/U+FE0F) modify the width of the preceding character
 //   - Regional indicator pairs (flags) are counted as width 2, not 4
 func StringWidth(s string) int {
-	// Fast path: ASCII-only strings
-	// This is the most common case (~95% of typical terminal content)
-	if isASCIIOnly(s) {
-		// Count width for ASCII, accounting for control characters
+	// Short string fast path (< 8 bytes): single-pass ASCII check and width
+	// count fused into one loop. For strings shorter than 8 bytes, the SWAR
+	// loop bodies in isASCIIOnly/asciiWidth never execute, making those two
+	// function calls pure overhead. This path avoids both calls entirely.
+	if len(s) < 8 {
 		width := 0
+		isASCII := true
 		for i := 0; i < len(s); i++ {
 			b := s[i]
-			// Control characters (0x00-0x1F, 0x7F) have zero width
-			if b < 0x20 || b == 0x7F {
-				continue // width += 0
+			if b >= 0x80 {
+				isASCII = false
+				break
+			}
+			if b >= 0x20 && b != 0x7F {
+				width++
 			}
-			width++
 		}
-		return width
+		if isASCII {
+			return width
+		}
+	} else if isASCIIOnly(s) {
+		// SWAR fast path for longer ASCII-only strings (8+ bytes)
+		return asciiWidth(s)
 	}
 
 	// Convert to rune slice for lookahead
@@ -247,18 +257,130 @@ func isRegionalIndicator(r rune) bool {
 }
 
 // isASCIIOnly returns true if the string contains only ASCII characters (0x00-0x7F).
-// This function is optimized for SIMD auto-vectorization by Go 1.25 compiler.
+//
+// Uses SWAR (SIMD Within A Register) to process 8 bytes at a time by loading
+// them into a uint64 and checking all high bits simultaneously with a single
+// AND against 0x8080808080808080. If any byte has its high bit set (>= 0x80),
+// it is non-ASCII. This works regardless of endianness because we only test
+// whether any byte has its high bit set, not which byte it is.
+//
+// Performance:
+//   - Short strings (< 8 bytes): scalar fallback, O(n) per byte
+//   - Longer strings: ~8x throughput via SWAR, O(n/8) per word + O(n%8) tail
+//   - 0 allocations in all cases
+//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
+// all pointer arithmetic is bounds-checked by the loop guard (i+8 <= n, i < n).
 func isASCIIOnly(s string) bool {
-	// Simple loop structure allows compiler to auto-vectorize
-	// (SSE2/AVX2 on x86-64, NEON on ARM)
-	for i := 0; i < len(s); i++ {
-		if s[i] >= 0x80 {
+	n := len(s)
+	if n == 0 {
+		return true
+	}
+
+	p := unsafe.StringData(s)
+
+	// SWAR: process 8 bytes at a time
+	const asciiMask = uint64(0x8080808080808080)
+	i := 0
+	for ; i+8 <= n; i += 8 {
+		word := *(*uint64)(unsafe.Add(unsafe.Pointer(p), i))
+		if word&asciiMask != 0 {
 			return false
 		}
 	}
+
+	// Scalar tail: process remaining bytes (0-7)
+	for ; i < n; i++ {
+		if *(*byte)(unsafe.Add(unsafe.Pointer(p), i)) >= 0x80 {
+			return false
+		}
+	}
+
 	return true
 }
 
+// asciiWidth returns the visual width of an ASCII-only string, accounting for
+// control characters (0x00-0x1F, 0x7F) which have zero width.
+//
+// Uses SWAR to detect control characters in 8-byte chunks. If a chunk contains
+// no control characters, width += 8 directly. Otherwise, falls back to scalar
+// processing for that chunk.
+//
+// Control character detection uses Daniel Lemire's SWAR technique:
+//   - Bytes < 0x20: detected via (x - 0x2020...) & ~x & 0x8080...
+//   - Byte == 0x7F: detected via XOR with 0x7F7F... then same underflow trick
+//
+// The underflow trick works because subtracting 0x20 from a byte < 0x20 causes
+// the high bit to set (unsigned underflow), while the original byte had its high
+// bit clear. The AND with ~x isolates genuine underflows from bytes >= 0x80
+// (which cannot appear here since isASCIIOnly was already verified).
+//
+// Caller must ensure s contains only ASCII bytes (call isASCIIOnly first).
+//
+// Performance:
+//   - 0 allocations
+//   - ~8x throughput for chunks without control characters
+//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
+// all pointer arithmetic is bounds-checked by the loop guards (i+8 <= n, i < n, j < 8).
+func asciiWidth(s string) int {
+	n := len(s)
+	if n == 0 {
+		return 0
+	}
+
+	p := unsafe.StringData(s)
+	width := 0
+	i := 0
+
+	// SWAR constants for control character detection.
+	const (
+		// Broadcast 0x20 and 0x7F across all 8 bytes of a uint64.
+		lo20 = uint64(0x2020202020202020)
+		hi80 = uint64(0x8080808080808080)
+		rep7F = uint64(0x7F7F7F7F7F7F7F7F)
+		rep01 = uint64(0x0101010101010101)
+	)
+
+	// Process 8 bytes at a time
+	for ; i+8 <= n; i += 8 {
+		word := *(*uint64)(unsafe.Add(unsafe.Pointer(p), i))
+
+		// Detect bytes < 0x20 using SWAR underflow trick:
+		// (word - 0x2020...) produces underflow (sets high bit) for bytes < 0x20.
+		// &^word masks out bytes that already had high bit set (not possible for
+		// ASCII, but defensive). &hi80 extracts only the high bits.
+		hasLow := (word - lo20) & ^word & hi80
+
+		// Detect bytes == 0x7F using XOR + underflow:
+		// word ^ 0x7F7F... zeros out any 0x7F bytes. Then the zero-byte detection
+		// pattern ((v - 0x0101...) & ~v & 0x8080...) finds the zeroed positions.
+		xored := word ^ rep7F
+		has7F := (xored - rep01) & ^xored & hi80
+
+		if (hasLow | has7F) == 0 {
+			// Fast path: no control characters in this 8-byte chunk
+			width += 8
+		} else {
+			// Slow path: at least one control character, process byte by byte
+			for j := 0; j < 8; j++ {
+				b := *(*byte)(unsafe.Add(unsafe.Pointer(p), i+j))
+				if b >= 0x20 && b != 0x7F {
+					width++
+				}
+			}
+		}
+	}
+
+	// Scalar tail: process remaining bytes (0-7)
+	for ; i < n; i++ {
+		b := *(*byte)(unsafe.Add(unsafe.Pointer(p), i))
+		if b >= 0x20 && b != 0x7F {
+			width++
+		}
+	}
+
+	return width
+}
+
 // binarySearchWidth performs binary search on Unicode width tables.
 // This is the fallback for rare characters not covered by hot paths.
 func binarySearchWidth(r rune) int {

From 1ca49230dcffe39b91c664f67f1ea32ce2af28c1 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 01:19:16 +0300
Subject: [PATCH 3/6] perf: replace binary search with O(1) multi-stage lookup
 table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 3-stage hierarchical table: ROOT(256B) → MIDDLE(17×64) → LEAVES(78×32)
- Total size: 3.8KB with bucket deduplication
- 2-bit width encoding: 0=zero-width, 1=narrow, 2=wide, 3=ambiguous
- Exhaustive verification: all 1,112,064 valid codepoints match
- Coverage improved: 87.1% → 97.6%
- Merged zero-width format chars (0x200B-0x200F) into single range
- Updated generator with buildWidthMap() and buildMultiStageTable()
- Fixed 3 conformance test expectations for Unicode 16.0 data
---
 cmd/generate-tables/main.go | 387 +++++++++++++++++++-----
 conformance_test.go         |  15 +-
 options.go                  |  26 +-
 tables_generated.go         | 576 +++++++++++++++++++++++++++++++++++-
 uniwidth.go                 |  59 ++--
 uniwidth_test.go            | 284 ++++++++++++++++++
 6 files changed, 1239 insertions(+), 108 deletions(-)

diff --git a/cmd/generate-tables/main.go b/cmd/generate-tables/main.go
index 09de4fc..2ff0573 100644
--- a/cmd/generate-tables/main.go
+++ b/cmd/generate-tables/main.go
@@ -6,7 +6,8 @@
 //
 // It generates optimized tables for uniwidth's tiered lookup strategy:
 // - Tier 1-3 (hot paths) are hardcoded in uniwidth.go for O(1) lookup
-// - This generates Tier 4 (binary search fallback) tables
+// - This generates Tier 4 tables: both legacy binary search tables and
+//   a 3-stage multi-stage lookup table for O(1) fallback
 //
 // Usage:
 //
@@ -31,11 +32,19 @@ import (
 )
 
 const (
-	unicodeVersion       = "16.0.0"
-	eastAsianWidthURL    = "https://www.unicode.org/Public/16.0.0/ucd/EastAsianWidth.txt"
-	emojiDataURL         = "https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt"
-	outputFile           = "tables_generated.go"
-	outputFileWithHeader = "tables_generated.go"
+	unicodeVersion    = "16.0.0"
+	eastAsianWidthURL = "https://www.unicode.org/Public/16.0.0/ucd/EastAsianWidth.txt"
+	emojiDataURL      = "https://www.unicode.org/Public/16.0.0/ucd/emoji/emoji-data.txt"
+	outputFile        = "tables_generated.go"
+
+	// maxCodepoint is the maximum valid Unicode codepoint (U+10FFFF).
+	maxCodepoint = 0x10FFFF
+
+	// 2-bit width encoding for multi-stage table leaves.
+	widthZero      = 0 // width 0: control, combining, zero-width
+	widthNarrow    = 1 // width 1: narrow (default)
+	widthWide      = 2 // width 2: wide (CJK, emoji, fullwidth)
+	widthAmbiguous = 3 // width 1 in neutral context, 2 in East Asian
 )
 
 // runeRange represents a contiguous range of runes with the same property.
@@ -44,7 +53,7 @@ type runeRange struct {
 	last  rune
 }
 
-// Category represents different width categories
+// category represents different width categories
 type category int
 
 const (
@@ -76,14 +85,23 @@ func main() {
 	log.Println("Parsing Emoji data...")
 	emojiRanges := parseEmojiData(emojiData)
 
-	// Merge emoji into wide ranges
+	// Build multi-stage table from UNFILTERED ranges (covers all codepoints)
+	log.Println("Building multi-stage lookup table...")
+	root, middle, leaves := buildMultiStageTable(wideRanges, ambiguousRanges, emojiRanges)
+	log.Printf("  - Root table: %d entries", len(root))
+	log.Printf("  - Middle tables: %d unique sub-tables", len(middle))
+	log.Printf("  - Leaf tables: %d unique sub-tables", len(leaves))
+	totalBytes := len(root) + len(middle)*64 + len(leaves)*32
+	log.Printf("  - Total size: %d bytes (%.1f KiB)", totalBytes, float64(totalBytes)/1024)
+
+	// Merge emoji into wide ranges for legacy tables
 	wideRanges = mergeRanges(wideRanges, emojiRanges)
 
 	// Generate zero-width tables (control chars, combining marks, format chars)
 	log.Println("Generating zero-width tables...")
 	zeroWidthRanges := generateZeroWidthRanges()
 
-	// Filter out hot path ranges (already handled in uniwidth.go)
+	// Filter out hot path ranges (already handled in uniwidth.go) for legacy tables
 	log.Println("Filtering hot path ranges (Tier 1-3)...")
 	wideRanges = filterHotPaths(wideRanges)
 	zeroWidthRanges = filterZeroWidthHotPaths(zeroWidthRanges)
@@ -97,7 +115,7 @@ func main() {
 
 	// Generate output file
 	log.Println("Generating tables_generated.go...")
-	err = generateGoFile(wideRanges, zeroWidthRanges, ambiguousRanges)
+	err = generateGoFile(wideRanges, zeroWidthRanges, ambiguousRanges, &root, middle, leaves)
 	if err != nil {
 		log.Fatalf("Failed to generate Go file: %v", err)
 	}
@@ -106,6 +124,7 @@ func main() {
 	log.Printf("  - Wide characters: %d ranges", len(wideRanges))
 	log.Printf("  - Zero-width characters: %d ranges", len(zeroWidthRanges))
 	log.Printf("  - Ambiguous characters: %d ranges", len(ambiguousRanges))
+	log.Printf("  - Multi-stage table: root=%d, middle=%d, leaves=%d", len(root), len(middle), len(leaves))
 	log.Println("Done!")
 }
 
@@ -188,8 +207,8 @@ func parseEastAsianWidth(data string) (wide, ambiguous []runeRange) {
 // parseEmojiData parses emoji-data.txt and returns emoji ranges.
 func parseEmojiData(data string) []runeRange {
 	// Regex to match lines like:
-	// 0023          ; Emoji                # E0.0   [1] (#️)       number sign
-	// 1F600..1F64F  ; Emoji                # E0.6  [80] (😀..🙏)    grinning face..folded hands
+	// 0023          ; Emoji                # E0.0   [1] (#)       number sign
+	// 1F600..1F64F  ; Emoji                # E0.6  [80] (...)    grinning face..folded hands
 	lineRe := regexp.MustCompile(`^([0-9A-F]+)(?:\.\.([0-9A-F]+))?\s*;\s*Emoji\s`)
 
 	var ranges []runeRange
@@ -264,6 +283,10 @@ func generateZeroWidthRanges() []runeRange {
 		{0x094D, 0x094D},
 		{0x0951, 0x0957},
 		{0x0962, 0x0963},
+		// Combining Diacritical Marks Extended (U+1AB0-U+1AFF)
+		{0x1AB0, 0x1AFF},
+		// Combining Diacritical Marks Supplement (U+1DC0-U+1DFF)
+		{0x1DC0, 0x1DFF},
 		// Format characters (ZWS, ZWNJ, ZWJ, LRM, RLM, etc.)
 		{0x200B, 0x200F},
 		// Combining marks for symbols
@@ -316,10 +339,8 @@ func filterZeroWidthHotPaths(ranges []runeRange) []runeRange {
 		// ASCII control chars (handled in Tier 1)
 		{0x0000, 0x001F},
 		{0x007F, 0x007F},
-		// ZWJ (handled explicitly)
-		{0x200D, 0x200D},
-		// ZWNJ (handled explicitly)
-		{0x200C, 0x200C},
+		// Format characters: ZWSP, ZWNJ, ZWJ, LRM, RLM (handled explicitly)
+		{0x200B, 0x200F},
 		// Variation selectors (handled explicitly)
 		{0xFE00, 0xFE0F},
 		{0xE0100, 0xE01EF},
@@ -401,8 +422,159 @@ func optimizeRanges(ranges []runeRange) []runeRange {
 	return result
 }
 
-// generateGoFile generates the Go source file with tables.
-func generateGoFile(wide, zeroWidth, ambiguous []runeRange) error {
+// buildWidthMap builds a complete width map for all Unicode codepoints.
+// The map uses the 2-bit encoding: 0=zero, 1=narrow, 2=wide, 3=ambiguous.
+//
+// The application order matters: later assignments override earlier ones,
+// matching the priority logic in RuneWidth() and binarySearchWidth().
+func buildWidthMap(wide, ambiguous, emoji []runeRange) []byte {
+	// Allocate map for all codepoints (0x000000 - 0x10FFFF)
+	widthMap := make([]byte, maxCodepoint+1)
+
+	// Step 1: Default everything to narrow (width 1)
+	for i := range widthMap {
+		widthMap[i] = widthNarrow
+	}
+
+	// Step 2: Apply zero-width ranges
+	zeroWidthRanges := generateZeroWidthRanges()
+	for _, rr := range zeroWidthRanges {
+		for cp := rr.first; cp <= rr.last; cp++ {
+			widthMap[cp] = widthZero
+		}
+	}
+
+	// Step 3: Mark surrogates as zero-width (they are invalid in Go strings)
+	for cp := rune(0xD800); cp <= 0xDFFF; cp++ {
+		widthMap[cp] = widthZero
+	}
+
+	// Step 4: Apply ambiguous ranges (encoded as 3)
+	// Must be applied BEFORE wide ranges so that characters that are both
+	// ambiguous AND wide (due to emoji overlap) get the correct wide width.
+	for _, rr := range ambiguous {
+		for cp := rr.first; cp <= rr.last; cp++ {
+			widthMap[cp] = widthAmbiguous
+		}
+	}
+
+	// Step 5: Apply wide ranges from EastAsianWidth (W, F)
+	for _, rr := range wide {
+		for cp := rr.first; cp <= rr.last; cp++ {
+			widthMap[cp] = widthWide
+		}
+	}
+
+	// Step 6: Apply emoji ranges (width 2)
+	// Emoji override ambiguous (e.g., U+2600-U+26FF are both ambiguous and emoji)
+	for _, rr := range emoji {
+		for cp := rr.first; cp <= rr.last; cp++ {
+			widthMap[cp] = widthWide
+		}
+	}
+
+	// Step 7: Re-apply zero-width overrides that must take precedence
+	// (variation selectors, ZWJ, ZWNJ, combining marks, control chars, etc.)
+	// These are zero-width regardless of any other property.
+	for _, rr := range zeroWidthRanges {
+		for cp := rr.first; cp <= rr.last; cp++ {
+			widthMap[cp] = widthZero
+		}
+	}
+
+	// Step 8: Ensure ASCII is correct
+	// C0 control characters (0x00-0x1F): width 0
+	for cp := rune(0x00); cp <= 0x1F; cp++ {
+		widthMap[cp] = widthZero
+	}
+	// Printable ASCII (0x20-0x7E): width 1
+	for cp := rune(0x20); cp <= 0x7E; cp++ {
+		widthMap[cp] = widthNarrow
+	}
+	// DELETE (0x7F): width 0
+	widthMap[0x7F] = widthZero
+
+	return widthMap
+}
+
+// buildMultiStageTable constructs a 3-stage hierarchical lookup table from Unicode data.
+//
+// The 3-stage table splits a 21-bit Unicode codepoint into 3 parts:
+//
+//	Codepoint: [20...13][12...7][6...0]
+//	             8 bits   6 bits  7 bits
+//	Stage:       ROOT    MIDDLE   LEAF
+//
+// ROOT (256 entries): indexes into MIDDLE
+// MIDDLE (N x 64 entries): indexes into LEAVES
+// LEAVES (M x 32 entries): packed 2-bit width values, 4 per byte
+//
+// Deduplication of identical sub-tables is critical for compact size.
+func buildMultiStageTable(wide, ambiguous, emoji []runeRange) (root [256]byte, middle [][64]byte, leaves [][32]byte) {
+	widthMap := buildWidthMap(wide, ambiguous, emoji)
+
+	// Maps for deduplication: serialized sub-table -> index
+	leafIndex := make(map[[32]byte]byte)
+	midIndex := make(map[[64]byte]byte)
+
+	// Iterate over root blocks (each covers 2^13 = 8192 codepoints)
+	for rootBlock := 0; rootBlock < 256; rootBlock++ {
+		var midTable [64]byte
+
+		baseCP := rootBlock << 13
+
+		// Iterate over middle entries within this root block
+		// Each middle entry covers 2^7 = 128 codepoints
+		for midEntry := 0; midEntry < 64; midEntry++ {
+			var leafTable [32]byte
+
+			midBaseCP := baseCP + (midEntry << 7)
+
+			// Pack 128 codepoints into 32 bytes (4 codepoints per byte, 2 bits each)
+			for leafByte := 0; leafByte < 32; leafByte++ {
+				var packed byte
+				for bit := 0; bit < 4; bit++ {
+					cp := midBaseCP + (leafByte << 2) + bit
+					var w byte
+					if cp <= maxCodepoint {
+						w = widthMap[cp]
+					}
+					packed |= w << (2 * uint(bit))
+				}
+				leafTable[leafByte] = packed
+			}
+
+			// Deduplicate leaf table
+			idx, ok := leafIndex[leafTable]
+			if !ok {
+				if len(leaves) > 255 {
+					log.Fatalf("Too many unique leaf tables (%d > 255), cannot fit in uint8", len(leaves))
+				}
+				idx = byte(len(leaves))
+				leafIndex[leafTable] = idx
+				leaves = append(leaves, leafTable)
+			}
+			midTable[midEntry] = idx
+		}
+
+		// Deduplicate middle table
+		idx, ok := midIndex[midTable]
+		if !ok {
+			if len(middle) > 255 {
+				log.Fatalf("Too many unique middle tables (%d > 255), cannot fit in uint8", len(middle))
+			}
+			idx = byte(len(middle))
+			midIndex[midTable] = idx
+			middle = append(middle, midTable)
+		}
+		root[rootBlock] = idx
+	}
+
+	return root, middle, leaves
+}
+
+// generateGoFile generates the Go source file with both legacy and multi-stage tables.
+func generateGoFile(wide, zeroWidth, ambiguous []runeRange, root *[256]byte, middle [][64]byte, leaves [][32]byte) error {
 	file, err := os.Create(outputFile)
 	if err != nil {
 		return err
@@ -426,81 +598,146 @@ func generateGoFile(wide, zeroWidth, ambiguous []runeRange) error {
 
 package uniwidth
 
-// This file contains Unicode width tables for characters NOT covered by
-// the hot path tiers (Tier 1-3) in uniwidth.go.
+// This file contains Unicode width tables for character width lookup.
 //
-// These tables are used as a fallback for rare characters that need
-// binary search (Tier 4).
+// Two table formats are provided:
+// 1. Legacy runeRange tables (used by Options API for ambiguous character handling)
+// 2. Multi-stage lookup tables (used by tableLookupWidth for O(1) fallback)
 
 `, unicodeVersion); err != nil {
 		return fmt.Errorf("failed to write file header: %w", err)
 	}
 
-	// Write wide table
-	if _, err := fmt.Fprintf(w, "// wideTableGenerated contains wide characters (width 2) not covered by hot paths.\n"); err != nil {
-		return fmt.Errorf("failed to write wide table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "// These are characters with East Asian Width property W (Wide) or F (Fullwidth),\n"); err != nil {
-		return fmt.Errorf("failed to write wide table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "// plus emoji characters not in the common emoji fast path.\n"); err != nil {
-		return fmt.Errorf("failed to write wide table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "var wideTableGenerated = []runeRange{\n"); err != nil {
-		return fmt.Errorf("failed to write wide table declaration: %w", err)
-	}
+	// Write legacy wide table
+	writeComment(w, "wideTableGenerated contains wide characters (width 2) not covered by hot paths.")
+	writeComment(w, "These are characters with East Asian Width property W (Wide) or F (Fullwidth),")
+	writeComment(w, "plus emoji characters not in the common emoji fast path.")
+	writeComment(w, "Used by the Options API (binarySearchWidthInternal).")
+	fmt.Fprint(w, "var wideTableGenerated = []runeRange{\n")
 	for _, rr := range wide {
-		if _, err := fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last); err != nil {
-			return fmt.Errorf("failed to write wide table entry: %w", err)
-		}
-	}
-	if _, err := fmt.Fprintf(w, "}\n\n"); err != nil {
-		return fmt.Errorf("failed to close wide table: %w", err)
+		fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last)
 	}
+	fmt.Fprint(w, "}\n\n")
 
-	// Write zero-width table
-	if _, err := fmt.Fprintf(w, "// zeroWidthTableGenerated contains zero-width characters not covered by hot paths.\n"); err != nil {
-		return fmt.Errorf("failed to write zero-width table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "// These are control characters, combining marks, and format characters.\n"); err != nil {
-		return fmt.Errorf("failed to write zero-width table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "var zeroWidthTableGenerated = []runeRange{\n"); err != nil {
-		return fmt.Errorf("failed to write zero-width table declaration: %w", err)
-	}
+	// Write legacy zero-width table
+	writeComment(w, "zeroWidthTableGenerated contains zero-width characters not covered by hot paths.")
+	writeComment(w, "These are control characters, combining marks, and format characters.")
+	writeComment(w, "Used by the Options API (binarySearchWidthInternal).")
+	fmt.Fprint(w, "var zeroWidthTableGenerated = []runeRange{\n")
 	for _, rr := range zeroWidth {
-		if _, err := fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last); err != nil {
-			return fmt.Errorf("failed to write zero-width table entry: %w", err)
-		}
-	}
-	if _, err := fmt.Fprintf(w, "}\n\n"); err != nil {
-		return fmt.Errorf("failed to close zero-width table: %w", err)
+		fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last)
 	}
+	fmt.Fprint(w, "}\n\n")
 
-	// Write ambiguous table
-	if _, err := fmt.Fprintf(w, "// ambiguousTableGenerated contains ambiguous-width characters.\n"); err != nil {
-		return fmt.Errorf("failed to write ambiguous table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "// These are characters with East Asian Width property A (Ambiguous).\n"); err != nil {
-		return fmt.Errorf("failed to write ambiguous table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "// Width depends on context: 2 in East Asian locales, 1 in neutral context.\n"); err != nil {
-		return fmt.Errorf("failed to write ambiguous table comment: %w", err)
-	}
-	if _, err := fmt.Fprintf(w, "var ambiguousTableGenerated = []runeRange{\n"); err != nil {
-		return fmt.Errorf("failed to write ambiguous table declaration: %w", err)
-	}
+	// Write legacy ambiguous table
+	writeComment(w, "ambiguousTableGenerated contains ambiguous-width characters.")
+	writeComment(w, "These are characters with East Asian Width property A (Ambiguous).")
+	writeComment(w, "Width depends on context: 2 in East Asian locales, 1 in neutral context.")
+	writeComment(w, "Used by the Options API (binarySearchWidthInternal).")
+	fmt.Fprint(w, "var ambiguousTableGenerated = []runeRange{\n")
 	for _, rr := range ambiguous {
-		if _, err := fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last); err != nil {
-			return fmt.Errorf("failed to write ambiguous table entry: %w", err)
+		fmt.Fprintf(w, "\t{0x%04X, 0x%04X},\n", rr.first, rr.last)
+	}
+	fmt.Fprint(w, "}\n\n")
+
+	// Write multi-stage table documentation
+	writeComment(w, "3-Stage Multi-Stage Lookup Table")
+	writeComment(w, "")
+	writeComment(w, "Splits a 21-bit Unicode codepoint into 3 parts:")
+	writeComment(w, "  Codepoint: [20...13][12...7][6...0]")
+	writeComment(w, "               8 bits   6 bits  7 bits")
+	writeComment(w, "  Stage:       ROOT    MIDDLE   LEAF")
+	writeComment(w, "")
+	writeComment(w, "Lookup: widthRoot[cp>>13] -> midIdx")
+	writeComment(w, "        widthMiddle[midIdx][cp>>7 & 0x3F] -> leafIdx")
+	writeComment(w, "        widthLeaves[leafIdx][cp>>2 & 0x1F] >> (2*(cp&0x03)) & 0x03 -> width")
+	writeComment(w, "")
+	writeComment(w, "2-bit width encoding:")
+	writeComment(w, "  0b00 = width 0 (control, combining, zero-width)")
+	writeComment(w, "  0b01 = width 1 (narrow, default)")
+	writeComment(w, "  0b10 = width 2 (wide: CJK, emoji, fullwidth)")
+	writeComment(w, "  0b11 = ambiguous (width 1 in neutral context; 2 in East Asian)")
+	fmt.Fprint(w, "\n")
+
+	// Write root table
+	fmt.Fprintf(w, "// widthRoot maps the top 8 bits of a codepoint (cp >> 13) to a middle table index.\n")
+	fmt.Fprintf(w, "// Size: 256 bytes.\n")
+	fmt.Fprint(w, "var widthRoot = [256]uint8{\n")
+	for i := 0; i < 256; i += 16 {
+		fmt.Fprint(w, "\t")
+		for j := 0; j < 16; j++ {
+			if j > 0 {
+				fmt.Fprint(w, " ")
+			}
+			fmt.Fprintf(w, "0x%02X,", root[i+j])
 		}
+		fmt.Fprint(w, "\n")
+	}
+	fmt.Fprint(w, "}\n\n")
+
+	// Write middle tables
+	fmt.Fprintf(w, "// widthMiddle contains %d unique middle sub-tables.\n", len(middle))
+	fmt.Fprintf(w, "// Each sub-table has 64 entries mapping bits [12:7] to a leaf table index.\n")
+	fmt.Fprintf(w, "// Size: %d bytes.\n", len(middle)*64)
+	fmt.Fprintf(w, "var widthMiddle = [%d][64]uint8{\n", len(middle))
+	for i, mt := range middle {
+		fmt.Fprintf(w, "\t// Middle table %d\n", i)
+		fmt.Fprint(w, "\t{\n")
+		for row := 0; row < 64; row += 16 {
+			fmt.Fprint(w, "\t\t")
+			end := row + 16
+			if end > 64 {
+				end = 64
+			}
+			for j := row; j < end; j++ {
+				if j > row {
+					fmt.Fprint(w, " ")
+				}
+				fmt.Fprintf(w, "0x%02X,", mt[j])
+			}
+			fmt.Fprint(w, "\n")
+		}
+		fmt.Fprint(w, "\t},\n")
+	}
+	fmt.Fprint(w, "}\n\n")
+
+	// Write leaf tables
+	fmt.Fprintf(w, "// widthLeaves contains %d unique leaf sub-tables.\n", len(leaves))
+	fmt.Fprintf(w, "// Each sub-table has 32 bytes of packed 2-bit width values (128 codepoints).\n")
+	fmt.Fprintf(w, "// Size: %d bytes.\n", len(leaves)*32)
+	fmt.Fprintf(w, "var widthLeaves = [%d][32]uint8{\n", len(leaves))
+	for i, lt := range leaves {
+		fmt.Fprintf(w, "\t// Leaf table %d\n", i)
+		fmt.Fprint(w, "\t{\n")
+		for row := 0; row < 32; row += 16 {
+			fmt.Fprint(w, "\t\t")
+			end := row + 16
+			if end > 32 {
+				end = 32
+			}
+			for j := row; j < end; j++ {
+				if j > row {
+					fmt.Fprint(w, " ")
+				}
+				fmt.Fprintf(w, "0x%02X,", lt[j])
+			}
+			fmt.Fprint(w, "\n")
+		}
+		fmt.Fprint(w, "\t},\n")
 	}
-	if _, err := fmt.Fprintf(w, "}\n"); err != nil {
-		return fmt.Errorf("failed to close ambiguous table: %w", err)
-	}
+	fmt.Fprint(w, "}\n")
 
 	if err := w.Flush(); err != nil {
 		return fmt.Errorf("failed to flush writer: %w", err)
 	}
 	return nil
 }
+
+// writeComment writes a single-line Go comment to the writer.
+func writeComment(w *bufio.Writer, line string) {
+	if line == "" {
+		fmt.Fprint(w, "//\n")
+	} else {
+		fmt.Fprintf(w, "// %s\n", line)
+	}
+}
diff --git a/conformance_test.go b/conformance_test.go
index c2a1216..db3d33c 100644
--- a/conformance_test.go
+++ b/conformance_test.go
@@ -107,7 +107,8 @@ func TestUnicodeConformance_EdgeCases(t *testing.T) {
 		{"Just after ASCII", 0x80, 0},    // C1 control
 
 		// Boundary of CJK Unified Ideographs
-		{"Before CJK", 0x4DFF, 1},
+		// Note: U+4DFF is in the CJK Unified Ideographs Extension A block (W in Unicode 16.0)
+		{"Before CJK", 0x4DFF, 2},
 		{"CJK start", 0x4E00, 2},
 		{"CJK end", 0x9FFF, 2},
 		{"After CJK", 0xA000, 2}, // Yi Syllables
@@ -119,7 +120,13 @@ func TestUnicodeConformance_EdgeCases(t *testing.T) {
 		{"After Hangul", 0xD7B0, 1},
 
 		// Boundary of Hiragana/Katakana
-		{"Before Hiragana", 0x303F, 2},
+		// Note: U+303F (IDEOGRAPHIC HALF FILL SPACE) is W in Unicode 16.0 but falls
+		// just outside the hot path range. The Tier 2 hot path starts at 0x3040.
+		// U+303F is covered by the wide table via {0x2FF0, 0x303E} which ends at 0x303E;
+		// U+303F itself is in the Hiragana block boundary and handled by the CJK Symbols
+		// and Punctuation block as Ambiguous in some contexts. With fresh Unicode 16.0 data,
+		// U+303F is no longer included in the generated wide table.
+		{"Before Hiragana", 0x303F, 1},
 		{"Hiragana start", 0x3040, 2},
 		{"Katakana end", 0x30FF, 2},
 		{"After Katakana", 0x3100, 2},
@@ -166,8 +173,8 @@ func TestUnicodeConformance_SurrogateHandling(t *testing.T) {
 		want int
 	}{
 		// Characters in Supplementary Multilingual Plane (SMP)
-		{"Gothic letter", "𐌰", 1},              // U+10330
-		{"Linear B syllable", "𐀀", 2},          // U+10000
+		{"Gothic letter", "𐌰", 1},            // U+10330
+		{"Linear B syllable", "𐀀", 1},        // U+10000 (EAW: N = Neutral/Narrow)
 		{"Emoji family", "👨\u200D👩\u200D👧", 6}, // Man + ZWJ + Woman + ZWJ + Girl (simplified width)
 	}
 
diff --git a/options.go b/options.go
index 0bb504f..bfd7cab 100644
--- a/options.go
+++ b/options.go
@@ -218,7 +218,9 @@ func runeWidthInternal(r rune) int {
 	// ========================================
 	// Zero-Width Characters (O(1))
 	// ========================================
-	if r == 0x200D || r == 0x200C {
+	// Format characters (U+200B-U+200F):
+	// ZWSP, ZWNJ, ZWJ, LRM, RLM - all zero-width formatting characters
+	if r >= 0x200B && r <= 0x200F {
 		return 0
 	}
 	if r >= 0xFE00 && r <= 0xFE0F {
@@ -240,12 +242,30 @@ func runeWidthInternal(r rune) int {
 	}
 
 	// ========================================
-	// Tier 4: Binary Search Fallback (O(log n))
+	// Tier 4: Multi-Stage Table Lookup (O(1))
 	// ========================================
-	return binarySearchWidthInternal(r)
+	return tableLookupWidthInternal(r)
+}
+
+// tableLookupWidthInternal performs O(1) width lookup using the 3-stage table,
+// returning -1 for ambiguous characters (encoding 0b11) so the caller can
+// apply the configured East Asian width.
+//
+// Performance: O(1), 0 allocations.
+func tableLookupWidthInternal(r rune) int {
+	cp := uint32(r)
+	rootIdx := widthRoot[cp>>13]
+	midIdx := widthMiddle[rootIdx][cp>>7&0x3F]
+	packed := widthLeaves[midIdx][cp>>2&0x1F]
+	width := (packed >> (2 * (cp & 0x03))) & 0x03
+	if width == 3 {
+		return -1 // ambiguous - caller decides
+	}
+	return int(width)
 }
 
 // binarySearchWidthInternal performs binary search and returns -1 for ambiguous characters.
+// Kept for backward compatibility; tableLookupWidthInternal is preferred.
 func binarySearchWidthInternal(r rune) int {
 	// Search in generated wide table (width 2)
 	if binarySearch(r, wideTableGenerated) {
diff --git a/tables_generated.go b/tables_generated.go
index c790a00..bebd6f2 100644
--- a/tables_generated.go
+++ b/tables_generated.go
@@ -8,15 +8,16 @@
 
 package uniwidth
 
-// This file contains Unicode width tables for characters NOT covered by
-// the hot path tiers (Tier 1-3) in uniwidth.go.
+// This file contains Unicode width tables for character width lookup.
 //
-// These tables are used as a fallback for rare characters that need
-// binary search (Tier 4).
+// Two table formats are provided:
+// 1. Legacy runeRange tables (used by Options API for ambiguous character handling)
+// 2. Multi-stage lookup tables (used by tableLookupWidth for O(1) fallback)
 
 // wideTableGenerated contains wide characters (width 2) not covered by hot paths.
 // These are characters with East Asian Width property W (Wide) or F (Fullwidth),
 // plus emoji characters not in the common emoji fast path.
+// Used by the Options API (binarySearchWidthInternal).
 var wideTableGenerated = []runeRange{
 	{0x00A9, 0x00A9},
 	{0x00AE, 0x00AE},
@@ -45,13 +46,13 @@ var wideTableGenerated = []runeRange{
 	{0x2E80, 0x2E99},
 	{0x2E9B, 0x2EF3},
 	{0x2F00, 0x2FD5},
-	{0x2FF0, 0x303F},
+	{0x2FF0, 0x303E},
 	{0x3105, 0x312F},
 	{0x3131, 0x318E},
 	{0x3190, 0x31E5},
 	{0x31EF, 0x321E},
 	{0x3220, 0x3247},
-	{0x3250, 0x4DBE},
+	{0x3250, 0x4DFF},
 	{0xA000, 0xA48C},
 	{0xA490, 0xA4C6},
 	{0xA960, 0xA97C},
@@ -61,7 +62,6 @@ var wideTableGenerated = []runeRange{
 	{0xFE68, 0xFE6B},
 	{0xFF01, 0xFF60},
 	{0xFFE0, 0xFFE6},
-	{0x10000, 0x1007F},
 	{0x16FE0, 0x16FE4},
 	{0x16FF0, 0x16FF1},
 	{0x17000, 0x187F7},
@@ -103,6 +103,7 @@ var wideTableGenerated = []runeRange{
 
 // zeroWidthTableGenerated contains zero-width characters not covered by hot paths.
 // These are control characters, combining marks, and format characters.
+// Used by the Options API (binarySearchWidthInternal).
 var zeroWidthTableGenerated = []runeRange{
 	{0x0080, 0x009F},
 	{0x00AD, 0x00AD},
@@ -127,6 +128,7 @@ var zeroWidthTableGenerated = []runeRange{
 	{0x0951, 0x0957},
 	{0x0962, 0x0963},
 	{0x1AB0, 0x1AFF},
+	{0x1DC0, 0x1DFF},
 	{0x20D0, 0x20FF},
 	{0xFE20, 0xFE2F},
 	{0xFEFF, 0xFEFF},
@@ -135,6 +137,7 @@ var zeroWidthTableGenerated = []runeRange{
 // ambiguousTableGenerated contains ambiguous-width characters.
 // These are characters with East Asian Width property A (Ambiguous).
 // Width depends on context: 2 in East Asian locales, 1 in neutral context.
+// Used by the Options API (binarySearchWidthInternal).
 var ambiguousTableGenerated = []runeRange{
 	{0x00A1, 0x00A1},
 	{0x00A4, 0x00A4},
@@ -316,3 +319,562 @@ var ambiguousTableGenerated = []runeRange{
 	{0xF0000, 0xFFFFD},
 	{0x100000, 0x10FFFD},
 }
+
+// 3-Stage Multi-Stage Lookup Table
+//
+// Splits a 21-bit Unicode codepoint into 3 parts:
+//   Codepoint: [20...13][12...7][6...0]
+//                8 bits   6 bits  7 bits
+//   Stage:       ROOT    MIDDLE   LEAF
+//
+// Lookup: widthRoot[cp>>13] -> midIdx
+//         widthMiddle[midIdx][cp>>7 & 0x3F] -> leafIdx
+//         widthLeaves[leafIdx][cp>>2 & 0x1F] >> (2*(cp&0x03)) & 0x03 -> width
+//
+// 2-bit width encoding:
+//   0b00 = width 0 (control, combining, zero-width)
+//   0b01 = width 1 (narrow, default)
+//   0b10 = width 2 (wide: CJK, emoji, fullwidth)
+//   0b11 = ambiguous (width 1 in neutral context; 2 in East Asian)
+
+// widthRoot maps the top 8 bits of a codepoint (cp >> 13) to a middle table index.
+// Size: 256 bytes.
+var widthRoot = [256]uint8{
+	0x00, 0x01, 0x02, 0x02, 0x02, 0x03, 0x04, 0x05, 0x06, 0x06, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
+	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x0C, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x0C,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06,
+	0x0D, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x06, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0F,
+	0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0E, 0x0F, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+	0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+}
+
+// widthMiddle contains 17 unique middle sub-tables.
+// Each sub-table has 64 entries mapping bits [12:7] to a leaf table index.
+// Size: 1088 bytes.
+var widthMiddle = [17][64]uint8{
+	// Middle table 0
+	{
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x09, 0x0A, 0x0B, 0x0C, 0x09, 0x09,
+		0x09, 0x09, 0x0D, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x0E, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x0F, 0x09, 0x09, 0x09, 0x09, 0x09, 0x10, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 1
+	{
+		0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20,
+		0x09, 0x09, 0x21, 0x09, 0x09, 0x09, 0x22, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x23, 0x24, 0x25,
+		0x26, 0x27, 0x28, 0x29, 0x2A, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+	},
+	// Middle table 2
+	{
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+	},
+	// Middle table 3
+	{
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x2B, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x2C, 0x09, 0x09, 0x09, 0x09, 0x09, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+	},
+	// Middle table 4
+	{
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x2D,
+		0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+	},
+	// Middle table 5
+	{
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x24, 0x24, 0x24, 0x24, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x30, 0x31, 0x32, 0x33,
+	},
+	// Middle table 6
+	{
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 7
+	{
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x34,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+	},
+	// Middle table 8
+	{
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x35,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x36, 0x37, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 9
+	{
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x38,
+		0x24, 0x24, 0x39, 0x24, 0x24, 0x3A, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 10
+	{
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x3B, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 11
+	{
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x09, 0x41, 0x42, 0x24, 0x43, 0x44, 0x45, 0x46, 0x47, 0x09, 0x48,
+		0x09, 0x09, 0x49, 0x24, 0x4A, 0x4B, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 12
+	{
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24,
+		0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x24, 0x4C,
+	},
+	// Middle table 13
+	{
+		0x09, 0x09, 0x2E, 0x06, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+		0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09, 0x09,
+	},
+	// Middle table 14
+	{
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+	},
+	// Middle table 15
+	{
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F,
+		0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x2F, 0x4D,
+	},
+	// Middle table 16
+	{
+		0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+		0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+		0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+		0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
+	},
+}
+
+// widthLeaves contains 78 unique leaf sub-tables.
+// Each sub-table has 32 bytes of packed 2-bit width values (128 codepoints).
+// Size: 2496 bytes.
+var widthLeaves = [78][32]uint8{
+	// Leaf table 0
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
+	},
+	// Leaf table 1
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x5D, 0xD7, 0x7B, 0x61, 0xFF, 0xF7, 0x7F, 0xFF,
+		0x55, 0x75, 0x55, 0x55, 0x57, 0xD5, 0x57, 0xF5, 0x5F, 0x75, 0x7F, 0x5F, 0xF7, 0xD5, 0x7F, 0x77,
+	},
+	// Leaf table 2
+	{
+		0x5D, 0x55, 0x55, 0x55, 0xDD, 0x55, 0xD5, 0x55, 0x55, 0xF5, 0xD5, 0x55, 0xFD, 0x55, 0x57, 0xD5,
+		0x7F, 0x57, 0xFF, 0x5D, 0xF5, 0x55, 0x55, 0x55, 0x55, 0xF5, 0xD5, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 3
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x75, 0x77, 0x77, 0x77, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 4
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x5D, 0x55, 0x55, 0x55, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 5
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0xD7, 0xFD, 0x5D, 0x57, 0x55, 0xFF, 0xDD, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 6
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 7
+	{
+		0x55, 0x55, 0x55, 0x55, 0xFD, 0xFF, 0xFF, 0xFF, 0xDF, 0xFF, 0x5F, 0x55, 0xFD, 0xFF, 0xFF, 0xFF,
+		0xDF, 0xFF, 0x5F, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 8
+	{
+		0x5D, 0x55, 0x55, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 9
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 10
+	{
+		0x55, 0x55, 0x55, 0x55, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10,
+		0x41, 0x10, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 11
+	{
+		0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x40, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x15, 0x00, 0x00, 0x00, 0x00, 0x00, 0x55, 0x55, 0x55, 0x55, 0x54, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 12
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x05, 0x00, 0x14, 0x00, 0x14, 0x04, 0x50, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 13
+	{
+		0x41, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x45, 0x54,
+		0x01, 0x00, 0x54, 0x51, 0x01, 0x00, 0x55, 0x55, 0x05, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 14
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 15
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	// Leaf table 16
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	// Leaf table 17
+	{
+		0x55, 0x55, 0x15, 0x00, 0xD7, 0x7F, 0x5F, 0x5F, 0x7F, 0xFF, 0x55, 0x55, 0xF7, 0x5D, 0xD5, 0x76,
+		0x55, 0x55, 0x59, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0xD5,
+	},
+	// Leaf table 18
+	{
+		0xFD, 0x57, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x57, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	// Leaf table 19
+	{
+		0xD5, 0x5D, 0x5D, 0x55, 0xD5, 0x75, 0x55, 0x55, 0x6D, 0x75, 0xD5, 0x55, 0x55, 0x55, 0x59, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0xD5, 0x57, 0xD5, 0x7F, 0xFF, 0xFF, 0xFF, 0x55, 0xFF, 0xFF, 0x5F, 0x55,
+	},
+	// Leaf table 20
+	{
+		0x55, 0x55, 0x5D, 0x55, 0xFF, 0xAA, 0x5A, 0x55, 0x55, 0x55, 0x69, 0x55, 0x55, 0x55, 0x5F, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x75, 0x57, 0x55, 0x55, 0x55, 0xD5, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 21
+	{
+		0xF7, 0xD5, 0xD7, 0xD5, 0x5D, 0x5D, 0x75, 0xFD, 0xD7, 0xDD, 0xFF, 0x77, 0x55, 0xFF, 0x55, 0x5F,
+		0x55, 0x55, 0x57, 0x57, 0x75, 0x55, 0x55, 0x55, 0x5F, 0xFF, 0xF5, 0xF5, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 22
+	{
+		0xF5, 0xF5, 0x55, 0x55, 0x55, 0x5D, 0x5D, 0x55, 0x55, 0x5D, 0x55, 0x55, 0x55, 0x55, 0x55, 0xD5,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 23
+	{
+		0x55, 0x55, 0x55, 0x55, 0x75, 0x55, 0xA5, 0x55, 0x55, 0x55, 0x6A, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 24
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x95, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xA9, 0xAA, 0xAA, 0x55, 0x6A, 0x55,
+	},
+	// Leaf table 25
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	},
+	// Leaf table 26
+	{
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xEF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xDF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	},
+	// Leaf table 27
+	{
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 28
+	{
+		0xFF, 0xFF, 0xFF, 0xFF, 0xF5, 0x5F, 0x55, 0x55, 0xDF, 0xFF, 0xAF, 0x55, 0xF5, 0xE5, 0x55, 0x5F,
+		0x5E, 0xF5, 0xD7, 0xF5, 0x5F, 0x55, 0x55, 0x55, 0xF5, 0x5F, 0x55, 0xD5, 0x55, 0x55, 0x95, 0x6A,
+	},
+	// Leaf table 29
+	{
+		0xAA, 0x7E, 0x5D, 0xE5, 0x59, 0x5A, 0x56, 0x7B, 0xA6, 0x65, 0x65, 0xA5, 0xAA, 0xAA, 0x6A, 0x55,
+		0x66, 0x55, 0xAA, 0xAA, 0xAA, 0x55, 0x55, 0x95, 0x9E, 0xEB, 0x7E, 0xDF, 0x55, 0x55, 0x95, 0xA5,
+	},
+	// Leaf table 30
+	{
+		0x55, 0x55, 0xA5, 0xAA, 0xA5, 0xAA, 0x99, 0xF6, 0x5A, 0x95, 0xA5, 0x55, 0x5A, 0x55, 0x55, 0xE9,
+		0x55, 0xFA, 0xFE, 0xAF, 0xBB, 0xFE, 0xFF, 0xFF, 0xDF, 0x55, 0xEB, 0xFF, 0xAA, 0xBA, 0xEA, 0xFB,
+	},
+	// Leaf table 31
+	{
+		0x65, 0x59, 0xAA, 0x9A, 0x65, 0x66, 0x55, 0x59, 0x59, 0x55, 0x56, 0x55, 0x95, 0x56, 0x55, 0x5D,
+		0x55, 0x96, 0x55, 0x66, 0x95, 0x9A, 0x55, 0x55, 0x95, 0x56, 0x55, 0x55, 0x55, 0xF5, 0xFF, 0xFF,
+	},
+	// Leaf table 32
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0xA9, 0x55, 0x55, 0x59, 0x55, 0x55, 0x55, 0x56, 0x55, 0x55, 0x95,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 33
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x5A, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 34
+	{
+		0x55, 0xA9, 0x55, 0x55, 0x55, 0x55, 0x95, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x56, 0xF9, 0x5F, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 35
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 36
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 37
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 38
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A,
+		0xA9, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 39
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0xA9, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 40
+	{
+		0x55, 0xA9, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xA9, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 41
+	{
+		0xAA, 0xAA, 0xAA, 0x6A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0x95, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 42
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xFF, 0xFF, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 43
+	{
+		0xAA, 0xAA, 0xAA, 0x56, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0x6A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 44
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56,
+	},
+	// Leaf table 45
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 46
+	{
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+		0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	},
+	// Leaf table 47
+	{
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+	},
+	// Leaf table 48
+	{
+		0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, 0x5A, 0x55, 0x00, 0x00, 0x00, 0x00, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 49
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x15,
+	},
+	// Leaf table 50
+	{
+		0xA9, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 51
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0x6A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x5D,
+	},
+	// Leaf table 52
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0x56, 0x55, 0x55, 0x5A, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 53
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x55,
+	},
+	// Leaf table 54
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x95,
+	},
+	// Leaf table 55
+	{
+		0xAA, 0xAA, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 56
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xA9, 0xAA, 0x69,
+	},
+	// Leaf table 57
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0x55, 0x55, 0x55, 0x65, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x6A, 0x59, 0x55, 0x55, 0x55, 0xAA, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 58
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55,
+	},
+	// Leaf table 59
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0x55, 0x55,
+	},
+	// Leaf table 60
+	{
+		0x55, 0x56, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 61
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x95, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 62
+	{
+		0xFF, 0xFF, 0x7F, 0x55, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F, 0x55, 0xFA, 0xFF, 0xFF, 0xAF,
+	},
+	// Leaf table 63
+	{
+		0xFF, 0xFF, 0xFF, 0xEF, 0xAB, 0xAA, 0xEA, 0xFF, 0xFF, 0xFF, 0xFF, 0x57, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xA5, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 64
+	{
+		0x6A, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55,
+		0xAA, 0xAA, 0x56, 0x55, 0x5A, 0x55, 0x55, 0x55, 0xAA, 0x5A, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 65
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 66
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xA5, 0xA9, 0xA5, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x96, 0x9A, 0xAA, 0xAA,
+	},
+	// Leaf table 67
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x9A,
+	},
+	// Leaf table 68
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A,
+		0x55, 0x55, 0xA9, 0x6A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x95, 0x96, 0xAA, 0x6A, 0x55,
+	},
+	// Leaf table 69
+	{
+		0x55, 0x95, 0xA5, 0x5A, 0x56, 0x69, 0x55, 0x55, 0x55, 0x5A, 0x56, 0x55, 0x69, 0x55, 0x55, 0x56,
+		0xA5, 0x56, 0x55, 0x55, 0xA9, 0x55, 0x55, 0x6A, 0x99, 0x55, 0x56, 0x95, 0x95, 0x55, 0xA5, 0xAA,
+	},
+	// Leaf table 70
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 71
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0x5A, 0x95, 0xAA, 0x6A, 0xA9, 0x55, 0xAA, 0xAA, 0x5A, 0x99, 0x56, 0x96, 0xAA, 0xAA, 0x56,
+	},
+	// Leaf table 72
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0x55, 0x56, 0x55, 0x55, 0x55,
+	},
+	// Leaf table 73
+	{
+		0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x6A, 0xAA,
+		0xAA, 0x9A, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+	},
+	// Leaf table 74
+	{
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55,
+		0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0xAA, 0xAA, 0xAA, 0x56,
+	},
+	// Leaf table 75
+	{
+		0xAA, 0xAA, 0x5A, 0x95, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0x6A, 0x55, 0xA5, 0xAA, 0xAA, 0xAA, 0x96, 0xAA, 0xAA, 0x5A, 0x55, 0xAA, 0xAA, 0x56, 0x55,
+	},
+	// Leaf table 76
+	{
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA,
+		0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0xAA, 0x5A,
+	},
+	// Leaf table 77
+	{
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+		0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x5F,
+	},
+}
diff --git a/uniwidth.go b/uniwidth.go
index 8fd74ca..7a2c9df 100644
--- a/uniwidth.go
+++ b/uniwidth.go
@@ -3,9 +3,11 @@
 // uniwidth uses a tiered lookup strategy for optimal performance:
 //   - Tier 1: ASCII (O(1), ~95% of typical content)
 //   - Tier 2: Common CJK & Emoji (O(1), ~90% of non-ASCII)
-//   - Tier 3: Binary search for rare characters (O(log n))
+//   - Tier 3: Common Emoji (O(1))
+//   - Tier 4: 3-stage table lookup for all other characters (O(1))
 //
-// This approach is 3-4x faster than traditional binary-search-only methods
+// All tiers are O(1) with zero allocations for single-rune lookups.
+// This approach is 3-46x faster than traditional binary-search-only methods
 // like go-runewidth, while maintaining full Unicode 16.0 compliance.
 //
 //go:generate go run cmd/generate-tables/main.go
@@ -26,7 +28,7 @@ import (
 // This function uses a tiered lookup strategy:
 //   - O(1) for ASCII (most common case)
 //   - O(1) for common CJK and emoji (hot paths)
-//   - O(log n) for rare characters (fallback)
+//   - O(1) for all other characters (3-stage table lookup)
 func RuneWidth(r rune) int {
 	// ========================================
 	// Tier 1: ASCII Fast Path (O(1))
@@ -121,18 +123,14 @@ func RuneWidth(r rune) int {
 	// Zero-Width Characters (O(1))
 	// ========================================
 
-	// Zero-Width Space (ZWSP) - U+200B
-	if r == 0x200B {
-		return 0
-	}
-
-	// Zero-Width Non-Joiner (ZWNJ)
-	if r == 0x200C {
-		return 0
-	}
-
-	// Zero-Width Joiner (ZWJ) - used in emoji sequences
-	if r == 0x200D {
+	// Format characters (U+200B-U+200F):
+	// U+200B: Zero-Width Space (ZWSP)
+	// U+200C: Zero-Width Non-Joiner (ZWNJ)
+	// U+200D: Zero-Width Joiner (ZWJ) - used in emoji sequences
+	// U+200E: Left-to-Right Mark (LRM)
+	// U+200F: Right-to-Left Mark (RLM)
+	// All are invisible formatting characters with zero terminal width.
+	if r >= 0x200B && r <= 0x200F {
 		return 0
 	}
 
@@ -155,10 +153,11 @@ func RuneWidth(r rune) int {
 	}
 
 	// ========================================
-	// Tier 4: Binary Search Fallback (O(log n))
+	// Tier 4: Multi-Stage Table Lookup (O(1))
 	// ========================================
-	// For rare characters not covered by hot paths
-	return binarySearchWidth(r)
+	// For characters not covered by hot paths, use the 3-stage
+	// hierarchical lookup table for constant-time width resolution.
+	return tableLookupWidth(r)
 }
 
 // StringWidth calculates the visual width of a string in monospace terminals.
@@ -381,8 +380,30 @@ func asciiWidth(s string) int {
 	return width
 }
 
+// tableLookupWidth performs O(1) width lookup using the 3-stage hierarchical table.
+//
+// The table encodes every Unicode codepoint (0x0000-0x10FFFF) as a 2-bit width value:
+//
+//	0b00 = width 0 (control, combining, zero-width)
+//	0b01 = width 1 (narrow, default)
+//	0b10 = width 2 (wide: CJK, emoji, fullwidth)
+//	0b11 = ambiguous (treated as width 1 in neutral context)
+//
+// Performance: O(1), 0 allocations. Three array lookups + bit extraction.
+func tableLookupWidth(r rune) int {
+	cp := uint32(r)
+	rootIdx := widthRoot[cp>>13]
+	midIdx := widthMiddle[rootIdx][cp>>7&0x3F]
+	packed := widthLeaves[midIdx][cp>>2&0x1F]
+	width := (packed >> (2 * (cp & 0x03))) & 0x03
+	if width == 3 {
+		return 1 // ambiguous -> narrow in neutral context
+	}
+	return int(width)
+}
+
 // binarySearchWidth performs binary search on Unicode width tables.
-// This is the fallback for rare characters not covered by hot paths.
+// This is the legacy fallback, kept for use by the Options API.
 func binarySearchWidth(r rune) int {
 	// Search in generated wide table (width 2)
 	if binarySearch(r, wideTableGenerated) {
diff --git a/uniwidth_test.go b/uniwidth_test.go
index 56e6c69..4c88c65 100644
--- a/uniwidth_test.go
+++ b/uniwidth_test.go
@@ -2,6 +2,7 @@ package uniwidth
 
 import (
 	"testing"
+	"unicode"
 )
 
 func TestRuneWidth_ASCII(t *testing.T) {
@@ -335,6 +336,289 @@ func TestIsASCIIOnly(t *testing.T) {
 	}
 }
 
+// runeWidthViaBinarySearch computes the full RuneWidth using the legacy
+// binary search path (Tier 1-3 hot paths + binary search fallback).
+// This is a reference implementation for verifying the table lookup.
+func runeWidthViaBinarySearch(r rune) int {
+	// Tier 1: ASCII
+	if r < 0x80 {
+		if r < 0x20 {
+			return 0
+		}
+		if r == 0x7F {
+			return 0
+		}
+		return 1
+	}
+
+	// Tier 2: CJK
+	if r >= 0x4E00 && r <= 0x9FFF {
+		return 2
+	}
+	if r >= 0xAC00 && r <= 0xD7AF {
+		return 2
+	}
+	if r >= 0x3040 && r <= 0x312F {
+		return 2
+	}
+	if r >= 0xF900 && r <= 0xFAFF {
+		return 2
+	}
+
+	// Tier 3: Emoji
+	if r >= 0x1F600 && r <= 0x1F64F {
+		return 2
+	}
+	if r >= 0x1F300 && r <= 0x1F5FF {
+		return 2
+	}
+	if r >= 0x1F680 && r <= 0x1F6FF {
+		return 2
+	}
+	if r >= 0x1F900 && r <= 0x1F9FF {
+		return 2
+	}
+	if r >= 0x2600 && r <= 0x26FF {
+		return 2
+	}
+	if r >= 0x2700 && r <= 0x27BF {
+		return 2
+	}
+
+	// Zero-width format characters (ZWSP, ZWNJ, ZWJ, LRM, RLM)
+	if r >= 0x200B && r <= 0x200F {
+		return 0
+	}
+	if r >= 0xFE00 && r <= 0xFE0F {
+		return 0
+	}
+	if r >= 0xE0100 && r <= 0xE01EF {
+		return 0
+	}
+
+	// Combining marks (same as RuneWidth uses unicode.In)
+	if unicode.In(r, unicode.Mn, unicode.Me, unicode.Mc) {
+		return 0
+	}
+
+	// Tier 4: Legacy binary search
+	return binarySearchWidth(r)
+}
+
+// TestTableLookup_ExhaustiveVerification iterates ALL valid Unicode codepoints
+// (0x0000-0x10FFFF, skipping surrogates 0xD800-0xDFFF) and verifies that
+// RuneWidth (which uses tableLookupWidth in Tier 4) returns the same result
+// as the reference implementation using binarySearchWidth in Tier 4.
+//
+// This ensures the multi-stage table produces identical results to the legacy
+// binary search tables when called through the full RuneWidth path.
+func TestTableLookup_ExhaustiveVerification(t *testing.T) {
+	mismatches := 0
+	const maxMismatchLog = 20
+
+	for cp := rune(0); cp <= 0x10FFFF; cp++ {
+		// Skip surrogates (not valid Unicode scalar values)
+		if cp >= 0xD800 && cp <= 0xDFFF {
+			continue
+		}
+
+		tableW := RuneWidth(cp)                 // uses tableLookupWidth in Tier 4
+		binaryW := runeWidthViaBinarySearch(cp) // uses binarySearchWidth in Tier 4
+
+		if tableW != binaryW {
+			mismatches++
+			if mismatches <= maxMismatchLog {
+				t.Errorf("U+%04X: RuneWidth(table)=%d, runeWidthViaBinarySearch=%d", cp, tableW, binaryW)
+			}
+		}
+	}
+
+	if mismatches > maxMismatchLog {
+		t.Errorf("... and %d more mismatches (total: %d)", mismatches-maxMismatchLog, mismatches)
+	}
+
+	if mismatches == 0 {
+		t.Logf("Verified %d codepoints: RuneWidth matches reference implementation for all", 0x10FFFF+1-(0xDFFF-0xD800+1))
+	}
+}
+
+// TestTableLookupInternal_ExhaustiveVerification verifies that the internal
+// table lookup (used by Options API) matches the legacy binary search internal
+// for ALL codepoints that reach Tier 4 (after Tier 1-3 hot paths).
+func TestTableLookupInternal_ExhaustiveVerification(t *testing.T) {
+	mismatches := 0
+	const maxMismatchLog = 20
+
+	for cp := rune(0); cp <= 0x10FFFF; cp++ {
+		// Skip surrogates
+		if cp >= 0xD800 && cp <= 0xDFFF {
+			continue
+		}
+
+		// Compare the full runeWidthInternal path (which uses tableLookupWidthInternal)
+		// against a reference that uses binarySearchWidthInternal.
+		// runeWidthInternal handles Tier 1-3 and zero-width checks before Tier 4,
+		// so we test the full path for consistency.
+		tableW := runeWidthInternal(cp) // uses tableLookupWidthInternal in Tier 4
+
+		// Reference: replicate runeWidthInternal logic but with binary search
+		binaryW := runeWidthInternalViaBinarySearch(cp)
+
+		if tableW != binaryW {
+			mismatches++
+			if mismatches <= maxMismatchLog {
+				t.Errorf("U+%04X: runeWidthInternal(table)=%d, runeWidthInternalViaBinarySearch=%d", cp, tableW, binaryW)
+			}
+		}
+	}
+
+	if mismatches > maxMismatchLog {
+		t.Errorf("... and %d more mismatches (total: %d)", mismatches-maxMismatchLog, mismatches)
+	}
+
+	if mismatches == 0 {
+		t.Logf("Verified %d codepoints: runeWidthInternal matches reference for all", 0x10FFFF+1-(0xDFFF-0xD800+1))
+	}
+}
+
+// runeWidthInternalViaBinarySearch is a reference implementation using binary search
+// for verifying the table-based runeWidthInternal.
+func runeWidthInternalViaBinarySearch(r rune) int {
+	// Tier 1: ASCII
+	if r < 0x80 {
+		if r < 0x20 {
+			return 0
+		}
+		if r == 0x7F {
+			return 0
+		}
+		return 1
+	}
+
+	// Tier 2: CJK
+	if r >= 0x4E00 && r <= 0x9FFF {
+		return 2
+	}
+	if r >= 0xAC00 && r <= 0xD7AF {
+		return 2
+	}
+	if r >= 0x3040 && r <= 0x30FF {
+		return 2
+	}
+	if r >= 0xF900 && r <= 0xFAFF {
+		return 2
+	}
+
+	// Tier 3: Emoji
+	if r >= 0x1F600 && r <= 0x1F64F {
+		return 2
+	}
+	if r >= 0x1F300 && r <= 0x1F5FF {
+		return 2
+	}
+	if r >= 0x1F680 && r <= 0x1F6FF {
+		return 2
+	}
+	if r >= 0x1F900 && r <= 0x1F9FF {
+		return 2
+	}
+	if r >= 0x2600 && r <= 0x26FF {
+		return 2
+	}
+	if r >= 0x2700 && r <= 0x27BF {
+		return 2
+	}
+
+	// Zero-width format characters (ZWSP, ZWNJ, ZWJ, LRM, RLM)
+	if r >= 0x200B && r <= 0x200F {
+		return 0
+	}
+	if r >= 0xFE00 && r <= 0xFE0F {
+		return 0
+	}
+	if r >= 0xE0100 && r <= 0xE01EF {
+		return 0
+	}
+
+	// Combining marks
+	if (r >= 0x0300 && r <= 0x036F) ||
+		(r >= 0x1AB0 && r <= 0x1AFF) ||
+		(r >= 0x1DC0 && r <= 0x1DFF) ||
+		(r >= 0x20D0 && r <= 0x20FF) ||
+		(r >= 0xFE20 && r <= 0xFE2F) {
+		return 0
+	}
+
+	// Tier 4: Legacy binary search
+	return binarySearchWidthInternal(r)
+}
+
+// TestTableLookup_SpecificCodepoints tests the table lookup for specific
+// important codepoints to ensure correctness of the 2-bit encoding.
+func TestTableLookup_SpecificCodepoints(t *testing.T) {
+	tests := []struct {
+		name string
+		r    rune
+		want int
+	}{
+		// Width 0: control characters
+		{"NUL", 0x0000, 0},
+		{"TAB", 0x0009, 0},
+		{"LF", 0x000A, 0},
+		{"DEL", 0x007F, 0},
+		{"C1 control", 0x0080, 0},
+		{"Soft hyphen", 0x00AD, 0},
+
+		// Width 0: combining marks
+		{"Combining grave", 0x0300, 0},
+		{"Combining acute", 0x0301, 0},
+		{"Combining marks extended", 0x1AB0, 0},
+		{"Combining marks extended end", 0x1AFF, 0},
+		{"Combining marks supplement", 0x1DC0, 0},
+
+		// Width 0: zero-width characters
+		{"ZWSP", 0x200B, 0},
+		{"ZWNJ", 0x200C, 0},
+		{"ZWJ", 0x200D, 0},
+		{"Variation selector 1", 0xFE00, 0},
+		{"Variation selector 16", 0xFE0F, 0},
+		{"BOM", 0xFEFF, 0},
+
+		// Width 1: ASCII printable
+		{"Space", 0x0020, 1},
+		{"Letter A", 0x0041, 1},
+		{"Tilde", 0x007E, 1},
+
+		// Width 1: Latin extended
+		{"e-acute", 0x00E9, 1},
+
+		// Width 2: CJK
+		{"CJK ideograph", 0x4E00, 2},
+		{"Hangul syllable", 0xAC00, 2},
+		{"Hiragana A", 0x3042, 2},
+		{"Katakana A", 0x30A2, 2},
+
+		// Width 2: Emoji
+		{"Grinning face", 0x1F600, 2},
+		{"Rocket", 0x1F680, 2},
+		{"Sun", 0x2600, 2},
+
+		// Width 2: Fullwidth
+		{"Fullwidth A", 0xFF21, 2},
+		{"Fullwidth 0", 0xFF10, 2},
+		{"Ideographic space", 0x3000, 2},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := tableLookupWidth(tt.r)
+			if got != tt.want {
+				t.Errorf("tableLookupWidth(%U) = %d, want %d", tt.r, got, tt.want)
+			}
+		})
+	}
+}
+
 // TestRuneWidth_UncommonRanges tests coverage for less common Unicode ranges
 func TestRuneWidth_UncommonRanges(t *testing.T) {
 	tests := []struct {

From 06aa51e8dab02abe1e74fea332b4e1109ac5d424 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 01:39:37 +0300
Subject: [PATCH 4/6] feat: ZWJ emoji sequence and skin tone modifier support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Forward-scan state machine for correct ZWJ width calculation
- Family emoji (👨‍👩‍👧‍👦) now correctly returns width 2, not 8
- Emoji modifier sequences (👍🏽) return width 2, not 4
- Handles profession ZWJ (👩‍🔬), flags (🏳️‍🌈), hearts (❤️‍🔥)
- Zero overhead for ASCII strings (fast paths unchanged)
- Zero allocations for short emoji sequences (stack-allocated)
- Added isExtendedPictographic() and isEmojiModifier() helpers
- 48 new test cases: ZWJ sequences, modifiers, edge cases
- 4 new benchmarks: ZWJ family, couple, modifier, mixed
- Coverage: 96.4%, lint: 0 issues
---
 benchmark_test.go   |  33 +++++
 conformance_test.go |   2 +-
 uniwidth.go         | 168 ++++++++++++++++++++---
 uniwidth_test.go    | 320 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 502 insertions(+), 21 deletions(-)

diff --git a/benchmark_test.go b/benchmark_test.go
index 27156db..682d776 100644
--- a/benchmark_test.go
+++ b/benchmark_test.go
@@ -112,6 +112,39 @@ func BenchmarkStringWidth_Emoji_Medium(b *testing.B) {
 	}
 }
 
+// ZWJ emoji sequences
+func BenchmarkStringWidth_ZWJ_Family(b *testing.B) {
+	s := "👨\u200D👩\u200D👧\u200D👦" // Family emoji
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_ZWJ_CoupleHeart(b *testing.B) {
+	s := "👩\u200D\u2764\uFE0F\u200D👨" // Couple with heart
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_EmojiModifier(b *testing.B) {
+	s := "👍🏽" // Thumbs up with skin tone
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = StringWidth(s)
+	}
+}
+
+func BenchmarkStringWidth_ZWJ_Mixed(b *testing.B) {
+	s := "Hello 👨\u200D👩\u200D👧 World 👍🏽 Test 🏳\uFE0F\u200D🌈"
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		_ = StringWidth(s)
+	}
+}
+
 // ============================================================================
 // Benchmark: isASCIIOnly - Fast Path Detection
 // ============================================================================
diff --git a/conformance_test.go b/conformance_test.go
index db3d33c..2b61a60 100644
--- a/conformance_test.go
+++ b/conformance_test.go
@@ -175,7 +175,7 @@ func TestUnicodeConformance_SurrogateHandling(t *testing.T) {
 		// Characters in Supplementary Multilingual Plane (SMP)
 		{"Gothic letter", "𐌰", 1},            // U+10330
 		{"Linear B syllable", "𐀀", 1},        // U+10000 (EAW: N = Neutral/Narrow)
-		{"Emoji family", "👨\u200D👩\u200D👧", 6}, // Man + ZWJ + Woman + ZWJ + Girl (simplified width)
+		{"Emoji family", "👨\u200D👩\u200D👧", 2}, // Man + ZWJ + Woman + ZWJ + Girl (ZWJ-aware: width 2)
 	}
 
 	for _, tt := range tests {
diff --git a/uniwidth.go b/uniwidth.go
index 7a2c9df..6168597 100644
--- a/uniwidth.go
+++ b/uniwidth.go
@@ -163,9 +163,11 @@ func RuneWidth(r rune) int {
 // StringWidth calculates the visual width of a string in monospace terminals.
 //
 // This function provides a fast path for ASCII-only strings,
-// and uses RuneWidth for strings containing Unicode characters.
+// and uses a state machine for correct handling of multi-rune sequences.
 //
 // Special handling:
+//   - ZWJ emoji sequences (👨‍👩‍👧‍👦) are treated as width 2, not the sum of parts
+//   - Emoji modifier sequences (👍🏽) are treated as width 2
 //   - Variation selectors (U+FE0E/U+FE0F) modify the width of the preceding character
 //   - Regional indicator pairs (flags) are counted as width 2, not 4
 func StringWidth(s string) int {
@@ -194,55 +196,115 @@ func StringWidth(s string) int {
 		return asciiWidth(s)
 	}
 
-	// Convert to rune slice for lookahead
+	// Unicode path: convert to rune slice for lookahead.
 	runes := []rune(s)
 	width := 0
 
+	// Emoji sequence state tracking (forward-scan state machine):
+	//   0 = default (not in an emoji sequence)
+	//   1 = after Extended_Pictographic character (may start ZWJ/modifier sequence)
+	//   2 = after EP + (Extend*) + ZWJ (expecting joined emoji)
+	state := 0
+
 	for i := 0; i < len(runes); i++ {
 		r := runes[i]
 
 		// ========================================
-		// Handle Regional Indicator Pairs (Flags)
+		// ZWJ Handling
+		// ========================================
+		// ZWJ (U+200D) after an Extended_Pictographic transitions to
+		// the "expecting joined emoji" state. ZWJ always has width 0.
+		if r == 0x200D {
+			if state == 1 {
+				state = 2
+			}
+			continue
+		}
+
+		// After EP + ZWJ: if next is EP, it joins (width 0).
+		// This implements the core of GB11: ExtPict Extend* ZWJ × ExtPict.
+		if state == 2 {
+			if isExtendedPictographic(r) {
+				state = 1 // Joined, still in emoji sequence
+				continue  // Width 0 — joined with preceding emoji
+			}
+			// Not a valid join target, reset state and process normally.
+			state = 0
+		}
+
+		// ========================================
+		// Emoji Modifier Handling (Skin Tones)
+		// ========================================
+		// Emoji modifiers (U+1F3FB-U+1F3FF) combine with the preceding
+		// Extended_Pictographic, contributing zero additional width.
+		if state == 1 && isEmojiModifier(r) {
+			continue // Width 0 (modifier combines with preceding emoji)
+		}
+
 		// ========================================
-		// Regional indicators (U+1F1E6 - U+1F1FF) represent country codes.
-		// Two consecutive indicators form a flag emoji with width 2 (not 4).
+		// Extend Characters in Emoji Context
+		// ========================================
+		// Variation selectors and combining marks within an active emoji
+		// sequence don't add width and keep the state alive for potential
+		// ZWJ continuation.
+		if state == 1 && (r >= 0xFE00 && r <= 0xFE0F) {
+			continue // VS in emoji sequence, width 0
+		}
+
+		// ========================================
+		// Regional Indicator Pairs (Flags)
+		// ========================================
+		// Two consecutive regional indicators (U+1F1E6-U+1F1FF) form
+		// a flag emoji with width 2 (not 4).
 		if isRegionalIndicator(r) && i+1 < len(runes) && isRegionalIndicator(runes[i+1]) {
-			width += 2 // Flag emoji = 2 columns
-			i++        // Skip the second indicator
+			width += 2
+			i++
+			state = 0
 			continue
 		}
 
 		// ========================================
-		// Handle Variation Selectors
+		// Variation Selectors (Lookahead)
 		// ========================================
-		// Variation selectors modify the presentation of the preceding character:
-		// - U+FE0E: Text presentation (narrow, width 1)
-		// - U+FE0F: Emoji presentation (wide, width 2)
-		//
-		// Note: The variation selector itself has width 0, but it affects
-		// the width calculation of the preceding character.
+		// Variation selectors modify the preceding character's presentation:
+		// - U+FE0E: Text presentation (width 1)
+		// - U+FE0F: Emoji presentation (width 2)
 		if i+1 < len(runes) {
 			next := runes[i+1]
 
-			// Text variation selector: force width 1
 			if next == 0xFE0E {
 				width++
-				i++ // Skip the variation selector
+				i++
+				state = 0
 				continue
 			}
 
-			// Emoji variation selector: force width 2
 			if next == 0xFE0F {
 				width += 2
-				i++ // Skip the variation selector
+				i++
+				if isExtendedPictographic(r) {
+					state = 1
+				} else {
+					state = 0
+				}
 				continue
 			}
 		}
 
 		// ========================================
-		// Default: Use RuneWidth
+		// Default: RuneWidth
 		// ========================================
-		width += RuneWidth(r)
+		w := RuneWidth(r)
+		width += w
+
+		// Track emoji state for ZWJ/modifier sequence detection.
+		if isExtendedPictographic(r) && w > 0 {
+			state = 1
+		} else if w > 0 {
+			state = 0
+		}
+		// When w == 0 (combining marks, tag characters, etc.),
+		// preserve current state to allow Extend* in GB11 pattern.
 	}
 
 	return width
@@ -255,6 +317,72 @@ func isRegionalIndicator(r rune) bool {
 	return r >= 0x1F1E6 && r <= 0x1F1FF
 }
 
+// isExtendedPictographic returns true if the rune has the Extended_Pictographic
+// property (Unicode 16.0 emoji-data.txt), meaning it can participate in emoji
+// ZWJ sequences. This covers all emoji ranges used in standard ZWJ sequences.
+//
+// The checks are ordered by frequency of occurrence in real-world emoji usage
+// to minimize branch mispredictions.
+func isExtendedPictographic(r rune) bool {
+	// SMP emoji blocks (U+1F000-U+1FAFF) — covers ~95% of emoji
+	// Includes: Emoticons, Pictographs, Transport, Supplemental Symbols,
+	// Symbols and Pictographs Extended-A, etc.
+	if r >= 0x1F000 && r <= 0x1FAFF {
+		return true
+	}
+
+	// BMP emoji: Misc Symbols (U+2600-U+26FF) and Dingbats (U+2700-U+27BF)
+	if r >= 0x2600 && r <= 0x27BF {
+		return true
+	}
+
+	// BMP emoji: Misc Technical (U+2300-U+23FF)
+	// Includes: ⌚⌛⏩⏪⏫⏬⏰⏳⏸⏹⏺⌨ etc.
+	if r >= 0x2300 && r <= 0x23FF {
+		return true
+	}
+
+	// Misc Symbols and Arrows (U+2B00-U+2BFF)
+	if r >= 0x2B00 && r <= 0x2BFF {
+		return true
+	}
+
+	// Arrow symbols (U+2194-U+21AA)
+	if r >= 0x2194 && r <= 0x21AA {
+		return true
+	}
+
+	// Geometric Shapes (U+25A0-U+25FF)
+	if r >= 0x25A0 && r <= 0x25FF {
+		return true
+	}
+
+	// Symbols for Legacy Computing and extensions (U+1FB00-U+1FFFD)
+	if r >= 0x1FB00 && r <= 0x1FFFD {
+		return true
+	}
+
+	// Individual Extended_Pictographic characters
+	switch r {
+	case 0x00A9, 0x00AE, // © ®
+		0x203C, 0x2049, // ‼ ⁉
+		0x2122, 0x2139, // ™ ℹ
+		0x3030, 0x303D, // 〰 〽
+		0x3297, 0x3299: // ㊗ ㊙
+		return true
+	}
+
+	return false
+}
+
+// isEmojiModifier returns true if the rune is an emoji modifier (skin tone).
+// Emoji modifiers (U+1F3FB-U+1F3FF) represent Fitzpatrick skin types 1-2 through 6.
+// They combine with the preceding Extended_Pictographic character to form
+// a single emoji with a specific skin tone.
+func isEmojiModifier(r rune) bool {
+	return r >= 0x1F3FB && r <= 0x1F3FF
+}
+
 // isASCIIOnly returns true if the string contains only ASCII characters (0x00-0x7F).
 //
 // Uses SWAR (SIMD Within A Register) to process 8 bytes at a time by loading
diff --git a/uniwidth_test.go b/uniwidth_test.go
index 4c88c65..fd8d672 100644
--- a/uniwidth_test.go
+++ b/uniwidth_test.go
@@ -619,6 +619,326 @@ func TestTableLookup_SpecificCodepoints(t *testing.T) {
 	}
 }
 
+func TestStringWidth_ZWJSequences(t *testing.T) {
+	tests := []struct {
+		name string
+		s    string
+		want int
+	}{
+		// Family ZWJ sequences
+		{
+			name: "Family: man+woman+girl+boy",
+			s:    "👨\u200D👩\u200D👧\u200D👦", // 👨‍👩‍👧‍👦
+			want: 2,
+		},
+		{
+			name: "Family: man+woman+girl",
+			s:    "👨\u200D👩\u200D👧", // 👨‍👩‍👧
+			want: 2,
+		},
+		{
+			name: "Couple with heart",
+			s:    "👩\u200D\u2764\uFE0F\u200D👨", // 👩‍❤️‍👨
+			want: 2,
+		},
+		{
+			name: "Kiss: woman+man",
+			s:    "👩\u200D\u2764\uFE0F\u200D\U0001F48B\u200D👨",
+			want: 2,
+		},
+		// Profession ZWJ sequences
+		{
+			name: "Woman scientist",
+			s:    "👩\u200D🔬", // 👩‍🔬
+			want: 2,
+		},
+		{
+			name: "Man firefighter",
+			s:    "👨\u200D🚒", // 👨‍🚒
+			want: 2,
+		},
+		{
+			name: "Woman technologist",
+			s:    "👩\u200D💻", // 👩‍💻
+			want: 2,
+		},
+		// Gendered ZWJ sequences
+		{
+			name: "Man with probing cane",
+			s:    "👨\u200D🦯", // 👨‍🦯
+			want: 2,
+		},
+		// Heart sequences
+		{
+			name: "Heart on fire",
+			s:    "\u2764\uFE0F\u200D🔥", // ❤️‍🔥
+			want: 2,
+		},
+		{
+			name: "Mending heart",
+			s:    "\u2764\uFE0F\u200D\U0001FA79", // ❤️‍🩹
+			want: 2,
+		},
+		// Rainbow flag
+		{
+			name: "Rainbow flag",
+			s:    "🏳\uFE0F\u200D🌈", // 🏳️‍🌈
+			want: 2,
+		},
+		// Transgender flag
+		{
+			name: "Transgender flag",
+			s:    "🏳\uFE0F\u200D\u26A7\uFE0F", // 🏳️‍⚧️
+			want: 2,
+		},
+		// Pirate flag
+		{
+			name: "Pirate flag",
+			s:    "🏴\u200D\u2620\uFE0F", // 🏴‍☠️
+			want: 2,
+		},
+		// Multiple ZWJ emoji in a string
+		{
+			name: "Multiple ZWJ sequences",
+			s:    "👨\u200D👩\u200D👧 and 👩\u200D💻",
+			want: 9, // family(2) + " and "(5) + technologist(2)
+		},
+		// ZWJ in mixed content
+		{
+			name: "Mixed: ASCII + ZWJ family",
+			s:    "Family: 👨\u200D👩\u200D👧\u200D👦!",
+			want: 11, // "Family: "(8) + family(2) + "!"(1)
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := StringWidth(tt.s)
+			if got != tt.want {
+				t.Errorf("StringWidth(%q) = %d, want %d", tt.s, got, tt.want)
+				t.Logf("Runes: %U", []rune(tt.s))
+			}
+		})
+	}
+}
+
+func TestStringWidth_EmojiModifiers(t *testing.T) {
+	tests := []struct {
+		name string
+		s    string
+		want int
+	}{
+		// Skin tone modifiers
+		{
+			name: "Thumbs up + light skin",
+			s:    "👍🏻", // U+1F44D + U+1F3FB
+			want: 2,
+		},
+		{
+			name: "Thumbs up + medium skin",
+			s:    "👍🏽", // U+1F44D + U+1F3FD
+			want: 2,
+		},
+		{
+			name: "Thumbs up + dark skin",
+			s:    "👍🏿", // U+1F44D + U+1F3FF
+			want: 2,
+		},
+		{
+			name: "Wave + medium-light skin",
+			s:    "👋🏼", // U+1F44B + U+1F3FC
+			want: 2,
+		},
+		// Skin tone + ZWJ (profession with skin tone)
+		{
+			name: "Woman scientist medium skin",
+			s:    "👩🏽\u200D🔬", // 👩🏽‍🔬
+			want: 2,
+		},
+		{
+			name: "Man firefighter dark skin",
+			s:    "👨🏿\u200D🚒", // 👨🏿‍🚒
+			want: 2,
+		},
+		// Multiple modified emoji
+		{
+			name: "Two skin-toned emoji",
+			s:    "👍🏻👋🏿",
+			want: 4, // 2 + 2
+		},
+		// Modified emoji in mixed text
+		{
+			name: "Mixed text with modified emoji",
+			s:    "Hi 👍🏽!",
+			want: 6, // H(1)+i(1)+space(1)+thumbs(2)+!(1)
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := StringWidth(tt.s)
+			if got != tt.want {
+				t.Errorf("StringWidth(%q) = %d, want %d", tt.s, got, tt.want)
+				t.Logf("Runes: %U", []rune(tt.s))
+			}
+		})
+	}
+}
+
+func TestStringWidth_ZWJEdgeCases(t *testing.T) {
+	tests := []struct {
+		name string
+		s    string
+		want int
+	}{
+		// Standalone ZWJ
+		{
+			name: "Standalone ZWJ",
+			s:    "\u200D",
+			want: 0,
+		},
+		// ZWJ between non-emoji characters
+		{
+			name: "ZWJ between ASCII",
+			s:    "a\u200Db",
+			want: 2, // a(1) + ZWJ(0) + b(1)
+		},
+		// Emoji + ZWJ + non-emoji (invalid ZWJ sequence)
+		{
+			name: "Emoji + ZWJ + ASCII",
+			s:    "😀\u200Da",
+			want: 3, // emoji(2) + ZWJ(0) + a(1)
+		},
+		// Multiple ZWJs without emoji
+		{
+			name: "Multiple standalone ZWJs",
+			s:    "\u200D\u200D\u200D",
+			want: 0,
+		},
+		// Emoji without ZWJ (should be normal)
+		{
+			name: "Two emoji without ZWJ",
+			s:    "😀🚀",
+			want: 4, // 2 + 2
+		},
+		// Single emoji modifier without base
+		{
+			name: "Orphan skin tone modifier",
+			s:    "🏽", // U+1F3FD alone
+			want: 2,  // Not preceded by EP, so normal width
+		},
+		// ZWJ at string boundaries
+		{
+			name: "Leading ZWJ + emoji",
+			s:    "\u200D😀",
+			want: 2, // ZWJ(0) + emoji(2)
+		},
+		{
+			name: "Emoji + trailing ZWJ",
+			s:    "😀\u200D",
+			want: 2, // emoji(2) + ZWJ(0)
+		},
+		// Very long ZWJ chain
+		{
+			name: "Long ZWJ chain (3 joins)",
+			s:    "👨\u200D👩\u200D👧\u200D👦",
+			want: 2,
+		},
+		// ZWJ sequence followed by regular emoji
+		{
+			name: "ZWJ family + regular emoji",
+			s:    "👨\u200D👩\u200D👧🚀",
+			want: 4, // family(2) + rocket(2)
+		},
+		// Keycap sequences (should still work)
+		{
+			name: "Keycap 1",
+			s:    "1\uFE0F\u20E3",
+			want: 2, // 1+VS16 → width 2, combining keycap → width 0
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := StringWidth(tt.s)
+			if got != tt.want {
+				t.Errorf("StringWidth(%q) = %d, want %d", tt.s, got, tt.want)
+				t.Logf("Runes: %U", []rune(tt.s))
+			}
+		})
+	}
+}
+
+func TestIsExtendedPictographic(t *testing.T) {
+	tests := []struct {
+		name string
+		r    rune
+		want bool
+	}{
+		// SMP emoji
+		{"Grinning face", 0x1F600, true},
+		{"Rocket", 0x1F680, true},
+		{"Thumbs up", 0x1F44D, true},
+		{"Woman", 0x1F469, true},
+		{"Man", 0x1F468, true},
+		{"Microscope", 0x1F52C, true},
+
+		// BMP emoji
+		{"Sun", 0x2600, true},
+		{"Heart", 0x2764, true},
+		{"Scissors", 0x2702, true},
+		{"Watch", 0x231A, true},
+
+		// Individual EP characters
+		{"Copyright", 0x00A9, true},
+		{"Registered", 0x00AE, true},
+		{"Trademark", 0x2122, true},
+
+		// Non-EP characters
+		{"ASCII a", 'a', false},
+		{"CJK ideograph", 0x4E00, false},
+		{"Hangul", 0xAC00, false},
+		{"Latin extended", 0x00E9, false},
+		{"Combining mark", 0x0300, false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := isExtendedPictographic(tt.r)
+			if got != tt.want {
+				t.Errorf("isExtendedPictographic(%U) = %v, want %v", tt.r, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestIsEmojiModifier(t *testing.T) {
+	tests := []struct {
+		name string
+		r    rune
+		want bool
+	}{
+		{"Light skin tone", 0x1F3FB, true},
+		{"Medium-light", 0x1F3FC, true},
+		{"Medium", 0x1F3FD, true},
+		{"Medium-dark", 0x1F3FE, true},
+		{"Dark skin tone", 0x1F3FF, true},
+		{"Before range", 0x1F3FA, false},
+		{"After range", 0x1F400, false},
+		{"Regular emoji", 0x1F600, false},
+		{"ASCII", 'a', false},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := isEmojiModifier(tt.r)
+			if got != tt.want {
+				t.Errorf("isEmojiModifier(%U) = %v, want %v", tt.r, got, tt.want)
+			}
+		})
+	}
+}
+
 // TestRuneWidth_UncommonRanges tests coverage for less common Unicode ranges
 func TestRuneWidth_UncommonRanges(t *testing.T) {
 	tests := []struct {

From e576f9650d1a52224ffd8f525e6b9e8b86f4b598 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 02:11:40 +0300
Subject: [PATCH 5/6] docs: update public documentation for v0.2.0 release

- CHANGELOG.md: add v0.2.0 section (ZWJ, SWAR, 3-stage table)
- README.md: update features, architecture, benchmarks, coverage
- ARCHITECTURE.md: add ZWJ state machine, SWAR, 3-stage table sections
- ROADMAP.md: add public roadmap (Now/Next/Later format)
- tables.go: remove dead code (replaced by tables_generated.go)
---
 CHANGELOG.md         | 210 ++++++---------
 README.md            | 203 +++++++-------
 ROADMAP.md           |  76 ++++++
 docs/ARCHITECTURE.md | 611 +++++++++++++++++++------------------------
 tables.go            | 185 -------------
 5 files changed, 532 insertions(+), 753 deletions(-)
 create mode 100644 ROADMAP.md
 delete mode 100644 tables.go

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 400ddc8..478c207 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,137 +7,124 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
-### Planned for v1.0.0
-- API freeze and stability commitment
-- Extended test coverage (>95%)
-- Performance regression test suite
-- Additional locale support
-- Migration guide improvements
-
-### Planned for v0.2.0+
-- Grapheme cluster support for complex emoji sequences
-- Explicit SIMD optimizations for AVX-512
+### Planned
 - Profile-Guided Optimization (PGO) support
+- Unicode 17.0 preparation
+- Benchmark CI for regression detection
+- Explicit SIMD via Go assembly and `archsimd` (Go 1.26+)
+- API stability review based on community feedback
+
+## [0.2.0] - 2026-02-05
+
+Major performance and emoji correctness release. All four lookup tiers are now O(1), ZWJ emoji sequences are handled correctly, and ASCII paths use SWAR for 8 bytes/iter throughput.
+
+### Added
+- **ZWJ emoji sequence support**: Forward-scan state machine with 3 states (default/emoji/emojiZWJ). Family emoji 👨‍👩‍👧‍👦 now correctly returns width 2, not 8.
+- **Emoji modifier (skin tone) support**: U+1F3FB-U+1F3FF (Fitzpatrick types) combine with preceding emoji. 👍🏽 now correctly returns width 2, not 4.
+- **`isExtendedPictographic()` helper**: Range-based Extended_Pictographic detection, frequency-ordered for minimal branch mispredictions.
+- **`isEmojiModifier()` helper**: Fitzpatrick skin tone modifier detection.
+- **48 new test cases**: ZWJ sequences (15), emoji modifiers (8), edge cases (11), Extended_Pictographic validation (18), emoji modifier validation (9).
+- **ZWJ benchmarks**: Family (~95 ns), couple with heart (~82 ns), skin tone modifier (~40 ns), mixed ZWJ text (~357 ns). All zero allocations.
+- **Three-way benchmark suite** (`bench/`): uniwidth vs go-runewidth vs rivo/uniseg.
+
+### Changed
+- **Tier 4 lookup**: Replaced O(log n) binary search with O(1) 3-stage hierarchical table. ROOT[256] → MIDDLE[17×64] → LEAVES[78×32], 3.8KB total. All Unicode codepoints resolved in 3 array lookups.
+- **ASCII detection**: SWAR `isASCIIOnly()` processes 8 bytes/iter via uint64 word with `0x8080808080808080` mask. No unsafe pointer escapes.
+- **ASCII width counting**: SWAR `asciiWidth()` uses Daniel Lemire's underflow trick for control character detection in 8-byte chunks.
+- **Short string optimization**: Strings < 8 bytes use a fused single-pass loop that combines ASCII check and width counting, avoiding SWAR function call overhead.
+- **Test coverage**: 87.1% → 96.4% (+9.3%).
+
+### Performance
+- **ASCII**: 3-46x faster than go-runewidth (SWAR fast paths)
+  - Short (5 chars): ~7 ns, 0 allocs
+  - Medium (44 chars): ~20 ns, 0 allocs
+  - Long (234 chars): ~50 ns, 0 allocs
+- **CJK**: 30-35% faster from O(1) table lookup (previously O(log n))
+- **ZWJ sequences**: New capability, ~95 ns for family emoji, 0 allocs
+- **Emoji modifiers**: New capability, ~40 ns for skin tone, 0 allocs
 
 ## [0.1.0] - 2025-11-22
 
-**Stable Release**: First stable release after 35 days of beta testing. All known issues from beta have been resolved.
+First stable release after beta testing. Variation selector and flag emoji bugs fixed.
 
 ### Added
-- 🐛 **Bug Fix**: Variation selectors (U+FE0E, U+FE0F) now handled correctly
-  - Text variation selector (U+FE0E) forces width 1
-  - Emoji variation selector (U+FE0F) forces width 2
-  - Example: "☀︎" (sun + text variant) now correctly returns width 1
-- 🐛 **Bug Fix**: Regional indicator pairs (flags) now handled correctly
-  - Two consecutive regional indicators count as width 2 (not 4)
-  - Example: "🇺🇸" (U+1F1FA + U+1F1F8) now correctly returns width 2
-- 📁 **Project Structure**: Reorganized documentation
-  - Created `docs/` for public documentation
-  - Created `docs/dev/` for development documentation (gitignored)
-  - Moved ARCHITECTURE.md and POC_RESULTS.md to `docs/`
-  - Added `docs/dev/INDEX.md` (Kanban-style tracker)
-  - Added `docs/dev/ROADMAP.md` (release planning)
-- 📝 **Documentation**: Added comprehensive CLAUDE.md for AI assistance
-- 🧪 **Tests**: Added edge case tests
-  - `TestStringWidth_VariationSelectors` (6 test cases)
-  - `TestStringWidth_RegionalIndicators` (5 test cases)
-  - `TestIsRegionalIndicator` (7 test cases)
+- Variation selector handling: U+FE0E (text, width 1) and U+FE0F (emoji, width 2)
+- Regional indicator pair handling: Flag emoji 🇺🇸 = width 2, not 4
+- `isRegionalIndicator()` helper function
+- Edge case tests: variation selectors (6), regional indicators (5), helper validation (7)
+- Project structure: `docs/` for public docs, `docs/dev/` for dev docs (gitignored)
 
 ### Changed
-- 🔄 **StringWidth**: Now converts to `[]rune` for lookahead (variation selectors)
+- `StringWidth()` now converts to `[]rune` for lookahead (variation selectors require it)
   - Trade-off: 1 allocation for Unicode strings (correctness > performance)
   - ASCII fast path still has 0 allocations
-- 📊 **Test Coverage**: Increased from 84.6% → 87.1% (+2.5%)
-
-### Performance Impact
-- ASCII strings: No change (0 allocations, ~5 ns/op)
-- Unicode strings: Minimal impact (<1 ns/op, 1 allocation for `[]rune` conversion)
-- Still 9-23x faster than go-runewidth overall
+- Test coverage: 84.6% → 87.1%
 
 ### Fixed (from beta)
-- ✅ Combining marks edge cases (U+1AD7, U+1AFF) - added to zero-width tables
-- ✅ Boundary issues (U+4DFF, U+303F, U+3100) - table boundaries corrected
-- ✅ Surrogate pair handling (U+10000) - Linear B Syllable now handled correctly
-
-### Known Limitations
-- Grapheme clusters not yet supported (planned for v0.2.0+)
-  - Complex emoji ZWJ sequences counted as sum of parts
-  - Single-character emoji work correctly
+- Combining marks edge cases (U+1AD7, U+1AFF)
+- Boundary issues (U+4DFF, U+303F, U+3100)
+- Surrogate pair handling (U+10000 - Linear B Syllable)
 
-## [0.1.0] - 2025-10-15
+## [0.1.0-beta] - 2025-10-15
 
-> 📝 **Note**: This version was superseded by v0.1.0-beta with critical bug fixes.
+Initial public beta. Core architecture proven with 3.9-46x speedup over go-runewidth.
 
 ### Added
-- Initial release of uniwidth library
-- Tiered lookup strategy (4 tiers: ASCII, CJK/Emoji, Zero-width, Binary search)
-- Full Unicode 16.0.0 support
-- Options API for East Asian Ambiguous character handling
-- Options API for emoji presentation mode
-- Zero allocation design (0 B/op, 0 allocs/op)
-- SIMD auto-vectorization for ASCII detection (Go 1.25+)
-- Table generation from official Unicode data files
+- 4-tier lookup architecture: ASCII O(1) → CJK/Emoji O(1) → Zero-width O(1) → Binary search O(log n)
+- Core API: `RuneWidth()`, `StringWidth()`
+- Options API: `WithEastAsianAmbiguous()`, `WithEmojiPresentation()`
+- Full Unicode 16.0 support via generated tables from official data
+- Zero allocation design for all code paths
+- Table generation from EastAsianWidth.txt and emoji-data.txt
 - Comprehensive test suite (84.6% coverage)
 - Conformance tests for Unicode categories
-- Fuzzing tests for robustness
+- Fuzzing tests (Go native)
 - Benchmarks vs go-runewidth (3-46x speedup proven)
 
 ### Performance
-- **ASCII strings**: 15-46x faster than go-runewidth
-- **CJK strings**: 4-14x faster than go-runewidth
-- **Emoji strings**: 6-8x faster than go-runewidth
-- **Zero allocations**: All operations are allocation-free
-- **Small footprint**: ~13KB total (code + tables)
+- ASCII strings: 15-46x faster than go-runewidth
+- CJK strings: 4-14x faster than go-runewidth
+- Emoji strings: 6-8x faster than go-runewidth
+- Zero allocations: 0 B/op, 0 allocs/op
 
 ### API
-- `RuneWidth(r rune) int` - Calculate visual width of a rune
-- `StringWidth(s string) int` - Calculate visual width of a string
-- `RuneWidthWithOptions(r rune, opts ...Option) int` - Rune width with options
-- `StringWidthWithOptions(s string, opts ...Option) int` - String width with options
-- `WithEastAsianAmbiguous(width EAWidth) Option` - Configure ambiguous width
-- `WithEmojiPresentation(emoji bool) Option` - Configure emoji presentation
-
-### Documentation
-- README.md with quick start and examples
-- ARCHITECTURE.md with detailed technical design
-- POC_RESULTS.md with benchmark analysis
-- LICENSE (MIT)
-- Comprehensive godoc comments
+- `RuneWidth(r rune) int`
+- `StringWidth(s string) int`
+- `RuneWidthWithOptions(r rune, opts ...Option) int`
+- `StringWidthWithOptions(s string, opts ...Option) int`
+- `WithEastAsianAmbiguous(width EAWidth) Option`
+- `WithEmojiPresentation(emoji bool) Option`
 
 ### Known Limitations
-- Grapheme clustering not yet implemented (complex emoji sequences counted as sum of parts)
-- Some edge cases at Unicode range boundaries (will be fixed in v0.1.1)
-- Zero-width space (U+200B) handling needs improvement
-- Test coverage 84.6% (target 90%+ in v0.1.1)
+- Grapheme clusters not supported (complex emoji ZWJ sequences counted as sum of parts)
+- Some combining marks edge cases at boundaries
+- Test coverage 84.6% (target 90%+)
 
 ### Requirements
-- Go 1.25.0 or later (required for optimal performance)
-- No external dependencies except go-runewidth (for benchmarks only)
+- Go 1.25.0 or later
+- No external dependencies
 
 ---
 
 ## Version History
 
-### Naming Convention
-- **Major**: Breaking API changes
-- **Minor**: New features, backward compatible
-- **Patch**: Bug fixes, performance improvements
-
-### Stability
-- v0.x.x: Pre-release, API may change
-- v1.x.x: Stable API, production ready
-
----
+| Version | Date | Highlights |
+|---------|------|------------|
+| 0.2.0 | 2026-02-05 | ZWJ emoji, SWAR, O(1) 3-stage table |
+| 0.1.0 | 2025-11-22 | Stable release, variation selectors, flags |
+| 0.1.0-beta | 2025-10-15 | Initial beta, 4-tier architecture |
 
 ## Upgrade Guide
 
-### From PoC to v0.1.0
-- No breaking changes
-- Generated tables now included
-- Options API added (optional, backward compatible)
+### From v0.1.0 to v0.2.0
+- No breaking API changes
+- ZWJ sequences now return correct width (e.g., 👨‍👩‍👧‍👦 = 2, was 8)
+- Emoji modifiers now return correct width (e.g., 👍🏽 = 2, was 4)
+- All tiers are now O(1) (Tier 4 upgraded from binary search to table lookup)
+- ASCII paths are significantly faster (SWAR optimization)
 
 ### From go-runewidth to uniwidth
-Simple drop-in replacement:
+Drop-in replacement:
 
 ```go
 // Before
@@ -149,41 +136,6 @@ import "github.com/unilibs/uniwidth"
 width := uniwidth.StringWidth(s)
 ```
 
-**Performance improvement**: 3-46x faster, zero code changes!
-
-**Note**: Grapheme clustering behavior may differ for complex emoji sequences.
-
----
-
-## Maintenance
-
-### Update Unicode Version
-
-To update to a newer Unicode version:
-
-1. Update URLs in `cmd/generate-tables/main.go`:
-   ```go
-   const unicodeVersion = "16.0.0" // Change this
-   const eastAsianWidthURL = "https://www.unicode.org/Public/16.0.0/..." // And this
-   ```
-
-2. Regenerate tables:
-   ```bash
-   go generate ./...
-   ```
-
-3. Run tests:
-   ```bash
-   go test ./...
-   ```
-
-4. Update benchmarks:
-   ```bash
-   go test -bench=. -benchmem
-   ```
-
 ---
 
-*For detailed performance analysis, see [docs/POC_RESULTS.md](docs/POC_RESULTS.md)*
 *For architecture details, see [docs/ARCHITECTURE.md](docs/ARCHITECTURE.md)*
-*For release planning, see [docs/dev/ROADMAP.md](docs/dev/ROADMAP.md) (development only)*
diff --git a/README.md b/README.md
index 4db36c9..da7605a 100644
--- a/README.md
+++ b/README.md
@@ -7,31 +7,34 @@
 [![Go Reference](https://pkg.go.dev/badge/github.com/unilibs/uniwidth.svg)](https://pkg.go.dev/github.com/unilibs/uniwidth)
 [![License](https://img.shields.io/github/license/unilibs/uniwidth)](LICENSE)
 [![Release](https://img.shields.io/github/v/release/unilibs/uniwidth)](https://github.com/unilibs/uniwidth/releases)
-[![Stars](https://img.shields.io/github/stars/unilibs/uniwidth?style=social)](https://github.com/unilibs/uniwidth/stargazers)
+[![Stars](https://img.shields.io/github/stars/unilibs/uniwidth)](https://github.com/unilibs/uniwidth/stargazers)
 
-**uniwidth** is a modern, high-performance Unicode width calculation library for Go 1.25+. It provides **3.9-46x faster** width calculation compared to existing solutions through tiered lookup optimization and Go 1.25+ compiler features.
+**uniwidth** is a modern, high-performance Unicode width calculation library for Go 1.25+. It provides **3-46x faster** width calculation compared to existing solutions through a 4-tier O(1) lookup architecture, SWAR optimization, and a ZWJ-aware emoji state machine.
 
-## 🚀 Performance
+## Performance
 
 Based on comprehensive benchmarks vs `go-runewidth`:
 
-- **ASCII strings**: 15-46x faster
-- **CJK strings**: 4-14x faster
+- **ASCII strings**: 15-46x faster (SWAR, 8 bytes/iter)
+- **CJK strings**: 4-14x faster (O(1) table lookup)
 - **Mixed/Emoji strings**: 6-8x faster
-- **Zero allocations**: 0 B/op, 0 allocs/op
+- **ZWJ emoji**: Correct width (👨‍👩‍👧‍👦 = 2, ~95 ns)
+- **Zero allocations**: 0 B/op, 0 allocs/op for ASCII paths
 
 Run benchmarks yourself: `cd bench && go test -bench=. -benchmem`
 
-## ✨ Features
+## Features
 
-- 🚀 **3.9-46x faster** than go-runewidth (proven in benchmarks)
-- 💎 **Zero allocations** (no GC pressure)
-- 🧵 **Thread-safe** (immutable design, no global state)
-- 🎯 **Unicode 16.0** support
-- 🔧 **Modern API** (Go 1.25+, clean design)
-- 📊 **Tiered lookup** (O(1) for 90-95% of cases)
+- **3-46x faster** than go-runewidth (proven in benchmarks)
+- **All tiers O(1)** — 4-tier lookup with 3-stage hierarchical table (3.8KB)
+- **ZWJ-aware** — family emoji, skin tones, flags handled correctly
+- **SWAR optimized** — ASCII detection and width counting at 8 bytes/iter
+- **Zero allocations** for ASCII strings (no GC pressure)
+- **Thread-safe** (immutable design, no global state)
+- **Unicode 16.0** support
+- **Modern API** (Go 1.25+, functional options pattern)
 
-## 📦 Installation
+## Installation
 
 ```bash
 go get github.com/unilibs/uniwidth
@@ -39,7 +42,7 @@ go get github.com/unilibs/uniwidth
 
 **Requirements**: Go 1.25 or later
 
-## 🔧 Usage
+## Usage
 
 ### Basic Usage
 
@@ -66,7 +69,27 @@ func main() {
 }
 ```
 
-### Options API (NEW!)
+### ZWJ Emoji Sequences
+
+```go
+// ZWJ family emoji — correctly returns 2, not 8
+width := uniwidth.StringWidth("👨‍👩‍👧‍👦")
+fmt.Println(width) // Output: 2
+
+// Skin tone modifiers — correctly returns 2, not 4
+width = uniwidth.StringWidth("👍🏽")
+fmt.Println(width) // Output: 2
+
+// Rainbow flag
+width = uniwidth.StringWidth("🏳️‍🌈")
+fmt.Println(width) // Output: 2
+
+// Country flags
+width = uniwidth.StringWidth("🇺🇸")
+fmt.Println(width) // Output: 2
+```
+
+### Options API
 
 Configure handling of ambiguous-width characters:
 
@@ -115,63 +138,55 @@ func truncate(s string, maxWidth int) string {
 }
 ```
 
-### Performance-Critical Code
-
-```go
-// ASCII fast path (46x faster than go-runewidth!)
-text := "Hello, World!"
-width := uniwidth.StringWidth(text) // ~4.6 ns/op
+## Architecture
 
-// CJK fast path (14x faster!)
-text := "你好世界"
-width := uniwidth.StringWidth(text) // ~33.7 ns/op
+### 4-Tier O(1) Lookup
 
-// Mixed content (8x faster!)
-text := "Hello 👋 World"
-width := uniwidth.StringWidth(text) // ~65.9 ns/op
+uniwidth uses a multi-tier approach where **all tiers are O(1)**:
 
-// All with zero allocations!
-```
-
-## 🏗️ Architecture
+1. **Tier 1: ASCII Fast Path** (O(1))
+   - Covers ~95% of typical terminal content
+   - SWAR `isASCIIOnly()` + `asciiWidth()` process 8 bytes/iter
+   - Short strings (< 8 bytes) use fused single-pass loop
 
-### Tiered Lookup Strategy
+2. **Tier 2: Common CJK** (O(1))
+   - CJK Unified Ideographs, Hangul Syllables, Hiragana/Katakana
+   - Simple range checks for 32,000+ characters
 
-uniwidth uses a multi-tier approach for optimal performance:
+3. **Tier 3: Common Emoji** (O(1))
+   - Emoticons, Pictographs, Dingbats, Symbols
+   - Range checks for ~1,200 emoji codepoints
 
-1. **Tier 1: ASCII Fast Path** (O(1))
-   - Covers ~95% of typical terminal content
-   - Uses simple `len(s)` for ASCII-only strings
-   - 15-46x faster than binary search
+4. **Tier 4: 3-Stage Table** (O(1))
+   - ROOT[256] → MIDDLE[17×64] → LEAVES[78×32]
+   - 2-bit width encoding, 3.8KB total
+   - Covers all remaining Unicode codepoints in 3 array lookups
 
-2. **Tier 2: Common CJK & Emoji** (O(1))
-   - Range checks for frequent characters
-   - CJK Unified Ideographs: 20,992 characters
-   - Common emoji ranges
-   - 4-14x faster than binary search
+### ZWJ State Machine
 
-3. **Tier 3: Binary Search Fallback** (O(log n))
-   - For rare characters not in hot paths
-   - Minimal overhead (~5-10% of cases)
+Forward-scan state machine for correct emoji sequence handling:
+- **3 states**: default → emoji → emojiZWJ
+- Handles: ZWJ sequences, skin tone modifiers, variation selectors, flag pairs
+- Inspired by Ghostty's approach, adapted for width calculation
 
-### Go 1.25+ Optimizations
+### SWAR Optimization
 
-- **SIMD Auto-Vectorization**: ASCII detection uses SSE2/AVX2
-- **Aggressive Inlining**: Hot paths compile to minimal instructions
-- **Zero Allocations**: No heap allocations, no GC pressure
+ASCII paths use SIMD Within A Register (SWAR) for high throughput:
+- `isASCIIOnly()`: uint64 word AND with `0x8080808080808080` mask
+- `asciiWidth()`: Daniel Lemire's underflow trick for control character detection
+- Both process 8 bytes per iteration with zero allocations
 
-## 📊 Benchmarks
+## Benchmarks
 
 ```
-BenchmarkStringWidth_ASCII_Short_Uniwidth-12     149590729   9.500 ns/op   0 B/op   0 allocs/op
-BenchmarkStringWidth_ASCII_Short_GoRunewidth-12   10065044  150.1 ns/op   0 B/op   0 allocs/op
-                                                             ^^^^^^^^^^
-                                                             15.8x faster!
-
-BenchmarkStringWidth_CJK_Short_Uniwidth-12        19064941   63.64 ns/op   0 B/op   0 allocs/op
-BenchmarkStringWidth_CJK_Short_GoRunewidth-12      2771077  368.0 ns/op   0 B/op   0 allocs/op
-                                                             ^^^^^^^^^^^
-                                                             5.8x faster!
+goos: windows
+goarch: amd64
+
+BenchmarkStringWidth_ASCII_Short     ~7 ns/op     0 B/op   0 allocs/op
+BenchmarkStringWidth_ASCII_Medium   ~20 ns/op     0 B/op   0 allocs/op
+BenchmarkStringWidth_CJK_Short     ~25 ns/op     0 B/op   0 allocs/op
+BenchmarkStringWidth_ZWJ_Family    ~95 ns/op     0 B/op   0 allocs/op
+BenchmarkStringWidth_EmojiModifier ~40 ns/op     0 B/op   0 allocs/op
 ```
 
 Run benchmarks yourself:
@@ -179,7 +194,7 @@ Run benchmarks yourself:
 go test -bench=. -benchmem
 ```
 
-## 🎯 Use Cases
+## Use Cases
 
 Perfect for:
 - **TUI frameworks** (terminal rendering hot paths)
@@ -188,7 +203,7 @@ Perfect for:
 - **Text editors** (cursor positioning, column calculation)
 - **Any high-performance text width calculation**
 
-## 🔄 Migration from go-runewidth
+## Migration from go-runewidth
 
 uniwidth provides a compatible API for easy migration:
 
@@ -202,16 +217,17 @@ import "github.com/unilibs/uniwidth"
 width := uniwidth.StringWidth(s)
 ```
 
-**Performance improvement**: 3.9-46x faster, zero code changes!
+**Performance improvement**: 3-46x faster, zero code changes!
 
-## 📚 Documentation
+## Documentation
 
 - [API Reference](https://pkg.go.dev/github.com/unilibs/uniwidth) - Full godoc documentation
 - [Benchmark Comparisons](bench/README.md) - Performance comparison vs go-runewidth
 - [Architecture Design](docs/ARCHITECTURE.md) - Technical deep dive & design decisions
 - [Changelog](CHANGELOG.md) - Version history & upgrade guide
+- [Roadmap](ROADMAP.md) - What's next for uniwidth
 
-## 🧪 Testing
+## Testing
 
 ```bash
 # Run tests
@@ -224,51 +240,36 @@ go test -bench=. -benchmem
 go test -cover
 ```
 
-Current test coverage: **90.3%** (exceeds 90% target ✅)
-
-## 🚀 Development Status
-
-**Current**: v0.1.0 (Stable Release)
+Current test coverage: **96.4%**
 
-> ✅ **Stable Release**: This library has completed beta testing. The API is stable and ready for production use. Minor version updates (v0.2.x) will maintain backward compatibility.
+## Development Status
 
-**What Beta Means**:
-- ✅ Feature-complete for core functionality
-- ✅ Production-quality code and performance
-- ⚠️ API may evolve based on community feedback
-- ⚠️ Edge cases still being discovered and fixed
-- 🎯 Goal: API freeze before v1.0.0-rc
+**Current**: v0.2.0
 
-**Completed**:
-- ✅ PoC (3 days) - 3.9-46x speedup proven
-- ✅ Complete Unicode 16.0 tables - Generated from official data
-- ✅ Options API - East Asian Width & emoji configuration
-- ✅ Comprehensive testing - 84.6% coverage, fuzzing, conformance tests
-- ✅ Bug fixes - Variation selectors, regional indicator flags
-- ✅ Documentation - README, ARCHITECTURE, CHANGELOG
+> This library is stable and production-ready. The API is backward-compatible across minor versions. ZWJ emoji sequences, skin tone modifiers, variation selectors, and flag emoji are all handled correctly.
 
-**Beta Goals** (Before RC):
-- [ ] Community feedback integration
-- [ ] Edge case coverage >95%
-- [ ] API stability validation
-- [ ] Performance regression testing
-- [ ] Documentation refinement
+**v0.2.0 Highlights**:
+- All 4 lookup tiers are now O(1) (3-stage table replaced binary search)
+- SWAR ASCII optimization (8 bytes/iter)
+- ZWJ emoji state machine (👨‍👩‍👧‍👦 = width 2)
+- Emoji modifier support (👍🏽 = width 2)
+- 96.4% test coverage
 
-**Future Roadmap** (v1.0+):
-- [ ] Grapheme cluster support (for complex emoji ZWJ sequences)
-- [ ] Additional locale support
-- [ ] Extended SIMD optimizations
-- [ ] Profile-Guided Optimization (PGO)
+**Roadmap** (v0.3.0+):
+- Profile-Guided Optimization (PGO)
+- Benchmark CI for regression detection
+- Explicit SIMD via Go assembly and `archsimd`
+- Unicode 17.0 preparation
 
-## 🤝 Contributing
+## Contributing
 
 Contributions welcome! This is part of the [unilibs](https://github.com/unilibs) organization - modern Unicode libraries for Go.
 
-## 📄 License
+## License
 
 MIT License - see [LICENSE](LICENSE) file
 
-## 🌟 Related Projects
+## Related Projects
 
 Built by the [Phoenix TUI Framework](https://github.com/phoenix-tui/phoenix) team.
 
@@ -277,17 +278,17 @@ Part of the **unilibs** ecosystem:
 - **unigrapheme** - Grapheme clustering (planned)
 - More Unicode utilities coming soon!
 
-## 📞 Support
+## Support
 
 - Issues: [GitHub Issues](https://github.com/unilibs/uniwidth/issues)
 - Discussions: [GitHub Discussions](https://github.com/unilibs/uniwidth/discussions)
 
 ---
 
-## 🙏 Special Thanks
+## Special Thanks
 
 **Professor Ancha Baranova** - This project would not have been possible without her invaluable help and support. Her assistance was crucial in bringing uniwidth to life.
 
 ---
 
-**Made with ❤️ by the Phoenix team** | **Powered by Go 1.25+**
+**Made with care by the Phoenix team** | **Powered by Go 1.25+**
diff --git a/ROADMAP.md b/ROADMAP.md
new file mode 100644
index 0000000..2e7419c
--- /dev/null
+++ b/ROADMAP.md
@@ -0,0 +1,76 @@
+# Roadmap
+
+> Updated: 2026-02-05 | Format: **Now / Next / Later**
+
+## Vision
+
+The fastest Unicode width calculation library in the Go ecosystem — correct emoji handling, zero allocations, full Unicode compliance. A drop-in replacement for go-runewidth.
+
+---
+
+## Now (v0.2.0 — Current Release)
+
+- [x] **4-tier O(1) lookup**: ASCII → CJK → Emoji → 3-stage table (3.8KB)
+- [x] **SWAR optimization**: ASCII detection and width counting at 8 bytes/iter
+- [x] **ZWJ emoji sequences**: 👨‍👩‍👧‍👦 = width 2 (forward-scan state machine)
+- [x] **Emoji modifiers**: Skin tones 👍🏽 = width 2
+- [x] **Variation selectors**: U+FE0E (text) / U+FE0F (emoji)
+- [x] **Regional indicator pairs**: Flag emoji 🇺🇸 = width 2
+- [x] **Unicode 16.0** compliance
+- [x] **96.4%** test coverage, zero lint issues
+
+---
+
+## Next (v0.3.0)
+
+- [ ] **Profile-Guided Optimization (PGO)** — expected 10-20% speedup on hot paths
+- [ ] **Benchmark CI** — automated regression detection on every PR
+- [ ] **Unicode 17.0 preparation** — generator pipeline ready for next release
+- [ ] **Keycap sequences** — `#️⃣`, `*️⃣`, `0️⃣-9️⃣`
+- [ ] **Migration guide** — step-by-step from go-runewidth
+- [ ] **API review** — gather feedback from early adopters
+
+---
+
+## Later (v1.0.0+)
+
+### API Freeze
+- [ ] Stable API guarantee (no breaking changes until v2.0)
+- [ ] Semantic versioning commitment
+- [ ] Validated by multiple production projects
+
+### Explicit SIMD
+- [ ] **Go assembly** (Plan 9 `.s` files) — SSE2/AVX2/NEON, 16-32 bytes/iter
+- [ ] **`archsimd`** (Go 1.26+) — portable SIMD intrinsics ([golang/go#67520](https://github.com/golang/go/issues/67520))
+- [ ] **AVX-512** — server-side bulk processing, 64 bytes/iter
+- [ ] **ARM NEON** — Apple Silicon / AWS Graviton
+
+### Grapheme Clusters (Conditional)
+- [ ] Full [UAX #29](https://unicode.org/reports/tr29/) support (opt-in, not default)
+- [ ] Reactivation: when users report incorrect widths for specific scripts
+
+### Ecosystem
+- [ ] [Phoenix TUI](https://github.com/phoenix-tui/phoenix) integration
+- [ ] **unigrapheme** — companion grapheme segmentation library
+
+---
+
+## Non-Goals
+
+- Automatic locale detection (use Options API)
+- Font-specific width variations
+- Backward compatibility below Go 1.25
+- Full ICU replacement
+
+---
+
+## Contributing
+
+We welcome contributions in these areas:
+
+1. **Bug reports** — width calculation issues, emoji mismatches
+2. **Performance testing** — benchmarks on different hardware
+3. **Real-world usage** — integrate in your app, report API friction
+4. **Unicode edge cases** — sequences we handle incorrectly
+
+See [GitHub Issues](https://github.com/unilibs/uniwidth/issues) for current work items.
diff --git a/docs/ARCHITECTURE.md b/docs/ARCHITECTURE.md
index c78fdb3..8f1bd1e 100644
--- a/docs/ARCHITECTURE.md
+++ b/docs/ARCHITECTURE.md
@@ -1,7 +1,7 @@
 # uniwidth Architecture
 
-**Version**: v0.1.0
-**Date**: 2025-10-15
+**Version**: v0.2.0
+**Date**: 2026-02-05
 **Unicode**: 16.0.0
 
 ---
@@ -9,21 +9,21 @@
 ## Design Goals
 
 1. **Performance**: 3-46x faster than existing solutions (proven in benchmarks)
-2. **Correctness**: Full Unicode 16.0 compliance
-3. **Zero Allocations**: No GC pressure
-4. **Modern Go**: Leverage Go 1.25+ compiler optimizations
+2. **Correctness**: Full Unicode 16.0 compliance, including ZWJ emoji sequences
+3. **Zero Allocations**: No GC pressure on ASCII paths
+4. **Modern Go**: Leverage Go 1.25+ compiler optimizations and SWAR techniques
 5. **Simple API**: Drop-in replacement for go-runewidth
 
 ---
 
-## Core Architecture: Tiered Lookup Strategy
+## Core Architecture: 4-Tier O(1) Lookup
 
-uniwidth uses a **4-tier lookup system** that optimizes for the 90-95% common case while maintaining full Unicode coverage.
+uniwidth uses a **4-tier lookup system** where all tiers operate in constant time O(1). This optimizes for the 90-95% common case while maintaining full Unicode coverage.
 
 ### Tier 1: ASCII Fast Path (O(1))
 
 **Coverage**: ~95% of typical terminal content
-**Performance**: 15-46x faster than binary search
+**Performance**: 15-46x faster than go-runewidth
 
 ```go
 // Tier 1: ASCII (0x00-0x7F)
@@ -44,15 +44,12 @@ if r < 0x80 {
 - Go compiler optimizes this to a few CPU instructions
 - No memory lookups, no cache misses
 
-**Performance**:
-- ASCII rune: 2.7 ns/op vs 3.1 ns/op (go-runewidth)
-- ASCII string (5 chars): 4.6 ns/op vs 101.6 ns/op (22x faster!)
-- ASCII string (234 chars): 126.7 ns/op vs 3983 ns/op (31x faster!)
+For strings, the ASCII fast path uses SWAR optimization (see below).
 
-### Tier 2: Common CJK & Emoji (O(1))
+### Tier 2: Common CJK Fast Path (O(1))
 
 **Coverage**: ~80-90% of non-ASCII content
-**Performance**: 4-14x faster than binary search
+**Performance**: 4-14x faster than go-runewidth
 
 ```go
 // CJK Unified Ideographs (20,992 characters)
@@ -65,456 +62,394 @@ if r >= 0xAC00 && r <= 0xD7AF {
     return 2
 }
 
-// Hiragana + Katakana
-if r >= 0x3040 && r <= 0x30FF {
+// Hiragana + Katakana + Bopomofo (384 characters)
+if r >= 0x3040 && r <= 0x312F {
     return 2
 }
 
-// Common emoji ranges
-if r >= 0x1F600 && r <= 0x1F64F {
-    return 2 // Smileys
+// CJK Compatibility Ideographs
+if r >= 0xF900 && r <= 0xFAFF {
+    return 2
 }
 ```
 
 **Why this works**:
-- CJK and emoji characters cluster in large contiguous ranges
+- CJK characters cluster in large contiguous ranges
 - Range checks (`>=` and `<=`) are O(1) operations
 - Covers 99% of Japanese, Chinese, Korean text
-- Covers 90% of commonly used emoji
-
-**Performance**:
-- CJK rune: 1.8 ns/op vs 34.2 ns/op (19x faster!)
-- Emoji rune: 4.0 ns/op vs 25.6 ns/op (6.4x faster!)
 
-### Tier 3: Zero-Width Characters (O(1))
+### Tier 3: Common Emoji Fast Path (O(1))
 
-**Coverage**: Combining marks, ZWJ, variation selectors
-**Performance**: O(1) checks + fallback to unicode package
+**Coverage**: ~90% of commonly used emoji
+**Performance**: 6-8x faster than go-runewidth
 
 ```go
-// Zero-Width Joiner (used in emoji sequences)
-if r == 0x200D {
-    return 0
-}
-
-// Variation Selectors
-if r >= 0xFE00 && r <= 0xFE0F {
-    return 0
-}
+// Emoticons (U+1F600-U+1F64F)
+if r >= 0x1F600 && r <= 0x1F64F { return 2 }
 
-// Combining marks (via unicode package)
-if unicode.In(r, unicode.Mn, unicode.Me, unicode.Mc) {
-    return 0
-}
-```
-
-**Why this works**:
-- Most zero-width characters are rare
-- Common ones (ZWJ, ZWNJ, VS) get explicit fast-path checks
-- Combining marks handled by stdlib unicode package (optimized)
+// Misc Symbols and Pictographs (U+1F300-U+1F5FF)
+if r >= 0x1F300 && r <= 0x1F5FF { return 2 }
 
-### Tier 4: Binary Search Fallback (O(log n))
+// Transport and Map (U+1F680-U+1F6FF)
+if r >= 0x1F680 && r <= 0x1F6FF { return 2 }
 
-**Coverage**: Rare characters (5-10% of non-ASCII)
-**Performance**: O(log n) but infrequent
+// Supplemental Symbols (U+1F900-U+1F9FF)
+if r >= 0x1F900 && r <= 0x1F9FF { return 2 }
 
-```go
-func binarySearchWidth(r rune) int {
-    if binarySearch(r, wideTableGenerated) {
-        return 2
-    }
-    if binarySearch(r, zeroWidthTableGenerated) {
-        return 0
-    }
-    if binarySearch(r, ambiguousTableGenerated) {
-        return 1 // or 2 with options
-    }
-    return 1 // Default
-}
+// Misc Symbols (U+2600-U+26FF), Dingbats (U+2700-U+27BF)
 ```
 
-**Table sizes** (generated from Unicode 16.0):
-- Wide: 80 ranges
-- Zero-width: 25 ranges
-- Ambiguous: 179 ranges
-
-**Why binary search**:
-- Only used for rare characters (5-10% of cases)
-- Small tables = good cache locality
-- O(log n) is acceptable for infrequent cases
+### Tier 4: 3-Stage Hierarchical Table (O(1))
 
----
+**Coverage**: All remaining Unicode codepoints
+**Performance**: O(1) — 3 array lookups + bit extraction
 
-## String Width Calculation
+Replaced the previous O(log n) binary search with a compact 3-stage hierarchical table that encodes every Unicode codepoint (U+0000-U+10FFFF) as a 2-bit width value.
 
-### ASCII-Only Fast Path
+#### Table Structure
 
-```go
-func StringWidth(s string) int {
-    if isASCIIOnly(s) {
-        return len(s) // Direct length!
-    }
-    // ... iterate runes
-}
-
-func isASCIIOnly(s string) bool {
-    for i := 0; i < len(s); i++ {
-        if s[i] >= 0x80 {
-            return false
-        }
-    }
-    return true
-}
+```
+ROOT[256] → MIDDLE[17×64] → LEAVES[78×32]
 ```
 
-**SIMD Auto-Vectorization**:
-- Go 1.25 compiler auto-vectorizes `isASCIIOnly()`
-- Uses SSE2/AVX2 on x86-64 (checks 16-32 bytes at once)
-- Uses NEON on ARM64
-- Simple loop structure is key to vectorization
-
-**Performance**:
-- 5 chars: 4.4 ns (0.88 ns/char)
-- 44 chars: 29.9 ns (0.68 ns/char)
-- 234 chars: 147.7 ns (0.63 ns/char)
+- **ROOT**: 256 entries, indexed by `cp >> 13` (top 8 bits of plane + block)
+- **MIDDLE**: 17 pages × 64 entries each, indexed by `(cp >> 7) & 0x3F`
+- **LEAVES**: 78 pages × 32 bytes each, packed 2-bit encoding, indexed by `(cp >> 2) & 0x1F`
 
-This is **near-theoretical maximum speed** for ASCII checking!
+#### 2-Bit Width Encoding
 
-### Rune-by-Rune Iteration
+```
+0b00 = width 0 (control, combining, zero-width)
+0b01 = width 1 (narrow, default)
+0b10 = width 2 (wide: CJK, emoji, fullwidth)
+0b11 = ambiguous (treated as width 1 in neutral context)
+```
 
-For non-ASCII strings:
+#### Lookup Code
 
 ```go
-width := 0
-for _, r := range s {
-    width += RuneWidth(r)
+func tableLookupWidth(r rune) int {
+    cp := uint32(r)
+    rootIdx := widthRoot[cp>>13]
+    midIdx  := widthMiddle[rootIdx][cp>>7&0x3F]
+    packed  := widthLeaves[midIdx][cp>>2&0x1F]
+    width   := (packed >> (2 * (cp & 0x03))) & 0x03
+    if width == 3 {
+        return 1 // ambiguous → narrow in neutral context
+    }
+    return int(width)
 }
 ```
 
-- Go's `range` automatically handles UTF-8 decoding
-- Each `RuneWidth()` call uses tiered lookup
-- Zero allocations (no grapheme clustering overhead)
-
----
-
-## Table Generation
-
-### Source Data
-
-Tables are generated from official Unicode 16.0 data:
-- `EastAsianWidth.txt` - East Asian Width property
-- `emoji-data.txt` - Emoji presentation data
-
-### Generation Process
-
-```bash
-go generate ./...
-# or
-go run cmd/generate-tables/main.go
-```
-
-Process:
-1. Download Unicode 16.0 data files
-2. Parse East Asian Width (W, F, N, A properties)
-3. Parse Emoji data
-4. Filter out hot-path ranges (already in Tier 1-3)
-5. Optimize ranges (merge adjacent ranges)
-6. Generate `tables_generated.go`
-
-### Hot Path Filtering
+#### Memory Footprint
 
-The generator **excludes** ranges already handled by fast paths:
-- ASCII (0x00-0x7F)
-- CJK Unified Ideographs (0x4E00-0x9FFF)
-- Hangul Syllables (0xAC00-0xD7AF)
-- Hiragana/Katakana (0x3040-0x30FF)
-- Common emoji ranges
+| Component | Size |
+|-----------|------|
+| ROOT | 256 bytes |
+| MIDDLE | 1,088 bytes (17 × 64) |
+| LEAVES | 2,496 bytes (78 × 32) |
+| **Total** | **3,840 bytes (3.8 KB)** |
 
-This keeps tables **small** (284 ranges total) while maintaining full coverage.
+Compare to go-runewidth: ~500KB of tables.
 
 ---
 
-## Options API
+## SWAR Optimization (SIMD Within A Register)
 
-### Functional Options Pattern
+### isASCIIOnly() — ASCII Detection
+
+Processes 8 bytes at a time by loading them into a uint64 and checking all high bits simultaneously:
 
 ```go
-type Options struct {
-    EastAsianAmbiguous EAWidth // 1 or 2
-    EmojiPresentation  bool     // true or false
+func isASCIIOnly(s string) bool {
+    p := unsafe.StringData(s)
+    const asciiMask = uint64(0x8080808080808080)
+
+    // Process 8 bytes at a time
+    for ; i+8 <= n; i += 8 {
+        word := *(*uint64)(unsafe.Add(unsafe.Pointer(p), i))
+        if word & asciiMask != 0 {
+            return false // Non-ASCII byte found
+        }
+    }
+    // Scalar tail for remaining 0-7 bytes
+    ...
 }
+```
 
-type Option func(*Options)
+If any byte has its high bit set (>= 0x80), the AND with `0x8080808080808080` produces a non-zero result. Works regardless of endianness.
 
-func WithEastAsianAmbiguous(width EAWidth) Option { ... }
-func WithEmojiPresentation(emoji bool) Option { ... }
-```
+### asciiWidth() — Control Character Detection
 
-### Implementation
+Uses Daniel Lemire's SWAR underflow trick to detect control characters in 8-byte chunks:
 
 ```go
-func StringWidthWithOptions(s string, opts ...Option) int {
-    options := defaultOptions()
-    for _, opt := range opts {
-        opt(&options)
-    }
-    // ... use options in width calculation
-}
+// Detect bytes < 0x20 (C0 controls):
+// Subtracting 0x20 from a byte < 0x20 causes unsigned underflow,
+// setting the high bit. AND with ~word isolates genuine underflows.
+hasLow := (word - 0x2020202020202020) & ^word & 0x8080808080808080
+
+// Detect byte == 0x7F (DELETE):
+// XOR with 0x7F zeros out any 0x7F bytes, then zero-byte detection
+// finds them via the underflow pattern.
+xored := word ^ 0x7F7F7F7F7F7F7F7F
+has7F := (xored - 0x0101010101010101) & ^xored & 0x8080808080808080
 ```
 
-**Key feature**: Ambiguous characters return -1 internally, allowing caller to decide width.
-
----
-
-## Go 1.25+ Optimizations
+If neither `hasLow` nor `has7F` is set, the entire 8-byte chunk has no control characters and `width += 8` directly.
 
-### 1. SIMD Auto-Vectorization
+### Short String Optimization
 
-The `isASCIIOnly()` function is designed for auto-vectorization:
+Strings shorter than 8 bytes use a fused single-pass loop that combines ASCII detection and width counting, avoiding the overhead of calling both `isASCIIOnly()` and `asciiWidth()` separately:
 
 ```go
-func isASCIIOnly(s string) bool {
+if len(s) < 8 {
+    width, isASCII := 0, true
     for i := 0; i < len(s); i++ {
-        if s[i] >= 0x80 {
-            return false
-        }
+        b := s[i]
+        if b >= 0x80 { isASCII = false; break }
+        if b >= 0x20 && b != 0x7F { width++ }
     }
-    return true
+    if isASCII { return width }
 }
 ```
 
-**Key factors** for vectorization:
-- Simple loop with index
-- Single condition per iteration
-- No function calls in loop
-- No complex branching
+---
 
-**Result**: Go compiler generates SSE2/AVX2 instructions.
+## ZWJ State Machine
 
-### 2. Aggressive Inlining
+StringWidth uses a forward-scan state machine for correct handling of multi-rune emoji sequences. Inspired by Ghostty's approach, adapted for width calculation.
 
-Functions < 80 "cost units" are inlined:
-- `RuneWidth()` - inlined into `StringWidth()`
-- `isASCIIOnly()` - inlined
-- Hot path checks - inlined
+### States
 
-**Result**: Minimal function call overhead.
+```
+State 0: Default (not in an emoji sequence)
+State 1: After Extended_Pictographic character (may start ZWJ/modifier sequence)
+State 2: After EP + (Extend*) + ZWJ (expecting joined emoji)
+```
 
-### 3. Branch Prediction
+### Transitions
 
-Tiered structure with early returns:
-- CPU learns common paths (ASCII first, then CJK)
-- Branch misprediction penalties minimized
-- Hot paths taken 90-95% of the time
+```
+[Default] ──EP(w>0)──→ [Emoji]
+[Emoji]   ──ZWJ──────→ [EmojiZWJ]
+[EmojiZWJ]──EP────────→ [Emoji]     (joined, width 0)
+[EmojiZWJ]──other─────→ [Default]   (broken sequence)
+[Emoji]   ──modifier──→ [Emoji]     (skin tone, width 0)
+[Emoji]   ──VS────────→ [Emoji]     (variation selector, width 0)
+[any]     ──w==0──────→ [preserve]  (combining marks keep state for Extend*)
+```
 
-### 4. Cache Locality
+### Supported Sequences
 
-Small tables (284 ranges = ~2KB) fit in L1 cache:
-- Binary search stays in cache
-- No TLB misses
-- Predictable memory access patterns
+| Sequence | Example | Width |
+|----------|---------|-------|
+| ZWJ family | 👨‍👩‍👧‍👦 | 2 |
+| Skin tone | 👍🏽 | 2 |
+| Professional | 👩🏽‍🔬 | 2 |
+| Rainbow flag | 🏳️‍🌈 | 2 |
+| Heart + fire | ❤️‍🔥 | 2 |
+| Country flag | 🇺🇸 | 2 |
+| VS-16 emoji | ☀️ | 2 |
+| VS-15 text | ☀︎ | 1 |
 
----
+### Extended_Pictographic Detection
 
-## Performance Characteristics
+`isExtendedPictographic()` uses range checks ordered by frequency of occurrence in real-world emoji usage:
 
-### Time Complexity
+1. SMP emoji blocks (U+1F000-U+1FAFF) — covers ~95% of emoji
+2. BMP: Misc Symbols (U+2600-U+27BF)
+3. BMP: Misc Technical (U+2300-U+23FF)
+4. BMP: Misc Symbols and Arrows (U+2B00-U+2BFF)
+5. BMP: Arrow symbols (U+2194-U+21AA)
+6. BMP: Geometric Shapes (U+25A0-U+25FF)
+7. SMP: Legacy Computing (U+1FB00-U+1FFFD)
+8. Individual characters: ©, ®, ‼, ⁉, ™, ℹ, 〰, 〽, ㊗, ㊙
 
-| Operation | ASCII | CJK/Emoji | Rare |
-|-----------|-------|-----------|------|
-| `RuneWidth()` | O(1) | O(1) | O(log n) |
-| `StringWidth()` ASCII-only | O(n) | N/A | N/A |
-| `StringWidth()` mixed | O(n) | O(n) | O(n log m) |
+---
 
-Where:
-- n = string length
-- m = table size (~284 ranges)
+## String Width Calculation
 
-### Space Complexity
+### Flow
 
-- **Code**: ~10KB (uniwidth.go + options.go)
-- **Tables**: ~3KB (tables_generated.go)
-- **Runtime**: 0 bytes (zero allocations)
-- **Total**: ~13KB
+```
+StringWidth(s)
+    │
+    ├─ len < 8? → Fused ASCII check + width count
+    │
+    ├─ isASCIIOnly(s)? → asciiWidth(s) [SWAR]
+    │
+    └─ Unicode path:
+        ├─ Convert to []rune (1 allocation)
+        └─ State machine loop:
+            ├─ ZWJ handling (state transitions)
+            ├─ Emoji modifier handling
+            ├─ VS in emoji context
+            ├─ Regional indicator pairs
+            ├─ Variation selector lookahead
+            └─ Default: RuneWidth(r)
+```
 
-Compare to go-runewidth: ~500KB (large tables for every rune category).
+### Allocation Behavior
 
-### Memory Access Patterns
+| Input | Allocations | Reason |
+|-------|-------------|--------|
+| ASCII-only, any length | 0 | SWAR fast path, no rune conversion |
+| Unicode, short (< ~32 runes) | 0 | Go stack-allocates small `[]rune` slices |
+| Unicode, long | 1 | `[]rune` heap allocation for lookahead |
 
-1. **Tier 1-2 (ASCII, CJK, Emoji)**: No memory access (pure CPU registers)
-2. **Tier 3 (Zero-width)**: Small lookups, likely in L1 cache
-3. **Tier 4 (Binary search)**: ~8-9 comparisons max, all in L1/L2 cache
+---
 
-**Result**: Minimal cache misses, predictable latency.
+## Options API
 
----
+### Functional Options Pattern
 
-## Benchmark Results Summary
+```go
+type Options struct {
+    EastAsianAmbiguous EAWidth // 1 or 2
+    EmojiPresentation  bool    // true or false
+}
 
-| Category | uniwidth | go-runewidth | **Speedup** |
-|----------|----------|--------------|-------------|
-| ASCII (short) | 4.6 ns | 101.6 ns | **22x** |
-| ASCII (long) | 126.7 ns | 3983 ns | **31x** |
-| CJK | 33.7 ns | 212.5 ns | **6.3x** |
-| Emoji | 64.9 ns | 337.4 ns | **5.2x** |
-| Mixed | 65.9 ns | 444.8 ns | **6.8x** |
+type Option func(*Options)
 
-**All measurements**: 0 B/op, 0 allocs/op
+func WithEastAsianAmbiguous(width EAWidth) Option { ... }
+func WithEmojiPresentation(emoji bool) Option { ... }
+```
 
----
+Ambiguous characters (width encoding `0b11` in the table) return width based on the configured option. Default: narrow (width 1).
 
-## Future Optimizations (v0.2.0+)
+---
 
-### 1. Grapheme Clustering (Optional)
+## Table Generation
 
-For proper emoji ZWJ sequence handling:
-- Add optional grapheme clustering mode
-- Use `uniseg` library for complex cases
-- Keep fast path for simple cases (90-95%)
+### Source Data
 
-### 2. SIMD Explicit Vectorization
+Tables are generated from official Unicode 16.0 data:
+- `EastAsianWidth.txt` — East Asian Width property
+- `emoji-data.txt` — Emoji presentation data
 
-For CPUs with AVX-512:
-- Hand-written SIMD for `isASCIIOnly()`
-- Potential 2-4x speedup for long ASCII strings
-- Fallback to auto-vectorized version
+### Process
 
-### 3. PGO (Profile-Guided Optimization)
+```bash
+go generate ./...
+# or
+go run cmd/generate-tables/main.go
+```
 
-- Collect real-world usage profiles
-- Feed to Go compiler for better optimization
-- Expected 10-20% improvement
+1. Download Unicode 16.0 data files
+2. Parse East Asian Width (W, F, N, A properties)
+3. Parse Emoji data
+4. Build full codepoint-to-width mapping (U+0000-U+10FFFF)
+5. Compress into 3-stage hierarchical table via page deduplication
+6. Generate `tables_generated.go`
 
----
+### Hot Path Filtering
 
-## Testing Strategy
+The 3-stage table encodes ALL codepoints, but Tiers 1-3 short-circuit before reaching the table for common characters. The table primarily serves rare characters that don't fall into the hot paths.
 
-### Unit Tests
+---
 
-- 40+ test cases covering all tiers
-- ASCII, CJK, emoji, ambiguous, zero-width
-- Backward compatibility tests
+## Performance Characteristics
 
-### Conformance Tests
+### Time Complexity
 
-- Unicode 16.0 category coverage
-- Edge cases and boundaries
-- Control characters, combining marks
-- Fullwidth/halfwidth forms
+| Operation | All Tiers |
+|-----------|-----------|
+| `RuneWidth()` | O(1) |
+| `StringWidth()` ASCII-only | O(n/8) via SWAR |
+| `StringWidth()` Unicode | O(n) per rune |
 
-### Fuzzing
+### Space Complexity
 
-- `FuzzRuneWidth`: Random runes (10M+ iterations)
-- `FuzzStringWidth`: Random strings
-- `FuzzStringWidthWithOptions`: Options API
-- Invariant checking (no panics, valid widths)
+| Component | Size |
+|-----------|------|
+| Code (uniwidth.go + options.go) | ~10 KB |
+| 3-stage table (tables_generated.go) | 3.8 KB |
+| Binary search tables (legacy, for Options API) | ~3 KB |
+| Runtime (ASCII path) | 0 bytes |
+| **Total** | **~17 KB** |
 
-### Benchmarks
+Compare to go-runewidth: ~500KB.
 
-- 32 benchmarks vs go-runewidth
-- ASCII, CJK, emoji, mixed content
-- Real-world TUI scenarios
+### Benchmark Results
 
-**Coverage**: 84.6% (target 90%+)
+| Category | Time | Allocs | vs go-runewidth |
+|----------|------|--------|-----------------|
+| ASCII short (5 chars) | ~7 ns | 0 | 15-22x faster |
+| ASCII medium (44 chars) | ~20 ns | 0 | 30-46x faster |
+| CJK short (4 chars) | ~25 ns | 0 | 5-14x faster |
+| ZWJ family (👨‍👩‍👧‍👦) | ~95 ns | 0 | New capability |
+| Emoji modifier (👍🏽) | ~40 ns | 0 | New capability |
+| Mixed (ASCII + CJK + emoji) | ~65 ns | 0 | 6-8x faster |
 
 ---
 
 ## Comparison with go-runewidth
 
-### Why uniwidth is Faster
-
-| Aspect | uniwidth | go-runewidth | **Advantage** |
-|--------|----------|--------------|---------------|
-| ASCII | `len(s)` | Grapheme + binary search | **46x faster** |
-| CJK | Range checks | Binary search | **14x faster** |
-| Emoji | Range checks | Grapheme + binary search | **8x faster** |
-| Hot paths | 90-95% | 0% | **Huge win** |
-| Allocations | 0 | 0 | Tie |
-| Table size | 3KB | ~500KB | **166x smaller** |
-
 ### Architectural Differences
 
-**uniwidth**:
-- Tiered lookup (O(1) for common, O(log n) for rare)
-- No grapheme clustering (yet)
-- Optimized for Go 1.25+
-
-**go-runewidth**:
-- Binary search for everything (O(log n) always)
-- Full grapheme clustering (expensive!)
-- Large pre-computed tables
+| Aspect | uniwidth | go-runewidth |
+|--------|----------|--------------|
+| Lookup strategy | 4-tier O(1) | Binary search O(log n) |
+| Table size | 3.8 KB | ~500 KB |
+| ASCII path | SWAR (8 bytes/iter) | Grapheme + binary search |
+| ZWJ emoji | Forward-scan state machine | Delegates to uax29 |
+| Allocations (ASCII) | 0 | 0 |
+| Go version | 1.25+ | 1.9+ |
 
 ### Trade-offs
 
-**uniwidth wins**:
-- Performance (3-46x faster)
-- Memory usage (166x smaller)
-- Code simplicity
+**uniwidth wins**: Performance (3-46x), memory (130x smaller tables), ZWJ correctness with minimal overhead.
 
-**go-runewidth wins**:
-- Mature (10+ years)
-- Grapheme clustering (ZWJ emoji sequences)
-- Wider Go version support (1.9+)
+**go-runewidth wins**: Mature ecosystem (10+ years), wider Go version support, full UAX #29 grapheme clustering via uax29.
 
 ---
 
-## Design Decisions & Rationale
-
-### Why Not Grapheme Clustering?
+## Design Decisions
 
-**Decision**: Defer to v0.2.0+
+| Decision | Rationale |
+|----------|-----------|
+| 4-tier lookup | 95% of content is ASCII; O(1) >> O(log n) |
+| 3-stage table (Tier 4) | O(1) for all codepoints, only 3.8KB |
+| Forward-scan ZWJ state machine | Simpler than reverse iteration, covers 99%+ of emoji |
+| SWAR over auto-vectorization | Explicit uint64 word processing, portable, predictable |
+| Functional options | Clean, extensible, backward compatible, zero alloc when unused |
+| Generate tables from Unicode data | Easy version updates, correctness guaranteed, reproducible |
+| Defer full UAX #29 | 2-5x performance cost, <1% real-world demand in terminals |
 
-**Rationale**:
-- 90-95% of content doesn't need it
-- Adds significant complexity and cost
-- Can be added as optional feature later
-- Simple width calculation is faster (proven)
-
-### Why Tiered Lookup?
-
-**Decision**: Use 4-tier strategy instead of pure binary search
-
-**Rationale**:
-- 95% of content is ASCII (O(1) >> O(log n))
-- CJK and emoji cluster in ranges
-- Small code size increase for huge perf win
-- Go compiler optimizes hot paths aggressively
-
-### Why Functional Options?
-
-**Decision**: Use functional options pattern for configuration
-
-**Rationale**:
-- Clean, extensible API
-- Backward compatible (default functions unchanged)
-- Zero allocation when options not used
-- Go idiomatic
+---
 
-### Why Generate Tables?
+## Future Optimizations
 
-**Decision**: Generate from Unicode data instead of hardcode
+### Explicit SIMD (Later)
+- **Go assembly** (Plan 9 `.s` files): Hand-written SSE2/AVX2/NEON for `isASCIIOnly()` and `asciiWidth()`. Potential 16-32 bytes/iter (2-4x over current SWAR).
+- **`archsimd` package** (Go 1.26+): Portable SIMD intrinsics when `GOEXPERIMENT=simd` stabilizes.
 
-**Rationale**:
-- Easy Unicode version updates
-- Correctness guaranteed (from official data)
-- Reproducible builds
-- Self-documenting (source URLs in code)
+### PGO (Profile-Guided Optimization)
+- Collect real-world profiles from TUI applications
+- Feed to Go compiler for better inlining and branch prediction
+- Expected 10-20% improvement on hot paths
 
 ---
 
-## Conclusion
+## Testing Strategy
 
-uniwidth achieves **3-46x speedup** through:
+### Test Categories
 
-1. **Tiered lookup** - O(1) for 90-95% of cases
-2. **Go 1.25 optimizations** - SIMD auto-vectorization
-3. **Zero allocations** - No GC pressure
-4. **Small tables** - Good cache locality
+| Category | Tests | Coverage |
+|----------|-------|----------|
+| Core unit tests | ASCII, CJK, Emoji, Zero-width | RuneWidth, StringWidth |
+| ZWJ sequences | 15 test cases | Family, professions, flags, modifiers |
+| Emoji modifiers | 8 test cases | Skin tones, combined sequences |
+| Edge cases | 11 test cases | Standalone ZWJ, orphan modifiers, boundaries |
+| Conformance | All Unicode categories | Categories, combining marks, controls, fullwidth |
+| Fuzzing | Go native | No panics, valid widths (0-2) |
+| Benchmarks | 20+ scenarios | ASCII, CJK, Emoji, ZWJ, TUI |
 
-The architecture is **simple**, **maintainable**, and **proven** to work.
+**Coverage**: 96.4% (target: >90%)
 
 ---
 
-*Architecture document for uniwidth v0.1.0*
-*Generated: 2025-10-15*
+*Architecture document for uniwidth v0.2.0*
+*Updated: 2026-02-05*
 *Unicode Version: 16.0.0*
diff --git a/tables.go b/tables.go
deleted file mode 100644
index 8db986d..0000000
--- a/tables.go
+++ /dev/null
@@ -1,185 +0,0 @@
-package uniwidth
-
-// This file contains Unicode width tables generated from Unicode 16.0 data.
-// These tables are used as fallback for characters not covered by fast path tiers.
-//
-// Table generation strategy:
-// - Hot path characters (ASCII, common CJK, emoji) are handled by range checks in uniwidth.go
-// - These tables contain remaining characters that need binary search
-// - This minimizes table size while maximizing performance
-
-// wideTable contains ranges of characters with East Asian Width property W (Wide) or F (Fullwidth).
-// These characters occupy 2 terminal columns.
-var wideTable = []runeRange{
-	// CJK Symbols and Punctuation (partial, not covered by fast path)
-	{0x3000, 0x303F}, // Ideographic space, CJK symbols, Ideographic Half Fill Space
-
-	// CJK Radicals Supplement
-	{0x2E80, 0x2E99},
-	{0x2E9B, 0x2EF3},
-
-	// Kangxi Radicals
-	{0x2F00, 0x2FD5},
-
-	// CJK Strokes
-	{0x31C0, 0x31E3},
-
-	// Enclosed CJK Letters and Months
-	{0x3200, 0x321E},
-	{0x3220, 0x3247},
-	{0x3250, 0x4DBE}, // Fixed: U+4DBF-U+4DFF are unassigned
-
-	// CJK Unified Ideographs Extension A
-	// (Already covered by fast path: 0x4E00-0x9FFF)
-
-	// CJK Compatibility Forms
-	{0xFE30, 0xFE4F},
-
-	// Halfwidth and Fullwidth Forms (fullwidth part)
-	{0xFF01, 0xFF60}, // Fullwidth ASCII variants
-	{0xFFE0, 0xFFE6}, // Fullwidth currency signs
-
-	// Kana Supplement
-	{0x1B000, 0x1B0FF},
-
-	// CJK Unified Ideographs Extension B-G (not covered by fast path)
-	{0x20000, 0x2A6DF}, // Extension B
-	{0x2A700, 0x2B73F}, // Extension C
-	{0x2B740, 0x2B81F}, // Extension D
-	{0x2B820, 0x2CEAF}, // Extension E
-	{0x2CEB0, 0x2EBEF}, // Extension F
-	{0x30000, 0x3134F}, // Extension G
-
-	// Additional emoji ranges not in fast path
-	{0x2600, 0x26FF},   // Miscellaneous Symbols
-	{0x2700, 0x27BF},   // Dingbats
-	{0x1F000, 0x1F02F}, // Mahjong Tiles
-	{0x1F0A0, 0x1F0FF}, // Playing Cards
-	{0x1FA00, 0x1FA6F}, // Chess Symbols
-	{0x1FA70, 0x1FAFF}, // Symbols and Pictographs Extended-A
-
-	// Ancient scripts (supplementary plane)
-	{0x10000, 0x1007F}, // Linear B Syllabary (Ancient Greek)
-}
-
-// zeroWidthTable contains ranges of characters with zero width.
-// These are control characters, combining marks, and format characters.
-var zeroWidthTable = []runeRange{
-	// C0 control characters (already handled in fast path)
-	// {0x0000, 0x001F},
-
-	// C1 control characters
-	{0x0080, 0x009F},
-
-	// Combining Diacritical Marks (partial, rest handled by unicode.In check)
-	{0x0300, 0x036F},
-
-	// Combining Diacritical Marks Extended
-	{0x1AB0, 0x1AFF},
-
-	// Hebrew combining marks
-	{0x0591, 0x05BD},
-	{0x05BF, 0x05BF},
-	{0x05C1, 0x05C2},
-	{0x05C4, 0x05C5},
-	{0x05C7, 0x05C7},
-
-	// Arabic combining marks
-	{0x0610, 0x061A},
-	{0x064B, 0x065F},
-	{0x0670, 0x0670},
-	{0x06D6, 0x06DC},
-	{0x06DF, 0x06E4},
-	{0x06E7, 0x06E8},
-	{0x06EA, 0x06ED},
-
-	// Devanagari combining marks
-	{0x0901, 0x0902},
-	{0x093A, 0x093A},
-	{0x093C, 0x093C},
-	{0x0941, 0x0948},
-	{0x094D, 0x094D},
-	{0x0951, 0x0957},
-	{0x0962, 0x0963},
-
-	// Soft hyphen
-	{0x00AD, 0x00AD},
-
-	// Format characters
-	{0x200B, 0x200F}, // Zero-width space, LRM, RLM, etc.
-	// ZWJ and ZWNJ already handled in fast path
-
-	// Combining marks for symbols
-	{0x20D0, 0x20FF},
-
-	// Variation selectors (partial, rest in fast path)
-	// {0xFE00, 0xFE0F}, // Already in fast path
-
-	// Arabic presentation forms (zero-width)
-	{0xFE20, 0xFE2F},
-
-	// Combining Half Marks
-	{0xFE30, 0xFE2F},
-
-	// Specials (BOM, etc.)
-	{0xFEFF, 0xFEFF},
-}
-
-// ambiguousTable contains ranges of characters with East Asian Width property A (Ambiguous).
-// Width depends on context (East Asian: 2, neutral: 1).
-// For now, we default to width 1 (neutral context).
-var ambiguousTable = []runeRange{
-	// Greek and Coptic (partial)
-	{0x00A1, 0x00A1}, // Inverted exclamation mark
-	{0x00A4, 0x00A4}, // Currency sign
-	{0x00A7, 0x00A8}, // Section sign, diaeresis
-	{0x00AA, 0x00AA}, // Feminine ordinal indicator
-	{0x00AD, 0x00AE}, // Soft hyphen, registered sign
-	{0x00B0, 0x00B4}, // Degree sign, acute accent, etc.
-	{0x00B6, 0x00BA}, // Pilcrow sign, middle dot, etc.
-	{0x00BC, 0x00BF}, // Fractions, inverted question mark
-	{0x00C6, 0x00C6}, // Latin capital letter AE
-	{0x00D0, 0x00D0}, // Latin capital letter Eth
-	{0x00D7, 0x00D8}, // Multiplication sign, O with stroke
-	{0x00DE, 0x00E1}, // Thorn, a with acute, etc.
-	{0x00E6, 0x00E6}, // Latin small letter ae
-	{0x00E8, 0x00EA}, // e with grave, acute, circumflex
-	{0x00EC, 0x00ED}, // i with grave, acute
-	{0x00F0, 0x00F0}, // Latin small letter eth
-	{0x00F2, 0x00F3}, // o with grave, acute
-	{0x00F7, 0x00FA}, // Division sign, o with stroke, etc.
-	{0x00FC, 0x00FC}, // u with diaeresis
-	{0x00FE, 0x00FE}, // Latin small letter thorn
-	{0x0101, 0x0101}, // a with macron
-	{0x0111, 0x0111}, // d with stroke
-	{0x0113, 0x0113}, // e with macron
-	{0x011B, 0x011B}, // e with caron
-	{0x0126, 0x0127}, // H with stroke
-	{0x012B, 0x012B}, // i with macron
-	{0x0131, 0x0133}, // Dotless i, IJ ligature
-	{0x0138, 0x0138}, // Kra
-	{0x013F, 0x0142}, // L with middle dot, l with stroke
-	{0x0144, 0x0144}, // n with acute
-	{0x0148, 0x014B}, // n with caron, Eng
-	{0x014D, 0x014D}, // o with macron
-	{0x0152, 0x0153}, // OE ligature
-	{0x0166, 0x0167}, // T with stroke
-	{0x016B, 0x016B}, // u with macron
-	{0x01CE, 0x01CE}, // a with caron
-	{0x01D0, 0x01D0}, // i with caron
-	{0x01D2, 0x01D2}, // o with caron
-	{0x01D4, 0x01D4}, // u with caron
-	{0x01D6, 0x01D6}, // u with diaeresis and macron
-	{0x01D8, 0x01D8}, // u with diaeresis and acute
-	{0x01DA, 0x01DA}, // u with diaeresis and caron
-	{0x01DC, 0x01DC}, // u with diaeresis and grave
-
-	// Box Drawing
-	{0x2500, 0x257F},
-
-	// Block Elements
-	{0x2580, 0x259F},
-
-	// Geometric Shapes
-	{0x25A0, 0x25FF},
-}

From 2f20abdfa300ef526c959c44d16b54cd65efc739 Mon Sep 17 00:00:00 2001
From: Andy <a.kolkov@gmail.com>
Date: Thu, 5 Feb 2026 02:34:45 +0300
Subject: [PATCH 6/6] chore: update CI workflows and fix code formatting

- Update GitHub Actions: checkout v6, setup-go v6, codecov v5, golangci-lint v9
- Add benchmarks.yml: regression detection (benchstat) + library comparison table
- Remove develop/release/hotfix branch triggers (main-only + PRs)
- Add concurrency groups to prevent duplicate CI runs
- Fix gofmt formatting in source files
---
 .github/workflows/benchmarks.yml | 296 +++++++++++++++++++++++++++++++
 .github/workflows/ci.yml         |  63 +++----
 cmd/generate-tables/main.go      |   6 +-
 conformance_test.go              |   4 +-
 uniwidth.go                      |  12 +-
 uniwidth_test.go                 |   2 +-
 6 files changed, 330 insertions(+), 53 deletions(-)
 create mode 100644 .github/workflows/benchmarks.yml

diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
new file mode 100644
index 0000000..7ce6e11
--- /dev/null
+++ b/.github/workflows/benchmarks.yml
@@ -0,0 +1,296 @@
+name: Benchmarks
+
+# Two benchmark jobs:
+# 1. Regression detection: benchstat compares base vs PR branch (same benchmarks)
+# 2. Library comparison: uniwidth vs go-runewidth vs uniseg (three-way table)
+#
+# Note: GitHub Actions shared runners have ~10-20% variance.
+# Results are directional, not absolute. Major regressions (>30%) are reliable.
+
+on:
+  pull_request:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  # ============================================================================
+  # Job 1: Regression Detection (benchstat base vs PR)
+  # ============================================================================
+  regression:
+    name: Regression Detection
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+
+    steps:
+    - name: Checkout PR branch
+      uses: actions/checkout@v6
+      with:
+        ref: ${{ github.event.pull_request.head.sha }}
+        path: pr
+
+    - name: Checkout base branch
+      uses: actions/checkout@v6
+      with:
+        ref: ${{ github.event.pull_request.base.sha }}
+        path: base
+
+    - name: Set up Go
+      uses: actions/setup-go@v6
+      with:
+        go-version: '1.25'
+        cache: false
+
+    - name: Install benchstat
+      run: go install golang.org/x/perf/cmd/benchstat@latest
+
+    - name: Run base branch benchmarks
+      working-directory: base
+      run: |
+        go test -bench=. -benchmem -count=5 -benchtime=100ms \
+          -run=^$ ./... 2>/dev/null > ../base-bench.txt || true
+
+    - name: Run PR branch benchmarks
+      working-directory: pr
+      run: |
+        go test -bench=. -benchmem -count=5 -benchtime=100ms \
+          -run=^$ ./... 2>/dev/null > ../pr-bench.txt || true
+
+    - name: Compare benchmarks
+      run: |
+        benchstat base-bench.txt pr-bench.txt > full-comparison.txt 2>&1 || echo "benchstat comparison failed" > full-comparison.txt
+
+        GEOMEAN=$(grep -E "^geomean" full-comparison.txt | head -1 || echo "")
+        REGRESSIONS=$(grep -E "\+[0-9]+\.[0-9]+%" full-comparison.txt | grep -v "~" | head -10 || echo "")
+
+        {
+          echo "## Regression Detection"
+          echo ""
+          echo "Comparing \`${{ github.event.pull_request.base.ref }}\` → PR #${{ github.event.pull_request.number }}"
+          echo ""
+
+          if [ -n "$GEOMEAN" ]; then
+            echo "**Summary:** \`$GEOMEAN\`"
+            echo ""
+          fi
+
+          if [ -n "$REGRESSIONS" ]; then
+            echo "⚠️ **Potential regressions detected:**"
+            echo "\`\`\`"
+            echo "$REGRESSIONS"
+            echo "\`\`\`"
+            echo ""
+          else
+            echo "✅ No significant regressions detected."
+            echo ""
+          fi
+
+          echo "<details>"
+          echo "<summary>Full benchstat output</summary>"
+          echo ""
+          echo "\`\`\`"
+          cat full-comparison.txt
+          echo "\`\`\`"
+          echo ""
+          echo "</details>"
+        } > regression.md
+
+    - name: Upload benchmark results
+      uses: actions/upload-artifact@v4
+      with:
+        name: regression-benchmarks
+        path: |
+          base-bench.txt
+          pr-bench.txt
+          full-comparison.txt
+        retention-days: 30
+
+    - name: Find existing regression comment
+      uses: peter-evans/find-comment@v3
+      id: fc-regression
+      with:
+        issue-number: ${{ github.event.pull_request.number }}
+        comment-author: 'github-actions[bot]'
+        body-includes: '## Regression Detection'
+
+    - name: Post regression comment
+      uses: peter-evans/create-or-update-comment@v4
+      with:
+        comment-id: ${{ steps.fc-regression.outputs.comment-id }}
+        issue-number: ${{ github.event.pull_request.number }}
+        body-path: regression.md
+        edit-mode: replace
+
+  # ============================================================================
+  # Job 2: Library Comparison (uniwidth vs go-runewidth vs uniseg)
+  # ============================================================================
+  comparison:
+    name: Library Comparison
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v6
+
+    - name: Set up Go
+      uses: actions/setup-go@v6
+      with:
+        go-version: '1.25'
+        cache: true
+
+    - name: Run three-way benchmarks
+      working-directory: bench
+      run: |
+        go test -bench=. -benchmem -count=3 -benchtime=100ms \
+          -run=^$ ./... 2>/dev/null > ../bench-raw.txt || true
+
+    - name: Generate comparison table
+      run: |
+        # Parse benchmark output and build comparison table
+        # Benchmark names follow: BenchmarkStringWidth_{Category}_{Size}_{Library}-N
+        # Extract: category, ns/op for each library
+
+        echo "## Library Comparison" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "**Platform**: Ubuntu \$(lsb_release -rs), \$(uname -m)" >> $GITHUB_STEP_SUMMARY
+        echo "**CPU**: \$(grep 'model name' /proc/cpuinfo | head -1 | cut -d: -f2 | xargs)" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+
+        # Build the table
+        {
+          echo "| Scenario | uniwidth | go-runewidth | uniseg | vs runewidth | vs uniseg | Winner |"
+          echo "|----------|----------|-------------|--------|-------------|----------|--------|"
+        } > table.md
+
+        # Get unique benchmark scenarios (strip library suffix and CPU count)
+        grep "^Benchmark" bench-raw.txt | sed 's/_Uniwidth-[0-9]*//' | sed 's/_GoRunewidth-[0-9]*//' | sed 's/_Uniseg-[0-9]*//' | awk '{print $1}' | sort -u | while read scenario; do
+          # Pretty name: strip "Benchmark" prefix, replace _ with spaces
+          pretty=$(echo "$scenario" | sed 's/^Benchmark//' | sed 's/_/ /g')
+
+          # Get median ns/op for each library (take middle value of count=3)
+          uni_ns=$(grep "^${scenario}_Uniwidth-" bench-raw.txt | awk '{print $3}' | sort -n | head -2 | tail -1)
+          rw_ns=$(grep "^${scenario}_GoRunewidth-" bench-raw.txt | awk '{print $3}' | sort -n | head -2 | tail -1)
+          seg_ns=$(grep "^${scenario}_Uniseg-" bench-raw.txt | awk '{print $3}' | sort -n | head -2 | tail -1)
+
+          # Skip if uniwidth result is missing
+          [ -z "$uni_ns" ] && continue
+
+          # Format times with units
+          uni_fmt="${uni_ns} ns"
+          rw_fmt="${rw_ns:-—} ns"
+          seg_fmt="${seg_ns:-—} ns"
+
+          # Calculate vs go-runewidth speedup
+          vs_rw="—"
+          if [ -n "$rw_ns" ] && [ "$uni_ns" != "0" ]; then
+            speedup=$(echo "scale=1; $rw_ns / $uni_ns" | bc 2>/dev/null || echo "")
+            if [ -n "$speedup" ]; then
+              # Check if speedup >= 2
+              is_fast=$(echo "$speedup >= 2" | bc 2>/dev/null || echo "0")
+              if [ "$is_fast" = "1" ]; then
+                vs_rw="**${speedup}x**"
+              else
+                vs_rw="${speedup}x"
+              fi
+            fi
+          fi
+
+          # Calculate vs uniseg speedup
+          vs_seg="—"
+          if [ -n "$seg_ns" ] && [ "$uni_ns" != "0" ]; then
+            speedup=$(echo "scale=1; $seg_ns / $uni_ns" | bc 2>/dev/null || echo "")
+            if [ -n "$speedup" ]; then
+              is_fast=$(echo "$speedup >= 2" | bc 2>/dev/null || echo "0")
+              if [ "$is_fast" = "1" ]; then
+                vs_seg="**${speedup}x**"
+              else
+                vs_seg="${speedup}x"
+              fi
+            fi
+          fi
+
+          # Determine winner
+          winner="uniwidth"
+          min_ns="$uni_ns"
+          if [ -n "$rw_ns" ]; then
+            rw_faster=$(echo "$rw_ns < $min_ns" | bc 2>/dev/null || echo "0")
+            [ "$rw_faster" = "1" ] && winner="go-runewidth" && min_ns="$rw_ns"
+          fi
+          if [ -n "$seg_ns" ]; then
+            seg_faster=$(echo "$seg_ns < $min_ns" | bc 2>/dev/null || echo "0")
+            [ "$seg_faster" = "1" ] && winner="uniseg"
+          fi
+
+          # Bold the winner's time
+          if [ "$winner" = "uniwidth" ]; then
+            uni_fmt="**${uni_ns} ns**"
+          elif [ "$winner" = "go-runewidth" ]; then
+            rw_fmt="**${rw_ns} ns**"
+          elif [ "$winner" = "uniseg" ]; then
+            seg_fmt="**${seg_ns} ns**"
+          fi
+
+          echo "| ${pretty} | ${uni_fmt} | ${rw_fmt} | ${seg_fmt} | ${vs_rw} | ${vs_seg} | ${winner} |" >> table.md
+        done
+
+        # Append table to summary
+        cat table.md >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+
+        # Raw output in details
+        echo "<details>" >> $GITHUB_STEP_SUMMARY
+        echo "<summary>Raw benchmark output</summary>" >> $GITHUB_STEP_SUMMARY
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+        cat bench-raw.txt >> $GITHUB_STEP_SUMMARY
+        echo '```' >> $GITHUB_STEP_SUMMARY
+        echo "</details>" >> $GITHUB_STEP_SUMMARY
+
+        # Build PR comment (table + note)
+        {
+          echo "## Library Comparison"
+          echo ""
+          cat table.md
+          echo ""
+          echo "<details>"
+          echo "<summary>Raw benchmark output</summary>"
+          echo ""
+          echo "\`\`\`"
+          cat bench-raw.txt
+          echo "\`\`\`"
+          echo ""
+          echo "</details>"
+          echo ""
+          echo "> CI runners have ~10-20% variance. For accurate results, run locally: \`cd bench && go test -bench=. -benchmem -count=10\`"
+        } > comparison.md
+
+    - name: Upload comparison results
+      uses: actions/upload-artifact@v4
+      with:
+        name: library-comparison
+        path: |
+          bench-raw.txt
+          table.md
+        retention-days: 30
+
+    - name: Find existing comparison comment
+      uses: peter-evans/find-comment@v3
+      id: fc-comparison
+      with:
+        issue-number: ${{ github.event.pull_request.number }}
+        comment-author: 'github-actions[bot]'
+        body-includes: '## Library Comparison'
+
+    - name: Post comparison comment
+      uses: peter-evans/create-or-update-comment@v4
+      with:
+        comment-id: ${{ steps.fc-comparison.outputs.comment-id }}
+        issue-number: ${{ github.event.pull_request.number }}
+        body-path: comparison.md
+        edit-mode: replace
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index b9d101d..656c70c 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -3,26 +3,21 @@ name: CI
 # Testing Strategy:
 # - Tests run on Linux, macOS, and Windows (library must be cross-platform)
 # - Go 1.25+ required (matches go.mod requirement)
-# - Beta phase: Continue-on-error for lint (non-blocking during rapid development)
 #
-# Branch Strategy (Git-Flow):
-# - main: Production-ready code (releases only)
-# - develop: Active development branch
-# - release/*: Release preparation branches
-# - hotfix/*: Critical production fixes
-# - Pull requests: Must pass all tests before merge
+# Trigger Strategy (anti-duplication):
+# - push on main: catch direct merges/commits
+# - pull_request (all branches): catch PR checks
+# - concurrency group: cancels outdated runs on same PR/branch
 
 on:
   push:
     branches:
       - main
-      - develop
-      - 'release/**'
-      - 'hotfix/**'
   pull_request:
-    branches:
-      - main
-      - develop
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
 
 jobs:
   # Unit tests - Cross-platform
@@ -36,10 +31,10 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
 
     - name: Set up Go
-      uses: actions/setup-go@v5
+      uses: actions/setup-go@v6
       with:
         go-version: ${{ matrix.go-version }}
         cache: true
@@ -59,7 +54,7 @@ jobs:
 
     - name: Upload coverage to Codecov
       if: matrix.os == 'ubuntu-latest' && matrix.go-version == '1.25'
-      uses: codecov/codecov-action@v4
+      uses: codecov/codecov-action@v5
       continue-on-error: true
       with:
         file: ./coverage.txt
@@ -75,38 +70,20 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
 
     - name: Set up Go
-      uses: actions/setup-go@v5
+      uses: actions/setup-go@v6
       with:
         go-version: '1.25'
         cache: true
 
     - name: Run golangci-lint
-      uses: golangci/golangci-lint-action@v8
+      uses: golangci/golangci-lint-action@v9
       with:
         version: latest
         args: --timeout=5m
 
-  # Benchmarks
-  benchmark:
-    name: Benchmark
-    runs-on: ubuntu-latest
-
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-
-    - name: Set up Go
-      uses: actions/setup-go@v5
-      with:
-        go-version: '1.25'
-        cache: true
-
-    - name: Run benchmarks
-      run: go test -bench=. -benchmem -run=^$ ./...
-
   # Fuzz tests
   fuzz:
     name: Fuzz Test
@@ -115,10 +92,10 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
 
     - name: Set up Go
-      uses: actions/setup-go@v5
+      uses: actions/setup-go@v6
       with:
         go-version: '1.25'
         cache: true
@@ -138,10 +115,10 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
 
     - name: Set up Go
-      uses: actions/setup-go@v5
+      uses: actions/setup-go@v6
       with:
         go-version: '1.25'
         cache: true
@@ -161,10 +138,10 @@ jobs:
 
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@v6
 
     - name: Set up Go
-      uses: actions/setup-go@v5
+      uses: actions/setup-go@v6
       with:
         go-version: '1.25'
         cache: true
diff --git a/cmd/generate-tables/main.go b/cmd/generate-tables/main.go
index 2ff0573..a66eed3 100644
--- a/cmd/generate-tables/main.go
+++ b/cmd/generate-tables/main.go
@@ -5,9 +5,9 @@
 // - emoji-data.txt - Emoji presentation properties
 //
 // It generates optimized tables for uniwidth's tiered lookup strategy:
-// - Tier 1-3 (hot paths) are hardcoded in uniwidth.go for O(1) lookup
-// - This generates Tier 4 tables: both legacy binary search tables and
-//   a 3-stage multi-stage lookup table for O(1) fallback
+//   - Tier 1-3 (hot paths) are hardcoded in uniwidth.go for O(1) lookup
+//   - This generates Tier 4 tables: both legacy binary search tables and
+//     a 3-stage multi-stage lookup table for O(1) fallback
 //
 // Usage:
 //
diff --git a/conformance_test.go b/conformance_test.go
index 2b61a60..d704852 100644
--- a/conformance_test.go
+++ b/conformance_test.go
@@ -173,8 +173,8 @@ func TestUnicodeConformance_SurrogateHandling(t *testing.T) {
 		want int
 	}{
 		// Characters in Supplementary Multilingual Plane (SMP)
-		{"Gothic letter", "𐌰", 1},            // U+10330
-		{"Linear B syllable", "𐀀", 1},        // U+10000 (EAW: N = Neutral/Narrow)
+		{"Gothic letter", "𐌰", 1},              // U+10330
+		{"Linear B syllable", "𐀀", 1},          // U+10000 (EAW: N = Neutral/Narrow)
 		{"Emoji family", "👨\u200D👩\u200D👧", 2}, // Man + ZWJ + Woman + ZWJ + Girl (ZWJ-aware: width 2)
 	}
 
diff --git a/uniwidth.go b/uniwidth.go
index 6168597..58d28f4 100644
--- a/uniwidth.go
+++ b/uniwidth.go
@@ -395,8 +395,10 @@ func isEmojiModifier(r rune) bool {
 //   - Short strings (< 8 bytes): scalar fallback, O(n) per byte
 //   - Longer strings: ~8x throughput via SWAR, O(n/8) per word + O(n%8) tail
 //   - 0 allocations in all cases
-//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
+//
 // all pointer arithmetic is bounds-checked by the loop guard (i+8 <= n, i < n).
+//
+//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
 func isASCIIOnly(s string) bool {
 	n := len(s)
 	if n == 0 {
@@ -446,8 +448,10 @@ func isASCIIOnly(s string) bool {
 // Performance:
 //   - 0 allocations
 //   - ~8x throughput for chunks without control characters
-//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
+//
 // all pointer arithmetic is bounds-checked by the loop guards (i+8 <= n, i < n, j < 8).
+//
+//nolint:gosec // G103: unsafe usage is intentional for SWAR performance optimization;
 func asciiWidth(s string) int {
 	n := len(s)
 	if n == 0 {
@@ -461,8 +465,8 @@ func asciiWidth(s string) int {
 	// SWAR constants for control character detection.
 	const (
 		// Broadcast 0x20 and 0x7F across all 8 bytes of a uint64.
-		lo20 = uint64(0x2020202020202020)
-		hi80 = uint64(0x8080808080808080)
+		lo20  = uint64(0x2020202020202020)
+		hi80  = uint64(0x8080808080808080)
 		rep7F = uint64(0x7F7F7F7F7F7F7F7F)
 		rep01 = uint64(0x0101010101010101)
 	)
diff --git a/uniwidth_test.go b/uniwidth_test.go
index fd8d672..689554c 100644
--- a/uniwidth_test.go
+++ b/uniwidth_test.go
@@ -825,7 +825,7 @@ func TestStringWidth_ZWJEdgeCases(t *testing.T) {
 		{
 			name: "Orphan skin tone modifier",
 			s:    "🏽", // U+1F3FD alone
-			want: 2,  // Not preceded by EP, so normal width
+			want: 2,   // Not preceded by EP, so normal width
 		},
 		// ZWJ at string boundaries
 		{