Skip to content
This repository was archived by the owner on Feb 14, 2025. It is now read-only.

Commit 829b0c0

Browse files
committed
add runes
1 parent cf7c44e commit 829b0c0

File tree

5 files changed

+369
-0
lines changed

5 files changed

+369
-0
lines changed

gojo/builtins/__init__.mojo

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ from .list import equals
33
from .result import Result, WrappedError
44
from .attributes import cap, copy
55
from .errors import exit, panic
6+
7+
alias Rune = Int32

gojo/unicode/__init__.mojo

Whitespace-only changes.

gojo/unicode/utf8/__init__.mojo

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
"""Almost all of the actual implementation in this module was written by @mzaks (https://github.com/mzaks)!
2+
This would not be possible without his help.
3+
"""

gojo/unicode/utf8/runes.mojo

Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
from ...builtins import Rune
2+
from algorithm.functional import vectorize
3+
from memory.unsafe import DTypePointer
4+
from sys.info import simdwidthof
5+
from math.bit import ctlz
6+
7+
8+
# The default lowest and highest continuation byte.
9+
alias locb = 0b10000000
10+
alias hicb = 0b10111111
11+
alias RUNE_SELF = 0x80 # Characters below RuneSelf are represented as themselves in a single byte
12+
13+
14+
# acceptRange gives the range of valid values for the second byte in a UTF-8
15+
# sequence.
16+
@value
17+
struct AcceptRange(CollectionElement):
18+
var lo: UInt8 # lowest value for second byte.
19+
var hi: UInt8 # highest value for second byte.
20+
21+
22+
# ACCEPT_RANGES has size 16 to avoid bounds checks in the code that uses it.
23+
alias ACCEPT_RANGES = List[AcceptRange](
24+
AcceptRange(locb, hicb),
25+
AcceptRange(0xA0, hicb),
26+
AcceptRange(locb, 0x9F),
27+
AcceptRange(0x90, hicb),
28+
AcceptRange(locb, 0x8F),
29+
)
30+
31+
# These names of these constants are chosen to give nice alignment in the
32+
# table below. The first nibble is an index into acceptRanges or F for
33+
# special one-byte cases. The second nibble is the Rune length or the
34+
# Status for the special one-byte case.
35+
alias xx = 0xF1 # invalid: size 1
36+
alias as1 = 0xF0 # ASCII: size 1
37+
alias s1 = 0x02 # accept 0, size 2
38+
alias s2 = 0x13 # accept 1, size 3
39+
alias s3 = 0x03 # accept 0, size 3
40+
alias s4 = 0x23 # accept 2, size 3
41+
alias s5 = 0x34 # accept 3, size 4
42+
alias s6 = 0x04 # accept 0, size 4
43+
alias s7 = 0x44 # accept 4, size 4
44+
45+
46+
# first is information about the first byte in a UTF-8 sequence.
47+
var first = List[UInt8](
48+
# 1 2 3 4 5 6 7 8 9 A B C D E F
49+
as1,
50+
as1,
51+
as1,
52+
as1,
53+
as1,
54+
as1,
55+
as1,
56+
as1,
57+
as1,
58+
as1,
59+
as1,
60+
as1,
61+
as1,
62+
as1,
63+
as1,
64+
as1, # 0x00-0x0F
65+
as1,
66+
as1,
67+
as1,
68+
as1,
69+
as1,
70+
as1,
71+
as1,
72+
as1,
73+
as1,
74+
as1,
75+
as1,
76+
as1,
77+
as1,
78+
as1,
79+
as1,
80+
as1, # 0x10-0x1F
81+
as1,
82+
as1,
83+
as1,
84+
as1,
85+
as1,
86+
as1,
87+
as1,
88+
as1,
89+
as1,
90+
as1,
91+
as1,
92+
as1,
93+
as1,
94+
as1,
95+
as1,
96+
as1, # 0x20-0x2F
97+
as1,
98+
as1,
99+
as1,
100+
as1,
101+
as1,
102+
as1,
103+
as1,
104+
as1,
105+
as1,
106+
as1,
107+
as1,
108+
as1,
109+
as1,
110+
as1,
111+
as1,
112+
as1, # 0x30-0x3F
113+
as1,
114+
as1,
115+
as1,
116+
as1,
117+
as1,
118+
as1,
119+
as1,
120+
as1,
121+
as1,
122+
as1,
123+
as1,
124+
as1,
125+
as1,
126+
as1,
127+
as1,
128+
as1, # 0x40-0x4F
129+
as1,
130+
as1,
131+
as1,
132+
as1,
133+
as1,
134+
as1,
135+
as1,
136+
as1,
137+
as1,
138+
as1,
139+
as1,
140+
as1,
141+
as1,
142+
as1,
143+
as1,
144+
as1, # 0x50-0x5F
145+
as1,
146+
as1,
147+
as1,
148+
as1,
149+
as1,
150+
as1,
151+
as1,
152+
as1,
153+
as1,
154+
as1,
155+
as1,
156+
as1,
157+
as1,
158+
as1,
159+
as1,
160+
as1, # 0x60-0x6F
161+
as1,
162+
as1,
163+
as1,
164+
as1,
165+
as1,
166+
as1,
167+
as1,
168+
as1,
169+
as1,
170+
as1,
171+
as1,
172+
as1,
173+
as1,
174+
as1,
175+
as1,
176+
as1, # 0x70-0x7F
177+
# 1 2 3 4 5 6 7 8 9 A B C D E F
178+
xx,
179+
xx,
180+
xx,
181+
xx,
182+
xx,
183+
xx,
184+
xx,
185+
xx,
186+
xx,
187+
xx,
188+
xx,
189+
xx,
190+
xx,
191+
xx,
192+
xx,
193+
xx, # 0x80-0x8F
194+
xx,
195+
xx,
196+
xx,
197+
xx,
198+
xx,
199+
xx,
200+
xx,
201+
xx,
202+
xx,
203+
xx,
204+
xx,
205+
xx,
206+
xx,
207+
xx,
208+
xx,
209+
xx, # 0x90-0x9F
210+
xx,
211+
xx,
212+
xx,
213+
xx,
214+
xx,
215+
xx,
216+
xx,
217+
xx,
218+
xx,
219+
xx,
220+
xx,
221+
xx,
222+
xx,
223+
xx,
224+
xx,
225+
xx, # 0xA0-0xAF
226+
xx,
227+
xx,
228+
xx,
229+
xx,
230+
xx,
231+
xx,
232+
xx,
233+
xx,
234+
xx,
235+
xx,
236+
xx,
237+
xx,
238+
xx,
239+
xx,
240+
xx,
241+
xx, # 0xB0-0xBF
242+
xx,
243+
xx,
244+
s1,
245+
s1,
246+
s1,
247+
s1,
248+
s1,
249+
s1,
250+
s1,
251+
s1,
252+
s1,
253+
s1,
254+
s1,
255+
s1,
256+
s1,
257+
s1, # 0xC0-0xCF
258+
s1,
259+
s1,
260+
s1,
261+
s1,
262+
s1,
263+
s1,
264+
s1,
265+
s1,
266+
s1,
267+
s1,
268+
s1,
269+
s1,
270+
s1,
271+
s1,
272+
s1,
273+
s1, # 0xD0-0xDF
274+
s2,
275+
s3,
276+
s3,
277+
s3,
278+
s3,
279+
s3,
280+
s3,
281+
s3,
282+
s3,
283+
s3,
284+
s3,
285+
s3,
286+
s3,
287+
s4,
288+
s3,
289+
s3, # 0xE0-0xEF
290+
s5,
291+
s6,
292+
s6,
293+
s6,
294+
s7,
295+
xx,
296+
xx,
297+
xx,
298+
xx,
299+
xx,
300+
xx,
301+
xx,
302+
xx,
303+
xx,
304+
xx,
305+
xx, # 0xF0-0xFF
306+
)
307+
308+
309+
alias simd_width_u8 = simdwidthof[DType.uint8]()
310+
311+
312+
fn rune_count_in_string(s: String) -> Int:
313+
"""Count the number of runes in a string.
314+
315+
Args:
316+
s: The string to count runes in.
317+
318+
Returns:
319+
The number of runes in the string.
320+
"""
321+
var p = s._as_ptr().bitcast[DType.uint8]()
322+
var string_byte_length = len(s)
323+
var result = 0
324+
325+
@parameter
326+
fn count[simd_width: Int](offset: Int):
327+
result += (
328+
((p.load[width=simd_width](offset) >> 6) != 0b10)
329+
.cast[DType.uint8]()
330+
.reduce_add()
331+
.to_int()
332+
)
333+
334+
vectorize[count, simd_width_u8](string_byte_length)
335+
return result
336+
337+
338+
fn string_iterator(s: String, func: fn (String) -> None):
339+
"""Iterate over the runes in a string and call the given function with each rune.
340+
341+
Args:
342+
s: The string to iterate over.
343+
func: The function to call with each rune.
344+
"""
345+
var bytes = len(s)
346+
var p = s._as_ptr().bitcast[DType.uint8]()
347+
while bytes > 0:
348+
var char_length = ((p.load() >> 7 == 0).cast[DType.uint8]() * 1 + ctlz(~p.load())).to_int()
349+
var sp = DTypePointer[DType.int8].alloc(char_length + 1)
350+
memcpy(sp, p.bitcast[DType.int8](), char_length)
351+
sp[char_length] = 0
352+
func(String(sp, char_length + 1))
353+
bytes -= char_length
354+
p += char_length

test.mojo

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from gojo.builtins import WrappedError
2+
3+
4+
fn dummy() -> (Int, WrappedError):
5+
return (1, WrappedError("error"))
6+
7+
8+
fn main():
9+
var result = dummy()
10+
result.get[1, WrappedError]()

0 commit comments

Comments
 (0)