Skip to content

Commit

Permalink
Add naive printable rune width function and update UnicodeString iter…
Browse files Browse the repository at this point in the history
…ation and slicing. (#47)

* update string iter to not copy strings

* add rune width
  • Loading branch information
thatstoasty authored Jul 7, 2024
1 parent 9b2c679 commit bf7e1d0
Show file tree
Hide file tree
Showing 5 changed files with 1,436 additions and 37 deletions.
2 changes: 1 addition & 1 deletion gojo/unicode/__init__.mojo
Original file line number Diff line number Diff line change
@@ -1 +1 @@
from .utf8 import rune_count_in_string, UnicodeString
from .utf8 import rune_count_in_string, UnicodeString, rune_width, string_width, Condition, DEFAULT_CONDITION
1 change: 1 addition & 0 deletions gojo/unicode/utf8/__init__.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@ This would not be possible without his help.
"""
from .runes import rune_count_in_string
from .string import UnicodeString
from .width import string_width, rune_width, Condition, DEFAULT_CONDITION
68 changes: 32 additions & 36 deletions gojo/unicode/utf8/string.mojo
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ alias simd_width_u8 = simdwidthof[DType.uint8]()

@value
struct UnicodeString(Stringable, Sized):
"""A string that supports Unicode characters of printable size 1
(ie not east asian characters and such.).
"""A string that supports Unicode characters.
The algorithms to handle UTF-8 are from @maxim on the Mojo Discord. Thanks!
"""
Expand Down Expand Up @@ -48,30 +47,28 @@ struct UnicodeString(Stringable, Sized):
fn __str__(self) -> String:
return self.inner

# @always_inline
# fn __getitem__(self, slice: Slice) -> String:
# # Copy N bytes + null terminator into new pointer and construct string.
# var copy_src = self.inner
# var copy = DTypePointer[DType.uint8](copy_src.unsafe_uint8_ptr())
# var bytes_left = len(self.inner)

# var result = DTypePointer[DType.uint8].alloc(len(self.inner))
# var total_char_length: Int = 0
# for _ in range(slice.start, slice.end):
# print(total_char_length, bytes_left)
# # Number of bytes of the current character
# var char_length = int((copy.load() >> 7 == 0).cast[DType.uint8]() * 1 + countl_zero(~copy.load()))

# memcpy(result.offset(total_char_length), copy, char_length)

# # Move iterator forward
# bytes_left -= char_length
# copy += char_length
# total_char_length += char_length
# print(total_char_length, char_length, bytes_left)

# result[total_char_length] = 0
# return StringRef(result, total_char_length + 1)
@always_inline
fn __getitem__(self: Reference[Self], slice: Slice) -> StringSlice[self.is_mutable, self.lifetime]:
"""TODO: Doesn't handle negative indices."""
var bytes_left = len(self[].inner)
var total_char_length: Int = 0
for _ in range(slice.start, slice.end):
# Number of bytes of the current character
var char_length = int(
(DTypePointer[DType.uint8](self[].inner.unsafe_uint8_ptr() + total_char_length).load() >> 7 == 0).cast[
DType.uint8
]()
* 1
+ countl_zero(~DTypePointer[DType.uint8](self[].inner.unsafe_uint8_ptr() + total_char_length).load())
)

# Move iterator forward
bytes_left -= char_length
total_char_length += char_length

return StringSlice[self.is_mutable, self.lifetime](
unsafe_from_utf8_ptr=self[].inner.unsafe_uint8_ptr(), len=total_char_length
)

@always_inline
fn bytecount(self) -> Int:
Expand All @@ -87,26 +84,25 @@ struct UnicodeString(Stringable, Sized):
@value
struct _StringIter[mutability: Bool, lifetime: AnyLifetime[mutability].type]():
var bytes_left: Int
var ptr: DTypePointer[DType.uint8]
var ptr: UnsafePointer[UInt8]

@always_inline
fn __init__(inout self, src: Reference[String, mutability, lifetime]):
self.bytes_left = len(src[])
self.ptr = DTypePointer[DType.uint8](src[]._buffer.data)
self.ptr = src[].unsafe_uint8_ptr()

@always_inline
fn __next__(inout self) -> String:
fn __next__(inout self) -> StringSlice[mutability, lifetime]:
# Number of bytes of the current character
var char_length = int((self.ptr.load() >> 7 == 0).cast[DType.uint8]() * 1 + countl_zero(~self.ptr.load()))

# Copy N bytes + null terminator into new pointer and construct string.
var sp = DTypePointer[DType.uint8].alloc(char_length + 1)
memcpy(sp, self.ptr, char_length)
var char_length = int(
(DTypePointer[DType.uint8](self.ptr).load() >> 7 == 0).cast[DType.uint8]() * 1
+ countl_zero(~DTypePointer[DType.uint8](self.ptr).load())
)

# Move iterator forward
self.bytes_left -= char_length
self.ptr += char_length

return StringSlice[mutability, lifetime](unsafe_from_utf8_strref=StringRef(sp, char_length))
return StringSlice[mutability, lifetime](unsafe_from_utf8_ptr=self.ptr - char_length, len=char_length)

@always_inline
fn __len__(self) -> Int:
Expand Down
Loading

0 comments on commit bf7e1d0

Please sign in to comment.