Skip to content

Commit ae733a0

Browse files
committed
u8string: derive directly from vector
most methods are competelely different, esp. the iterator. but esp. struct A is different. add a special u8str_str_push_back for strings not chars, analog to vec_str_push_back it's a bit hard to write tests without an libstdc++ <u8string> yet. use libunistring as first possible backend. there are more.
1 parent 7f30291 commit ae733a0

File tree

5 files changed

+729
-39
lines changed

5 files changed

+729
-39
lines changed

ctl/u8string.h

Lines changed: 242 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,29 @@
1+
/* We don't even have <u8string> yet in the libstdc++!
2+
Do normalized NFD comparisons. Check validity.
3+
SPDX-License-Identifier: MIT */
14
#ifndef __CTL_U8STRING_H__
25
#define __CTL_U8STRING_H__
36

47
#ifdef T
5-
#error "Template type T defined for <ctl/u8string.h>"
8+
# error "Template type T defined for <ctl/u8string.h>"
69
#endif
710

811
#ifndef __cpp_lib_char8_t
912
typedef unsigned char char8_t;
1013
#endif
1114

12-
#define HOLD
13-
#define T char8_t
14-
#ifndef vec
15-
# define u8str_char8_t u8str
16-
# define vec u8str
17-
# define A u8str
15+
// check backends
16+
#ifdef _LIBUNISTRING_VERSION
17+
# include <uninorm.h>
1818
#endif
1919

20+
#define POD
21+
#undef A
22+
#define T char8_t
23+
#define A u8str
24+
#define u8str_char8_t u8str
25+
#define vec u8str
26+
2027
enum u8str_norm {
2128
NORM_NONE = 0,
2229
NFD, /* fastest, not composed */
@@ -29,34 +36,221 @@ enum u8str_norm {
2936

3037
typedef struct A
3138
{
32-
str rawstr;
33-
str *normstr;
39+
T* vector;
40+
size_t size;
41+
size_t capacity;
3442
void (*free)(T*);
3543
T (*copy)(T*);
36-
#ifdef COMPARE
3744
int (*compare)(T*, T*);
38-
#endif
45+
int (*equal)(T*, T*);
46+
struct str *normstr;
3947
unsigned valid:1; /* is already validated */
4048
unsigned repair:1; /* do repair */
4149
unsigned normalized:3; /* 6 normalize enums. NONE,NFD,NFC,...*/
42-
unsigned same_norm:1; /* optimization norm_value == value */
50+
unsigned same_norm:1; /* optimization normstr.vector == vector */
4351
} A;
4452

45-
#define compare __COMPARE
46-
#include <ctl/string.h>
47-
#undef compare
53+
#define HOLD
54+
#define u8str_init u8str___INIT
55+
#define u8str_equal u8str___EQUAL
56+
#define u8str_find u8str___FIND
57+
//#define at __AT
58+
#undef A
59+
#include <ctl/vector.h>
60+
#undef A
61+
#define A u8str
62+
#ifdef u8id_char8_t
63+
# define HOLD
64+
#endif
65+
#undef u8str_init
66+
#undef u8str_equal
67+
#undef u8str_find
68+
//#undef at
69+
70+
#include <string.h>
71+
72+
// for simplicity start with a signed char*? or demand char8_t and u8"" literals?
73+
static inline A
74+
JOIN(A, init)(const T* c_str)
75+
{
76+
A self = u8str___INIT();
77+
size_t len = strlen((char*)c_str);
78+
size_t min = 15;
79+
JOIN(A, reserve)(&self, len < min ? min : len);
80+
for(const T* s = c_str; *s; s++)
81+
JOIN(A, push_back)(&self, (T)*s);
82+
return self;
83+
}
84+
85+
// Compare with append, and push_back to add a single char8_t
86+
static inline void
87+
u8str_str_push_back(A* self, A s)
88+
{
89+
if(self->size == self->capacity)
90+
JOIN(A, reserve)(self, self->capacity == 0 ? s.size : (2 * self->capacity) + s.size);
91+
for(size_t i = 0; i < s.size; i++)
92+
self->vector[self->size + i] = s.vector[i];
93+
self->size += s.size;
94+
}
95+
96+
static inline void
97+
JOIN(A, append)(A* self, const T* s)
98+
{
99+
size_t start = self->size;
100+
size_t len = strlen((char*)s);
101+
JOIN(A, resize)(self, self->size + len, '\0');
102+
for(size_t i = 0; i < len; i++)
103+
self->vector[start + i] = s[i];
104+
}
105+
106+
static inline void
107+
JOIN(A, insert_str)(A* self, size_t index, const T* s)
108+
{
109+
size_t start = self->size;
110+
size_t len = strlen((char*)s);
111+
JOIN(A, resize)(self, self->size + len, '\0');
112+
self->size = start;
113+
while(len != 0)
114+
{
115+
len--;
116+
JOIN(A, insert)(self, index, s[len]);
117+
}
118+
}
119+
120+
static inline void
121+
JOIN(A, replace)(A* self, size_t index, size_t size, const T* s)
122+
{
123+
size_t end = index + size;
124+
if(end >= self->size)
125+
end = self->size;
126+
for(size_t i = index; i < end; i++)
127+
JOIN(A, erase)(self, index);
128+
JOIN(A, insert_str)(self, index, s);
129+
}
130+
131+
static inline T*
132+
JOIN(A, c_str)(A* self)
133+
{
134+
return JOIN(A, data)(self);
135+
}
136+
137+
static inline size_t
138+
JOIN(A, find)(A* self, const T* s)
139+
{
140+
T* c_str = self->vector;
141+
char* found = strstr((char*)c_str, (char*)s);
142+
if(found)
143+
return found - (char*)c_str;
144+
return SIZE_MAX;
145+
}
146+
147+
static inline int
148+
JOIN(A, count)(A* self, T c)
149+
{
150+
size_t count = 0;
151+
for(size_t i = 0; i < self->size; i++)
152+
if(self->vector[i] == c)
153+
count++;
154+
return count;
155+
}
156+
157+
static inline size_t
158+
JOIN(A, rfind)(A* self, const T* s)
159+
{
160+
T* c_str = self->vector;
161+
for(size_t i = self->size; i != SIZE_MAX; i--)
162+
{
163+
char* found = strstr((char*)&c_str[i], (char*)s);
164+
if(found)
165+
return found - (char*)c_str;
166+
}
167+
return SIZE_MAX;
168+
}
169+
170+
static inline size_t
171+
JOIN(A, find_first_of)(A* self, const T* s)
172+
{
173+
for(size_t i = 0; i < self->size; i++)
174+
for(const T* p = s; *p; p++)
175+
if(self->vector[i] == *p)
176+
return i;
177+
return SIZE_MAX;
178+
}
179+
180+
static inline size_t
181+
JOIN(A, find_last_of)(A* self, const T* s)
182+
{
183+
for(size_t i = self->size; i != SIZE_MAX; i--)
184+
for(const T* p = s; *p; p++)
185+
if(self->vector[i] == *p)
186+
return i;
187+
return SIZE_MAX;
188+
}
189+
190+
static inline size_t
191+
JOIN(A, find_first_not_of)(A* self, const T* s)
192+
{
193+
for(size_t i = 0; i < self->size; i++)
194+
{
195+
size_t count = 0;
196+
for(const T* p = s; *p; p++)
197+
if(self->vector[i] == *p)
198+
count++;
199+
if(count == 0)
200+
return i;
201+
}
202+
return SIZE_MAX;
203+
}
204+
205+
static inline size_t
206+
JOIN(A, find_last_not_of)(A* self, const T* s)
207+
{
208+
for(size_t i = self->size - 1; i != SIZE_MAX; i--)
209+
{
210+
size_t count = 0;
211+
for(const T* p = s; *p; p++)
212+
if(self->vector[i] == *p)
213+
count++;
214+
if(count == 0)
215+
return i;
216+
}
217+
return SIZE_MAX;
218+
}
219+
220+
static inline A
221+
JOIN(A, substr)(A* self, size_t index, size_t size)
222+
{
223+
A s = JOIN(A, init)((T*)"");
224+
JOIN(A, resize )(&s, size, '\0');
225+
for(size_t i = 0; i < size; i++)
226+
// FIXME
227+
s.vector[i] = self->vector[index + i];
228+
return s;
229+
}
48230

49231
/* decompose only */
50232
static inline str*
51233
JOIN(A, NFD)(A* self)
52234
{
53235
if (self->normalized == NFD)
54-
return self;
236+
return self->same_norm ? (str*)self : self->normstr;
55237
if (!self->normstr)
56238
{
57-
str norm = str_init(self->rawstr.vector);
58-
self->normstr = &norm;
239+
str _norm = str_init("");
240+
str_resize(&_norm, self->capacity * 2, '\0');
241+
self->normstr = &_norm;
59242
}
243+
str *norm = self->normstr;
244+
#ifdef _LIBUNISTRING_VERSION
245+
norm->vector = (char*)u8_normalize(UNINORM_NFD, self->vector, self->size,
246+
norm->vector, &norm->size);
247+
#else
248+
// TODO other backends
249+
strcpy (norm->vector, (char*)self->vector);
250+
#endif
251+
if (strcmp(norm->vector, (char*)self->vector) == 0)
252+
self->same_norm = 1;
253+
// free norm?
60254
return self->normstr;
61255
}
62256

@@ -65,17 +259,21 @@ static inline str*
65259
JOIN(A, NFC)(A* self)
66260
{
67261
if (self->normalized == NFC)
68-
return self;
262+
return self->same_norm ? (str*)self : self->normstr;
69263
if (!self->normstr)
70264
{
71-
str _norm = str_init(self->rawstr.vector);
265+
str _norm = str_init("");
266+
str_resize(&_norm, self->capacity * 2, '\0');
72267
self->normstr = &_norm;
73268
}
74269
str *norm = self->normstr;
75-
norm->capacity = self->rawstr.size * 2;
76-
norm->vector = (T*) malloc (norm->capacity);
77-
// TODO
78-
strcpy ((char*)norm->vector, (char*)self->rawstr.vector);
270+
#ifdef _LIBUNISTRING_VERSION
271+
norm->vector = (char*)u8_normalize(UNINORM_NFC, self->vector, self->size,
272+
norm->vector, &norm->size);
273+
#else
274+
// TODO other backends
275+
strcpy (norm->vector, (char*)self->vector);
276+
#endif
79277
/*
80278
dest = self->norm_value;
81279
dmax = self->norm_capacity;
@@ -96,6 +294,8 @@ JOIN(A, NFC)(A* self)
96294
}
97295
*/
98296
self->normalized = NFC;
297+
if (strcmp(norm->vector, (char*)self->vector) == 0)
298+
self->same_norm = 1;
99299
return self->normstr;
100300
}
101301

@@ -106,7 +306,7 @@ JOIN(A, normalize)(A* self)
106306
return self;
107307
}
108308

109-
/* Assuming s is normalized.
309+
/* Assuming arg `s` is normalized.
110310
W3C recommends not to normalize. We think different.
111311
*/
112312
static inline int
@@ -115,18 +315,33 @@ JOIN(A, compare)(A* self, const T* s)
115315
if (!self->normalized)
116316
{
117317
JOIN(A, normalize)(self);
118-
return strcmp ((char*)self->normstr->vector, (char*)s);
318+
return strcmp (self->normstr->vector, (char*)s);
319+
}
320+
else
321+
return strcmp ((char*)self->vector, (char*)s);
322+
}
323+
324+
static inline int
325+
JOIN(A, key_compare)(A* self, A* s)
326+
{
327+
if (!self->normalized)
328+
{
329+
JOIN(A, normalize)(self);
330+
return strcmp (self->normstr->vector, (char*)s->vector);
119331
}
120332
else
121-
return strcmp ((char*)self->rawstr.vector, (char*)s);
333+
return strcmp ((char*)self->vector, (char*)s->vector);
122334
}
123335

124336
#ifdef HOLD /* for u8ident.h */
125337
# undef HOLD
126338
#else
127339
# undef T
128340
# undef A
341+
# undef I
342+
# undef vec
129343
# undef u8str
344+
# undef u8str_char8_t
130345
#endif
131346

132347
#endif

ctl/vector.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,13 +27,13 @@
2727
#ifndef u8str_char8_t
2828
typedef struct A
2929
{
30-
T *vector;
31-
void (*free)(T *);
32-
T (*copy)(T *);
33-
int (*compare)(T *, T *); // 2-way operator<
34-
int (*equal)(T *, T *); // optional
30+
T* vector;
3531
size_t size;
3632
size_t capacity;
33+
void (*free)(T*);
34+
T (*copy)(T*);
35+
int (*compare)(T*, T*); // 2-way operator<
36+
int (*equal)(T*, T*); // optional
3737
} A;
3838
#endif
3939

docs/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -364,7 +364,7 @@ And in its grandiosity (esp. not header-only):
364364
array.h: stack/heap allocated
365365
vector.h: realloc
366366
string.h: vector.h
367-
u8string.h: vector.h ++
367+
u8string.h: vector.h ++
368368
deque.h: realloc (paged)
369369
queue.h: deque.h
370370
stack.h: deque.h

0 commit comments

Comments
 (0)