1
+ /* We don't even have <u8string> yet in the libstdc++!
2
+ Do normalized NFD comparisons. Check validity.
3
+ SPDX-License-Identifier: MIT */
1
4
#ifndef __CTL_U8STRING_H__
2
5
#define __CTL_U8STRING_H__
3
6
4
7
#ifdef T
5
- #error "Template type T defined for <ctl/u8string.h>"
8
+ # error "Template type T defined for <ctl/u8string.h>"
6
9
#endif
7
10
8
11
#ifndef __cpp_lib_char8_t
9
12
typedef unsigned char char8_t ;
10
13
#endif
11
14
12
- #define HOLD
13
- #define T char8_t
14
- #ifndef vec
15
- # define u8str_char8_t u8str
16
- # define vec u8str
17
- # define A u8str
15
+ // check backends
16
+ #ifdef _LIBUNISTRING_VERSION
17
+ # include <uninorm.h>
18
18
#endif
19
19
20
+ #define POD
21
+ #undef A
22
+ #define T char8_t
23
+ #define A u8str
24
+ #define u8str_char8_t u8str
25
+ #define vec u8str
26
+
20
27
enum u8str_norm {
21
28
NORM_NONE = 0 ,
22
29
NFD , /* fastest, not composed */
@@ -29,34 +36,221 @@ enum u8str_norm {
29
36
30
37
typedef struct A
31
38
{
32
- str rawstr ;
33
- str * normstr ;
39
+ T * vector ;
40
+ size_t size ;
41
+ size_t capacity ;
34
42
void (* free )(T * );
35
43
T (* copy )(T * );
36
- #ifdef COMPARE
37
44
int (* compare )(T * , T * );
38
- #endif
45
+ int (* equal )(T * , T * );
46
+ struct str * normstr ;
39
47
unsigned valid :1 ; /* is already validated */
40
48
unsigned repair :1 ; /* do repair */
41
49
unsigned normalized :3 ; /* 6 normalize enums. NONE,NFD,NFC,...*/
42
- unsigned same_norm :1 ; /* optimization norm_value == value */
50
+ unsigned same_norm :1 ; /* optimization normstr.vector == vector */
43
51
} A ;
44
52
45
- #define compare __COMPARE
46
- #include <ctl/string.h>
47
- #undef compare
53
+ #define HOLD
54
+ #define u8str_init u8str___INIT
55
+ #define u8str_equal u8str___EQUAL
56
+ #define u8str_find u8str___FIND
57
+ //#define at __AT
58
+ #undef A
59
+ #include <ctl/vector.h>
60
+ #undef A
61
+ #define A u8str
62
+ #ifdef u8id_char8_t
63
+ # define HOLD
64
+ #endif
65
+ #undef u8str_init
66
+ #undef u8str_equal
67
+ #undef u8str_find
68
+ //#undef at
69
+
70
+ #include <string.h>
71
+
72
+ // for simplicity start with a signed char*? or demand char8_t and u8"" literals?
73
+ static inline A
74
+ JOIN (A , init )(const T * c_str )
75
+ {
76
+ A self = u8str___INIT ();
77
+ size_t len = strlen ((char * )c_str );
78
+ size_t min = 15 ;
79
+ JOIN (A , reserve )(& self , len < min ? min : len );
80
+ for (const T * s = c_str ; * s ; s ++ )
81
+ JOIN (A , push_back )(& self , (T )* s );
82
+ return self ;
83
+ }
84
+
85
+ // Compare with append, and push_back to add a single char8_t
86
+ static inline void
87
+ u8str_str_push_back (A * self , A s )
88
+ {
89
+ if (self -> size == self -> capacity )
90
+ JOIN (A , reserve )(self , self -> capacity == 0 ? s .size : (2 * self -> capacity ) + s .size );
91
+ for (size_t i = 0 ; i < s .size ; i ++ )
92
+ self -> vector [self -> size + i ] = s .vector [i ];
93
+ self -> size += s .size ;
94
+ }
95
+
96
+ static inline void
97
+ JOIN (A , append )(A * self , const T * s )
98
+ {
99
+ size_t start = self -> size ;
100
+ size_t len = strlen ((char * )s );
101
+ JOIN (A , resize )(self , self -> size + len , '\0' );
102
+ for (size_t i = 0 ; i < len ; i ++ )
103
+ self -> vector [start + i ] = s [i ];
104
+ }
105
+
106
+ static inline void
107
+ JOIN (A , insert_str )(A * self , size_t index , const T * s )
108
+ {
109
+ size_t start = self -> size ;
110
+ size_t len = strlen ((char * )s );
111
+ JOIN (A , resize )(self , self -> size + len , '\0' );
112
+ self -> size = start ;
113
+ while (len != 0 )
114
+ {
115
+ len -- ;
116
+ JOIN (A , insert )(self , index , s [len ]);
117
+ }
118
+ }
119
+
120
+ static inline void
121
+ JOIN (A , replace )(A * self , size_t index , size_t size , const T * s )
122
+ {
123
+ size_t end = index + size ;
124
+ if (end >= self -> size )
125
+ end = self -> size ;
126
+ for (size_t i = index ; i < end ; i ++ )
127
+ JOIN (A , erase )(self , index );
128
+ JOIN (A , insert_str )(self , index , s );
129
+ }
130
+
131
+ static inline T *
132
+ JOIN (A , c_str )(A * self )
133
+ {
134
+ return JOIN (A , data )(self );
135
+ }
136
+
137
+ static inline size_t
138
+ JOIN (A , find )(A * self , const T * s )
139
+ {
140
+ T * c_str = self -> vector ;
141
+ char * found = strstr ((char * )c_str , (char * )s );
142
+ if (found )
143
+ return found - (char * )c_str ;
144
+ return SIZE_MAX ;
145
+ }
146
+
147
+ static inline int
148
+ JOIN (A , count )(A * self , T c )
149
+ {
150
+ size_t count = 0 ;
151
+ for (size_t i = 0 ; i < self -> size ; i ++ )
152
+ if (self -> vector [i ] == c )
153
+ count ++ ;
154
+ return count ;
155
+ }
156
+
157
+ static inline size_t
158
+ JOIN (A , rfind )(A * self , const T * s )
159
+ {
160
+ T * c_str = self -> vector ;
161
+ for (size_t i = self -> size ; i != SIZE_MAX ; i -- )
162
+ {
163
+ char * found = strstr ((char * )& c_str [i ], (char * )s );
164
+ if (found )
165
+ return found - (char * )c_str ;
166
+ }
167
+ return SIZE_MAX ;
168
+ }
169
+
170
+ static inline size_t
171
+ JOIN (A , find_first_of )(A * self , const T * s )
172
+ {
173
+ for (size_t i = 0 ; i < self -> size ; i ++ )
174
+ for (const T * p = s ; * p ; p ++ )
175
+ if (self -> vector [i ] == * p )
176
+ return i ;
177
+ return SIZE_MAX ;
178
+ }
179
+
180
+ static inline size_t
181
+ JOIN (A , find_last_of )(A * self , const T * s )
182
+ {
183
+ for (size_t i = self -> size ; i != SIZE_MAX ; i -- )
184
+ for (const T * p = s ; * p ; p ++ )
185
+ if (self -> vector [i ] == * p )
186
+ return i ;
187
+ return SIZE_MAX ;
188
+ }
189
+
190
+ static inline size_t
191
+ JOIN (A , find_first_not_of )(A * self , const T * s )
192
+ {
193
+ for (size_t i = 0 ; i < self -> size ; i ++ )
194
+ {
195
+ size_t count = 0 ;
196
+ for (const T * p = s ; * p ; p ++ )
197
+ if (self -> vector [i ] == * p )
198
+ count ++ ;
199
+ if (count == 0 )
200
+ return i ;
201
+ }
202
+ return SIZE_MAX ;
203
+ }
204
+
205
+ static inline size_t
206
+ JOIN (A , find_last_not_of )(A * self , const T * s )
207
+ {
208
+ for (size_t i = self -> size - 1 ; i != SIZE_MAX ; i -- )
209
+ {
210
+ size_t count = 0 ;
211
+ for (const T * p = s ; * p ; p ++ )
212
+ if (self -> vector [i ] == * p )
213
+ count ++ ;
214
+ if (count == 0 )
215
+ return i ;
216
+ }
217
+ return SIZE_MAX ;
218
+ }
219
+
220
+ static inline A
221
+ JOIN (A , substr )(A * self , size_t index , size_t size )
222
+ {
223
+ A s = JOIN (A , init )((T * )"" );
224
+ JOIN (A , resize )(& s , size , '\0' );
225
+ for (size_t i = 0 ; i < size ; i ++ )
226
+ // FIXME
227
+ s .vector [i ] = self -> vector [index + i ];
228
+ return s ;
229
+ }
48
230
49
231
/* decompose only */
50
232
static inline str *
51
233
JOIN (A , NFD )(A * self )
52
234
{
53
235
if (self -> normalized == NFD )
54
- return self ;
236
+ return self -> same_norm ? ( str * ) self : self -> normstr ;
55
237
if (!self -> normstr )
56
238
{
57
- str norm = str_init (self -> rawstr .vector );
58
- self -> normstr = & norm ;
239
+ str _norm = str_init ("" );
240
+ str_resize (& _norm , self -> capacity * 2 , '\0' );
241
+ self -> normstr = & _norm ;
59
242
}
243
+ str * norm = self -> normstr ;
244
+ #ifdef _LIBUNISTRING_VERSION
245
+ norm -> vector = (char * )u8_normalize (UNINORM_NFD , self -> vector , self -> size ,
246
+ norm -> vector , & norm -> size );
247
+ #else
248
+ // TODO other backends
249
+ strcpy (norm -> vector , (char * )self -> vector );
250
+ #endif
251
+ if (strcmp (norm -> vector , (char * )self -> vector ) == 0 )
252
+ self -> same_norm = 1 ;
253
+ // free norm?
60
254
return self -> normstr ;
61
255
}
62
256
@@ -65,17 +259,21 @@ static inline str*
65
259
JOIN (A , NFC )(A * self )
66
260
{
67
261
if (self -> normalized == NFC )
68
- return self ;
262
+ return self -> same_norm ? ( str * ) self : self -> normstr ;
69
263
if (!self -> normstr )
70
264
{
71
- str _norm = str_init (self -> rawstr .vector );
265
+ str _norm = str_init ("" );
266
+ str_resize (& _norm , self -> capacity * 2 , '\0' );
72
267
self -> normstr = & _norm ;
73
268
}
74
269
str * norm = self -> normstr ;
75
- norm -> capacity = self -> rawstr .size * 2 ;
76
- norm -> vector = (T * ) malloc (norm -> capacity );
77
- // TODO
78
- strcpy ((char * )norm -> vector , (char * )self -> rawstr .vector );
270
+ #ifdef _LIBUNISTRING_VERSION
271
+ norm -> vector = (char * )u8_normalize (UNINORM_NFC , self -> vector , self -> size ,
272
+ norm -> vector , & norm -> size );
273
+ #else
274
+ // TODO other backends
275
+ strcpy (norm -> vector , (char * )self -> vector );
276
+ #endif
79
277
/*
80
278
dest = self->norm_value;
81
279
dmax = self->norm_capacity;
@@ -96,6 +294,8 @@ JOIN(A, NFC)(A* self)
96
294
}
97
295
*/
98
296
self -> normalized = NFC ;
297
+ if (strcmp (norm -> vector , (char * )self -> vector ) == 0 )
298
+ self -> same_norm = 1 ;
99
299
return self -> normstr ;
100
300
}
101
301
@@ -106,7 +306,7 @@ JOIN(A, normalize)(A* self)
106
306
return self ;
107
307
}
108
308
109
- /* Assuming s is normalized.
309
+ /* Assuming arg `s` is normalized.
110
310
W3C recommends not to normalize. We think different.
111
311
*/
112
312
static inline int
@@ -115,18 +315,33 @@ JOIN(A, compare)(A* self, const T* s)
115
315
if (!self -> normalized )
116
316
{
117
317
JOIN (A , normalize )(self );
118
- return strcmp ((char * )self -> normstr -> vector , (char * )s );
318
+ return strcmp (self -> normstr -> vector , (char * )s );
319
+ }
320
+ else
321
+ return strcmp ((char * )self -> vector , (char * )s );
322
+ }
323
+
324
+ static inline int
325
+ JOIN (A , key_compare )(A * self , A * s )
326
+ {
327
+ if (!self -> normalized )
328
+ {
329
+ JOIN (A , normalize )(self );
330
+ return strcmp (self -> normstr -> vector , (char * )s -> vector );
119
331
}
120
332
else
121
- return strcmp ((char * )self -> rawstr . vector , (char * )s );
333
+ return strcmp ((char * )self -> vector , (char * )s -> vector );
122
334
}
123
335
124
336
#ifdef HOLD /* for u8ident.h */
125
337
# undef HOLD
126
338
#else
127
339
# undef T
128
340
# undef A
341
+ # undef I
342
+ # undef vec
129
343
# undef u8str
344
+ # undef u8str_char8_t
130
345
#endif
131
346
132
347
#endif
0 commit comments