-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbbre.c
5777 lines (5517 loc) · 229 KB
/
bbre.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#include <assert.h> /* assert() */
#include <limits.h> /* CHAR_BIT */
#include <stdarg.h> /* va_list, va_start(), va_arg(), va_end() */
#include <stdlib.h> /* size_t, realloc(), free() */
#include <string.h> /* memcmp(), memset(), memcpy(), strlen() */
#include "bbre.h"
#ifdef BBRE_CONFIG_HEADER_FILE
#include BBRE_CONFIG_HEADER_FILE
#endif
#define BBRE_NIL 0
#define BBRE_UTF_MAX 0x10FFFF
/* Maximum repetition count for quantifiers. */
#define BBRE_LIMIT_REPETITION_COUNT 100000
/* Maximum size of the AST. This is the sum of node count and argument count. */
#define BBRE_LIMIT_AST_SIZE 1000000
/* Maximum length (in bytes) of a group name. */
#define BBRE_LIMIT_GROUP_NAME_SIZE 1000000
/* Maximum size of a normalized charclass (max number of ranges) */
#define BBRE_LIMIT_CHARCLASS_NORMALIZED_SIZE ((BBRE_UTF_MAX + 1) / 2)
typedef unsigned int bbre_uint;
typedef unsigned char bbre_byte;
/* Macro for declaring a buffer (see the tirade about dynamic arrays later in
* this file). Serves mostly for readability. */
#define bbre_buf(T) T *
/* Enumeration of AST types. */
typedef enum bbre_ast_type {
/* An epsilon node: /|/ */
BBRE_AST_TYPE_EPS = 0,
/* A single character: /a/ */
BBRE_AST_TYPE_CHR,
/* The concatenation of two regular expressions: /lr/
* Argument 0: left child tree (AST)
* Argument 1: right child tree (AST) */
BBRE_AST_TYPE_CAT,
/* The alternation of two regular expressions: /l|r/
* Argument 0: primary alternation tree (AST)
* Argument 1: secondary alternation tree (AST) */
BBRE_AST_TYPE_ALT,
/* A repeated regular expression: /a+/
* Argument 0: child tree (AST)
* Argument 1: lower bound, always <= upper bound (number)
* Argument 2: upper bound, might be the constant `BBRE_INFTY` (number) */
BBRE_AST_TYPE_QUANT,
/* Like `QUANT`, but not greedy: /(a*?)/
* Argument 0: child tree (AST)
* Argument 1: lower bound, always <= upper bound (number)
* Argument 2: upper bound, might be the constant `BBRE_INFTY` (number) */
BBRE_AST_TYPE_UQUANT,
/* A matching group: /(?i-s:a)/
* Argument 0: child tree (AST)
* Argument 1: group flags pulled up, bitset of `enum group_flag` (number)
* Argument 2: group flags pulled down (number)
* Argument 3: capture index (number) */
BBRE_AST_TYPE_GROUP,
/* An inline group: /(?i-s)a/
* Argument 0: child tree (AST)
* Argument 1: group flags pulled up, bitset of `enum group_flag` (number)
* Argument 2: group flags pulled down (number) */
BBRE_AST_TYPE_IGROUP,
/* A single range in a character class: /[a-z]/
* Argument 0: character range begin (number)
* Argument 1: character range end (number) */
BBRE_AST_TYPE_CC_LEAF,
/* A builtin character class: /[[:digit:]]/
* Argument 0: starting index into the builtin_cc array
* Argument 1: number of character ranges to parse */
BBRE_AST_TYPE_CC_BUILTIN,
/* The set-inversion of a character class: /[^a]/
* Argument 0: child tree (AST) */
BBRE_AST_TYPE_CC_NOT,
/* The set-disjunction of a character class: /[az]/
* Argument 0: child tree A (AST)
* Argument 1: child tree B (AST) */
BBRE_AST_TYPE_CC_OR,
/* Matches any character: /./ */
BBRE_AST_TYPE_ANYCHAR,
/* Matches any byte: /\C/ */
BBRE_AST_TYPE_ANYBYTE,
/* Empty assertion: /\b/
* Argument 0: assertion flags, bitset of `bbre_assert_flag` (number) */
BBRE_AST_TYPE_ASSERT
} bbre_ast_type;
/* Information needed by the parser about each AST node type. */
typedef struct bbre_ast_type_info {
bbre_byte size; /* Number of arguments */
bbre_byte children; /* Number of children (last N nodes in arguments) */
bbre_byte prec; /* Node precedence in relation to enclosing nodes */
} bbre_ast_type_info;
/* Table of AST type information. */
static const bbre_ast_type_info bbre_ast_type_infos[] = {
{0, 0, 0}, /* EPS */
{1, 0, 0}, /* CHR */
{2, 2, 0}, /* CAT */
{2, 2, 2}, /* ALT */
{3, 1, 0}, /* QUANT */
{3, 1, 0}, /* UQUANT */
{4, 1, 3}, /* GROUP */
{3, 1, 1}, /* IGROUP */
{2, 0, 0}, /* CC_LEAF */
{2, 0, 0}, /* CC_BUILTIN */
{1, 1, 0}, /* CC_NOT */
{2, 2, 0}, /* CC_OR */
{0, 0, 0}, /* ANYCHAR */
{0, 0, 0}, /* ANYBYTE */
{1, 0, 0}, /* ASSERT */
};
/* Max number of arguments an AST node can contain. */
#define BBRE_AST_MAX_ARGS 4
/* Represents an inclusive range of bytes. */
typedef struct bbre_byte_range {
bbre_byte l; /* min ordinal */
bbre_byte h; /* max ordinal */
} bbre_byte_range;
/* Represents an inclusive range of runes. */
typedef struct bbre_rune_range {
bbre_uint l; /* min ordinal */
bbre_uint h; /* max ordinal */
} bbre_rune_range;
/* Enumeration of the various flags a group can set or clear. Note that some of
* these flags are duplicates of `bbre_flags`, and some are not. I think it's a
* good idea to keep the ABI flags separate from our internal flags. */
typedef enum bbre_group_flag {
BBRE_GROUP_FLAG_INSENSITIVE = 1, /* case-insensitive matching */
BBRE_GROUP_FLAG_MULTILINE = 2, /* ^$ match beginning/end of each line */
BBRE_GROUP_FLAG_DOTNEWLINE = 4, /* . matches \n */
BBRE_GROUP_FLAG_UNGREEDY = 8, /* ungreedy quantifiers */
BBRE_GROUP_FLAG_NONCAPTURING = 16, /* non-capturing group (?:...) */
BBRE_GROUP_FLAG_EXPRESSION = 32, /* the entire regexp */
BBRE_GROUP_FLAG_CC_DENORM = 64 /* set when compiling charclasses */
} bbre_group_flag;
/* Stack frame for the compiler, used to track a single AST node being
* compiled. */
/* A single AST node, when compiled, corresponds to a contiguous list of
* instructions. The first instruction in this list is the single entry point
* for the node. Using the NFA paradigm, this corresponds to the start state of
* an automaton.
* There may be zero or more exits from the list of instructions -- these are
* instructions that hand off control to the enclosing AST node. Again, using
* the NFA paradigm, these are transitions from nodes that do not yet have an
* end state, but will need one later. */
/* Consider the regex /ab/ which is just the concatenation of the literals a and
* b. The AST for this regex looks like:
* CAT
* +-CHR A
* +-CHR B
* In terms of an NFA, it looks like this chain of states:
* --> Q_0 --A-> Q_1 --B-> Q_2 ---> ...
* The compiler first considers the CAT node. This node links its two children
* sequentially, so the compiler must next consider the first CHR node. To match
* a CHR node, we use a RANGE instruction to check for the presence of the A
* character, and then hand back control to the instructions of the enclosing
* node. When being compiled, AST nodes do not know anything about their
* enclosing environment, so they simply keep track of instructions that
* transfer control back to the enclosing node. So, the list of instructions for
* the `CHR A` node looks like (starting at PC 1):
* 0001 RANGE 'A'-'A' -> OUT
* The compiler then goes back to the CAT node, which moves to its next child;
* the `CHR B` node. Since the `CAT` node compiles to a program that runs its
* first child, then subsequently its second, the `CAT` node will link all exits
* of the `CHR A` node to the entrypoint of the `CHR A` node.
* 0001 RANGE 'A'-'A' -> 0002
* 0002 RANGE 'B'-'B' -> OUT
* The `CAT` node itself compiles to the above list of instructions, and has a
* single exit point at PC 2.
* We keep track of the exit points from the program using a trick I first saw
* in Russ Cox's series on regexps. A linked list, backed by the actual words
* inside of the instructions, stores the exit points. This list is tracked by
* `head` and `tail`. */
/* When the compiler is evaluating a character class (resolving ands/ors/nots)
* it uses head and tail to refer to offsets in the `bbre.cc_store` array--
* `head` and `tail`, in this case, form the ends of a linked list containing
* all of the character class components (rune ranges).*/
typedef struct bbre_compframe {
bbre_uint root_hdl, /* handle to the AST node being compiled */
child_hdl, /* handle to the child AST node to be compiled next */
idx, /* used keep track of repetition index */
head, /* head of the outgoing patch linked list */
tail, /* tail of the outgoing patch linked list */
pc, /* location of first instruction compiled for this node */
flags, /* group flags in effect (INSENSITIVE, etc.) */
set_idx; /* index of the current pattern being compiled */
} bbre_compframe;
/* Bitset of empty assertions. */
typedef enum bbre_assert_flag {
BBRE_ASSERT_LINE_BEGIN = 1, /* ^ */
BBRE_ASSERT_LINE_END = 2, /* $ */
BBRE_ASSERT_TEXT_BEGIN = 4, /* \A */
BBRE_ASSERT_TEXT_END = 8, /* \z */
BBRE_ASSERT_WORD = 16, /* \b */
BBRE_ASSERT_NOT_WORD = 32 /* \B */
} bbre_assert_flag;
/* How many bits inside of the `opcode_next` field we allocate to the opcode
* itself: currently this is just 2 as we only have exactly 4 distinct opcodes,
* but it could be increased later if we wish to add more */
#define BBRE_INST_OPCODE_BITS 2
/* The number of distinct opcodes was deliberately kept as low as possible. This
* makes the compiled programs easy to reason about manually. */
typedef enum bbre_opcode {
BBRE_OPCODE_RANGE, /* matches a range of bytes */
BBRE_OPCODE_SPLIT, /* forks execution into two paths */
BBRE_OPCODE_MATCH, /* writes the current string position into a submatch */
BBRE_OPCODE_ASSERT /* continue execution if zero-width assertion */
} bbre_opcode;
/* Compiled program instruction. */
typedef struct bbre_inst {
/* opcode_next is the opcode and the next program counter (primary branch
* target), and param is opcode-specific data */
/* 3 2 2 2 1 1 0 0 0 */
/* 2 8 4 0 6 2 8 4 0 */
bbre_uint opcode_next; /* / nnnnnnnnnnnnnnnnnnnnnnnnnnnnnnoo */
/* \ n = next PC, o = opcode */
bbre_uint param; /* / 0000000000000000hhhhhhhhllllllll (RANGE) */
/* \ h = high byte, l = low byte (RANGE) */
/* / NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN (SPLIT) */
/* \ N = secondary next PC (SPLIT) */
/* / iiiiiiiiiiiiiiiiiiiiiiiiiiiiiiie (MATCH) */
/* \ i = group idx, e = start/end (MATCH) */
/* / 00000000000000000000000000aaaaaa (ASSERT) */
/* \ a = assert_flag (ASSERT) */
} bbre_inst;
/* Auxiliary data for tree nodes used for accelerating compilation of character
* classes. */
typedef union bbre_compcc_tree_aux {
bbre_uint hash, /* node hash, used for tree reduction */
pc, /* compiled location, nonzero if this node was compiled already */
xposed; /* 1 if the node was transposed, 0 otherwise */
} bbre_compcc_tree_aux;
/* Tree node for the character class compiler. */
typedef struct bbre_compcc_tree {
bbre_uint range, /* range of bytes this node matches */
child_hdl, /* handle to concatenation node */
sibling_hdl; /* handle to alternation node */
bbre_compcc_tree_aux aux; /* node hash OR cached PC */
} bbre_compcc_tree;
/* Element of a character class, used when compiling charclasses. */
typedef struct bbre_cc_elem {
bbre_rune_range range; /* the rune range this describes */
size_t next_hdl; /* handle to the next range in this list (0 to denote the end
of the list) */
} bbre_cc_elem;
/* Internal storage used for the character class compiler. It uses enough state
* that it definitely warrants its own struct. */
typedef struct bbre_compcc_data {
bbre_buf(bbre_compcc_tree) tree;
bbre_buf(bbre_compcc_tree) tree_2;
bbre_buf(bbre_uint) hash;
bbre_buf(bbre_cc_elem) store; /* character class storage */
bbre_uint store_empty; /* freelist for cc_store */
size_t store_ops; /* number of evaluation operations */
} bbre_compcc_data;
/* Bit flags to identify program entry points in the `entry` field of `re`. */
typedef enum bbre_prog_entry {
BBRE_PROG_ENTRY_REVERSE = 1, /* reverse execution */
BBRE_PROG_ENTRY_DOTSTAR = 2, /* .* before execution (unanchored match) */
BBRE_PROG_ENTRY_MAX = 4
} bbre_prog_entry;
/* A builder class for regular expressions. */
struct bbre_builder {
bbre_alloc alloc; /* allocator function */
const bbre_byte *expr; /* the expression itself */
size_t expr_size; /* the length of the expression in bytes */
bbre_flags flags; /* regex flags used for parsing / the root AST */
};
/* Forward declaration */
typedef struct bbre_exec bbre_exec;
/* Used to hold reportable errors. */
typedef struct bbre_error {
const char *msg; /* error message, if any */
size_t pos; /* position the error was encountered in expr */
} bbre_error;
/* The compiled form of a regular expression. */
typedef struct bbre_prog {
bbre_alloc alloc; /* allocator function */
bbre_buf(bbre_inst) insts; /* The compiled instructions */
bbre_buf(bbre_uint) set_idxs; /* pattern index for each instruction */
bbre_uint entry[BBRE_PROG_ENTRY_MAX]; /* entry points for the program */
bbre_uint npat; /* number of distinct patterns */
bbre_error *error; /* error info, we don't own this */
} bbre_prog;
/* Internal structure used to store a named group's name. */
typedef struct bbre_group_name {
char *name; /* The actual name (null-terminated) */
size_t name_size; /* The size of the name (allocation is this + 1) */
} bbre_group_name;
/* A compiled regular expression. */
struct bbre {
bbre_alloc alloc; /* allocator function */
bbre_buf(bbre_uint) ast; /* AST arena */
bbre_uint ast_root_hdl; /* AST root node reference */
bbre_buf(bbre_group_name) group_names; /* Named group names */
bbre_buf(bbre_uint) op_stk; /* operator stack of node handles */
bbre_buf(bbre_compframe) comp_stk; /* compiler frame stack */
bbre_compcc_data compcc; /* data used for the charclass compiler */
bbre_prog prog; /* NFA program */
const bbre_byte *expr; /* input parser expression (i.e. the regexp) */
size_t expr_pos, /* parser's current position in expr */
expr_size; /* number of *bytes* in expr */
bbre_error error; /* error message and/or pos within expr */
bbre_exec *exec; /* local execution context, NULL until actually used */
};
/* A builder class for regular expression sets. */
struct bbre_set_builder {
bbre_alloc alloc; /* allocator function */
bbre_buf(const bbre *) pats; /* patterns that compose this set */
};
/* A set of compiled regular expressions. */
struct bbre_set {
bbre_alloc alloc; /* allocator function */
bbre_prog prog; /* compiled program */
bbre_exec *exec; /* local execution context, NULL until actually used */
bbre_error error; /* error info */
};
/* Arena-like data structure used for quickly storing nfa state sets.
* Threads in the NFA need to hold on to their saved match offsets, but these
* offsets rarely change. Instead of holding them directly within the thread,
* the thread holds a handle from this arena, and the handle is copied around
* within the NFA. When a thread needs to update a match offset, the NFA will
* call bbre_save_slots_set(), which will update the backing array in this
* function accordingly. The data structure is smart enough to handle reference
* counting, so it will not allocate more slots until two or more threads need
* to store different sets of match offsets. */
typedef struct bbre_save_slots {
size_t *slots, /* slot storage array */
slots_size, /* size in threads of `slots` */
slots_alloc, /* allocation size in `size_t` of `slots` */
last_empty, /* freelist head within `slots` */
per_thrd; /* number of slots within `slots` allocated to each thread (the
last slot is reserved for the reference count, so this is the
number of groups times two plus one) */
} bbre_save_slots;
/* Thread structure used within the NFA's Pike VM. */
typedef struct bbre_nfa_thrd {
bbre_uint pc, /* program counter of the thread */
slot_hdl; /* slot handle within the NFA's save_slots */
} bbre_nfa_thrd;
/* Sparse-set data structure used to accelerate the Pike VM. This data structure
* is slightly different from the classical sparse-set data structure
* (https://research.swtch.com/sparse), in that it can be used in sparse-set or
* sparse-map mode, depending on the context, with different performance levels.
* In this engine, I model threads as a <pc, saved> pair, and use those as the
* key and value for this data structure respectively. The DFA does not concern
* itself with saved match offsets, so it only considers a thread's program
* counter, and thus uses this structure in sparse-set mode. The NFA, in
* contrast, needs to consider match offsets, and it uses the slower sparse-map
* mode. When the data structure is in sparse-set mode, the `dense_slot` array
* remains empty. The data structure is robust, but could use some error
* checking to ensure that sparse-set and sparse-map functions are not used
* during the same generation of the structure. */
typedef struct bbre_sset {
bbre_uint
size; /* reserved maximum size of the arrays (tracks the program size) */
bbre_uint dense_pc_size; /* current size of `dense_pc` */
bbre_uint dense_slot_size; /* current size of `dense_slot` */
bbre_buf(bbre_uint) sparse; /* sparse array */
bbre_buf(bbre_uint) dense_pc; /* dense key array */
bbre_buf(bbre_uint) dense_slot; /* dense value array */
} bbre_sset;
/* Pike VM-based NFA executor. This is a pretty run-of-the-mill implementation
* of the algorithm, with the exception of the `pri_stk` and `pri_bmp_tmp`
* members which are used to track pattern indices when matching a set of
* patterns. */
typedef struct bbre_nfa {
/* Thread frontier for epsilon execution */
bbre_buf(bbre_nfa_thrd) thrd_stk;
/* Match offset save slots for each thread */
bbre_save_slots slots;
/* Array of saved offsets for leftmost-longest tracking */
bbre_buf(bbre_uint) pri_stk;
/* Bitmap describing if each position in `pri_stk` is occupied */
bbre_buf(bbre_uint) pri_bmp_tmp;
/* Whether or not the NFA is being run in reverse mode */
int reversed;
} bbre_nfa;
/* Maximum number of states the DFA can cache at once before incurring a cache
* flush. */
#define BBRE_DFA_MAX_NUM_STATES 256
/* Flags that control how the DFA matches text. */
typedef enum bbre_dfa_match_flags {
/* Run the DFA in reverse mode */
BBRE_DFA_MATCH_FLAG_REVERSED = 1,
/* Exit early when finding a match (just return boolean match) */
BBRE_DFA_MATCH_FLAG_EXIT_EARLY = 2,
/* Track sets of match pattern indices */
BBRE_DFA_MATCH_FLAG_MANY = 4,
/* Use accurate pattern priority tracking when running epsilon transitions,
* this is only needed for certain types of matches */
BBRE_DFA_MATCH_FLAG_PRI = 8
} bbre_dfa_match_flags;
/* Flags that apply to and disambiguate between individual states in the DFA. */
typedef enum bbre_dfa_state_flag {
/* State was created from the beginning of text. */
BBRE_DFA_STATE_FLAG_FROM_TEXT_BEGIN = 1,
/* State was created from the beginning of a line. */
BBRE_DFA_STATE_FLAG_FROM_LINE_BEGIN = 2,
/* State was created after a word boundary. */
BBRE_DFA_STATE_FLAG_FROM_WORD = 4,
/* State has a previous match. */
BBRE_DFA_STATE_FLAG_PRI = 8,
/* State's memory can be reused. */
BBRE_DFA_STATE_FLAG_DIRTY = 16,
/* MAX is the same as DIRTY because the DIRTY flag is never stored on a state
* normally. */
BBRE_DFA_STATE_FLAG_MAX = 16
} bbre_dfa_state_flag;
/* Represents a DFA state. Keeps track of a transition for every possible input
* byte, plus another transition for the end of text. Also remembers the NFA
* states (PCs) of threads, and the set indices that the state matches. */
/* Currently, states have a flexible array immediately after them in memory that
* holds `num_state` program counters immediately followed by `num_set` pattern
* indices. The `alloc` member tracks the total size (sizeof(bbre_dfa_state) +
* sizeof(bbre_uint) * (num_state + num_set)) */
typedef struct bbre_dfa_state {
/* Transitions: These always point to other states within the same DFA cache.
* The 257'th transition is the end of text transition. */
struct bbre_dfa_state *ptrs[256 + 1];
/* Allocation size of this state. */
bbre_uint alloc;
/* Bitset of `bbre_dfa_state_flag` */
bbre_uint flags;
/* Number of NFA threads tracked by this state */
bbre_uint num_state;
/* Number of pattern indices tracked by this state */
bbre_uint num_set;
} bbre_dfa_state;
/* Lazily-generated DFA execution context. Like the NFA, this is a fairly simple
* and common implementation, with the somewhat less common ability to perform
* multipattern matching. It uses a cache to keep track of common states. */
typedef struct bbre_dfa {
/* Thread frontier for epsilon execution (similar to `bbre_nfa.thrd_stk`) */
bbre_buf(bbre_uint) thrd_stk;
/* The state cache */
bbre_dfa_state **states;
/* Number of slots in the cache */
size_t states_size;
/* Cache utilization */
size_t num_active_states;
/* Start states for every combination of state flag and entry point */
bbre_dfa_state *entry[BBRE_PROG_ENTRY_MAX][BBRE_DFA_STATE_FLAG_MAX];
/* Keeps track of which patterns have been matched throughout the text */
bbre_buf(bbre_uint) set_bmp;
/* Keeps track of which patterns that have been matched when constructing a
* new state */
bbre_buf(bbre_uint) set_buf;
} bbre_dfa;
/* Execution context that is shared between the NFA and DFA, and embedded in
* both `bbre` and `bbre_set` structs. */
struct bbre_exec {
/* Source thread set; the threads that resulted from the previous character */
/* Also used to keep track of which threads were found when exploring epsilon
* transitions */
bbre_sset src;
/* Destination thread set; the threads that resulted from exploring all
* epsilon transitions in `src` */
bbre_sset dst;
/* Allocator callback */
bbre_alloc alloc;
/* NFA program */
const bbre_prog *prog;
/* NFA executor */
bbre_nfa nfa;
/* DFA executor */
bbre_dfa dfa;
};
/* Helper macro for assertions. */
#define BBRE_IMPLIES(subject, predicate) (!(subject) || (predicate))
/* Set a generic error message. */
void bbre_error_set(bbre_error *err, const char *msg)
{
err->msg = msg;
err->pos = 0;
}
/* Initialize an error value. */
void bbre_error_init(bbre_error *err) { bbre_error_set(err, NULL); }
#ifndef BBRE_DEFAULT_ALLOC
/* Default allocation function. Hooks stdlib malloc. */
static void *
bbre_default_alloc(void *user, void *in_ptr, size_t prev, size_t next)
{
void *ptr = NULL;
(void)user, (void)prev;
if (next) {
assert(BBRE_IMPLIES(!prev, !in_ptr));
ptr = realloc(in_ptr, next);
} else if (in_ptr) {
free(in_ptr);
}
return ptr;
}
#define BBRE_DEFAULT_ALLOC bbre_default_alloc
#endif
/* Call alloc->cb and get/free memory, given a `bbre_alloc` object. */
static void *
bbre_alloci(bbre_alloc *alloc, void *old_ptr, size_t old_size, size_t new_size)
{
return alloc->cb(alloc->user, old_ptr, old_size, new_size);
}
/* For a library like this, you really need a convenient way to represent
* dynamically-sized arrays of many different types. There's a million ways to
* do this in C, but they usually boil down to capturing the size of each
* element, and then plugging that size into an array allocation routine.
* Originally, this library used a non-generic dynamic array only capable of
* holding u32 (machine words), and simply represented all relevant types in
* terms of u32. This actually worked very well for the AST and parser, but the
* more complex structures used to execute regular expressions really benefit
* from having a properly typed dynamic array implementation. */
/* I avoided implementing a solid dynamic array in this library for a while,
* becuase I didn't feel like spending the brainpower on finding a good and safe
* solution. I've implemented dynamic arrays in C before, and I've never been
* fully satisfied with them. I think that the main problems with these data
* structures result from (1) type unsafety, (2) macro overuse, and (3)
* ergonomics, in order of importance. */
/* Any generic dynamic array implementation in C worth its salt *must* have a
* measure of type safety. When the language itself provides next to nothing in
* terms of safety checks, you have to take everything you can get.
* Many dynamic array implementations rely on the user carrying the type
* information around with them. Consider these two ways of defining push:
* [a] dynamic_array_T_push(arr, elem)
* [b] dynamic_array_push(arr, T, elem)
* Option (a) requires the function dynamic_array_T_push to be predefined,
* usually through a lengthy macro. This increases macro use, and decreases
* ergonomics, since you end up wasting lines on declaring these functions in
* what is essentially manual template instantiation:
* DYNAMIC_ARRAY_INIT_DECL(T);
* DYNAMIC_ARRAY_PUSH_DECL(T);
* DYNAMIC_ARRAY_POP_DECL(T);
* ...
* Option (b) does not require this manual template instantiation, but suffers
* from a worse problem: it's easy to accidentally use the wrong T, which is
* very hard to check for, especially at compile-time. */
/* In essence, we want a dynamic array implementation that does not require us
* to carry around a T for each call:
* dynamic_array_push(arr, elem)
* This means that the dynamic_array_push macro must determine the generic type
* of arr purely through properties of arr. But this presents another problem. A
* dynamic array needs to remember its size and allocated reserve, so it will
* look something like this:
* struct dynamic_array_struct_T {
* T* ptr;
* size_t size, alloc;
* };
* ...which means that the `arr` in dynamic_array_push(arr, elem) must be such a
* generic struct. We now have a familiar problem: foreach T we use in our
* program, we must declare some `struct dynamic_array_struct_T` to be able to
* use the dynamic array. */
/* So now we have another constraint on our implementation: we must not be
* required to declare a new dynamic array type for each distinct T used in our
* program. The only way to do this, to my knowledge, is to just represent the
* dynamic array as a bare T*, and use the ages-old trick of storing metadata in
* a header *before the pointer.*
* We get type safety and ergonomics (array accesses can simply use p[i]!) and
* the macro side can be made relatively simple. This proved to be a good fit
* for this library. */
/* One caveat to this approach is that it introduces weird alignment concerns.
* You have to be sure that you correctly offset the size of the header from the
* returned pointer; specifically you must ensure that the alignment of the
* returned pointer is identical to that of malloc() (the largest alignment of
* any built-in type, typically long double or size_t, but left
* implementation-defined in C89)
* This is one of the cases where this program delves into pseudo-undefined
* behavior. The pointers used for each bbre_buf type are aligned to size_t,
* which happens to be the largest-width type that this library will ever store
* in them, *assuming that size_t is the same alignment (or greater) than a
* pointer*. On all major ABIs, this is not an issue. */
/* Dynamic array header, stored before the data pointer in memory. */
typedef struct bbre_buf_hdr {
size_t size, /* container size */ alloc; /* reserved capacity */
} bbre_buf_hdr;
/* Since we store the dynamic array as a raw T*, a natural implementaion might
* represent an empty array as NULL. However, this complicates things-- size
* checks must always have a branch to check for NULL, the grow routine has more
* scary codepaths, etc. To make code simpler, there exists a special sentinel
* value that contains the empty array. */
static const bbre_buf_hdr bbre_buf_sentinel = {0};
/* Given a dynamic array, get its header. */
static bbre_buf_hdr *bbre_buf_get_hdr(void *buf)
{
return ((bbre_buf_hdr *)buf) - 1;
}
/* Given a dynamic array, get its size. */
static size_t bbre_buf_size_t(void *buf) { return bbre_buf_get_hdr(buf)->size; }
/* Reserve enough memory to set the array's size to `size`. Note that this is
* different from C++'s std::vector::reserve() in that it actually sets the used
* size of the dynamic array. The caller must initialize the newly available
* elements. */
static int bbre_buf_resize_t(bbre_alloc *a, void **buf, size_t size)
{
bbre_buf_hdr *hdr = NULL;
size_t next_alloc;
void *next_ptr;
int err = 0;
assert(buf && *buf);
hdr = bbre_buf_get_hdr(*buf);
next_alloc = hdr->alloc ? hdr->alloc : /* sentinel */ 1;
if (size <= hdr->alloc) {
hdr->size = size;
goto error;
}
#ifdef BBRE_COV
/* For code coverage, be much lazier about our allocation policy -- this
* ensures that almost every call to this function will allocate memory. For
* testing, this is really valuable, and is the only case in the main
* implementation of this library where we explicitly fuse off a part of the
* code when we are checking coverage. */
next_alloc = size < 128 ? size : next_alloc;
#endif
while (next_alloc < size)
next_alloc *= 2;
next_ptr = bbre_alloci(
a, hdr->alloc ? hdr : /* sentinel */ NULL,
hdr->alloc ? sizeof(bbre_buf_hdr) + hdr->alloc : /* sentinel */ 0,
sizeof(bbre_buf_hdr) + next_alloc);
if (!next_ptr) {
err = BBRE_ERR_MEM;
goto error;
}
hdr = next_ptr;
hdr->alloc = next_alloc;
hdr->size = size;
*buf = hdr + 1;
error:
return err;
}
/* Initialize an empty dynamic array. */
static void bbre_buf_init_t(void **b)
{
/* discard const qualifier: this is actually a good thing, because
* bbre_buf_sentinel resides in rodata, and shouldn't be written to. This
* cast helps us catch bugs in the buf implementation earlier */
*b = ((bbre_buf_hdr *)&bbre_buf_sentinel) + 1;
assert(bbre_buf_get_hdr(*b)->size == 0 && bbre_buf_get_hdr(*b)->alloc == 0);
}
/* Destroy a dynamic array. */
static void bbre_buf_destroy_t(bbre_alloc *a, void **buf)
{
bbre_buf_hdr *hdr;
assert(buf && *buf);
hdr = bbre_buf_get_hdr(*buf);
if (hdr->alloc)
bbre_alloci(a, hdr, sizeof(*hdr) + hdr->alloc, 0);
}
/* Increase size by `incr`. */
static int bbre_buf_grow_t(bbre_alloc *a, void **buf, size_t incr)
{
assert(buf);
return bbre_buf_resize_t(a, buf, bbre_buf_size_t(*buf) + incr);
}
/* Get the last element index of the dynamic array. */
static size_t bbre_buf_tail_t(void *buf, size_t decr)
{
return bbre_buf_get_hdr(buf)->size - decr;
}
/* Pop the last element of the array, returning its index in storage units. */
static size_t bbre_buf_pop_t(void *buf, size_t decr)
{
size_t out;
bbre_buf_hdr *hdr;
assert(buf);
out = bbre_buf_tail_t(buf, decr);
hdr = bbre_buf_get_hdr(buf);
assert(hdr->size >= decr);
hdr->size -= decr;
return out;
}
/* Clear the buffer, without freeing its backing memory */
static void bbre_buf_clear(void *buf)
{
void *sbuf;
assert(buf);
sbuf = *(void **)buf;
assert(sbuf);
if (bbre_buf_get_hdr(sbuf) != &bbre_buf_sentinel)
bbre_buf_get_hdr(sbuf)->size = 0;
}
/* Initialize a dynamic array. */
#define bbre_buf_init(b) bbre_buf_init_t((void **)b)
/* Get the element size of a dynamic array. */
#define bbre_buf_esz(b) sizeof(**(b))
/* Push an element. */
#define bbre_buf_push(r, b, e) \
(bbre_buf_grow_t((r), (void **)(b), bbre_buf_esz(b)) \
? BBRE_ERR_MEM \
: (((*(b)) \
[bbre_buf_tail_t((void *)(*b), bbre_buf_esz(b)) / \
bbre_buf_esz(b)]) = (e), \
0))
/* Set the size to `n`. */
#define bbre_buf_reserve(r, b, n) \
(bbre_buf_resize_t(r, (void **)(b), bbre_buf_esz(b) * (n)))
/* Pop an element. */
#define bbre_buf_pop(b) \
((*(b))[bbre_buf_pop_t((void *)(*b), bbre_buf_esz(b)) / bbre_buf_esz(b)])
/* Get a pointer to `n` elements from the end. */
#define bbre_buf_peek(b, n) \
((*b) + bbre_buf_tail_t((void *)(*b), bbre_buf_esz(b)) / bbre_buf_esz(b) - \
(n))
/* Get the size. */
#define bbre_buf_size(b) (bbre_buf_size_t((void *)(b)) / sizeof(*(b)))
/* Destroy a dynamic array. */
#define bbre_buf_destroy(r, b) (bbre_buf_destroy_t((r), (void **)(b)))
static bbre_alloc bbre_alloc_make(const bbre_alloc *input)
{
bbre_alloc out;
if (input)
out = *input;
else {
out.cb = bbre_default_alloc;
out.user = NULL;
}
return out;
}
/* Make a byte range; more convenient than struct initialization in '89. */
static bbre_byte_range bbre_byte_range_make(bbre_byte l, bbre_byte h)
{
bbre_byte_range out;
out.l = l, out.h = h;
return out;
}
/* Pack a byte range into a uint, low byte first. */
static bbre_uint bbre_byte_range_to_u32(bbre_byte_range br)
{
return ((bbre_uint)br.l) | ((bbre_uint)br.h) << 8;
}
/* Unpack a byte range from a uint. */
static bbre_byte_range bbre_uint_to_byte_range(bbre_uint u)
{
return bbre_byte_range_make(u & 0xFF, u >> 8 & 0xFF);
}
/* Check if two byte ranges are adjacent (right directly supersedes left) */
static int
bbre_byte_range_is_adjacent(bbre_byte_range left, bbre_byte_range right)
{
return ((bbre_uint)left.h) + 1 == ((bbre_uint)right.l);
}
/* Make a rune range. */
static bbre_rune_range bbre_rune_range_make(bbre_uint l, bbre_uint h)
{
bbre_rune_range out;
out.l = l, out.h = h;
return out;
}
/* General purpose hashing function. This should probably be changed to
* something a bit better, but works very well for this library.
* Found by the intrepid Chris Wellons:
* https://nullprogram.com/blog/2018/07/31/ */
static bbre_uint bbre_hash(bbre_uint x)
{
x ^= x >> 16;
x *= 0x7feb352dU;
x ^= x >> 15;
x *= 0x846ca68bU;
x ^= x >> 16;
return x;
}
/* Create and propagate a parsing error.
* Returns `BBRE_ERR_PARSE` unconditionally. */
static int bbre_err_parse(bbre *r, const char *msg)
{
bbre_error_set(&r->error, msg);
r->error.pos = r->expr_pos;
return BBRE_ERR_PARSE;
}
/* Check if we are at the end of the regex string. */
static int bbre_parse_has_more(bbre *r) { return r->expr_pos != r->expr_size; }
/* These functions are defined near the automatically-generated parts of the
* file (the end) for readability purposes. */
static bbre_uint
bbre_utf8_decode(bbre_uint *state, bbre_uint *codep, bbre_uint byte);
static int bbre_parse_check_well_formed_utf8(bbre *r);
/* Get the next input codepoint. This function assumes that there is a valid
* codepoint left in the input string, so it will abort the program if there is
* none. */
static bbre_uint bbre_parse_next(bbre *r)
{
bbre_uint state = 0, codep;
assert(bbre_parse_has_more(r));
while (bbre_utf8_decode(&state, &codep, r->expr[r->expr_pos++]) != 0)
assert(r->expr_pos < r->expr_size);
assert(state == 0);
return codep;
}
/* Get the next input codepoint, or raise a parse error with the given error
* message if there is no more input. */
static int bbre_parse_next_or(bbre *r, bbre_uint *codep, const char *else_msg)
{
int err = 0;
assert(else_msg);
if (!bbre_parse_has_more(r)) {
err = bbre_err_parse(r, else_msg);
goto error;
}
*codep = bbre_parse_next(r);
error:
return err;
}
/* Helper function to check the next character of input without advancing the
* parser past it. */
static bbre_uint bbre_peek_next(bbre *r)
{
size_t prev_pos = r->expr_pos;
bbre_uint out = bbre_parse_next(r);
r->expr_pos = prev_pos;
return out;
}
/* Sentinel value to represent an infinite repetition. */
#define BBRE_INFTY (BBRE_LIMIT_REPETITION_COUNT + 1)
/* Make a new AST node within the regular expression. Variadic for convenience
* when creating new nodes, which is done frequently in the parser. */
static int
bbre_ast_make(bbre *r, bbre_uint *out_node_hdl, bbre_ast_type type, ...)
{
va_list in_args;
bbre_uint args[6], arg_idx = 0, i = 0;
int err = 0;
va_start(in_args, type);
if (!bbre_buf_size(r->ast))
args[arg_idx++] = 0; /* sentinel */
*out_node_hdl = bbre_buf_size(r->ast) + arg_idx;
args[arg_idx++] = type;
while (i < bbre_ast_type_infos[type].size)
args[arg_idx++] = va_arg(in_args, bbre_uint), i++;
assert(i == bbre_ast_type_infos[type].size);
for (i = 0; i < arg_idx; i++) {
if (bbre_buf_size(r->ast) == BBRE_LIMIT_AST_SIZE) {
bbre_error_set(&r->error, "regular expression is too complex");
err = BBRE_ERR_LIMIT;
goto error;
}
if ((err = bbre_buf_push(&r->alloc, &r->ast, args[i])))
goto error;
}
error:
va_end(in_args);
return err;
}
/* Decompose a given AST node, given its reference, into `out_args`. */
static void bbre_ast_decompose(bbre *r, bbre_uint node_hdl, bbre_uint *out_args)
{
bbre_uint *in_args = r->ast + node_hdl;
bbre_uint i;
for (i = 0; i < bbre_ast_type_infos[*in_args].size; i++)
out_args[i] = in_args[i + 1];
}
/* Get the type of the given AST node. */
static bbre_uint *bbre_ast_type_ptr(bbre *r, bbre_uint node_hdl)
{
assert(node_hdl != BBRE_NIL);
return r->ast + node_hdl;
}
/* Get a pointer to the `n`'th parameter of the given AST node. */
static bbre_uint *bbre_ast_param_ptr(bbre *r, bbre_uint node_hdl, bbre_uint n)
{
assert(bbre_ast_type_infos[*bbre_ast_type_ptr(r, node_hdl)].size > n);
return r->ast + node_hdl + 1 + n;
}
/* Returns true if the given ast type is part of a character class subtree. */
static int bbre_ast_type_is_cc(bbre_ast_type ast_type)
{
return (ast_type == BBRE_AST_TYPE_CC_LEAF) ||
(ast_type == BBRE_AST_TYPE_CC_BUILTIN) ||
(ast_type == BBRE_AST_TYPE_CC_NOT) ||
(ast_type == BBRE_AST_TYPE_CC_OR);
}
/* Based on node precedence, pop nodes on the operator stack. This will pop
* nodes until a node of equal or greater precedence is at the top. */
static bbre_uint bbre_ast_pop_prec(bbre *r, bbre_ast_type pop_type)
{
bbre_uint popped_hdl = BBRE_NIL;
assert(bbre_buf_size(r->op_stk));
/* The top node is the cat node, it should always be popped. */
popped_hdl = bbre_buf_pop(&r->op_stk);
while (bbre_buf_size(r->op_stk)) {
bbre_uint top_hdl = *bbre_buf_peek(&r->op_stk, 0);
bbre_ast_type top_type = *bbre_ast_type_ptr(r, top_hdl);
bbre_uint top_prec = bbre_ast_type_infos[top_type].prec,
pop_prec = bbre_ast_type_infos[pop_type].prec;
if (top_prec < pop_prec)
popped_hdl = bbre_buf_pop(&r->op_stk);
else
break;
}
return popped_hdl;
}
/* Link the top node on the AST stack to the preceding node on the stack. */
static void bbre_ast_fix(bbre *r)
{
bbre_uint top_hdl;
assert(bbre_buf_size(r->op_stk) > 0);
top_hdl = *bbre_buf_peek(&r->op_stk, 0);
if (bbre_buf_size(r->op_stk) == 1)
r->ast_root_hdl = top_hdl;
else {
bbre_uint parent_hdl = *bbre_buf_peek(&r->op_stk, 1);
bbre_ast_type parent_type = *bbre_ast_type_ptr(r, parent_hdl);
assert(bbre_ast_type_infos[parent_type].children > 0);
*bbre_ast_param_ptr(
r, parent_hdl, bbre_ast_type_infos[parent_type].children - 1) = top_hdl;
}
}
/* Push an AST node to the operator stack, and fixup the furthest right child
* pointer of the parent node. */
static int bbre_ast_push(bbre *r, bbre_uint node_hdl)
{
int err = 0;
if ((err = bbre_buf_push(&r->alloc, &r->op_stk, node_hdl)))
goto error;
bbre_ast_fix(r);
error:
return err;
}
/* Create a CAT node on the top of the stack. */