-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpcre.go
754 lines (638 loc) · 19.3 KB
/
pcre.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
// Package pcre is a library that provides pcre2 regular expressions
// in pure Go, allowing for features such as cross-compiling.
//
// The lib directory contains source code automatically translated from
// pcre2's C source code for each supported architecture and/or OS.
// This package wraps the automatically-translated source to provide a
// safe interface as close to Go's regexp library as possible.
package pcre
import (
"math"
"os"
"runtime"
"strconv"
"sync"
"unsafe"
"go.elara.ws/pcre/lib"
"modernc.org/libc"
)
const Unset = math.MaxUint
// Version returns the version of pcre2 embedded in this library.
func Version() string { return lib.DPACKAGE_VERSION }
// Regexp represents a pcre2 regular expression
type Regexp struct {
mtx *sync.Mutex
expr string
re uintptr
mctx uintptr
tls *libc.TLS
calloutMtx *sync.Mutex
callout *func(tls *libc.TLS, cbptr, data uintptr) int32
}
// Compile runs CompileOpts with no options.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func Compile(pattern string) (*Regexp, error) {
return CompileOpts(pattern, 0)
}
// CompileOpts compiles the provided pattern using the given options.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func CompileOpts(pattern string, options CompileOption) (*Regexp, error) {
tls := libc.NewTLS()
// Get C string of pattern
cPattern, err := libc.CString(pattern)
if err != nil {
return nil, err
}
// Free the string when done
defer libc.Xfree(tls, cPattern)
// Allocate new error
cErr := allocError(tls)
// Free error when done
defer libc.Xfree(tls, cErr)
// Get error offsets
errPtr := addErrCodeOffset(cErr)
errOffsetPtr := addErrOffsetOffset(cErr)
// Convert pattern length to size_t type
cPatLen := lib.Tsize_t(len(pattern))
// Compile expression
r := lib.Xpcre2_compile_8(tls, cPattern, cPatLen, uint32(options), errPtr, errOffsetPtr, 0)
if r == 0 {
return nil, ptrToError(tls, cErr)
}
// Create regexp instance
regex := Regexp{
expr: pattern,
mtx: &sync.Mutex{},
re: r,
mctx: lib.Xpcre2_match_context_create_8(tls, 0),
tls: tls,
calloutMtx: &sync.Mutex{},
}
// Make sure resources are freed if GC collects the
// regular expression.
runtime.SetFinalizer(®ex, func(r *Regexp) error {
return r.Close()
})
return ®ex, nil
}
// MustCompile compiles the given pattern and panics
// if there was an error
//
// Close() should be called on the returned expression
// once it is no longer needed.
func MustCompile(pattern string) *Regexp {
rgx, err := Compile(pattern)
if err != nil {
panic(err)
}
return rgx
}
// MustCompileOpts compiles the given pattern with the given
// options and panics if there was an error.
//
// Close() should be called on the returned expression
// once it is no longer needed.
func MustCompileOpts(pattern string, options CompileOption) *Regexp {
rgx, err := CompileOpts(pattern, options)
if err != nil {
panic(err)
}
return rgx
}
// Find returns the leftmost match of the regular expression.
// A return value of nil indicates no match.
func (r *Regexp) Find(b []byte) []byte {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
return b[match[0]:match[1]]
}
// FindIndex returns a two-element slice of integers
// representing the location of the leftmost match of the
// regular expression.
func (r *Regexp) FindIndex(b []byte) []int {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
return []int{int(match[0]), int(match[1])}
}
// FindAll returns all matches of the regular expression.
// A return value of nil indicates no match.
func (r *Regexp) FindAll(b []byte, n int) [][]byte {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]byte, len(matches))
for index, match := range matches {
out[index] = b[match[0]:match[1]]
}
return out
}
// FindAll returns indices of all matches of the
// regular expression. A return value of nil indicates
// no match.
func (r *Regexp) FindAllIndex(b []byte, n int) [][]int {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]int, len(matches))
for index, match := range matches {
out[index] = []int{int(match[0]), int(match[1])}
}
return out
}
// FindSubmatch returns a slice containing the match as the
// first element, and the submatches as the subsequent elements.
func (r *Regexp) FindSubmatch(b []byte) [][]byte {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
out := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
if match[i] == Unset {
out = append(out, nil)
} else {
out = append(out, b[match[i]:match[i+1]])
}
}
return out
}
// FindSubmatchIndex returns a slice of index pairs representing
// the match and submatches, if any.
func (r *Regexp) FindSubmatchIndex(b []byte) []int {
matches, err := r.match(b, 0, false)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return nil
}
match := matches[0]
out := make([]int, len(match))
for index, offset := range match {
out[index] = int(offset)
}
return out
}
// FindAllSubmatch returns a slice of all matches and submatches
// of the regular expression. It will return no more than n matches.
// If n < 0, it will return all matches.
func (r *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][][]byte, len(matches))
for index, match := range matches {
outMatch := make([][]byte, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
if match[i] == Unset {
outMatch = append(outMatch, nil)
} else {
outMatch = append(outMatch, b[match[i]:match[i+1]])
}
}
out[index] = outMatch
}
return out
}
// FindAllSubmatch returns a slice of all indeces representing the
// locations of matches and submatches, if any, of the regular expression.
// It will return no more than n matches. If n < 0, it will return all matches.
func (r *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
matches, err := r.match(b, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 || n == 0 {
return nil
}
if n > 0 && len(matches) > n {
matches = matches[:n]
}
out := make([][]int, len(matches))
for index, match := range matches {
offsets := make([]int, len(match))
for index, offset := range match {
offsets[index] = int(offset)
}
out[index] = offsets
}
return out
}
// FindString is the String version of Find
func (r *Regexp) FindString(s string) string {
return string(r.Find([]byte(s)))
}
// FindStringIndex is the String version of FindIndex
func (r *Regexp) FindStringIndex(s string) []int {
return r.FindIndex([]byte(s))
}
// FinAllString is the String version of FindAll
func (r *Regexp) FindAllString(s string, n int) []string {
matches := r.FindAll([]byte(s), n)
out := make([]string, len(matches))
for index, match := range matches {
out[index] = string(match)
}
return out
}
// FindAllStringIndex is the String version of FindIndex
func (r *Regexp) FindAllStringIndex(s string, n int) [][]int {
return r.FindAllIndex([]byte(s), n)
}
// FindStringSubmatch is the string version of FindSubmatch
func (r *Regexp) FindStringSubmatch(s string) []string {
matches := r.FindSubmatch([]byte(s))
out := make([]string, len(matches))
for index, match := range matches {
out[index] = string(match)
}
return out
}
// FindStringSubmatchIndex is the String version of FindSubmatchIndex
func (r *Regexp) FindStringSubmatchIndex(s string) []int {
return r.FindSubmatchIndex([]byte(s))
}
// FindAllStringSubmatch is the String version of FindAllSubmatch
func (r *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
matches := r.FindAllSubmatch([]byte(s), n)
out := make([][]string, len(matches))
for index, match := range matches {
outMatch := make([]string, len(match))
for index, byteMatch := range match {
outMatch[index] = string(byteMatch)
}
out[index] = outMatch
}
return out
}
// FindAllStringSubmatchIndex is the String version of FindAllSubmatchIndex
func (r *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
return r.FindAllSubmatchIndex([]byte(s), n)
}
// Match reports whether b contains a match of the regular expression
func (r *Regexp) Match(b []byte) bool {
return r.Find(b) != nil
}
// MatchString is the String version of Match
func (r *Regexp) MatchString(s string) bool {
return r.Find([]byte(s)) != nil
}
// NumSubexp returns the number of parenthesized subexpressions
// in the regular expression.
func (r *Regexp) NumSubexp() int {
return int(r.patternInfo(lib.DPCRE2_INFO_CAPTURECOUNT))
}
// ReplaceAll returns a copy of src, replacing matches of the
// regular expression with the replacement text repl.
// Inside repl, $ signs are interpreted as in Expand,
// so for instance $1 represents the text of the first
// submatch and $name would represent the text of the
// subexpression called "name".
func (r *Regexp) ReplaceAll(src, repl []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
replStr := os.Expand(string(repl), func(s string) string {
i, err := strconv.Atoi(s)
if err != nil {
i = r.SubexpIndex(s)
if i == -1 {
return ""
}
}
// If there given match does not exist, return empty string
if i == 0 || len(match) < (2*i)+1 {
return ""
}
// Return match
return string(src[match[2*i]:match[(2*i)+1]])
})
// Replace replacement string with expanded string
repl := []byte(replStr)
// Replace bytes with new replacement string
diff, out = replaceBytes(out, repl, match[0], match[1], diff)
}
return out
}
// ReplaceAllFunc returns a copy of src in which all matches of the
// regular expression have been replaced by the return value of function
// repl applied to the matched byte slice. The replacement returned by
// repl is substituted directly, without using Expand.
func (r *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
replBytes := repl(src[match[0]:match[1]])
diff, out = replaceBytes(out, replBytes, match[0], match[1], diff)
}
return out
}
// ReplaceAllLiteral returns a copy of src, replacing matches of
// the regular expression with the replacement bytes repl.
// The replacement is substituted directly, without using Expand.
func (r *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
matches, err := r.match(src, 0, true)
if err != nil {
panic(err)
}
if len(matches) == 0 {
return src
}
out := make([]byte, len(src))
copy(out, src)
var diff int64
for _, match := range matches {
diff, out = replaceBytes(out, repl, match[0], match[1], diff)
}
return out
}
// ReplaceAllString is the String version of ReplaceAll
func (r *Regexp) ReplaceAllString(src, repl string) string {
return string(r.ReplaceAll([]byte(src), []byte(repl)))
}
// ReplaceAllStringFunc is the String version of ReplaceAllFunc
func (r *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
return string(r.ReplaceAllFunc([]byte(src), func(b []byte) []byte {
return []byte(repl(string(b)))
}))
}
// ReplaceAllLiteralString is the String version of ReplaceAllLiteral
func (r *Regexp) ReplaceAllLiteralString(src, repl string) string {
return string(r.ReplaceAllLiteral([]byte(src), []byte(repl)))
}
// Split slices s into substrings separated by the
// expression and returns a slice of the substrings
// between those expression matches.
//
// Example:
//
// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
// // s: ["", "b", "b", "c", "cadaaae"]
//
// The count determines the number of substrings to return:
//
// n > 0: at most n substrings; the last substring will be the unsplit remainder.
// n == 0: the result is nil (zero substrings)
// n < 0: all substrings
func (r *Regexp) Split(s string, n int) []string {
if n == 0 {
return nil
}
if len(r.expr) > 0 && len(s) == 0 {
return []string{""}
}
matches := r.FindAllStringIndex(s, n)
strings := make([]string, 0, len(matches))
beg := 0
end := 0
for _, match := range matches {
if n > 0 && len(strings) >= n-1 {
break
}
end = match[0]
if match[1] != 0 {
strings = append(strings, s[beg:end])
}
beg = match[1]
}
if end != len(s) {
strings = append(strings, s[beg:])
}
return strings
}
// String returns the text of the regular expression
// used for compilation.
func (r *Regexp) String() string {
return r.expr
}
// SubexpIndex returns the index of the subexpression
// with the given name, or -1 if there is no subexpression
// with that name.
func (r *Regexp) SubexpIndex(name string) int {
r.mtx.Lock()
defer r.mtx.Unlock()
// Get C string of name
cName, err := libc.CString(name)
if err != nil {
panic(err)
}
// Get substring index from name
ret := lib.Xpcre2_substring_number_from_name_8(r.tls, r.re, cName)
// If no substring error returned, return -1.
// If a different error is returned, panic.
if ret == lib.DPCRE2_ERROR_NOSUBSTRING {
return -1
} else if ret < 0 {
panic(codeToError(r.tls, ret))
}
// Return the index of the subexpression
return int(ret)
}
// SetCallout sets a callout function that will be called at specified points in the matching operation.
// fn should return zero if it ran successfully or a non-zero integer to force an error.
// See https://www.pcre.org/current/doc/html/pcre2callout.html for more information.
func (r *Regexp) SetCallout(fn func(cb *CalloutBlock) int32) error {
cfn := func(tls *libc.TLS, cbptr, data uintptr) int32 {
ccb := (*lib.Tpcre2_callout_block_8)(unsafe.Pointer(cbptr))
cb := &CalloutBlock{
Version: ccb.Fversion,
CalloutNumber: ccb.Fcallout_number,
CaptureTop: ccb.Fcapture_top,
CaptureLast: ccb.Fcapture_last,
Mark: libc.GoString(ccb.Fmark),
StartMatch: uint(ccb.Fstart_match),
CurrentPosition: uint(ccb.Fcurrent_position),
PatternPosition: uint(ccb.Fpattern_position),
NextItemLength: uint(ccb.Fnext_item_length),
CalloutStringOffset: uint(ccb.Fcallout_string_offset),
CalloutFlags: CalloutFlags(ccb.Fcallout_flags),
}
subjectBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fsubject)), ccb.Fsubject_length)
cb.Subject = string(subjectBytes)
calloutStrBytes := unsafe.Slice((*byte)(unsafe.Pointer(ccb.Fcallout_string)), ccb.Fcallout_string_length)
cb.CalloutString = string(calloutStrBytes)
ovecSlice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ccb.Foffset_vector)), (ccb.Fcapture_top*2)-1)
if len(ovecSlice) > 2 {
ovecSlice = ovecSlice[2:]
for i := 0; i < len(ovecSlice); i += 2 {
if i+1 >= len(ovecSlice) {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:])
} else {
cb.Substrings = append(cb.Substrings, cb.Subject[ovecSlice[i]:ovecSlice[i+1]])
}
}
}
return fn(cb)
}
r.calloutMtx.Lock()
defer r.calloutMtx.Unlock()
// Prevent callout function from being GC'd
r.callout = &cfn
ret := lib.Xpcre2_set_callout_8(r.tls, r.mctx, *(*uintptr)(unsafe.Pointer(&cfn)), 0)
if ret < 0 {
return codeToError(r.tls, ret)
}
return nil
}
// replaceBytes replaces the bytes at a given location, and returns a new
// offset, based on how much bigger or smaller the slice got after replacement
func replaceBytes(src, repl []byte, sOff, eOff lib.Tsize_t, diff int64) (int64, []byte) {
var out []byte
out = append(
src[:int64(sOff)+diff],
append(
repl,
src[int64(eOff)+diff:]...,
)...,
)
return diff + int64(len(out)-len(src)), out
}
// match calls the underlying pcre match functions. It re-runs the functions
// until no matches are found if multi is set to true.
func (r *Regexp) match(b []byte, options uint32, multi bool) ([][]lib.Tsize_t, error) {
if len(b) == 0 {
return nil, nil
}
r.mtx.Lock()
defer r.mtx.Unlock()
// Create a C pointer to the subject
sp := unsafe.Pointer(&b[0])
cSubject := uintptr(sp)
// Convert the size of the subject to a C size_t type
cSubjectLen := lib.Tsize_t(len(b))
// Create match data using the pattern to figure out the buffer size
md := lib.Xpcre2_match_data_create_from_pattern_8(r.tls, r.re, 0)
if md == 0 {
panic("error creating match data")
}
// Free the match data at the end of the function
defer lib.Xpcre2_match_data_free_8(r.tls, md)
var offset lib.Tsize_t
var out [][]lib.Tsize_t
// While the offset is less than the length of the subject
for offset < cSubjectLen {
// Execute expression on subject
ret := lib.Xpcre2_match_8(r.tls, r.re, cSubject, cSubjectLen, offset, options, md, r.mctx)
if ret < 0 {
// If no match found, break
if ret == lib.DPCRE2_ERROR_NOMATCH {
break
}
return nil, codeToError(r.tls, ret)
} else {
// Get amount of pairs in output vector
pairAmt := lib.Xpcre2_get_ovector_count_8(r.tls, md)
// Get pointer to output vector
ovec := lib.Xpcre2_get_ovector_pointer_8(r.tls, md)
// Create a Go slice using the output vector as the underlying array
slice := unsafe.Slice((*lib.Tsize_t)(unsafe.Pointer(ovec)), pairAmt*2)
// Create a new slice and copy the elements from the slice
// This is required because the match data will be freed in
// a defer, and that would cause a panic every time the slice
// is used later.
matches := make([]lib.Tsize_t, len(slice))
copy(matches, slice)
// If the two indices are the same (empty string), and the match is not
// immediately after another match, add it to the output and increment the
// offset. Otherwise, increment the offset and ignore the match.
if slice[0] == slice[1] && len(out) > 0 && slice[0] != out[len(out)-1][1] {
out = append(out, matches)
offset = slice[1] + 1
continue
} else if slice[0] == slice[1] {
offset = slice[1] + 1
continue
}
// Add the match to the output
out = append(out, matches)
// Set the next offset to the end index of the match
offset = matches[1]
}
// If multiple matches disabled, break
if !multi {
break
}
}
return out, nil
}
// patternInfo calls the underlying pcre pattern info function
// and returns information about the compiled regular expression
func (r *Regexp) patternInfo(what uint32) (out uint32) {
// Create a C pointer to the output integer
cOut := uintptr(unsafe.Pointer(&out))
// Get information about the compiled pattern
lib.Xpcre2_pattern_info_8(r.tls, r.re, what, cOut)
return
}
// Close frees resources used by the regular expression.
func (r *Regexp) Close() error {
if r == nil {
return nil
}
// Close thread-local storage
defer r.tls.Close()
// Free the compiled code
lib.Xpcre2_code_free_8(r.tls, r.re)
// Free the match context
lib.Xpcre2_match_context_free_8(r.tls, r.mctx)
// Set regular expression to null
r.re = 0
return nil
}