1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package regexp implements regular expression search.
6//
7// The syntax of the regular expressions accepted is the same
8// general syntax used by Perl, Python, and other languages.
9// More precisely, it is the syntax accepted by RE2 and described at
10// https://golang.org/s/re2syntax, except for \C.
11// For an overview of the syntax, see the [regexp/syntax] package.
12//
13// The regexp implementation provided by this package is
14// guaranteed to run in time linear in the size of the input.
15// (This is a property not guaranteed by most open source
16// implementations of regular expressions.) For more information
17// about this property, see https://swtch.com/~rsc/regexp/regexp1.html
18// or any book about automata theory.
19//
20// All characters are UTF-8-encoded code points.
21// Following [utf8.DecodeRune], each byte of an invalid UTF-8 sequence
22// is treated as if it encoded utf8.RuneError (U+FFFD).
23//
24// There are 16 methods of [Regexp] that match a regular expression and identify
25// the matched text. Their names are matched by this regular expression:
26//
27//	Find(All)?(String)?(Submatch)?(Index)?
28//
29// If 'All' is present, the routine matches successive non-overlapping
30// matches of the entire expression. Empty matches abutting a preceding
31// match are ignored. The return value is a slice containing the successive
32// return values of the corresponding non-'All' routine. These routines take
33// an extra integer argument, n. If n >= 0, the function returns at most n
34// matches/submatches; otherwise, it returns all of them.
35//
36// If 'String' is present, the argument is a string; otherwise it is a slice
37// of bytes; return values are adjusted as appropriate.
38//
39// If 'Submatch' is present, the return value is a slice identifying the
40// successive submatches of the expression. Submatches are matches of
41// parenthesized subexpressions (also known as capturing groups) within the
42// regular expression, numbered from left to right in order of opening
43// parenthesis. Submatch 0 is the match of the entire expression, submatch 1 is
44// the match of the first parenthesized subexpression, and so on.
45//
46// If 'Index' is present, matches and submatches are identified by byte index
47// pairs within the input string: result[2*n:2*n+2] identifies the indexes of
48// the nth submatch. The pair for n==0 identifies the match of the entire
49// expression. If 'Index' is not present, the match is identified by the text
50// of the match/submatch. If an index is negative or text is nil, it means that
51// subexpression did not match any string in the input. For 'String' versions
52// an empty string means either no match or an empty match.
53//
54// There is also a subset of the methods that can be applied to text read from
55// an [io.RuneReader]: [Regexp.MatchReader], [Regexp.FindReaderIndex],
56// [Regexp.FindReaderSubmatchIndex].
57//
58// This set may grow. Note that regular expression matches may need to
59// examine text beyond the text returned by a match, so the methods that
60// match text from an [io.RuneReader] may read arbitrarily far into the input
61// before returning.
62//
63// (There are a few other methods that do not match this pattern.)
64package regexp
65
66import (
67	"bytes"
68	"io"
69	"regexp/syntax"
70	"strconv"
71	"strings"
72	"sync"
73	"unicode"
74	"unicode/utf8"
75)
76
77// Regexp is the representation of a compiled regular expression.
78// A Regexp is safe for concurrent use by multiple goroutines,
79// except for configuration methods, such as [Regexp.Longest].
80type Regexp struct {
81	expr           string       // as passed to Compile
82	prog           *syntax.Prog // compiled program
83	onepass        *onePassProg // onepass program or nil
84	numSubexp      int
85	maxBitStateLen int
86	subexpNames    []string
87	prefix         string         // required prefix in unanchored matches
88	prefixBytes    []byte         // prefix, as a []byte
89	prefixRune     rune           // first rune in prefix
90	prefixEnd      uint32         // pc for last rune in prefix
91	mpool          int            // pool for machines
92	matchcap       int            // size of recorded match lengths
93	prefixComplete bool           // prefix is the entire regexp
94	cond           syntax.EmptyOp // empty-width conditions required at start of match
95	minInputLen    int            // minimum length of the input in bytes
96
97	// This field can be modified by the Longest method,
98	// but it is otherwise read-only.
99	longest bool // whether regexp prefers leftmost-longest match
100}
101
102// String returns the source text used to compile the regular expression.
103func (re *Regexp) String() string {
104	return re.expr
105}
106
107// Copy returns a new [Regexp] object copied from re.
108// Calling [Regexp.Longest] on one copy does not affect another.
109//
110// Deprecated: In earlier releases, when using a [Regexp] in multiple goroutines,
111// giving each goroutine its own copy helped to avoid lock contention.
112// As of Go 1.12, using Copy is no longer necessary to avoid lock contention.
113// Copy may still be appropriate if the reason for its use is to make
114// two copies with different [Regexp.Longest] settings.
115func (re *Regexp) Copy() *Regexp {
116	re2 := *re
117	return &re2
118}
119
120// Compile parses a regular expression and returns, if successful,
121// a [Regexp] object that can be used to match against text.
122//
123// When matching against text, the regexp returns a match that
124// begins as early as possible in the input (leftmost), and among those
125// it chooses the one that a backtracking search would have found first.
126// This so-called leftmost-first matching is the same semantics
127// that Perl, Python, and other implementations use, although this
128// package implements it without the expense of backtracking.
129// For POSIX leftmost-longest matching, see [CompilePOSIX].
130func Compile(expr string) (*Regexp, error) {
131	return compile(expr, syntax.Perl, false)
132}
133
134// CompilePOSIX is like [Compile] but restricts the regular expression
135// to POSIX ERE (egrep) syntax and changes the match semantics to
136// leftmost-longest.
137//
138// That is, when matching against text, the regexp returns a match that
139// begins as early as possible in the input (leftmost), and among those
140// it chooses a match that is as long as possible.
141// This so-called leftmost-longest matching is the same semantics
142// that early regular expression implementations used and that POSIX
143// specifies.
144//
145// However, there can be multiple leftmost-longest matches, with different
146// submatch choices, and here this package diverges from POSIX.
147// Among the possible leftmost-longest matches, this package chooses
148// the one that a backtracking search would have found first, while POSIX
149// specifies that the match be chosen to maximize the length of the first
150// subexpression, then the second, and so on from left to right.
151// The POSIX rule is computationally prohibitive and not even well-defined.
152// See https://swtch.com/~rsc/regexp/regexp2.html#posix for details.
153func CompilePOSIX(expr string) (*Regexp, error) {
154	return compile(expr, syntax.POSIX, true)
155}
156
157// Longest makes future searches prefer the leftmost-longest match.
158// That is, when matching against text, the regexp returns a match that
159// begins as early as possible in the input (leftmost), and among those
160// it chooses a match that is as long as possible.
161// This method modifies the [Regexp] and may not be called concurrently
162// with any other methods.
163func (re *Regexp) Longest() {
164	re.longest = true
165}
166
167func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
168	re, err := syntax.Parse(expr, mode)
169	if err != nil {
170		return nil, err
171	}
172	maxCap := re.MaxCap()
173	capNames := re.CapNames()
174
175	re = re.Simplify()
176	prog, err := syntax.Compile(re)
177	if err != nil {
178		return nil, err
179	}
180	matchcap := prog.NumCap
181	if matchcap < 2 {
182		matchcap = 2
183	}
184	regexp := &Regexp{
185		expr:        expr,
186		prog:        prog,
187		onepass:     compileOnePass(prog),
188		numSubexp:   maxCap,
189		subexpNames: capNames,
190		cond:        prog.StartCond(),
191		longest:     longest,
192		matchcap:    matchcap,
193		minInputLen: minInputLen(re),
194	}
195	if regexp.onepass == nil {
196		regexp.prefix, regexp.prefixComplete = prog.Prefix()
197		regexp.maxBitStateLen = maxBitStateLen(prog)
198	} else {
199		regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
200	}
201	if regexp.prefix != "" {
202		// TODO(rsc): Remove this allocation by adding
203		// IndexString to package bytes.
204		regexp.prefixBytes = []byte(regexp.prefix)
205		regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
206	}
207
208	n := len(prog.Inst)
209	i := 0
210	for matchSize[i] != 0 && matchSize[i] < n {
211		i++
212	}
213	regexp.mpool = i
214
215	return regexp, nil
216}
217
218// Pools of *machine for use during (*Regexp).doExecute,
219// split up by the size of the execution queues.
220// matchPool[i] machines have queue size matchSize[i].
221// On a 64-bit system each queue entry is 16 bytes,
222// so matchPool[0] has 16*2*128 = 4kB queues, etc.
223// The final matchPool is a catch-all for very large queues.
224var (
225	matchSize = [...]int{128, 512, 2048, 16384, 0}
226	matchPool [len(matchSize)]sync.Pool
227)
228
229// get returns a machine to use for matching re.
230// It uses the re's machine cache if possible, to avoid
231// unnecessary allocation.
232func (re *Regexp) get() *machine {
233	m, ok := matchPool[re.mpool].Get().(*machine)
234	if !ok {
235		m = new(machine)
236	}
237	m.re = re
238	m.p = re.prog
239	if cap(m.matchcap) < re.matchcap {
240		m.matchcap = make([]int, re.matchcap)
241		for _, t := range m.pool {
242			t.cap = make([]int, re.matchcap)
243		}
244	}
245
246	// Allocate queues if needed.
247	// Or reallocate, for "large" match pool.
248	n := matchSize[re.mpool]
249	if n == 0 { // large pool
250		n = len(re.prog.Inst)
251	}
252	if len(m.q0.sparse) < n {
253		m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
254		m.q1 = queue{make([]uint32, n), make([]entry, 0, n)}
255	}
256	return m
257}
258
259// put returns a machine to the correct machine pool.
260func (re *Regexp) put(m *machine) {
261	m.re = nil
262	m.p = nil
263	m.inputs.clear()
264	matchPool[re.mpool].Put(m)
265}
266
267// minInputLen walks the regexp to find the minimum length of any matchable input.
268func minInputLen(re *syntax.Regexp) int {
269	switch re.Op {
270	default:
271		return 0
272	case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass:
273		return 1
274	case syntax.OpLiteral:
275		l := 0
276		for _, r := range re.Rune {
277			if r == utf8.RuneError {
278				l++
279			} else {
280				l += utf8.RuneLen(r)
281			}
282		}
283		return l
284	case syntax.OpCapture, syntax.OpPlus:
285		return minInputLen(re.Sub[0])
286	case syntax.OpRepeat:
287		return re.Min * minInputLen(re.Sub[0])
288	case syntax.OpConcat:
289		l := 0
290		for _, sub := range re.Sub {
291			l += minInputLen(sub)
292		}
293		return l
294	case syntax.OpAlternate:
295		l := minInputLen(re.Sub[0])
296		var lnext int
297		for _, sub := range re.Sub[1:] {
298			lnext = minInputLen(sub)
299			if lnext < l {
300				l = lnext
301			}
302		}
303		return l
304	}
305}
306
307// MustCompile is like [Compile] but panics if the expression cannot be parsed.
308// It simplifies safe initialization of global variables holding compiled regular
309// expressions.
310func MustCompile(str string) *Regexp {
311	regexp, err := Compile(str)
312	if err != nil {
313		panic(`regexp: Compile(` + quote(str) + `): ` + err.Error())
314	}
315	return regexp
316}
317
318// MustCompilePOSIX is like [CompilePOSIX] but panics if the expression cannot be parsed.
319// It simplifies safe initialization of global variables holding compiled regular
320// expressions.
321func MustCompilePOSIX(str string) *Regexp {
322	regexp, err := CompilePOSIX(str)
323	if err != nil {
324		panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error())
325	}
326	return regexp
327}
328
329func quote(s string) string {
330	if strconv.CanBackquote(s) {
331		return "`" + s + "`"
332	}
333	return strconv.Quote(s)
334}
335
336// NumSubexp returns the number of parenthesized subexpressions in this [Regexp].
337func (re *Regexp) NumSubexp() int {
338	return re.numSubexp
339}
340
341// SubexpNames returns the names of the parenthesized subexpressions
342// in this [Regexp]. The name for the first sub-expression is names[1],
343// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
344// Since the Regexp as a whole cannot be named, names[0] is always
345// the empty string. The slice should not be modified.
346func (re *Regexp) SubexpNames() []string {
347	return re.subexpNames
348}
349
350// SubexpIndex returns the index of the first subexpression with the given name,
351// or -1 if there is no subexpression with that name.
352//
353// Note that multiple subexpressions can be written using the same name, as in
354// (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob".
355// In this case, SubexpIndex returns the index of the leftmost such subexpression
356// in the regular expression.
357func (re *Regexp) SubexpIndex(name string) int {
358	if name != "" {
359		for i, s := range re.subexpNames {
360			if name == s {
361				return i
362			}
363		}
364	}
365	return -1
366}
367
368const endOfText rune = -1
369
370// input abstracts different representations of the input text. It provides
371// one-character lookahead.
372type input interface {
373	step(pos int) (r rune, width int) // advance one rune
374	canCheckPrefix() bool             // can we look ahead without losing info?
375	hasPrefix(re *Regexp) bool
376	index(re *Regexp, pos int) int
377	context(pos int) lazyFlag
378}
379
380// inputString scans a string.
381type inputString struct {
382	str string
383}
384
385func (i *inputString) step(pos int) (rune, int) {
386	if pos < len(i.str) {
387		c := i.str[pos]
388		if c < utf8.RuneSelf {
389			return rune(c), 1
390		}
391		return utf8.DecodeRuneInString(i.str[pos:])
392	}
393	return endOfText, 0
394}
395
396func (i *inputString) canCheckPrefix() bool {
397	return true
398}
399
400func (i *inputString) hasPrefix(re *Regexp) bool {
401	return strings.HasPrefix(i.str, re.prefix)
402}
403
404func (i *inputString) index(re *Regexp, pos int) int {
405	return strings.Index(i.str[pos:], re.prefix)
406}
407
408func (i *inputString) context(pos int) lazyFlag {
409	r1, r2 := endOfText, endOfText
410	// 0 < pos && pos <= len(i.str)
411	if uint(pos-1) < uint(len(i.str)) {
412		r1 = rune(i.str[pos-1])
413		if r1 >= utf8.RuneSelf {
414			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
415		}
416	}
417	// 0 <= pos && pos < len(i.str)
418	if uint(pos) < uint(len(i.str)) {
419		r2 = rune(i.str[pos])
420		if r2 >= utf8.RuneSelf {
421			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
422		}
423	}
424	return newLazyFlag(r1, r2)
425}
426
427// inputBytes scans a byte slice.
428type inputBytes struct {
429	str []byte
430}
431
432func (i *inputBytes) step(pos int) (rune, int) {
433	if pos < len(i.str) {
434		c := i.str[pos]
435		if c < utf8.RuneSelf {
436			return rune(c), 1
437		}
438		return utf8.DecodeRune(i.str[pos:])
439	}
440	return endOfText, 0
441}
442
443func (i *inputBytes) canCheckPrefix() bool {
444	return true
445}
446
447func (i *inputBytes) hasPrefix(re *Regexp) bool {
448	return bytes.HasPrefix(i.str, re.prefixBytes)
449}
450
451func (i *inputBytes) index(re *Regexp, pos int) int {
452	return bytes.Index(i.str[pos:], re.prefixBytes)
453}
454
455func (i *inputBytes) context(pos int) lazyFlag {
456	r1, r2 := endOfText, endOfText
457	// 0 < pos && pos <= len(i.str)
458	if uint(pos-1) < uint(len(i.str)) {
459		r1 = rune(i.str[pos-1])
460		if r1 >= utf8.RuneSelf {
461			r1, _ = utf8.DecodeLastRune(i.str[:pos])
462		}
463	}
464	// 0 <= pos && pos < len(i.str)
465	if uint(pos) < uint(len(i.str)) {
466		r2 = rune(i.str[pos])
467		if r2 >= utf8.RuneSelf {
468			r2, _ = utf8.DecodeRune(i.str[pos:])
469		}
470	}
471	return newLazyFlag(r1, r2)
472}
473
474// inputReader scans a RuneReader.
475type inputReader struct {
476	r     io.RuneReader
477	atEOT bool
478	pos   int
479}
480
481func (i *inputReader) step(pos int) (rune, int) {
482	if !i.atEOT && pos != i.pos {
483		return endOfText, 0
484
485	}
486	r, w, err := i.r.ReadRune()
487	if err != nil {
488		i.atEOT = true
489		return endOfText, 0
490	}
491	i.pos += w
492	return r, w
493}
494
495func (i *inputReader) canCheckPrefix() bool {
496	return false
497}
498
499func (i *inputReader) hasPrefix(re *Regexp) bool {
500	return false
501}
502
503func (i *inputReader) index(re *Regexp, pos int) int {
504	return -1
505}
506
507func (i *inputReader) context(pos int) lazyFlag {
508	return 0 // not used
509}
510
511// LiteralPrefix returns a literal string that must begin any match
512// of the regular expression re. It returns the boolean true if the
513// literal string comprises the entire regular expression.
514func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
515	return re.prefix, re.prefixComplete
516}
517
518// MatchReader reports whether the text returned by the [io.RuneReader]
519// contains any match of the regular expression re.
520func (re *Regexp) MatchReader(r io.RuneReader) bool {
521	return re.doMatch(r, nil, "")
522}
523
524// MatchString reports whether the string s
525// contains any match of the regular expression re.
526func (re *Regexp) MatchString(s string) bool {
527	return re.doMatch(nil, nil, s)
528}
529
530// Match reports whether the byte slice b
531// contains any match of the regular expression re.
532func (re *Regexp) Match(b []byte) bool {
533	return re.doMatch(nil, b, "")
534}
535
536// MatchReader reports whether the text returned by the [io.RuneReader]
537// contains any match of the regular expression pattern.
538// More complicated queries need to use [Compile] and the full [Regexp] interface.
539func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
540	re, err := Compile(pattern)
541	if err != nil {
542		return false, err
543	}
544	return re.MatchReader(r), nil
545}
546
547// MatchString reports whether the string s
548// contains any match of the regular expression pattern.
549// More complicated queries need to use [Compile] and the full [Regexp] interface.
550func MatchString(pattern string, s string) (matched bool, err error) {
551	re, err := Compile(pattern)
552	if err != nil {
553		return false, err
554	}
555	return re.MatchString(s), nil
556}
557
558// Match reports whether the byte slice b
559// contains any match of the regular expression pattern.
560// More complicated queries need to use [Compile] and the full [Regexp] interface.
561func Match(pattern string, b []byte) (matched bool, err error) {
562	re, err := Compile(pattern)
563	if err != nil {
564		return false, err
565	}
566	return re.Match(b), nil
567}
568
569// ReplaceAllString returns a copy of src, replacing matches of the [Regexp]
570// with the replacement string repl.
571// Inside repl, $ signs are interpreted as in [Regexp.Expand].
572func (re *Regexp) ReplaceAllString(src, repl string) string {
573	n := 2
574	if strings.Contains(repl, "$") {
575		n = 2 * (re.numSubexp + 1)
576	}
577	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
578		return re.expand(dst, repl, nil, src, match)
579	})
580	return string(b)
581}
582
583// ReplaceAllLiteralString returns a copy of src, replacing matches of the [Regexp]
584// with the replacement string repl. The replacement repl is substituted directly,
585// without using [Regexp.Expand].
586func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
587	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
588		return append(dst, repl...)
589	}))
590}
591
592// ReplaceAllStringFunc returns a copy of src in which all matches of the
593// [Regexp] have been replaced by the return value of function repl applied
594// to the matched substring. The replacement returned by repl is substituted
595// directly, without using [Regexp.Expand].
596func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
597	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
598		return append(dst, repl(src[match[0]:match[1]])...)
599	})
600	return string(b)
601}
602
603func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
604	lastMatchEnd := 0 // end position of the most recent match
605	searchPos := 0    // position where we next look for a match
606	var buf []byte
607	var endPos int
608	if bsrc != nil {
609		endPos = len(bsrc)
610	} else {
611		endPos = len(src)
612	}
613	if nmatch > re.prog.NumCap {
614		nmatch = re.prog.NumCap
615	}
616
617	var dstCap [2]int
618	for searchPos <= endPos {
619		a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
620		if len(a) == 0 {
621			break // no more matches
622		}
623
624		// Copy the unmatched characters before this match.
625		if bsrc != nil {
626			buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
627		} else {
628			buf = append(buf, src[lastMatchEnd:a[0]]...)
629		}
630
631		// Now insert a copy of the replacement string, but not for a
632		// match of the empty string immediately after another match.
633		// (Otherwise, we get double replacement for patterns that
634		// match both empty and nonempty strings.)
635		if a[1] > lastMatchEnd || a[0] == 0 {
636			buf = repl(buf, a)
637		}
638		lastMatchEnd = a[1]
639
640		// Advance past this match; always advance at least one character.
641		var width int
642		if bsrc != nil {
643			_, width = utf8.DecodeRune(bsrc[searchPos:])
644		} else {
645			_, width = utf8.DecodeRuneInString(src[searchPos:])
646		}
647		if searchPos+width > a[1] {
648			searchPos += width
649		} else if searchPos+1 > a[1] {
650			// This clause is only needed at the end of the input
651			// string. In that case, DecodeRuneInString returns width=0.
652			searchPos++
653		} else {
654			searchPos = a[1]
655		}
656	}
657
658	// Copy the unmatched characters after the last match.
659	if bsrc != nil {
660		buf = append(buf, bsrc[lastMatchEnd:]...)
661	} else {
662		buf = append(buf, src[lastMatchEnd:]...)
663	}
664
665	return buf
666}
667
668// ReplaceAll returns a copy of src, replacing matches of the [Regexp]
669// with the replacement text repl.
670// Inside repl, $ signs are interpreted as in [Regexp.Expand].
671func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
672	n := 2
673	if bytes.IndexByte(repl, '$') >= 0 {
674		n = 2 * (re.numSubexp + 1)
675	}
676	srepl := ""
677	b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
678		if len(srepl) != len(repl) {
679			srepl = string(repl)
680		}
681		return re.expand(dst, srepl, src, "", match)
682	})
683	return b
684}
685
686// ReplaceAllLiteral returns a copy of src, replacing matches of the [Regexp]
687// with the replacement bytes repl. The replacement repl is substituted directly,
688// without using [Regexp.Expand].
689func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
690	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
691		return append(dst, repl...)
692	})
693}
694
695// ReplaceAllFunc returns a copy of src in which all matches of the
696// [Regexp] have been replaced by the return value of function repl applied
697// to the matched byte slice. The replacement returned by repl is substituted
698// directly, without using [Regexp.Expand].
699func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
700	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
701		return append(dst, repl(src[match[0]:match[1]])...)
702	})
703}
704
705// Bitmap used by func special to check whether a character needs to be escaped.
706var specialBytes [16]byte
707
708// special reports whether byte b needs to be escaped by QuoteMeta.
709func special(b byte) bool {
710	return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
711}
712
713func init() {
714	for _, b := range []byte(`\.+*?()|[]{}^$`) {
715		specialBytes[b%16] |= 1 << (b / 16)
716	}
717}
718
719// QuoteMeta returns a string that escapes all regular expression metacharacters
720// inside the argument text; the returned string is a regular expression matching
721// the literal text.
722func QuoteMeta(s string) string {
723	// A byte loop is correct because all metacharacters are ASCII.
724	var i int
725	for i = 0; i < len(s); i++ {
726		if special(s[i]) {
727			break
728		}
729	}
730	// No meta characters found, so return original string.
731	if i >= len(s) {
732		return s
733	}
734
735	b := make([]byte, 2*len(s)-i)
736	copy(b, s[:i])
737	j := i
738	for ; i < len(s); i++ {
739		if special(s[i]) {
740			b[j] = '\\'
741			j++
742		}
743		b[j] = s[i]
744		j++
745	}
746	return string(b[:j])
747}
748
749// The number of capture values in the program may correspond
750// to fewer capturing expressions than are in the regexp.
751// For example, "(a){0}" turns into an empty program, so the
752// maximum capture in the program is 0 but we need to return
753// an expression for \1.  Pad appends -1s to the slice a as needed.
754func (re *Regexp) pad(a []int) []int {
755	if a == nil {
756		// No match.
757		return nil
758	}
759	n := (1 + re.numSubexp) * 2
760	for len(a) < n {
761		a = append(a, -1)
762	}
763	return a
764}
765
766// allMatches calls deliver at most n times
767// with the location of successive matches in the input text.
768// The input text is b if non-nil, otherwise s.
769func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
770	var end int
771	if b == nil {
772		end = len(s)
773	} else {
774		end = len(b)
775	}
776
777	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
778		matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
779		if len(matches) == 0 {
780			break
781		}
782
783		accept := true
784		if matches[1] == pos {
785			// We've found an empty match.
786			if matches[0] == prevMatchEnd {
787				// We don't allow an empty match right
788				// after a previous match, so ignore it.
789				accept = false
790			}
791			var width int
792			if b == nil {
793				is := inputString{str: s}
794				_, width = is.step(pos)
795			} else {
796				ib := inputBytes{str: b}
797				_, width = ib.step(pos)
798			}
799			if width > 0 {
800				pos += width
801			} else {
802				pos = end + 1
803			}
804		} else {
805			pos = matches[1]
806		}
807		prevMatchEnd = matches[1]
808
809		if accept {
810			deliver(re.pad(matches))
811			i++
812		}
813	}
814}
815
816// Find returns a slice holding the text of the leftmost match in b of the regular expression.
817// A return value of nil indicates no match.
818func (re *Regexp) Find(b []byte) []byte {
819	var dstCap [2]int
820	a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
821	if a == nil {
822		return nil
823	}
824	return b[a[0]:a[1]:a[1]]
825}
826
827// FindIndex returns a two-element slice of integers defining the location of
828// the leftmost match in b of the regular expression. The match itself is at
829// b[loc[0]:loc[1]].
830// A return value of nil indicates no match.
831func (re *Regexp) FindIndex(b []byte) (loc []int) {
832	a := re.doExecute(nil, b, "", 0, 2, nil)
833	if a == nil {
834		return nil
835	}
836	return a[0:2]
837}
838
839// FindString returns a string holding the text of the leftmost match in s of the regular
840// expression. If there is no match, the return value is an empty string,
841// but it will also be empty if the regular expression successfully matches
842// an empty string. Use [Regexp.FindStringIndex] or [Regexp.FindStringSubmatch] if it is
843// necessary to distinguish these cases.
844func (re *Regexp) FindString(s string) string {
845	var dstCap [2]int
846	a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
847	if a == nil {
848		return ""
849	}
850	return s[a[0]:a[1]]
851}
852
853// FindStringIndex returns a two-element slice of integers defining the
854// location of the leftmost match in s of the regular expression. The match
855// itself is at s[loc[0]:loc[1]].
856// A return value of nil indicates no match.
857func (re *Regexp) FindStringIndex(s string) (loc []int) {
858	a := re.doExecute(nil, nil, s, 0, 2, nil)
859	if a == nil {
860		return nil
861	}
862	return a[0:2]
863}
864
865// FindReaderIndex returns a two-element slice of integers defining the
866// location of the leftmost match of the regular expression in text read from
867// the [io.RuneReader]. The match text was found in the input stream at
868// byte offset loc[0] through loc[1]-1.
869// A return value of nil indicates no match.
870func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
871	a := re.doExecute(r, nil, "", 0, 2, nil)
872	if a == nil {
873		return nil
874	}
875	return a[0:2]
876}
877
878// FindSubmatch returns a slice of slices holding the text of the leftmost
879// match of the regular expression in b and the matches, if any, of its
880// subexpressions, as defined by the 'Submatch' descriptions in the package
881// comment.
882// A return value of nil indicates no match.
883func (re *Regexp) FindSubmatch(b []byte) [][]byte {
884	var dstCap [4]int
885	a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
886	if a == nil {
887		return nil
888	}
889	ret := make([][]byte, 1+re.numSubexp)
890	for i := range ret {
891		if 2*i < len(a) && a[2*i] >= 0 {
892			ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]]
893		}
894	}
895	return ret
896}
897
898// Expand appends template to dst and returns the result; during the
899// append, Expand replaces variables in the template with corresponding
900// matches drawn from src. The match slice should have been returned by
901// [Regexp.FindSubmatchIndex].
902//
903// In the template, a variable is denoted by a substring of the form
904// $name or ${name}, where name is a non-empty sequence of letters,
905// digits, and underscores. A purely numeric name like $1 refers to
906// the submatch with the corresponding index; other names refer to
907// capturing parentheses named with the (?P<name>...) syntax. A
908// reference to an out of range or unmatched index or a name that is not
909// present in the regular expression is replaced with an empty slice.
910//
911// In the $name form, name is taken to be as long as possible: $1x is
912// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
913//
914// To insert a literal $ in the output, use $$ in the template.
915func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
916	return re.expand(dst, string(template), src, "", match)
917}
918
919// ExpandString is like [Regexp.Expand] but the template and source are strings.
920// It appends to and returns a byte slice in order to give the calling
921// code control over allocation.
922func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
923	return re.expand(dst, template, nil, src, match)
924}
925
926func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
927	for len(template) > 0 {
928		before, after, ok := strings.Cut(template, "$")
929		if !ok {
930			break
931		}
932		dst = append(dst, before...)
933		template = after
934		if template != "" && template[0] == '$' {
935			// Treat $$ as $.
936			dst = append(dst, '$')
937			template = template[1:]
938			continue
939		}
940		name, num, rest, ok := extract(template)
941		if !ok {
942			// Malformed; treat $ as raw text.
943			dst = append(dst, '$')
944			continue
945		}
946		template = rest
947		if num >= 0 {
948			if 2*num+1 < len(match) && match[2*num] >= 0 {
949				if bsrc != nil {
950					dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
951				} else {
952					dst = append(dst, src[match[2*num]:match[2*num+1]]...)
953				}
954			}
955		} else {
956			for i, namei := range re.subexpNames {
957				if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
958					if bsrc != nil {
959						dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
960					} else {
961						dst = append(dst, src[match[2*i]:match[2*i+1]]...)
962					}
963					break
964				}
965			}
966		}
967	}
968	dst = append(dst, template...)
969	return dst
970}
971
972// extract returns the name from a leading "name" or "{name}" in str.
973// (The $ has already been removed by the caller.)
974// If it is a number, extract returns num set to that number; otherwise num = -1.
975func extract(str string) (name string, num int, rest string, ok bool) {
976	if str == "" {
977		return
978	}
979	brace := false
980	if str[0] == '{' {
981		brace = true
982		str = str[1:]
983	}
984	i := 0
985	for i < len(str) {
986		rune, size := utf8.DecodeRuneInString(str[i:])
987		if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
988			break
989		}
990		i += size
991	}
992	if i == 0 {
993		// empty name is not okay
994		return
995	}
996	name = str[:i]
997	if brace {
998		if i >= len(str) || str[i] != '}' {
999			// missing closing brace
1000			return
1001		}
1002		i++
1003	}
1004
1005	// Parse number.
1006	num = 0
1007	for i := 0; i < len(name); i++ {
1008		if name[i] < '0' || '9' < name[i] || num >= 1e8 {
1009			num = -1
1010			break
1011		}
1012		num = num*10 + int(name[i]) - '0'
1013	}
1014	// Disallow leading zeros.
1015	if name[0] == '0' && len(name) > 1 {
1016		num = -1
1017	}
1018
1019	rest = str[i:]
1020	ok = true
1021	return
1022}
1023
1024// FindSubmatchIndex returns a slice holding the index pairs identifying the
1025// leftmost match of the regular expression in b and the matches, if any, of
1026// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
1027// in the package comment.
1028// A return value of nil indicates no match.
1029func (re *Regexp) FindSubmatchIndex(b []byte) []int {
1030	return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
1031}
1032
1033// FindStringSubmatch returns a slice of strings holding the text of the
1034// leftmost match of the regular expression in s and the matches, if any, of
1035// its subexpressions, as defined by the 'Submatch' description in the
1036// package comment.
1037// A return value of nil indicates no match.
1038func (re *Regexp) FindStringSubmatch(s string) []string {
1039	var dstCap [4]int
1040	a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
1041	if a == nil {
1042		return nil
1043	}
1044	ret := make([]string, 1+re.numSubexp)
1045	for i := range ret {
1046		if 2*i < len(a) && a[2*i] >= 0 {
1047			ret[i] = s[a[2*i]:a[2*i+1]]
1048		}
1049	}
1050	return ret
1051}
1052
1053// FindStringSubmatchIndex returns a slice holding the index pairs
1054// identifying the leftmost match of the regular expression in s and the
1055// matches, if any, of its subexpressions, as defined by the 'Submatch' and
1056// 'Index' descriptions in the package comment.
1057// A return value of nil indicates no match.
1058func (re *Regexp) FindStringSubmatchIndex(s string) []int {
1059	return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
1060}
1061
1062// FindReaderSubmatchIndex returns a slice holding the index pairs
1063// identifying the leftmost match of the regular expression of text read by
1064// the [io.RuneReader], and the matches, if any, of its subexpressions, as defined
1065// by the 'Submatch' and 'Index' descriptions in the package comment. A
1066// return value of nil indicates no match.
1067func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
1068	return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
1069}
1070
1071const startSize = 10 // The size at which to start a slice in the 'All' routines.
1072
1073// FindAll is the 'All' version of [Regexp.Find]; it returns a slice of all successive
1074// matches of the expression, as defined by the 'All' description in the
1075// package comment.
1076// A return value of nil indicates no match.
1077func (re *Regexp) FindAll(b []byte, n int) [][]byte {
1078	if n < 0 {
1079		n = len(b) + 1
1080	}
1081	var result [][]byte
1082	re.allMatches("", b, n, func(match []int) {
1083		if result == nil {
1084			result = make([][]byte, 0, startSize)
1085		}
1086		result = append(result, b[match[0]:match[1]:match[1]])
1087	})
1088	return result
1089}
1090
1091// FindAllIndex is the 'All' version of [Regexp.FindIndex]; it returns a slice of all
1092// successive matches of the expression, as defined by the 'All' description
1093// in the package comment.
1094// A return value of nil indicates no match.
1095func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
1096	if n < 0 {
1097		n = len(b) + 1
1098	}
1099	var result [][]int
1100	re.allMatches("", b, n, func(match []int) {
1101		if result == nil {
1102			result = make([][]int, 0, startSize)
1103		}
1104		result = append(result, match[0:2])
1105	})
1106	return result
1107}
1108
1109// FindAllString is the 'All' version of [Regexp.FindString]; it returns a slice of all
1110// successive matches of the expression, as defined by the 'All' description
1111// in the package comment.
1112// A return value of nil indicates no match.
1113func (re *Regexp) FindAllString(s string, n int) []string {
1114	if n < 0 {
1115		n = len(s) + 1
1116	}
1117	var result []string
1118	re.allMatches(s, nil, n, func(match []int) {
1119		if result == nil {
1120			result = make([]string, 0, startSize)
1121		}
1122		result = append(result, s[match[0]:match[1]])
1123	})
1124	return result
1125}
1126
1127// FindAllStringIndex is the 'All' version of [Regexp.FindStringIndex]; it returns a
1128// slice of all successive matches of the expression, as defined by the 'All'
1129// description in the package comment.
1130// A return value of nil indicates no match.
1131func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
1132	if n < 0 {
1133		n = len(s) + 1
1134	}
1135	var result [][]int
1136	re.allMatches(s, nil, n, func(match []int) {
1137		if result == nil {
1138			result = make([][]int, 0, startSize)
1139		}
1140		result = append(result, match[0:2])
1141	})
1142	return result
1143}
1144
1145// FindAllSubmatch is the 'All' version of [Regexp.FindSubmatch]; it returns a slice
1146// of all successive matches of the expression, as defined by the 'All'
1147// description in the package comment.
1148// A return value of nil indicates no match.
1149func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1150	if n < 0 {
1151		n = len(b) + 1
1152	}
1153	var result [][][]byte
1154	re.allMatches("", b, n, func(match []int) {
1155		if result == nil {
1156			result = make([][][]byte, 0, startSize)
1157		}
1158		slice := make([][]byte, len(match)/2)
1159		for j := range slice {
1160			if match[2*j] >= 0 {
1161				slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]]
1162			}
1163		}
1164		result = append(result, slice)
1165	})
1166	return result
1167}
1168
1169// FindAllSubmatchIndex is the 'All' version of [Regexp.FindSubmatchIndex]; it returns
1170// a slice of all successive matches of the expression, as defined by the
1171// 'All' description in the package comment.
1172// A return value of nil indicates no match.
1173func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1174	if n < 0 {
1175		n = len(b) + 1
1176	}
1177	var result [][]int
1178	re.allMatches("", b, n, func(match []int) {
1179		if result == nil {
1180			result = make([][]int, 0, startSize)
1181		}
1182		result = append(result, match)
1183	})
1184	return result
1185}
1186
1187// FindAllStringSubmatch is the 'All' version of [Regexp.FindStringSubmatch]; it
1188// returns a slice of all successive matches of the expression, as defined by
1189// the 'All' description in the package comment.
1190// A return value of nil indicates no match.
1191func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1192	if n < 0 {
1193		n = len(s) + 1
1194	}
1195	var result [][]string
1196	re.allMatches(s, nil, n, func(match []int) {
1197		if result == nil {
1198			result = make([][]string, 0, startSize)
1199		}
1200		slice := make([]string, len(match)/2)
1201		for j := range slice {
1202			if match[2*j] >= 0 {
1203				slice[j] = s[match[2*j]:match[2*j+1]]
1204			}
1205		}
1206		result = append(result, slice)
1207	})
1208	return result
1209}
1210
1211// FindAllStringSubmatchIndex is the 'All' version of
1212// [Regexp.FindStringSubmatchIndex]; it returns a slice of all successive matches of
1213// the expression, as defined by the 'All' description in the package
1214// comment.
1215// A return value of nil indicates no match.
1216func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1217	if n < 0 {
1218		n = len(s) + 1
1219	}
1220	var result [][]int
1221	re.allMatches(s, nil, n, func(match []int) {
1222		if result == nil {
1223			result = make([][]int, 0, startSize)
1224		}
1225		result = append(result, match)
1226	})
1227	return result
1228}
1229
1230// Split slices s into substrings separated by the expression and returns a slice of
1231// the substrings between those expression matches.
1232//
1233// The slice returned by this method consists of all the substrings of s
1234// not contained in the slice returned by [Regexp.FindAllString]. When called on an expression
1235// that contains no metacharacters, it is equivalent to [strings.SplitN].
1236//
1237// Example:
1238//
1239//	s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
1240//	// s: ["", "b", "b", "c", "cadaaae"]
1241//
1242// The count determines the number of substrings to return:
1243//   - n > 0: at most n substrings; the last substring will be the unsplit remainder;
1244//   - n == 0: the result is nil (zero substrings);
1245//   - n < 0: all substrings.
1246func (re *Regexp) Split(s string, n int) []string {
1247
1248	if n == 0 {
1249		return nil
1250	}
1251
1252	if len(re.expr) > 0 && len(s) == 0 {
1253		return []string{""}
1254	}
1255
1256	matches := re.FindAllStringIndex(s, n)
1257	strings := make([]string, 0, len(matches))
1258
1259	beg := 0
1260	end := 0
1261	for _, match := range matches {
1262		if n > 0 && len(strings) >= n-1 {
1263			break
1264		}
1265
1266		end = match[0]
1267		if match[1] != 0 {
1268			strings = append(strings, s[beg:end])
1269		}
1270		beg = match[1]
1271	}
1272
1273	if end != len(s) {
1274		strings = append(strings, s[beg:])
1275	}
1276
1277	return strings
1278}
1279
1280// MarshalText implements [encoding.TextMarshaler]. The output
1281// matches that of calling the [Regexp.String] method.
1282//
1283// Note that the output is lossy in some cases: This method does not indicate
1284// POSIX regular expressions (i.e. those compiled by calling [CompilePOSIX]), or
1285// those for which the [Regexp.Longest] method has been called.
1286func (re *Regexp) MarshalText() ([]byte, error) {
1287	return []byte(re.String()), nil
1288}
1289
1290// UnmarshalText implements [encoding.TextUnmarshaler] by calling
1291// [Compile] on the encoded value.
1292func (re *Regexp) UnmarshalText(text []byte) error {
1293	newRE, err := Compile(string(text))
1294	if err != nil {
1295		return err
1296	}
1297	*re = *newRE
1298	return nil
1299}
1300