1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package regexp
6
7import (
8	"regexp/syntax"
9	"slices"
10	"strings"
11	"unicode"
12	"unicode/utf8"
13)
14
15// "One-pass" regexp execution.
16// Some regexps can be analyzed to determine that they never need
17// backtracking: they are guaranteed to run in one pass over the string
18// without bothering to save all the usual NFA state.
19// Detect those and execute them more quickly.
20
21// A onePassProg is a compiled one-pass regular expression program.
22// It is the same as syntax.Prog except for the use of onePassInst.
23type onePassProg struct {
24	Inst   []onePassInst
25	Start  int // index of start instruction
26	NumCap int // number of InstCapture insts in re
27}
28
29// A onePassInst is a single instruction in a one-pass regular expression program.
30// It is the same as syntax.Inst except for the new 'Next' field.
31type onePassInst struct {
32	syntax.Inst
33	Next []uint32
34}
35
36// onePassPrefix returns a literal string that all matches for the
37// regexp must start with. Complete is true if the prefix
38// is the entire match. Pc is the index of the last rune instruction
39// in the string. The onePassPrefix skips over the mandatory
40// EmptyBeginText.
41func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
42	i := &p.Inst[p.Start]
43	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
44		return "", i.Op == syntax.InstMatch, uint32(p.Start)
45	}
46	pc = i.Out
47	i = &p.Inst[pc]
48	for i.Op == syntax.InstNop {
49		pc = i.Out
50		i = &p.Inst[pc]
51	}
52	// Avoid allocation of buffer if prefix is empty.
53	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
54		return "", i.Op == syntax.InstMatch, uint32(p.Start)
55	}
56
57	// Have prefix; gather characters.
58	var buf strings.Builder
59	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError {
60		buf.WriteRune(i.Rune[0])
61		pc, i = i.Out, &p.Inst[i.Out]
62	}
63	if i.Op == syntax.InstEmptyWidth &&
64		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
65		p.Inst[i.Out].Op == syntax.InstMatch {
66		complete = true
67	}
68	return buf.String(), complete, pc
69}
70
71// onePassNext selects the next actionable state of the prog, based on the input character.
72// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
73// One of the alternates may ultimately lead without input to end of line. If the instruction
74// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
75func onePassNext(i *onePassInst, r rune) uint32 {
76	next := i.MatchRunePos(r)
77	if next >= 0 {
78		return i.Next[next]
79	}
80	if i.Op == syntax.InstAltMatch {
81		return i.Out
82	}
83	return 0
84}
85
86func iop(i *syntax.Inst) syntax.InstOp {
87	op := i.Op
88	switch op {
89	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
90		op = syntax.InstRune
91	}
92	return op
93}
94
95// Sparse Array implementation is used as a queueOnePass.
96type queueOnePass struct {
97	sparse          []uint32
98	dense           []uint32
99	size, nextIndex uint32
100}
101
102func (q *queueOnePass) empty() bool {
103	return q.nextIndex >= q.size
104}
105
106func (q *queueOnePass) next() (n uint32) {
107	n = q.dense[q.nextIndex]
108	q.nextIndex++
109	return
110}
111
112func (q *queueOnePass) clear() {
113	q.size = 0
114	q.nextIndex = 0
115}
116
117func (q *queueOnePass) contains(u uint32) bool {
118	if u >= uint32(len(q.sparse)) {
119		return false
120	}
121	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
122}
123
124func (q *queueOnePass) insert(u uint32) {
125	if !q.contains(u) {
126		q.insertNew(u)
127	}
128}
129
130func (q *queueOnePass) insertNew(u uint32) {
131	if u >= uint32(len(q.sparse)) {
132		return
133	}
134	q.sparse[u] = q.size
135	q.dense[q.size] = u
136	q.size++
137}
138
139func newQueue(size int) (q *queueOnePass) {
140	return &queueOnePass{
141		sparse: make([]uint32, size),
142		dense:  make([]uint32, size),
143	}
144}
145
146// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
147// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
148// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
149// NextIp array with the single element mergeFailed is returned.
150// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
151const mergeFailed = uint32(0xffffffff)
152
153var (
154	noRune = []rune{}
155	noNext = []uint32{mergeFailed}
156)
157
158func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
159	leftLen := len(*leftRunes)
160	rightLen := len(*rightRunes)
161	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
162		panic("mergeRuneSets odd length []rune")
163	}
164	var (
165		lx, rx int
166	)
167	merged := make([]rune, 0)
168	next := make([]uint32, 0)
169	ok := true
170	defer func() {
171		if !ok {
172			merged = nil
173			next = nil
174		}
175	}()
176
177	ix := -1
178	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
179		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
180			return false
181		}
182		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
183		*newLow += 2
184		ix += 2
185		next = append(next, pc)
186		return true
187	}
188
189	for lx < leftLen || rx < rightLen {
190		switch {
191		case rx >= rightLen:
192			ok = extend(&lx, leftRunes, leftPC)
193		case lx >= leftLen:
194			ok = extend(&rx, rightRunes, rightPC)
195		case (*rightRunes)[rx] < (*leftRunes)[lx]:
196			ok = extend(&rx, rightRunes, rightPC)
197		default:
198			ok = extend(&lx, leftRunes, leftPC)
199		}
200		if !ok {
201			return noRune, noNext
202		}
203	}
204	return merged, next
205}
206
207// cleanupOnePass drops working memory, and restores certain shortcut instructions.
208func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
209	for ix, instOriginal := range original.Inst {
210		switch instOriginal.Op {
211		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
212		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
213			prog.Inst[ix].Next = nil
214		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
215			prog.Inst[ix].Next = nil
216			prog.Inst[ix] = onePassInst{Inst: instOriginal}
217		}
218	}
219}
220
221// onePassCopy creates a copy of the original Prog, as we'll be modifying it.
222func onePassCopy(prog *syntax.Prog) *onePassProg {
223	p := &onePassProg{
224		Start:  prog.Start,
225		NumCap: prog.NumCap,
226		Inst:   make([]onePassInst, len(prog.Inst)),
227	}
228	for i, inst := range prog.Inst {
229		p.Inst[i] = onePassInst{Inst: inst}
230	}
231
232	// rewrites one or more common Prog constructs that enable some otherwise
233	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
234	// ip A, that points to ips B & C.
235	// A:BC + B:DA => A:BC + B:CD
236	// A:BC + B:DC => A:DC + B:DC
237	for pc := range p.Inst {
238		switch p.Inst[pc].Op {
239		default:
240			continue
241		case syntax.InstAlt, syntax.InstAltMatch:
242			// A:Bx + B:Ay
243			p_A_Other := &p.Inst[pc].Out
244			p_A_Alt := &p.Inst[pc].Arg
245			// make sure a target is another Alt
246			instAlt := p.Inst[*p_A_Alt]
247			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
248				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
249				instAlt = p.Inst[*p_A_Alt]
250				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
251					continue
252				}
253			}
254			instOther := p.Inst[*p_A_Other]
255			// Analyzing both legs pointing to Alts is for another day
256			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
257				// too complicated
258				continue
259			}
260			// simple empty transition loop
261			// A:BC + B:DA => A:BC + B:DC
262			p_B_Alt := &p.Inst[*p_A_Alt].Out
263			p_B_Other := &p.Inst[*p_A_Alt].Arg
264			patch := false
265			if instAlt.Out == uint32(pc) {
266				patch = true
267			} else if instAlt.Arg == uint32(pc) {
268				patch = true
269				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
270			}
271			if patch {
272				*p_B_Alt = *p_A_Other
273			}
274
275			// empty transition to common target
276			// A:BC + B:DC => A:DC + B:DC
277			if *p_A_Other == *p_B_Alt {
278				*p_A_Alt = *p_B_Other
279			}
280		}
281	}
282	return p
283}
284
285var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
286var anyRune = []rune{0, unicode.MaxRune}
287
288// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
289// the match engine can always tell which branch to take. The routine may modify
290// p if it is turned into a onepass Prog. If it isn't possible for this to be a
291// onepass Prog, the Prog nil is returned. makeOnePass is recursive
292// to the size of the Prog.
293func makeOnePass(p *onePassProg) *onePassProg {
294	// If the machine is very long, it's not worth the time to check if we can use one pass.
295	if len(p.Inst) >= 1000 {
296		return nil
297	}
298
299	var (
300		instQueue    = newQueue(len(p.Inst))
301		visitQueue   = newQueue(len(p.Inst))
302		check        func(uint32, []bool) bool
303		onePassRunes = make([][]rune, len(p.Inst))
304	)
305
306	// check that paths from Alt instructions are unambiguous, and rebuild the new
307	// program as a onepass program
308	check = func(pc uint32, m []bool) (ok bool) {
309		ok = true
310		inst := &p.Inst[pc]
311		if visitQueue.contains(pc) {
312			return
313		}
314		visitQueue.insert(pc)
315		switch inst.Op {
316		case syntax.InstAlt, syntax.InstAltMatch:
317			ok = check(inst.Out, m) && check(inst.Arg, m)
318			// check no-input paths to InstMatch
319			matchOut := m[inst.Out]
320			matchArg := m[inst.Arg]
321			if matchOut && matchArg {
322				ok = false
323				break
324			}
325			// Match on empty goes in inst.Out
326			if matchArg {
327				inst.Out, inst.Arg = inst.Arg, inst.Out
328				matchOut, matchArg = matchArg, matchOut
329			}
330			if matchOut {
331				m[pc] = true
332				inst.Op = syntax.InstAltMatch
333			}
334
335			// build a dispatch operator from the two legs of the alt.
336			onePassRunes[pc], inst.Next = mergeRuneSets(
337				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
338			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
339				ok = false
340				break
341			}
342		case syntax.InstCapture, syntax.InstNop:
343			ok = check(inst.Out, m)
344			m[pc] = m[inst.Out]
345			// pass matching runes back through these no-ops.
346			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
347			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
348			for i := range inst.Next {
349				inst.Next[i] = inst.Out
350			}
351		case syntax.InstEmptyWidth:
352			ok = check(inst.Out, m)
353			m[pc] = m[inst.Out]
354			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
355			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
356			for i := range inst.Next {
357				inst.Next[i] = inst.Out
358			}
359		case syntax.InstMatch, syntax.InstFail:
360			m[pc] = inst.Op == syntax.InstMatch
361		case syntax.InstRune:
362			m[pc] = false
363			if len(inst.Next) > 0 {
364				break
365			}
366			instQueue.insert(inst.Out)
367			if len(inst.Rune) == 0 {
368				onePassRunes[pc] = []rune{}
369				inst.Next = []uint32{inst.Out}
370				break
371			}
372			runes := make([]rune, 0)
373			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
374				r0 := inst.Rune[0]
375				runes = append(runes, r0, r0)
376				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
377					runes = append(runes, r1, r1)
378				}
379				slices.Sort(runes)
380			} else {
381				runes = append(runes, inst.Rune...)
382			}
383			onePassRunes[pc] = runes
384			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
385			for i := range inst.Next {
386				inst.Next[i] = inst.Out
387			}
388			inst.Op = syntax.InstRune
389		case syntax.InstRune1:
390			m[pc] = false
391			if len(inst.Next) > 0 {
392				break
393			}
394			instQueue.insert(inst.Out)
395			runes := []rune{}
396			// expand case-folded runes
397			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
398				r0 := inst.Rune[0]
399				runes = append(runes, r0, r0)
400				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
401					runes = append(runes, r1, r1)
402				}
403				slices.Sort(runes)
404			} else {
405				runes = append(runes, inst.Rune[0], inst.Rune[0])
406			}
407			onePassRunes[pc] = runes
408			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
409			for i := range inst.Next {
410				inst.Next[i] = inst.Out
411			}
412			inst.Op = syntax.InstRune
413		case syntax.InstRuneAny:
414			m[pc] = false
415			if len(inst.Next) > 0 {
416				break
417			}
418			instQueue.insert(inst.Out)
419			onePassRunes[pc] = append([]rune{}, anyRune...)
420			inst.Next = []uint32{inst.Out}
421		case syntax.InstRuneAnyNotNL:
422			m[pc] = false
423			if len(inst.Next) > 0 {
424				break
425			}
426			instQueue.insert(inst.Out)
427			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
428			inst.Next = make([]uint32, len(onePassRunes[pc])/2+1)
429			for i := range inst.Next {
430				inst.Next[i] = inst.Out
431			}
432		}
433		return
434	}
435
436	instQueue.clear()
437	instQueue.insert(uint32(p.Start))
438	m := make([]bool, len(p.Inst))
439	for !instQueue.empty() {
440		visitQueue.clear()
441		pc := instQueue.next()
442		if !check(pc, m) {
443			p = nil
444			break
445		}
446	}
447	if p != nil {
448		for i := range p.Inst {
449			p.Inst[i].Rune = onePassRunes[i]
450		}
451	}
452	return p
453}
454
455// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
456// can be recharacterized as a one-pass regexp program, or syntax.nil if the
457// Prog cannot be converted. For a one pass prog, the fundamental condition that must
458// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
459func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
460	if prog.Start == 0 {
461		return nil
462	}
463	// onepass regexp is anchored
464	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
465		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
466		return nil
467	}
468	// every instruction leading to InstMatch must be EmptyEndText
469	for _, inst := range prog.Inst {
470		opOut := prog.Inst[inst.Out].Op
471		switch inst.Op {
472		default:
473			if opOut == syntax.InstMatch {
474				return nil
475			}
476		case syntax.InstAlt, syntax.InstAltMatch:
477			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
478				return nil
479			}
480		case syntax.InstEmptyWidth:
481			if opOut == syntax.InstMatch {
482				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
483					continue
484				}
485				return nil
486			}
487		}
488	}
489	// Creates a slightly optimized copy of the original Prog
490	// that cleans up some Prog idioms that block valid onepass programs
491	p = onePassCopy(prog)
492
493	// checkAmbiguity on InstAlts, build onepass Prog if possible
494	p = makeOnePass(p)
495
496	if p != nil {
497		cleanupOnePass(p, prog)
498	}
499	return p
500}
501