1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7import (
8	"strconv"
9	"strings"
10	"unicode"
11	"unicode/utf8"
12)
13
14// Compiled program.
15// May not belong in this package, but convenient for now.
16
17// A Prog is a compiled regular expression program.
18type Prog struct {
19	Inst   []Inst
20	Start  int // index of start instruction
21	NumCap int // number of InstCapture insts in re
22}
23
24// An InstOp is an instruction opcode.
25type InstOp uint8
26
27const (
28	InstAlt InstOp = iota
29	InstAltMatch
30	InstCapture
31	InstEmptyWidth
32	InstMatch
33	InstFail
34	InstNop
35	InstRune
36	InstRune1
37	InstRuneAny
38	InstRuneAnyNotNL
39)
40
41var instOpNames = []string{
42	"InstAlt",
43	"InstAltMatch",
44	"InstCapture",
45	"InstEmptyWidth",
46	"InstMatch",
47	"InstFail",
48	"InstNop",
49	"InstRune",
50	"InstRune1",
51	"InstRuneAny",
52	"InstRuneAnyNotNL",
53}
54
55func (i InstOp) String() string {
56	if uint(i) >= uint(len(instOpNames)) {
57		return ""
58	}
59	return instOpNames[i]
60}
61
62// An EmptyOp specifies a kind or mixture of zero-width assertions.
63type EmptyOp uint8
64
65const (
66	EmptyBeginLine EmptyOp = 1 << iota
67	EmptyEndLine
68	EmptyBeginText
69	EmptyEndText
70	EmptyWordBoundary
71	EmptyNoWordBoundary
72)
73
74// EmptyOpContext returns the zero-width assertions
75// satisfied at the position between the runes r1 and r2.
76// Passing r1 == -1 indicates that the position is
77// at the beginning of the text.
78// Passing r2 == -1 indicates that the position is
79// at the end of the text.
80func EmptyOpContext(r1, r2 rune) EmptyOp {
81	var op EmptyOp = EmptyNoWordBoundary
82	var boundary byte
83	switch {
84	case IsWordChar(r1):
85		boundary = 1
86	case r1 == '\n':
87		op |= EmptyBeginLine
88	case r1 < 0:
89		op |= EmptyBeginText | EmptyBeginLine
90	}
91	switch {
92	case IsWordChar(r2):
93		boundary ^= 1
94	case r2 == '\n':
95		op |= EmptyEndLine
96	case r2 < 0:
97		op |= EmptyEndText | EmptyEndLine
98	}
99	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
100		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
101	}
102	return op
103}
104
105// IsWordChar reports whether r is considered a “word character”
106// during the evaluation of the \b and \B zero-width assertions.
107// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
108func IsWordChar(r rune) bool {
109	// Test for lowercase letters first, as these occur more
110	// frequently than uppercase letters in common cases.
111	return 'a' <= r && r <= 'z' || 'A' <= r && r <= 'Z' || '0' <= r && r <= '9' || r == '_'
112}
113
114// An Inst is a single instruction in a regular expression program.
115type Inst struct {
116	Op   InstOp
117	Out  uint32 // all but InstMatch, InstFail
118	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
119	Rune []rune
120}
121
122func (p *Prog) String() string {
123	var b strings.Builder
124	dumpProg(&b, p)
125	return b.String()
126}
127
128// skipNop follows any no-op or capturing instructions.
129func (p *Prog) skipNop(pc uint32) *Inst {
130	i := &p.Inst[pc]
131	for i.Op == InstNop || i.Op == InstCapture {
132		i = &p.Inst[i.Out]
133	}
134	return i
135}
136
137// op returns i.Op but merges all the Rune special cases into InstRune
138func (i *Inst) op() InstOp {
139	op := i.Op
140	switch op {
141	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
142		op = InstRune
143	}
144	return op
145}
146
147// Prefix returns a literal string that all matches for the
148// regexp must start with. Complete is true if the prefix
149// is the entire match.
150func (p *Prog) Prefix() (prefix string, complete bool) {
151	i := p.skipNop(uint32(p.Start))
152
153	// Avoid allocation of buffer if prefix is empty.
154	if i.op() != InstRune || len(i.Rune) != 1 {
155		return "", i.Op == InstMatch
156	}
157
158	// Have prefix; gather characters.
159	var buf strings.Builder
160	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 && i.Rune[0] != utf8.RuneError {
161		buf.WriteRune(i.Rune[0])
162		i = p.skipNop(i.Out)
163	}
164	return buf.String(), i.Op == InstMatch
165}
166
167// StartCond returns the leading empty-width conditions that must
168// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
169func (p *Prog) StartCond() EmptyOp {
170	var flag EmptyOp
171	pc := uint32(p.Start)
172	i := &p.Inst[pc]
173Loop:
174	for {
175		switch i.Op {
176		case InstEmptyWidth:
177			flag |= EmptyOp(i.Arg)
178		case InstFail:
179			return ^EmptyOp(0)
180		case InstCapture, InstNop:
181			// skip
182		default:
183			break Loop
184		}
185		pc = i.Out
186		i = &p.Inst[pc]
187	}
188	return flag
189}
190
191const noMatch = -1
192
193// MatchRune reports whether the instruction matches (and consumes) r.
194// It should only be called when i.Op == [InstRune].
195func (i *Inst) MatchRune(r rune) bool {
196	return i.MatchRunePos(r) != noMatch
197}
198
199// MatchRunePos checks whether the instruction matches (and consumes) r.
200// If so, MatchRunePos returns the index of the matching rune pair
201// (or, when len(i.Rune) == 1, rune singleton).
202// If not, MatchRunePos returns -1.
203// MatchRunePos should only be called when i.Op == [InstRune].
204func (i *Inst) MatchRunePos(r rune) int {
205	rune := i.Rune
206
207	switch len(rune) {
208	case 0:
209		return noMatch
210
211	case 1:
212		// Special case: single-rune slice is from literal string, not char class.
213		r0 := rune[0]
214		if r == r0 {
215			return 0
216		}
217		if Flags(i.Arg)&FoldCase != 0 {
218			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
219				if r == r1 {
220					return 0
221				}
222			}
223		}
224		return noMatch
225
226	case 2:
227		if r >= rune[0] && r <= rune[1] {
228			return 0
229		}
230		return noMatch
231
232	case 4, 6, 8:
233		// Linear search for a few pairs.
234		// Should handle ASCII well.
235		for j := 0; j < len(rune); j += 2 {
236			if r < rune[j] {
237				return noMatch
238			}
239			if r <= rune[j+1] {
240				return j / 2
241			}
242		}
243		return noMatch
244	}
245
246	// Otherwise binary search.
247	lo := 0
248	hi := len(rune) / 2
249	for lo < hi {
250		m := int(uint(lo+hi) >> 1)
251		if c := rune[2*m]; c <= r {
252			if r <= rune[2*m+1] {
253				return m
254			}
255			lo = m + 1
256		} else {
257			hi = m
258		}
259	}
260	return noMatch
261}
262
263// MatchEmptyWidth reports whether the instruction matches
264// an empty string between the runes before and after.
265// It should only be called when i.Op == [InstEmptyWidth].
266func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
267	switch EmptyOp(i.Arg) {
268	case EmptyBeginLine:
269		return before == '\n' || before == -1
270	case EmptyEndLine:
271		return after == '\n' || after == -1
272	case EmptyBeginText:
273		return before == -1
274	case EmptyEndText:
275		return after == -1
276	case EmptyWordBoundary:
277		return IsWordChar(before) != IsWordChar(after)
278	case EmptyNoWordBoundary:
279		return IsWordChar(before) == IsWordChar(after)
280	}
281	panic("unknown empty width arg")
282}
283
284func (i *Inst) String() string {
285	var b strings.Builder
286	dumpInst(&b, i)
287	return b.String()
288}
289
290func bw(b *strings.Builder, args ...string) {
291	for _, s := range args {
292		b.WriteString(s)
293	}
294}
295
296func dumpProg(b *strings.Builder, p *Prog) {
297	for j := range p.Inst {
298		i := &p.Inst[j]
299		pc := strconv.Itoa(j)
300		if len(pc) < 3 {
301			b.WriteString("   "[len(pc):])
302		}
303		if j == p.Start {
304			pc += "*"
305		}
306		bw(b, pc, "\t")
307		dumpInst(b, i)
308		bw(b, "\n")
309	}
310}
311
312func u32(i uint32) string {
313	return strconv.FormatUint(uint64(i), 10)
314}
315
316func dumpInst(b *strings.Builder, i *Inst) {
317	switch i.Op {
318	case InstAlt:
319		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
320	case InstAltMatch:
321		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
322	case InstCapture:
323		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
324	case InstEmptyWidth:
325		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
326	case InstMatch:
327		bw(b, "match")
328	case InstFail:
329		bw(b, "fail")
330	case InstNop:
331		bw(b, "nop -> ", u32(i.Out))
332	case InstRune:
333		if i.Rune == nil {
334			// shouldn't happen
335			bw(b, "rune <nil>")
336		}
337		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
338		if Flags(i.Arg)&FoldCase != 0 {
339			bw(b, "/i")
340		}
341		bw(b, " -> ", u32(i.Out))
342	case InstRune1:
343		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
344	case InstRuneAny:
345		bw(b, "any -> ", u32(i.Out))
346	case InstRuneAnyNotNL:
347		bw(b, "anynotnl -> ", u32(i.Out))
348	}
349}
350