1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7// Note to implementers:
8// In this package, re is always a *Regexp and r is always a rune.
9
10import (
11	"slices"
12	"strconv"
13	"strings"
14	"unicode"
15)
16
17// A Regexp is a node in a regular expression syntax tree.
18type Regexp struct {
19	Op       Op // operator
20	Flags    Flags
21	Sub      []*Regexp  // subexpressions, if any
22	Sub0     [1]*Regexp // storage for short Sub
23	Rune     []rune     // matched runes, for OpLiteral, OpCharClass
24	Rune0    [2]rune    // storage for short Rune
25	Min, Max int        // min, max for OpRepeat
26	Cap      int        // capturing index, for OpCapture
27	Name     string     // capturing name, for OpCapture
28}
29
30//go:generate stringer -type Op -trimprefix Op
31
32// An Op is a single regular expression operator.
33type Op uint8
34
35// Operators are listed in precedence order, tightest binding to weakest.
36// Character class operators are listed simplest to most complex
37// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar).
38
39const (
40	OpNoMatch        Op = 1 + iota // matches no strings
41	OpEmptyMatch                   // matches empty string
42	OpLiteral                      // matches Runes sequence
43	OpCharClass                    // matches Runes interpreted as range pair list
44	OpAnyCharNotNL                 // matches any character except newline
45	OpAnyChar                      // matches any character
46	OpBeginLine                    // matches empty string at beginning of line
47	OpEndLine                      // matches empty string at end of line
48	OpBeginText                    // matches empty string at beginning of text
49	OpEndText                      // matches empty string at end of text
50	OpWordBoundary                 // matches word boundary `\b`
51	OpNoWordBoundary               // matches word non-boundary `\B`
52	OpCapture                      // capturing subexpression with index Cap, optional name Name
53	OpStar                         // matches Sub[0] zero or more times
54	OpPlus                         // matches Sub[0] one or more times
55	OpQuest                        // matches Sub[0] zero or one times
56	OpRepeat                       // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
57	OpConcat                       // matches concatenation of Subs
58	OpAlternate                    // matches alternation of Subs
59)
60
61const opPseudo Op = 128 // where pseudo-ops start
62
63// Equal reports whether x and y have identical structure.
64func (x *Regexp) Equal(y *Regexp) bool {
65	if x == nil || y == nil {
66		return x == y
67	}
68	if x.Op != y.Op {
69		return false
70	}
71	switch x.Op {
72	case OpEndText:
73		// The parse flags remember whether this is \z or \Z.
74		if x.Flags&WasDollar != y.Flags&WasDollar {
75			return false
76		}
77
78	case OpLiteral, OpCharClass:
79		return slices.Equal(x.Rune, y.Rune)
80
81	case OpAlternate, OpConcat:
82		return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal)
83
84	case OpStar, OpPlus, OpQuest:
85		if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
86			return false
87		}
88
89	case OpRepeat:
90		if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
91			return false
92		}
93
94	case OpCapture:
95		if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
96			return false
97		}
98	}
99	return true
100}
101
102// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp.
103type printFlags uint8
104
105const (
106	flagI    printFlags = 1 << iota // (?i:
107	flagM                           // (?m:
108	flagS                           // (?s:
109	flagOff                         // )
110	flagPrec                        // (?: )
111	negShift = 5                    // flagI<<negShift is (?-i:
112)
113
114// addSpan enables the flags f around start..last,
115// by setting flags[start] = f and flags[last] = flagOff.
116func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) {
117	if *flags == nil {
118		*flags = make(map[*Regexp]printFlags)
119	}
120	(*flags)[start] = f
121	(*flags)[last] |= flagOff // maybe start==last
122}
123
124// calcFlags calculates the flags to print around each subexpression in re,
125// storing that information in (*flags)[sub] for each affected subexpression.
126// The first time an entry needs to be written to *flags, calcFlags allocates the map.
127// calcFlags also calculates the flags that must be active or can't be active
128// around re and returns those flags.
129func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) {
130	switch re.Op {
131	default:
132		return 0, 0
133
134	case OpLiteral:
135		// If literal is fold-sensitive, return (flagI, 0) or (0, flagI)
136		// according to whether (?i) is active.
137		// If literal is not fold-sensitive, return 0, 0.
138		for _, r := range re.Rune {
139			if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r {
140				if re.Flags&FoldCase != 0 {
141					return flagI, 0
142				} else {
143					return 0, flagI
144				}
145			}
146		}
147		return 0, 0
148
149	case OpCharClass:
150		// If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out.
151		// If literal is not fold-sensitive, return 0, 0.
152		for i := 0; i < len(re.Rune); i += 2 {
153			lo := max(minFold, re.Rune[i])
154			hi := min(maxFold, re.Rune[i+1])
155			for r := lo; r <= hi; r++ {
156				for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) {
157					if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) {
158						return 0, flagI
159					}
160				}
161			}
162		}
163		return 0, 0
164
165	case OpAnyCharNotNL: // (?-s).
166		return 0, flagS
167
168	case OpAnyChar: // (?s).
169		return flagS, 0
170
171	case OpBeginLine, OpEndLine: // (?m)^ (?m)$
172		return flagM, 0
173
174	case OpEndText:
175		if re.Flags&WasDollar != 0 { // (?-m)$
176			return 0, flagM
177		}
178		return 0, 0
179
180	case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat:
181		return calcFlags(re.Sub[0], flags)
182
183	case OpConcat, OpAlternate:
184		// Gather the must and cant for each subexpression.
185		// When we find a conflicting subexpression, insert the necessary
186		// flags around the previously identified span and start over.
187		var must, cant, allCant printFlags
188		start := 0
189		last := 0
190		did := false
191		for i, sub := range re.Sub {
192			subMust, subCant := calcFlags(sub, flags)
193			if must&subCant != 0 || subMust&cant != 0 {
194				if must != 0 {
195					addSpan(re.Sub[start], re.Sub[last], must, flags)
196				}
197				must = 0
198				cant = 0
199				start = i
200				did = true
201			}
202			must |= subMust
203			cant |= subCant
204			allCant |= subCant
205			if subMust != 0 {
206				last = i
207			}
208			if must == 0 && start == i {
209				start++
210			}
211		}
212		if !did {
213			// No conflicts: pass the accumulated must and cant upward.
214			return must, cant
215		}
216		if must != 0 {
217			// Conflicts found; need to finish final span.
218			addSpan(re.Sub[start], re.Sub[last], must, flags)
219		}
220		return 0, allCant
221	}
222}
223
224// writeRegexp writes the Perl syntax for the regular expression re to b.
225func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) {
226	f |= flags[re]
227	if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 {
228		// flagPrec is redundant with other flags being added and terminated
229		f &^= flagPrec
230	}
231	if f&^(flagOff|flagPrec) != 0 {
232		b.WriteString(`(?`)
233		if f&flagI != 0 {
234			b.WriteString(`i`)
235		}
236		if f&flagM != 0 {
237			b.WriteString(`m`)
238		}
239		if f&flagS != 0 {
240			b.WriteString(`s`)
241		}
242		if f&((flagM|flagS)<<negShift) != 0 {
243			b.WriteString(`-`)
244			if f&(flagM<<negShift) != 0 {
245				b.WriteString(`m`)
246			}
247			if f&(flagS<<negShift) != 0 {
248				b.WriteString(`s`)
249			}
250		}
251		b.WriteString(`:`)
252	}
253	if f&flagOff != 0 {
254		defer b.WriteString(`)`)
255	}
256	if f&flagPrec != 0 {
257		b.WriteString(`(?:`)
258		defer b.WriteString(`)`)
259	}
260
261	switch re.Op {
262	default:
263		b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
264	case OpNoMatch:
265		b.WriteString(`[^\x00-\x{10FFFF}]`)
266	case OpEmptyMatch:
267		b.WriteString(`(?:)`)
268	case OpLiteral:
269		for _, r := range re.Rune {
270			escape(b, r, false)
271		}
272	case OpCharClass:
273		if len(re.Rune)%2 != 0 {
274			b.WriteString(`[invalid char class]`)
275			break
276		}
277		b.WriteRune('[')
278		if len(re.Rune) == 0 {
279			b.WriteString(`^\x00-\x{10FFFF}`)
280		} else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 {
281			// Contains 0 and MaxRune. Probably a negated class.
282			// Print the gaps.
283			b.WriteRune('^')
284			for i := 1; i < len(re.Rune)-1; i += 2 {
285				lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
286				escape(b, lo, lo == '-')
287				if lo != hi {
288					if hi != lo+1 {
289						b.WriteRune('-')
290					}
291					escape(b, hi, hi == '-')
292				}
293			}
294		} else {
295			for i := 0; i < len(re.Rune); i += 2 {
296				lo, hi := re.Rune[i], re.Rune[i+1]
297				escape(b, lo, lo == '-')
298				if lo != hi {
299					if hi != lo+1 {
300						b.WriteRune('-')
301					}
302					escape(b, hi, hi == '-')
303				}
304			}
305		}
306		b.WriteRune(']')
307	case OpAnyCharNotNL, OpAnyChar:
308		b.WriteString(`.`)
309	case OpBeginLine:
310		b.WriteString(`^`)
311	case OpEndLine:
312		b.WriteString(`$`)
313	case OpBeginText:
314		b.WriteString(`\A`)
315	case OpEndText:
316		if re.Flags&WasDollar != 0 {
317			b.WriteString(`$`)
318		} else {
319			b.WriteString(`\z`)
320		}
321	case OpWordBoundary:
322		b.WriteString(`\b`)
323	case OpNoWordBoundary:
324		b.WriteString(`\B`)
325	case OpCapture:
326		if re.Name != "" {
327			b.WriteString(`(?P<`)
328			b.WriteString(re.Name)
329			b.WriteRune('>')
330		} else {
331			b.WriteRune('(')
332		}
333		if re.Sub[0].Op != OpEmptyMatch {
334			writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags)
335		}
336		b.WriteRune(')')
337	case OpStar, OpPlus, OpQuest, OpRepeat:
338		p := printFlags(0)
339		sub := re.Sub[0]
340		if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 {
341			p = flagPrec
342		}
343		writeRegexp(b, sub, p, flags)
344
345		switch re.Op {
346		case OpStar:
347			b.WriteRune('*')
348		case OpPlus:
349			b.WriteRune('+')
350		case OpQuest:
351			b.WriteRune('?')
352		case OpRepeat:
353			b.WriteRune('{')
354			b.WriteString(strconv.Itoa(re.Min))
355			if re.Max != re.Min {
356				b.WriteRune(',')
357				if re.Max >= 0 {
358					b.WriteString(strconv.Itoa(re.Max))
359				}
360			}
361			b.WriteRune('}')
362		}
363		if re.Flags&NonGreedy != 0 {
364			b.WriteRune('?')
365		}
366	case OpConcat:
367		for _, sub := range re.Sub {
368			p := printFlags(0)
369			if sub.Op == OpAlternate {
370				p = flagPrec
371			}
372			writeRegexp(b, sub, p, flags)
373		}
374	case OpAlternate:
375		for i, sub := range re.Sub {
376			if i > 0 {
377				b.WriteRune('|')
378			}
379			writeRegexp(b, sub, 0, flags)
380		}
381	}
382}
383
384func (re *Regexp) String() string {
385	var b strings.Builder
386	var flags map[*Regexp]printFlags
387	must, cant := calcFlags(re, &flags)
388	must |= (cant &^ flagI) << negShift
389	if must != 0 {
390		must |= flagOff
391	}
392	writeRegexp(&b, re, must, flags)
393	return b.String()
394}
395
396const meta = `\.+*?()|[]{}^$`
397
398func escape(b *strings.Builder, r rune, force bool) {
399	if unicode.IsPrint(r) {
400		if strings.ContainsRune(meta, r) || force {
401			b.WriteRune('\\')
402		}
403		b.WriteRune(r)
404		return
405	}
406
407	switch r {
408	case '\a':
409		b.WriteString(`\a`)
410	case '\f':
411		b.WriteString(`\f`)
412	case '\n':
413		b.WriteString(`\n`)
414	case '\r':
415		b.WriteString(`\r`)
416	case '\t':
417		b.WriteString(`\t`)
418	case '\v':
419		b.WriteString(`\v`)
420	default:
421		if r < 0x100 {
422			b.WriteString(`\x`)
423			s := strconv.FormatInt(int64(r), 16)
424			if len(s) == 1 {
425				b.WriteRune('0')
426			}
427			b.WriteString(s)
428			break
429		}
430		b.WriteString(`\x{`)
431		b.WriteString(strconv.FormatInt(int64(r), 16))
432		b.WriteString(`}`)
433	}
434}
435
436// MaxCap walks the regexp to find the maximum capture index.
437func (re *Regexp) MaxCap() int {
438	m := 0
439	if re.Op == OpCapture {
440		m = re.Cap
441	}
442	for _, sub := range re.Sub {
443		if n := sub.MaxCap(); m < n {
444			m = n
445		}
446	}
447	return m
448}
449
450// CapNames walks the regexp to find the names of capturing groups.
451func (re *Regexp) CapNames() []string {
452	names := make([]string, re.MaxCap()+1)
453	re.capNames(names)
454	return names
455}
456
457func (re *Regexp) capNames(names []string) {
458	if re.Op == OpCapture {
459		names[re.Cap] = re.Name
460	}
461	for _, sub := range re.Sub {
462		sub.capNames(names)
463	}
464}
465