1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package syntax 6 7// Note to implementers: 8// In this package, re is always a *Regexp and r is always a rune. 9 10import ( 11 "slices" 12 "strconv" 13 "strings" 14 "unicode" 15) 16 17// A Regexp is a node in a regular expression syntax tree. 18type Regexp struct { 19 Op Op // operator 20 Flags Flags 21 Sub []*Regexp // subexpressions, if any 22 Sub0 [1]*Regexp // storage for short Sub 23 Rune []rune // matched runes, for OpLiteral, OpCharClass 24 Rune0 [2]rune // storage for short Rune 25 Min, Max int // min, max for OpRepeat 26 Cap int // capturing index, for OpCapture 27 Name string // capturing name, for OpCapture 28} 29 30//go:generate stringer -type Op -trimprefix Op 31 32// An Op is a single regular expression operator. 33type Op uint8 34 35// Operators are listed in precedence order, tightest binding to weakest. 36// Character class operators are listed simplest to most complex 37// (OpLiteral, OpCharClass, OpAnyCharNotNL, OpAnyChar). 38 39const ( 40 OpNoMatch Op = 1 + iota // matches no strings 41 OpEmptyMatch // matches empty string 42 OpLiteral // matches Runes sequence 43 OpCharClass // matches Runes interpreted as range pair list 44 OpAnyCharNotNL // matches any character except newline 45 OpAnyChar // matches any character 46 OpBeginLine // matches empty string at beginning of line 47 OpEndLine // matches empty string at end of line 48 OpBeginText // matches empty string at beginning of text 49 OpEndText // matches empty string at end of text 50 OpWordBoundary // matches word boundary `\b` 51 OpNoWordBoundary // matches word non-boundary `\B` 52 OpCapture // capturing subexpression with index Cap, optional name Name 53 OpStar // matches Sub[0] zero or more times 54 OpPlus // matches Sub[0] one or more times 55 OpQuest // matches Sub[0] zero or one times 56 OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) 57 OpConcat // matches concatenation of Subs 58 OpAlternate // matches alternation of Subs 59) 60 61const opPseudo Op = 128 // where pseudo-ops start 62 63// Equal reports whether x and y have identical structure. 64func (x *Regexp) Equal(y *Regexp) bool { 65 if x == nil || y == nil { 66 return x == y 67 } 68 if x.Op != y.Op { 69 return false 70 } 71 switch x.Op { 72 case OpEndText: 73 // The parse flags remember whether this is \z or \Z. 74 if x.Flags&WasDollar != y.Flags&WasDollar { 75 return false 76 } 77 78 case OpLiteral, OpCharClass: 79 return slices.Equal(x.Rune, y.Rune) 80 81 case OpAlternate, OpConcat: 82 return slices.EqualFunc(x.Sub, y.Sub, (*Regexp).Equal) 83 84 case OpStar, OpPlus, OpQuest: 85 if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) { 86 return false 87 } 88 89 case OpRepeat: 90 if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) { 91 return false 92 } 93 94 case OpCapture: 95 if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) { 96 return false 97 } 98 } 99 return true 100} 101 102// printFlags is a bit set indicating which flags (including non-capturing parens) to print around a regexp. 103type printFlags uint8 104 105const ( 106 flagI printFlags = 1 << iota // (?i: 107 flagM // (?m: 108 flagS // (?s: 109 flagOff // ) 110 flagPrec // (?: ) 111 negShift = 5 // flagI<<negShift is (?-i: 112) 113 114// addSpan enables the flags f around start..last, 115// by setting flags[start] = f and flags[last] = flagOff. 116func addSpan(start, last *Regexp, f printFlags, flags *map[*Regexp]printFlags) { 117 if *flags == nil { 118 *flags = make(map[*Regexp]printFlags) 119 } 120 (*flags)[start] = f 121 (*flags)[last] |= flagOff // maybe start==last 122} 123 124// calcFlags calculates the flags to print around each subexpression in re, 125// storing that information in (*flags)[sub] for each affected subexpression. 126// The first time an entry needs to be written to *flags, calcFlags allocates the map. 127// calcFlags also calculates the flags that must be active or can't be active 128// around re and returns those flags. 129func calcFlags(re *Regexp, flags *map[*Regexp]printFlags) (must, cant printFlags) { 130 switch re.Op { 131 default: 132 return 0, 0 133 134 case OpLiteral: 135 // If literal is fold-sensitive, return (flagI, 0) or (0, flagI) 136 // according to whether (?i) is active. 137 // If literal is not fold-sensitive, return 0, 0. 138 for _, r := range re.Rune { 139 if minFold <= r && r <= maxFold && unicode.SimpleFold(r) != r { 140 if re.Flags&FoldCase != 0 { 141 return flagI, 0 142 } else { 143 return 0, flagI 144 } 145 } 146 } 147 return 0, 0 148 149 case OpCharClass: 150 // If literal is fold-sensitive, return 0, flagI - (?i) has been compiled out. 151 // If literal is not fold-sensitive, return 0, 0. 152 for i := 0; i < len(re.Rune); i += 2 { 153 lo := max(minFold, re.Rune[i]) 154 hi := min(maxFold, re.Rune[i+1]) 155 for r := lo; r <= hi; r++ { 156 for f := unicode.SimpleFold(r); f != r; f = unicode.SimpleFold(f) { 157 if !(lo <= f && f <= hi) && !inCharClass(f, re.Rune) { 158 return 0, flagI 159 } 160 } 161 } 162 } 163 return 0, 0 164 165 case OpAnyCharNotNL: // (?-s). 166 return 0, flagS 167 168 case OpAnyChar: // (?s). 169 return flagS, 0 170 171 case OpBeginLine, OpEndLine: // (?m)^ (?m)$ 172 return flagM, 0 173 174 case OpEndText: 175 if re.Flags&WasDollar != 0 { // (?-m)$ 176 return 0, flagM 177 } 178 return 0, 0 179 180 case OpCapture, OpStar, OpPlus, OpQuest, OpRepeat: 181 return calcFlags(re.Sub[0], flags) 182 183 case OpConcat, OpAlternate: 184 // Gather the must and cant for each subexpression. 185 // When we find a conflicting subexpression, insert the necessary 186 // flags around the previously identified span and start over. 187 var must, cant, allCant printFlags 188 start := 0 189 last := 0 190 did := false 191 for i, sub := range re.Sub { 192 subMust, subCant := calcFlags(sub, flags) 193 if must&subCant != 0 || subMust&cant != 0 { 194 if must != 0 { 195 addSpan(re.Sub[start], re.Sub[last], must, flags) 196 } 197 must = 0 198 cant = 0 199 start = i 200 did = true 201 } 202 must |= subMust 203 cant |= subCant 204 allCant |= subCant 205 if subMust != 0 { 206 last = i 207 } 208 if must == 0 && start == i { 209 start++ 210 } 211 } 212 if !did { 213 // No conflicts: pass the accumulated must and cant upward. 214 return must, cant 215 } 216 if must != 0 { 217 // Conflicts found; need to finish final span. 218 addSpan(re.Sub[start], re.Sub[last], must, flags) 219 } 220 return 0, allCant 221 } 222} 223 224// writeRegexp writes the Perl syntax for the regular expression re to b. 225func writeRegexp(b *strings.Builder, re *Regexp, f printFlags, flags map[*Regexp]printFlags) { 226 f |= flags[re] 227 if f&flagPrec != 0 && f&^(flagOff|flagPrec) != 0 && f&flagOff != 0 { 228 // flagPrec is redundant with other flags being added and terminated 229 f &^= flagPrec 230 } 231 if f&^(flagOff|flagPrec) != 0 { 232 b.WriteString(`(?`) 233 if f&flagI != 0 { 234 b.WriteString(`i`) 235 } 236 if f&flagM != 0 { 237 b.WriteString(`m`) 238 } 239 if f&flagS != 0 { 240 b.WriteString(`s`) 241 } 242 if f&((flagM|flagS)<<negShift) != 0 { 243 b.WriteString(`-`) 244 if f&(flagM<<negShift) != 0 { 245 b.WriteString(`m`) 246 } 247 if f&(flagS<<negShift) != 0 { 248 b.WriteString(`s`) 249 } 250 } 251 b.WriteString(`:`) 252 } 253 if f&flagOff != 0 { 254 defer b.WriteString(`)`) 255 } 256 if f&flagPrec != 0 { 257 b.WriteString(`(?:`) 258 defer b.WriteString(`)`) 259 } 260 261 switch re.Op { 262 default: 263 b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">") 264 case OpNoMatch: 265 b.WriteString(`[^\x00-\x{10FFFF}]`) 266 case OpEmptyMatch: 267 b.WriteString(`(?:)`) 268 case OpLiteral: 269 for _, r := range re.Rune { 270 escape(b, r, false) 271 } 272 case OpCharClass: 273 if len(re.Rune)%2 != 0 { 274 b.WriteString(`[invalid char class]`) 275 break 276 } 277 b.WriteRune('[') 278 if len(re.Rune) == 0 { 279 b.WriteString(`^\x00-\x{10FFFF}`) 280 } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune && len(re.Rune) > 2 { 281 // Contains 0 and MaxRune. Probably a negated class. 282 // Print the gaps. 283 b.WriteRune('^') 284 for i := 1; i < len(re.Rune)-1; i += 2 { 285 lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 286 escape(b, lo, lo == '-') 287 if lo != hi { 288 if hi != lo+1 { 289 b.WriteRune('-') 290 } 291 escape(b, hi, hi == '-') 292 } 293 } 294 } else { 295 for i := 0; i < len(re.Rune); i += 2 { 296 lo, hi := re.Rune[i], re.Rune[i+1] 297 escape(b, lo, lo == '-') 298 if lo != hi { 299 if hi != lo+1 { 300 b.WriteRune('-') 301 } 302 escape(b, hi, hi == '-') 303 } 304 } 305 } 306 b.WriteRune(']') 307 case OpAnyCharNotNL, OpAnyChar: 308 b.WriteString(`.`) 309 case OpBeginLine: 310 b.WriteString(`^`) 311 case OpEndLine: 312 b.WriteString(`$`) 313 case OpBeginText: 314 b.WriteString(`\A`) 315 case OpEndText: 316 if re.Flags&WasDollar != 0 { 317 b.WriteString(`$`) 318 } else { 319 b.WriteString(`\z`) 320 } 321 case OpWordBoundary: 322 b.WriteString(`\b`) 323 case OpNoWordBoundary: 324 b.WriteString(`\B`) 325 case OpCapture: 326 if re.Name != "" { 327 b.WriteString(`(?P<`) 328 b.WriteString(re.Name) 329 b.WriteRune('>') 330 } else { 331 b.WriteRune('(') 332 } 333 if re.Sub[0].Op != OpEmptyMatch { 334 writeRegexp(b, re.Sub[0], flags[re.Sub[0]], flags) 335 } 336 b.WriteRune(')') 337 case OpStar, OpPlus, OpQuest, OpRepeat: 338 p := printFlags(0) 339 sub := re.Sub[0] 340 if sub.Op > OpCapture || sub.Op == OpLiteral && len(sub.Rune) > 1 { 341 p = flagPrec 342 } 343 writeRegexp(b, sub, p, flags) 344 345 switch re.Op { 346 case OpStar: 347 b.WriteRune('*') 348 case OpPlus: 349 b.WriteRune('+') 350 case OpQuest: 351 b.WriteRune('?') 352 case OpRepeat: 353 b.WriteRune('{') 354 b.WriteString(strconv.Itoa(re.Min)) 355 if re.Max != re.Min { 356 b.WriteRune(',') 357 if re.Max >= 0 { 358 b.WriteString(strconv.Itoa(re.Max)) 359 } 360 } 361 b.WriteRune('}') 362 } 363 if re.Flags&NonGreedy != 0 { 364 b.WriteRune('?') 365 } 366 case OpConcat: 367 for _, sub := range re.Sub { 368 p := printFlags(0) 369 if sub.Op == OpAlternate { 370 p = flagPrec 371 } 372 writeRegexp(b, sub, p, flags) 373 } 374 case OpAlternate: 375 for i, sub := range re.Sub { 376 if i > 0 { 377 b.WriteRune('|') 378 } 379 writeRegexp(b, sub, 0, flags) 380 } 381 } 382} 383 384func (re *Regexp) String() string { 385 var b strings.Builder 386 var flags map[*Regexp]printFlags 387 must, cant := calcFlags(re, &flags) 388 must |= (cant &^ flagI) << negShift 389 if must != 0 { 390 must |= flagOff 391 } 392 writeRegexp(&b, re, must, flags) 393 return b.String() 394} 395 396const meta = `\.+*?()|[]{}^$` 397 398func escape(b *strings.Builder, r rune, force bool) { 399 if unicode.IsPrint(r) { 400 if strings.ContainsRune(meta, r) || force { 401 b.WriteRune('\\') 402 } 403 b.WriteRune(r) 404 return 405 } 406 407 switch r { 408 case '\a': 409 b.WriteString(`\a`) 410 case '\f': 411 b.WriteString(`\f`) 412 case '\n': 413 b.WriteString(`\n`) 414 case '\r': 415 b.WriteString(`\r`) 416 case '\t': 417 b.WriteString(`\t`) 418 case '\v': 419 b.WriteString(`\v`) 420 default: 421 if r < 0x100 { 422 b.WriteString(`\x`) 423 s := strconv.FormatInt(int64(r), 16) 424 if len(s) == 1 { 425 b.WriteRune('0') 426 } 427 b.WriteString(s) 428 break 429 } 430 b.WriteString(`\x{`) 431 b.WriteString(strconv.FormatInt(int64(r), 16)) 432 b.WriteString(`}`) 433 } 434} 435 436// MaxCap walks the regexp to find the maximum capture index. 437func (re *Regexp) MaxCap() int { 438 m := 0 439 if re.Op == OpCapture { 440 m = re.Cap 441 } 442 for _, sub := range re.Sub { 443 if n := sub.MaxCap(); m < n { 444 m = n 445 } 446 } 447 return m 448} 449 450// CapNames walks the regexp to find the names of capturing groups. 451func (re *Regexp) CapNames() []string { 452 names := make([]string, re.MaxCap()+1) 453 re.capNames(names) 454 return names 455} 456 457func (re *Regexp) capNames(names []string) { 458 if re.Op == OpCapture { 459 names[re.Cap] = re.Name 460 } 461 for _, sub := range re.Sub { 462 sub.capNames(names) 463 } 464} 465