1// Copyright 2014 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package regexp 6 7import ( 8 "regexp/syntax" 9 "slices" 10 "strings" 11 "unicode" 12 "unicode/utf8" 13) 14 15// "One-pass" regexp execution. 16// Some regexps can be analyzed to determine that they never need 17// backtracking: they are guaranteed to run in one pass over the string 18// without bothering to save all the usual NFA state. 19// Detect those and execute them more quickly. 20 21// A onePassProg is a compiled one-pass regular expression program. 22// It is the same as syntax.Prog except for the use of onePassInst. 23type onePassProg struct { 24 Inst []onePassInst 25 Start int // index of start instruction 26 NumCap int // number of InstCapture insts in re 27} 28 29// A onePassInst is a single instruction in a one-pass regular expression program. 30// It is the same as syntax.Inst except for the new 'Next' field. 31type onePassInst struct { 32 syntax.Inst 33 Next []uint32 34} 35 36// onePassPrefix returns a literal string that all matches for the 37// regexp must start with. Complete is true if the prefix 38// is the entire match. Pc is the index of the last rune instruction 39// in the string. The onePassPrefix skips over the mandatory 40// EmptyBeginText. 41func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { 42 i := &p.Inst[p.Start] 43 if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { 44 return "", i.Op == syntax.InstMatch, uint32(p.Start) 45 } 46 pc = i.Out 47 i = &p.Inst[pc] 48 for i.Op == syntax.InstNop { 49 pc = i.Out 50 i = &p.Inst[pc] 51 } 52 // Avoid allocation of buffer if prefix is empty. 53 if iop(i) != syntax.InstRune || len(i.Rune) != 1 { 54 return "", i.Op == syntax.InstMatch, uint32(p.Start) 55 } 56 57 // Have prefix; gather characters. 58 var buf strings.Builder 59 for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 && i.Rune[0] != utf8.RuneError { 60 buf.WriteRune(i.Rune[0]) 61 pc, i = i.Out, &p.Inst[i.Out] 62 } 63 if i.Op == syntax.InstEmptyWidth && 64 syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 && 65 p.Inst[i.Out].Op == syntax.InstMatch { 66 complete = true 67 } 68 return buf.String(), complete, pc 69} 70 71// onePassNext selects the next actionable state of the prog, based on the input character. 72// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. 73// One of the alternates may ultimately lead without input to end of line. If the instruction 74// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. 75func onePassNext(i *onePassInst, r rune) uint32 { 76 next := i.MatchRunePos(r) 77 if next >= 0 { 78 return i.Next[next] 79 } 80 if i.Op == syntax.InstAltMatch { 81 return i.Out 82 } 83 return 0 84} 85 86func iop(i *syntax.Inst) syntax.InstOp { 87 op := i.Op 88 switch op { 89 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 90 op = syntax.InstRune 91 } 92 return op 93} 94 95// Sparse Array implementation is used as a queueOnePass. 96type queueOnePass struct { 97 sparse []uint32 98 dense []uint32 99 size, nextIndex uint32 100} 101 102func (q *queueOnePass) empty() bool { 103 return q.nextIndex >= q.size 104} 105 106func (q *queueOnePass) next() (n uint32) { 107 n = q.dense[q.nextIndex] 108 q.nextIndex++ 109 return 110} 111 112func (q *queueOnePass) clear() { 113 q.size = 0 114 q.nextIndex = 0 115} 116 117func (q *queueOnePass) contains(u uint32) bool { 118 if u >= uint32(len(q.sparse)) { 119 return false 120 } 121 return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u 122} 123 124func (q *queueOnePass) insert(u uint32) { 125 if !q.contains(u) { 126 q.insertNew(u) 127 } 128} 129 130func (q *queueOnePass) insertNew(u uint32) { 131 if u >= uint32(len(q.sparse)) { 132 return 133 } 134 q.sparse[u] = q.size 135 q.dense[q.size] = u 136 q.size++ 137} 138 139func newQueue(size int) (q *queueOnePass) { 140 return &queueOnePass{ 141 sparse: make([]uint32, size), 142 dense: make([]uint32, size), 143 } 144} 145 146// mergeRuneSets merges two non-intersecting runesets, and returns the merged result, 147// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index 148// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a 149// NextIp array with the single element mergeFailed is returned. 150// The code assumes that both inputs contain ordered and non-intersecting rune pairs. 151const mergeFailed = uint32(0xffffffff) 152 153var ( 154 noRune = []rune{} 155 noNext = []uint32{mergeFailed} 156) 157 158func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { 159 leftLen := len(*leftRunes) 160 rightLen := len(*rightRunes) 161 if leftLen&0x1 != 0 || rightLen&0x1 != 0 { 162 panic("mergeRuneSets odd length []rune") 163 } 164 var ( 165 lx, rx int 166 ) 167 merged := make([]rune, 0) 168 next := make([]uint32, 0) 169 ok := true 170 defer func() { 171 if !ok { 172 merged = nil 173 next = nil 174 } 175 }() 176 177 ix := -1 178 extend := func(newLow *int, newArray *[]rune, pc uint32) bool { 179 if ix > 0 && (*newArray)[*newLow] <= merged[ix] { 180 return false 181 } 182 merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) 183 *newLow += 2 184 ix += 2 185 next = append(next, pc) 186 return true 187 } 188 189 for lx < leftLen || rx < rightLen { 190 switch { 191 case rx >= rightLen: 192 ok = extend(&lx, leftRunes, leftPC) 193 case lx >= leftLen: 194 ok = extend(&rx, rightRunes, rightPC) 195 case (*rightRunes)[rx] < (*leftRunes)[lx]: 196 ok = extend(&rx, rightRunes, rightPC) 197 default: 198 ok = extend(&lx, leftRunes, leftPC) 199 } 200 if !ok { 201 return noRune, noNext 202 } 203 } 204 return merged, next 205} 206 207// cleanupOnePass drops working memory, and restores certain shortcut instructions. 208func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { 209 for ix, instOriginal := range original.Inst { 210 switch instOriginal.Op { 211 case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: 212 case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: 213 prog.Inst[ix].Next = nil 214 case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 215 prog.Inst[ix].Next = nil 216 prog.Inst[ix] = onePassInst{Inst: instOriginal} 217 } 218 } 219} 220 221// onePassCopy creates a copy of the original Prog, as we'll be modifying it. 222func onePassCopy(prog *syntax.Prog) *onePassProg { 223 p := &onePassProg{ 224 Start: prog.Start, 225 NumCap: prog.NumCap, 226 Inst: make([]onePassInst, len(prog.Inst)), 227 } 228 for i, inst := range prog.Inst { 229 p.Inst[i] = onePassInst{Inst: inst} 230 } 231 232 // rewrites one or more common Prog constructs that enable some otherwise 233 // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at 234 // ip A, that points to ips B & C. 235 // A:BC + B:DA => A:BC + B:CD 236 // A:BC + B:DC => A:DC + B:DC 237 for pc := range p.Inst { 238 switch p.Inst[pc].Op { 239 default: 240 continue 241 case syntax.InstAlt, syntax.InstAltMatch: 242 // A:Bx + B:Ay 243 p_A_Other := &p.Inst[pc].Out 244 p_A_Alt := &p.Inst[pc].Arg 245 // make sure a target is another Alt 246 instAlt := p.Inst[*p_A_Alt] 247 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 248 p_A_Alt, p_A_Other = p_A_Other, p_A_Alt 249 instAlt = p.Inst[*p_A_Alt] 250 if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { 251 continue 252 } 253 } 254 instOther := p.Inst[*p_A_Other] 255 // Analyzing both legs pointing to Alts is for another day 256 if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { 257 // too complicated 258 continue 259 } 260 // simple empty transition loop 261 // A:BC + B:DA => A:BC + B:DC 262 p_B_Alt := &p.Inst[*p_A_Alt].Out 263 p_B_Other := &p.Inst[*p_A_Alt].Arg 264 patch := false 265 if instAlt.Out == uint32(pc) { 266 patch = true 267 } else if instAlt.Arg == uint32(pc) { 268 patch = true 269 p_B_Alt, p_B_Other = p_B_Other, p_B_Alt 270 } 271 if patch { 272 *p_B_Alt = *p_A_Other 273 } 274 275 // empty transition to common target 276 // A:BC + B:DC => A:DC + B:DC 277 if *p_A_Other == *p_B_Alt { 278 *p_A_Alt = *p_B_Other 279 } 280 } 281 } 282 return p 283} 284 285var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} 286var anyRune = []rune{0, unicode.MaxRune} 287 288// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, 289// the match engine can always tell which branch to take. The routine may modify 290// p if it is turned into a onepass Prog. If it isn't possible for this to be a 291// onepass Prog, the Prog nil is returned. makeOnePass is recursive 292// to the size of the Prog. 293func makeOnePass(p *onePassProg) *onePassProg { 294 // If the machine is very long, it's not worth the time to check if we can use one pass. 295 if len(p.Inst) >= 1000 { 296 return nil 297 } 298 299 var ( 300 instQueue = newQueue(len(p.Inst)) 301 visitQueue = newQueue(len(p.Inst)) 302 check func(uint32, []bool) bool 303 onePassRunes = make([][]rune, len(p.Inst)) 304 ) 305 306 // check that paths from Alt instructions are unambiguous, and rebuild the new 307 // program as a onepass program 308 check = func(pc uint32, m []bool) (ok bool) { 309 ok = true 310 inst := &p.Inst[pc] 311 if visitQueue.contains(pc) { 312 return 313 } 314 visitQueue.insert(pc) 315 switch inst.Op { 316 case syntax.InstAlt, syntax.InstAltMatch: 317 ok = check(inst.Out, m) && check(inst.Arg, m) 318 // check no-input paths to InstMatch 319 matchOut := m[inst.Out] 320 matchArg := m[inst.Arg] 321 if matchOut && matchArg { 322 ok = false 323 break 324 } 325 // Match on empty goes in inst.Out 326 if matchArg { 327 inst.Out, inst.Arg = inst.Arg, inst.Out 328 matchOut, matchArg = matchArg, matchOut 329 } 330 if matchOut { 331 m[pc] = true 332 inst.Op = syntax.InstAltMatch 333 } 334 335 // build a dispatch operator from the two legs of the alt. 336 onePassRunes[pc], inst.Next = mergeRuneSets( 337 &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) 338 if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { 339 ok = false 340 break 341 } 342 case syntax.InstCapture, syntax.InstNop: 343 ok = check(inst.Out, m) 344 m[pc] = m[inst.Out] 345 // pass matching runes back through these no-ops. 346 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 347 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 348 for i := range inst.Next { 349 inst.Next[i] = inst.Out 350 } 351 case syntax.InstEmptyWidth: 352 ok = check(inst.Out, m) 353 m[pc] = m[inst.Out] 354 onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) 355 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 356 for i := range inst.Next { 357 inst.Next[i] = inst.Out 358 } 359 case syntax.InstMatch, syntax.InstFail: 360 m[pc] = inst.Op == syntax.InstMatch 361 case syntax.InstRune: 362 m[pc] = false 363 if len(inst.Next) > 0 { 364 break 365 } 366 instQueue.insert(inst.Out) 367 if len(inst.Rune) == 0 { 368 onePassRunes[pc] = []rune{} 369 inst.Next = []uint32{inst.Out} 370 break 371 } 372 runes := make([]rune, 0) 373 if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 374 r0 := inst.Rune[0] 375 runes = append(runes, r0, r0) 376 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 377 runes = append(runes, r1, r1) 378 } 379 slices.Sort(runes) 380 } else { 381 runes = append(runes, inst.Rune...) 382 } 383 onePassRunes[pc] = runes 384 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 385 for i := range inst.Next { 386 inst.Next[i] = inst.Out 387 } 388 inst.Op = syntax.InstRune 389 case syntax.InstRune1: 390 m[pc] = false 391 if len(inst.Next) > 0 { 392 break 393 } 394 instQueue.insert(inst.Out) 395 runes := []rune{} 396 // expand case-folded runes 397 if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { 398 r0 := inst.Rune[0] 399 runes = append(runes, r0, r0) 400 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 401 runes = append(runes, r1, r1) 402 } 403 slices.Sort(runes) 404 } else { 405 runes = append(runes, inst.Rune[0], inst.Rune[0]) 406 } 407 onePassRunes[pc] = runes 408 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 409 for i := range inst.Next { 410 inst.Next[i] = inst.Out 411 } 412 inst.Op = syntax.InstRune 413 case syntax.InstRuneAny: 414 m[pc] = false 415 if len(inst.Next) > 0 { 416 break 417 } 418 instQueue.insert(inst.Out) 419 onePassRunes[pc] = append([]rune{}, anyRune...) 420 inst.Next = []uint32{inst.Out} 421 case syntax.InstRuneAnyNotNL: 422 m[pc] = false 423 if len(inst.Next) > 0 { 424 break 425 } 426 instQueue.insert(inst.Out) 427 onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) 428 inst.Next = make([]uint32, len(onePassRunes[pc])/2+1) 429 for i := range inst.Next { 430 inst.Next[i] = inst.Out 431 } 432 } 433 return 434 } 435 436 instQueue.clear() 437 instQueue.insert(uint32(p.Start)) 438 m := make([]bool, len(p.Inst)) 439 for !instQueue.empty() { 440 visitQueue.clear() 441 pc := instQueue.next() 442 if !check(pc, m) { 443 p = nil 444 break 445 } 446 } 447 if p != nil { 448 for i := range p.Inst { 449 p.Inst[i].Rune = onePassRunes[i] 450 } 451 } 452 return p 453} 454 455// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog 456// can be recharacterized as a one-pass regexp program, or syntax.nil if the 457// Prog cannot be converted. For a one pass prog, the fundamental condition that must 458// be true is: at any InstAlt, there must be no ambiguity about what branch to take. 459func compileOnePass(prog *syntax.Prog) (p *onePassProg) { 460 if prog.Start == 0 { 461 return nil 462 } 463 // onepass regexp is anchored 464 if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || 465 syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { 466 return nil 467 } 468 // every instruction leading to InstMatch must be EmptyEndText 469 for _, inst := range prog.Inst { 470 opOut := prog.Inst[inst.Out].Op 471 switch inst.Op { 472 default: 473 if opOut == syntax.InstMatch { 474 return nil 475 } 476 case syntax.InstAlt, syntax.InstAltMatch: 477 if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { 478 return nil 479 } 480 case syntax.InstEmptyWidth: 481 if opOut == syntax.InstMatch { 482 if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { 483 continue 484 } 485 return nil 486 } 487 } 488 } 489 // Creates a slightly optimized copy of the original Prog 490 // that cleans up some Prog idioms that block valid onepass programs 491 p = onePassCopy(prog) 492 493 // checkAmbiguity on InstAlts, build onepass Prog if possible 494 p = makeOnePass(p) 495 496 if p != nil { 497 cleanupOnePass(p, prog) 498 } 499 return p 500} 501