1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// backtrack is a regular expression search with submatch 6// tracking for small regular expressions and texts. It allocates 7// a bit vector with (length of input) * (length of prog) bits, 8// to make sure it never explores the same (character position, instruction) 9// state multiple times. This limits the search to run in time linear in 10// the length of the test. 11// 12// backtrack is a fast replacement for the NFA code on small 13// regexps when onepass cannot be used. 14 15package regexp 16 17import ( 18 "regexp/syntax" 19 "sync" 20) 21 22// A job is an entry on the backtracker's job stack. It holds 23// the instruction pc and the position in the input. 24type job struct { 25 pc uint32 26 arg bool 27 pos int 28} 29 30const ( 31 visitedBits = 32 32 maxBacktrackProg = 500 // len(prog.Inst) <= max 33 maxBacktrackVector = 256 * 1024 // bit vector size <= max (bits) 34) 35 36// bitState holds state for the backtracker. 37type bitState struct { 38 end int 39 cap []int 40 matchcap []int 41 jobs []job 42 visited []uint32 43 44 inputs inputs 45} 46 47var bitStatePool sync.Pool 48 49func newBitState() *bitState { 50 b, ok := bitStatePool.Get().(*bitState) 51 if !ok { 52 b = new(bitState) 53 } 54 return b 55} 56 57func freeBitState(b *bitState) { 58 b.inputs.clear() 59 bitStatePool.Put(b) 60} 61 62// maxBitStateLen returns the maximum length of a string to search with 63// the backtracker using prog. 64func maxBitStateLen(prog *syntax.Prog) int { 65 if !shouldBacktrack(prog) { 66 return 0 67 } 68 return maxBacktrackVector / len(prog.Inst) 69} 70 71// shouldBacktrack reports whether the program is too 72// long for the backtracker to run. 73func shouldBacktrack(prog *syntax.Prog) bool { 74 return len(prog.Inst) <= maxBacktrackProg 75} 76 77// reset resets the state of the backtracker. 78// end is the end position in the input. 79// ncap is the number of captures. 80func (b *bitState) reset(prog *syntax.Prog, end int, ncap int) { 81 b.end = end 82 83 if cap(b.jobs) == 0 { 84 b.jobs = make([]job, 0, 256) 85 } else { 86 b.jobs = b.jobs[:0] 87 } 88 89 visitedSize := (len(prog.Inst)*(end+1) + visitedBits - 1) / visitedBits 90 if cap(b.visited) < visitedSize { 91 b.visited = make([]uint32, visitedSize, maxBacktrackVector/visitedBits) 92 } else { 93 b.visited = b.visited[:visitedSize] 94 clear(b.visited) // set to 0 95 } 96 97 if cap(b.cap) < ncap { 98 b.cap = make([]int, ncap) 99 } else { 100 b.cap = b.cap[:ncap] 101 } 102 for i := range b.cap { 103 b.cap[i] = -1 104 } 105 106 if cap(b.matchcap) < ncap { 107 b.matchcap = make([]int, ncap) 108 } else { 109 b.matchcap = b.matchcap[:ncap] 110 } 111 for i := range b.matchcap { 112 b.matchcap[i] = -1 113 } 114} 115 116// shouldVisit reports whether the combination of (pc, pos) has not 117// been visited yet. 118func (b *bitState) shouldVisit(pc uint32, pos int) bool { 119 n := uint(int(pc)*(b.end+1) + pos) 120 if b.visited[n/visitedBits]&(1<<(n&(visitedBits-1))) != 0 { 121 return false 122 } 123 b.visited[n/visitedBits] |= 1 << (n & (visitedBits - 1)) 124 return true 125} 126 127// push pushes (pc, pos, arg) onto the job stack if it should be 128// visited. 129func (b *bitState) push(re *Regexp, pc uint32, pos int, arg bool) { 130 // Only check shouldVisit when arg is false. 131 // When arg is true, we are continuing a previous visit. 132 if re.prog.Inst[pc].Op != syntax.InstFail && (arg || b.shouldVisit(pc, pos)) { 133 b.jobs = append(b.jobs, job{pc: pc, arg: arg, pos: pos}) 134 } 135} 136 137// tryBacktrack runs a backtracking search starting at pos. 138func (re *Regexp) tryBacktrack(b *bitState, i input, pc uint32, pos int) bool { 139 longest := re.longest 140 141 b.push(re, pc, pos, false) 142 for len(b.jobs) > 0 { 143 l := len(b.jobs) - 1 144 // Pop job off the stack. 145 pc := b.jobs[l].pc 146 pos := b.jobs[l].pos 147 arg := b.jobs[l].arg 148 b.jobs = b.jobs[:l] 149 150 // Optimization: rather than push and pop, 151 // code that is going to Push and continue 152 // the loop simply updates ip, p, and arg 153 // and jumps to CheckAndLoop. We have to 154 // do the ShouldVisit check that Push 155 // would have, but we avoid the stack 156 // manipulation. 157 goto Skip 158 CheckAndLoop: 159 if !b.shouldVisit(pc, pos) { 160 continue 161 } 162 Skip: 163 164 inst := &re.prog.Inst[pc] 165 166 switch inst.Op { 167 default: 168 panic("bad inst") 169 case syntax.InstFail: 170 panic("unexpected InstFail") 171 case syntax.InstAlt: 172 // Cannot just 173 // b.push(inst.Out, pos, false) 174 // b.push(inst.Arg, pos, false) 175 // If during the processing of inst.Out, we encounter 176 // inst.Arg via another path, we want to process it then. 177 // Pushing it here will inhibit that. Instead, re-push 178 // inst with arg==true as a reminder to push inst.Arg out 179 // later. 180 if arg { 181 // Finished inst.Out; try inst.Arg. 182 arg = false 183 pc = inst.Arg 184 goto CheckAndLoop 185 } else { 186 b.push(re, pc, pos, true) 187 pc = inst.Out 188 goto CheckAndLoop 189 } 190 191 case syntax.InstAltMatch: 192 // One opcode consumes runes; the other leads to match. 193 switch re.prog.Inst[inst.Out].Op { 194 case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 195 // inst.Arg is the match. 196 b.push(re, inst.Arg, pos, false) 197 pc = inst.Arg 198 pos = b.end 199 goto CheckAndLoop 200 } 201 // inst.Out is the match - non-greedy 202 b.push(re, inst.Out, b.end, false) 203 pc = inst.Out 204 goto CheckAndLoop 205 206 case syntax.InstRune: 207 r, width := i.step(pos) 208 if !inst.MatchRune(r) { 209 continue 210 } 211 pos += width 212 pc = inst.Out 213 goto CheckAndLoop 214 215 case syntax.InstRune1: 216 r, width := i.step(pos) 217 if r != inst.Rune[0] { 218 continue 219 } 220 pos += width 221 pc = inst.Out 222 goto CheckAndLoop 223 224 case syntax.InstRuneAnyNotNL: 225 r, width := i.step(pos) 226 if r == '\n' || r == endOfText { 227 continue 228 } 229 pos += width 230 pc = inst.Out 231 goto CheckAndLoop 232 233 case syntax.InstRuneAny: 234 r, width := i.step(pos) 235 if r == endOfText { 236 continue 237 } 238 pos += width 239 pc = inst.Out 240 goto CheckAndLoop 241 242 case syntax.InstCapture: 243 if arg { 244 // Finished inst.Out; restore the old value. 245 b.cap[inst.Arg] = pos 246 continue 247 } else { 248 if inst.Arg < uint32(len(b.cap)) { 249 // Capture pos to register, but save old value. 250 b.push(re, pc, b.cap[inst.Arg], true) // come back when we're done. 251 b.cap[inst.Arg] = pos 252 } 253 pc = inst.Out 254 goto CheckAndLoop 255 } 256 257 case syntax.InstEmptyWidth: 258 flag := i.context(pos) 259 if !flag.match(syntax.EmptyOp(inst.Arg)) { 260 continue 261 } 262 pc = inst.Out 263 goto CheckAndLoop 264 265 case syntax.InstNop: 266 pc = inst.Out 267 goto CheckAndLoop 268 269 case syntax.InstMatch: 270 // We found a match. If the caller doesn't care 271 // where the match is, no point going further. 272 if len(b.cap) == 0 { 273 return true 274 } 275 276 // Record best match so far. 277 // Only need to check end point, because this entire 278 // call is only considering one start position. 279 if len(b.cap) > 1 { 280 b.cap[1] = pos 281 } 282 if old := b.matchcap[1]; old == -1 || (longest && pos > 0 && pos > old) { 283 copy(b.matchcap, b.cap) 284 } 285 286 // If going for first match, we're done. 287 if !longest { 288 return true 289 } 290 291 // If we used the entire text, no longer match is possible. 292 if pos == b.end { 293 return true 294 } 295 296 // Otherwise, continue on in hope of a longer match. 297 continue 298 } 299 } 300 301 return longest && len(b.matchcap) > 1 && b.matchcap[1] >= 0 302} 303 304// backtrack runs a backtracking search of prog on the input starting at pos. 305func (re *Regexp) backtrack(ib []byte, is string, pos int, ncap int, dstCap []int) []int { 306 startCond := re.cond 307 if startCond == ^syntax.EmptyOp(0) { // impossible 308 return nil 309 } 310 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 311 // Anchored match, past beginning of text. 312 return nil 313 } 314 315 b := newBitState() 316 i, end := b.inputs.init(nil, ib, is) 317 b.reset(re.prog, end, ncap) 318 319 // Anchored search must start at the beginning of the input 320 if startCond&syntax.EmptyBeginText != 0 { 321 if len(b.cap) > 0 { 322 b.cap[0] = pos 323 } 324 if !re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { 325 freeBitState(b) 326 return nil 327 } 328 } else { 329 330 // Unanchored search, starting from each possible text position. 331 // Notice that we have to try the empty string at the end of 332 // the text, so the loop condition is pos <= end, not pos < end. 333 // This looks like it's quadratic in the size of the text, 334 // but we are not clearing visited between calls to TrySearch, 335 // so no work is duplicated and it ends up still being linear. 336 width := -1 337 for ; pos <= end && width != 0; pos += width { 338 if len(re.prefix) > 0 { 339 // Match requires literal prefix; fast search for it. 340 advance := i.index(re, pos) 341 if advance < 0 { 342 freeBitState(b) 343 return nil 344 } 345 pos += advance 346 } 347 348 if len(b.cap) > 0 { 349 b.cap[0] = pos 350 } 351 if re.tryBacktrack(b, i, uint32(re.prog.Start), pos) { 352 // Match must be leftmost; done. 353 goto Match 354 } 355 _, width = i.step(pos) 356 } 357 freeBitState(b) 358 return nil 359 } 360 361Match: 362 dstCap = append(dstCap, b.matchcap...) 363 freeBitState(b) 364 return dstCap 365} 366