1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package regexp implements regular expression search. 6// 7// The syntax of the regular expressions accepted is the same 8// general syntax used by Perl, Python, and other languages. 9// More precisely, it is the syntax accepted by RE2 and described at 10// https://golang.org/s/re2syntax, except for \C. 11// For an overview of the syntax, see the [regexp/syntax] package. 12// 13// The regexp implementation provided by this package is 14// guaranteed to run in time linear in the size of the input. 15// (This is a property not guaranteed by most open source 16// implementations of regular expressions.) For more information 17// about this property, see https://swtch.com/~rsc/regexp/regexp1.html 18// or any book about automata theory. 19// 20// All characters are UTF-8-encoded code points. 21// Following [utf8.DecodeRune], each byte of an invalid UTF-8 sequence 22// is treated as if it encoded utf8.RuneError (U+FFFD). 23// 24// There are 16 methods of [Regexp] that match a regular expression and identify 25// the matched text. Their names are matched by this regular expression: 26// 27// Find(All)?(String)?(Submatch)?(Index)? 28// 29// If 'All' is present, the routine matches successive non-overlapping 30// matches of the entire expression. Empty matches abutting a preceding 31// match are ignored. The return value is a slice containing the successive 32// return values of the corresponding non-'All' routine. These routines take 33// an extra integer argument, n. If n >= 0, the function returns at most n 34// matches/submatches; otherwise, it returns all of them. 35// 36// If 'String' is present, the argument is a string; otherwise it is a slice 37// of bytes; return values are adjusted as appropriate. 38// 39// If 'Submatch' is present, the return value is a slice identifying the 40// successive submatches of the expression. Submatches are matches of 41// parenthesized subexpressions (also known as capturing groups) within the 42// regular expression, numbered from left to right in order of opening 43// parenthesis. Submatch 0 is the match of the entire expression, submatch 1 is 44// the match of the first parenthesized subexpression, and so on. 45// 46// If 'Index' is present, matches and submatches are identified by byte index 47// pairs within the input string: result[2*n:2*n+2] identifies the indexes of 48// the nth submatch. The pair for n==0 identifies the match of the entire 49// expression. If 'Index' is not present, the match is identified by the text 50// of the match/submatch. If an index is negative or text is nil, it means that 51// subexpression did not match any string in the input. For 'String' versions 52// an empty string means either no match or an empty match. 53// 54// There is also a subset of the methods that can be applied to text read from 55// an [io.RuneReader]: [Regexp.MatchReader], [Regexp.FindReaderIndex], 56// [Regexp.FindReaderSubmatchIndex]. 57// 58// This set may grow. Note that regular expression matches may need to 59// examine text beyond the text returned by a match, so the methods that 60// match text from an [io.RuneReader] may read arbitrarily far into the input 61// before returning. 62// 63// (There are a few other methods that do not match this pattern.) 64package regexp 65 66import ( 67 "bytes" 68 "io" 69 "regexp/syntax" 70 "strconv" 71 "strings" 72 "sync" 73 "unicode" 74 "unicode/utf8" 75) 76 77// Regexp is the representation of a compiled regular expression. 78// A Regexp is safe for concurrent use by multiple goroutines, 79// except for configuration methods, such as [Regexp.Longest]. 80type Regexp struct { 81 expr string // as passed to Compile 82 prog *syntax.Prog // compiled program 83 onepass *onePassProg // onepass program or nil 84 numSubexp int 85 maxBitStateLen int 86 subexpNames []string 87 prefix string // required prefix in unanchored matches 88 prefixBytes []byte // prefix, as a []byte 89 prefixRune rune // first rune in prefix 90 prefixEnd uint32 // pc for last rune in prefix 91 mpool int // pool for machines 92 matchcap int // size of recorded match lengths 93 prefixComplete bool // prefix is the entire regexp 94 cond syntax.EmptyOp // empty-width conditions required at start of match 95 minInputLen int // minimum length of the input in bytes 96 97 // This field can be modified by the Longest method, 98 // but it is otherwise read-only. 99 longest bool // whether regexp prefers leftmost-longest match 100} 101 102// String returns the source text used to compile the regular expression. 103func (re *Regexp) String() string { 104 return re.expr 105} 106 107// Copy returns a new [Regexp] object copied from re. 108// Calling [Regexp.Longest] on one copy does not affect another. 109// 110// Deprecated: In earlier releases, when using a [Regexp] in multiple goroutines, 111// giving each goroutine its own copy helped to avoid lock contention. 112// As of Go 1.12, using Copy is no longer necessary to avoid lock contention. 113// Copy may still be appropriate if the reason for its use is to make 114// two copies with different [Regexp.Longest] settings. 115func (re *Regexp) Copy() *Regexp { 116 re2 := *re 117 return &re2 118} 119 120// Compile parses a regular expression and returns, if successful, 121// a [Regexp] object that can be used to match against text. 122// 123// When matching against text, the regexp returns a match that 124// begins as early as possible in the input (leftmost), and among those 125// it chooses the one that a backtracking search would have found first. 126// This so-called leftmost-first matching is the same semantics 127// that Perl, Python, and other implementations use, although this 128// package implements it without the expense of backtracking. 129// For POSIX leftmost-longest matching, see [CompilePOSIX]. 130func Compile(expr string) (*Regexp, error) { 131 return compile(expr, syntax.Perl, false) 132} 133 134// CompilePOSIX is like [Compile] but restricts the regular expression 135// to POSIX ERE (egrep) syntax and changes the match semantics to 136// leftmost-longest. 137// 138// That is, when matching against text, the regexp returns a match that 139// begins as early as possible in the input (leftmost), and among those 140// it chooses a match that is as long as possible. 141// This so-called leftmost-longest matching is the same semantics 142// that early regular expression implementations used and that POSIX 143// specifies. 144// 145// However, there can be multiple leftmost-longest matches, with different 146// submatch choices, and here this package diverges from POSIX. 147// Among the possible leftmost-longest matches, this package chooses 148// the one that a backtracking search would have found first, while POSIX 149// specifies that the match be chosen to maximize the length of the first 150// subexpression, then the second, and so on from left to right. 151// The POSIX rule is computationally prohibitive and not even well-defined. 152// See https://swtch.com/~rsc/regexp/regexp2.html#posix for details. 153func CompilePOSIX(expr string) (*Regexp, error) { 154 return compile(expr, syntax.POSIX, true) 155} 156 157// Longest makes future searches prefer the leftmost-longest match. 158// That is, when matching against text, the regexp returns a match that 159// begins as early as possible in the input (leftmost), and among those 160// it chooses a match that is as long as possible. 161// This method modifies the [Regexp] and may not be called concurrently 162// with any other methods. 163func (re *Regexp) Longest() { 164 re.longest = true 165} 166 167func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 168 re, err := syntax.Parse(expr, mode) 169 if err != nil { 170 return nil, err 171 } 172 maxCap := re.MaxCap() 173 capNames := re.CapNames() 174 175 re = re.Simplify() 176 prog, err := syntax.Compile(re) 177 if err != nil { 178 return nil, err 179 } 180 matchcap := prog.NumCap 181 if matchcap < 2 { 182 matchcap = 2 183 } 184 regexp := &Regexp{ 185 expr: expr, 186 prog: prog, 187 onepass: compileOnePass(prog), 188 numSubexp: maxCap, 189 subexpNames: capNames, 190 cond: prog.StartCond(), 191 longest: longest, 192 matchcap: matchcap, 193 minInputLen: minInputLen(re), 194 } 195 if regexp.onepass == nil { 196 regexp.prefix, regexp.prefixComplete = prog.Prefix() 197 regexp.maxBitStateLen = maxBitStateLen(prog) 198 } else { 199 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 200 } 201 if regexp.prefix != "" { 202 // TODO(rsc): Remove this allocation by adding 203 // IndexString to package bytes. 204 regexp.prefixBytes = []byte(regexp.prefix) 205 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 206 } 207 208 n := len(prog.Inst) 209 i := 0 210 for matchSize[i] != 0 && matchSize[i] < n { 211 i++ 212 } 213 regexp.mpool = i 214 215 return regexp, nil 216} 217 218// Pools of *machine for use during (*Regexp).doExecute, 219// split up by the size of the execution queues. 220// matchPool[i] machines have queue size matchSize[i]. 221// On a 64-bit system each queue entry is 16 bytes, 222// so matchPool[0] has 16*2*128 = 4kB queues, etc. 223// The final matchPool is a catch-all for very large queues. 224var ( 225 matchSize = [...]int{128, 512, 2048, 16384, 0} 226 matchPool [len(matchSize)]sync.Pool 227) 228 229// get returns a machine to use for matching re. 230// It uses the re's machine cache if possible, to avoid 231// unnecessary allocation. 232func (re *Regexp) get() *machine { 233 m, ok := matchPool[re.mpool].Get().(*machine) 234 if !ok { 235 m = new(machine) 236 } 237 m.re = re 238 m.p = re.prog 239 if cap(m.matchcap) < re.matchcap { 240 m.matchcap = make([]int, re.matchcap) 241 for _, t := range m.pool { 242 t.cap = make([]int, re.matchcap) 243 } 244 } 245 246 // Allocate queues if needed. 247 // Or reallocate, for "large" match pool. 248 n := matchSize[re.mpool] 249 if n == 0 { // large pool 250 n = len(re.prog.Inst) 251 } 252 if len(m.q0.sparse) < n { 253 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 254 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 255 } 256 return m 257} 258 259// put returns a machine to the correct machine pool. 260func (re *Regexp) put(m *machine) { 261 m.re = nil 262 m.p = nil 263 m.inputs.clear() 264 matchPool[re.mpool].Put(m) 265} 266 267// minInputLen walks the regexp to find the minimum length of any matchable input. 268func minInputLen(re *syntax.Regexp) int { 269 switch re.Op { 270 default: 271 return 0 272 case syntax.OpAnyChar, syntax.OpAnyCharNotNL, syntax.OpCharClass: 273 return 1 274 case syntax.OpLiteral: 275 l := 0 276 for _, r := range re.Rune { 277 if r == utf8.RuneError { 278 l++ 279 } else { 280 l += utf8.RuneLen(r) 281 } 282 } 283 return l 284 case syntax.OpCapture, syntax.OpPlus: 285 return minInputLen(re.Sub[0]) 286 case syntax.OpRepeat: 287 return re.Min * minInputLen(re.Sub[0]) 288 case syntax.OpConcat: 289 l := 0 290 for _, sub := range re.Sub { 291 l += minInputLen(sub) 292 } 293 return l 294 case syntax.OpAlternate: 295 l := minInputLen(re.Sub[0]) 296 var lnext int 297 for _, sub := range re.Sub[1:] { 298 lnext = minInputLen(sub) 299 if lnext < l { 300 l = lnext 301 } 302 } 303 return l 304 } 305} 306 307// MustCompile is like [Compile] but panics if the expression cannot be parsed. 308// It simplifies safe initialization of global variables holding compiled regular 309// expressions. 310func MustCompile(str string) *Regexp { 311 regexp, err := Compile(str) 312 if err != nil { 313 panic(`regexp: Compile(` + quote(str) + `): ` + err.Error()) 314 } 315 return regexp 316} 317 318// MustCompilePOSIX is like [CompilePOSIX] but panics if the expression cannot be parsed. 319// It simplifies safe initialization of global variables holding compiled regular 320// expressions. 321func MustCompilePOSIX(str string) *Regexp { 322 regexp, err := CompilePOSIX(str) 323 if err != nil { 324 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + err.Error()) 325 } 326 return regexp 327} 328 329func quote(s string) string { 330 if strconv.CanBackquote(s) { 331 return "`" + s + "`" 332 } 333 return strconv.Quote(s) 334} 335 336// NumSubexp returns the number of parenthesized subexpressions in this [Regexp]. 337func (re *Regexp) NumSubexp() int { 338 return re.numSubexp 339} 340 341// SubexpNames returns the names of the parenthesized subexpressions 342// in this [Regexp]. The name for the first sub-expression is names[1], 343// so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 344// Since the Regexp as a whole cannot be named, names[0] is always 345// the empty string. The slice should not be modified. 346func (re *Regexp) SubexpNames() []string { 347 return re.subexpNames 348} 349 350// SubexpIndex returns the index of the first subexpression with the given name, 351// or -1 if there is no subexpression with that name. 352// 353// Note that multiple subexpressions can be written using the same name, as in 354// (?P<bob>a+)(?P<bob>b+), which declares two subexpressions named "bob". 355// In this case, SubexpIndex returns the index of the leftmost such subexpression 356// in the regular expression. 357func (re *Regexp) SubexpIndex(name string) int { 358 if name != "" { 359 for i, s := range re.subexpNames { 360 if name == s { 361 return i 362 } 363 } 364 } 365 return -1 366} 367 368const endOfText rune = -1 369 370// input abstracts different representations of the input text. It provides 371// one-character lookahead. 372type input interface { 373 step(pos int) (r rune, width int) // advance one rune 374 canCheckPrefix() bool // can we look ahead without losing info? 375 hasPrefix(re *Regexp) bool 376 index(re *Regexp, pos int) int 377 context(pos int) lazyFlag 378} 379 380// inputString scans a string. 381type inputString struct { 382 str string 383} 384 385func (i *inputString) step(pos int) (rune, int) { 386 if pos < len(i.str) { 387 c := i.str[pos] 388 if c < utf8.RuneSelf { 389 return rune(c), 1 390 } 391 return utf8.DecodeRuneInString(i.str[pos:]) 392 } 393 return endOfText, 0 394} 395 396func (i *inputString) canCheckPrefix() bool { 397 return true 398} 399 400func (i *inputString) hasPrefix(re *Regexp) bool { 401 return strings.HasPrefix(i.str, re.prefix) 402} 403 404func (i *inputString) index(re *Regexp, pos int) int { 405 return strings.Index(i.str[pos:], re.prefix) 406} 407 408func (i *inputString) context(pos int) lazyFlag { 409 r1, r2 := endOfText, endOfText 410 // 0 < pos && pos <= len(i.str) 411 if uint(pos-1) < uint(len(i.str)) { 412 r1 = rune(i.str[pos-1]) 413 if r1 >= utf8.RuneSelf { 414 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 415 } 416 } 417 // 0 <= pos && pos < len(i.str) 418 if uint(pos) < uint(len(i.str)) { 419 r2 = rune(i.str[pos]) 420 if r2 >= utf8.RuneSelf { 421 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 422 } 423 } 424 return newLazyFlag(r1, r2) 425} 426 427// inputBytes scans a byte slice. 428type inputBytes struct { 429 str []byte 430} 431 432func (i *inputBytes) step(pos int) (rune, int) { 433 if pos < len(i.str) { 434 c := i.str[pos] 435 if c < utf8.RuneSelf { 436 return rune(c), 1 437 } 438 return utf8.DecodeRune(i.str[pos:]) 439 } 440 return endOfText, 0 441} 442 443func (i *inputBytes) canCheckPrefix() bool { 444 return true 445} 446 447func (i *inputBytes) hasPrefix(re *Regexp) bool { 448 return bytes.HasPrefix(i.str, re.prefixBytes) 449} 450 451func (i *inputBytes) index(re *Regexp, pos int) int { 452 return bytes.Index(i.str[pos:], re.prefixBytes) 453} 454 455func (i *inputBytes) context(pos int) lazyFlag { 456 r1, r2 := endOfText, endOfText 457 // 0 < pos && pos <= len(i.str) 458 if uint(pos-1) < uint(len(i.str)) { 459 r1 = rune(i.str[pos-1]) 460 if r1 >= utf8.RuneSelf { 461 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 462 } 463 } 464 // 0 <= pos && pos < len(i.str) 465 if uint(pos) < uint(len(i.str)) { 466 r2 = rune(i.str[pos]) 467 if r2 >= utf8.RuneSelf { 468 r2, _ = utf8.DecodeRune(i.str[pos:]) 469 } 470 } 471 return newLazyFlag(r1, r2) 472} 473 474// inputReader scans a RuneReader. 475type inputReader struct { 476 r io.RuneReader 477 atEOT bool 478 pos int 479} 480 481func (i *inputReader) step(pos int) (rune, int) { 482 if !i.atEOT && pos != i.pos { 483 return endOfText, 0 484 485 } 486 r, w, err := i.r.ReadRune() 487 if err != nil { 488 i.atEOT = true 489 return endOfText, 0 490 } 491 i.pos += w 492 return r, w 493} 494 495func (i *inputReader) canCheckPrefix() bool { 496 return false 497} 498 499func (i *inputReader) hasPrefix(re *Regexp) bool { 500 return false 501} 502 503func (i *inputReader) index(re *Regexp, pos int) int { 504 return -1 505} 506 507func (i *inputReader) context(pos int) lazyFlag { 508 return 0 // not used 509} 510 511// LiteralPrefix returns a literal string that must begin any match 512// of the regular expression re. It returns the boolean true if the 513// literal string comprises the entire regular expression. 514func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 515 return re.prefix, re.prefixComplete 516} 517 518// MatchReader reports whether the text returned by the [io.RuneReader] 519// contains any match of the regular expression re. 520func (re *Regexp) MatchReader(r io.RuneReader) bool { 521 return re.doMatch(r, nil, "") 522} 523 524// MatchString reports whether the string s 525// contains any match of the regular expression re. 526func (re *Regexp) MatchString(s string) bool { 527 return re.doMatch(nil, nil, s) 528} 529 530// Match reports whether the byte slice b 531// contains any match of the regular expression re. 532func (re *Regexp) Match(b []byte) bool { 533 return re.doMatch(nil, b, "") 534} 535 536// MatchReader reports whether the text returned by the [io.RuneReader] 537// contains any match of the regular expression pattern. 538// More complicated queries need to use [Compile] and the full [Regexp] interface. 539func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 540 re, err := Compile(pattern) 541 if err != nil { 542 return false, err 543 } 544 return re.MatchReader(r), nil 545} 546 547// MatchString reports whether the string s 548// contains any match of the regular expression pattern. 549// More complicated queries need to use [Compile] and the full [Regexp] interface. 550func MatchString(pattern string, s string) (matched bool, err error) { 551 re, err := Compile(pattern) 552 if err != nil { 553 return false, err 554 } 555 return re.MatchString(s), nil 556} 557 558// Match reports whether the byte slice b 559// contains any match of the regular expression pattern. 560// More complicated queries need to use [Compile] and the full [Regexp] interface. 561func Match(pattern string, b []byte) (matched bool, err error) { 562 re, err := Compile(pattern) 563 if err != nil { 564 return false, err 565 } 566 return re.Match(b), nil 567} 568 569// ReplaceAllString returns a copy of src, replacing matches of the [Regexp] 570// with the replacement string repl. 571// Inside repl, $ signs are interpreted as in [Regexp.Expand]. 572func (re *Regexp) ReplaceAllString(src, repl string) string { 573 n := 2 574 if strings.Contains(repl, "$") { 575 n = 2 * (re.numSubexp + 1) 576 } 577 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 578 return re.expand(dst, repl, nil, src, match) 579 }) 580 return string(b) 581} 582 583// ReplaceAllLiteralString returns a copy of src, replacing matches of the [Regexp] 584// with the replacement string repl. The replacement repl is substituted directly, 585// without using [Regexp.Expand]. 586func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 587 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 588 return append(dst, repl...) 589 })) 590} 591 592// ReplaceAllStringFunc returns a copy of src in which all matches of the 593// [Regexp] have been replaced by the return value of function repl applied 594// to the matched substring. The replacement returned by repl is substituted 595// directly, without using [Regexp.Expand]. 596func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 597 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 598 return append(dst, repl(src[match[0]:match[1]])...) 599 }) 600 return string(b) 601} 602 603func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 604 lastMatchEnd := 0 // end position of the most recent match 605 searchPos := 0 // position where we next look for a match 606 var buf []byte 607 var endPos int 608 if bsrc != nil { 609 endPos = len(bsrc) 610 } else { 611 endPos = len(src) 612 } 613 if nmatch > re.prog.NumCap { 614 nmatch = re.prog.NumCap 615 } 616 617 var dstCap [2]int 618 for searchPos <= endPos { 619 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 620 if len(a) == 0 { 621 break // no more matches 622 } 623 624 // Copy the unmatched characters before this match. 625 if bsrc != nil { 626 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 627 } else { 628 buf = append(buf, src[lastMatchEnd:a[0]]...) 629 } 630 631 // Now insert a copy of the replacement string, but not for a 632 // match of the empty string immediately after another match. 633 // (Otherwise, we get double replacement for patterns that 634 // match both empty and nonempty strings.) 635 if a[1] > lastMatchEnd || a[0] == 0 { 636 buf = repl(buf, a) 637 } 638 lastMatchEnd = a[1] 639 640 // Advance past this match; always advance at least one character. 641 var width int 642 if bsrc != nil { 643 _, width = utf8.DecodeRune(bsrc[searchPos:]) 644 } else { 645 _, width = utf8.DecodeRuneInString(src[searchPos:]) 646 } 647 if searchPos+width > a[1] { 648 searchPos += width 649 } else if searchPos+1 > a[1] { 650 // This clause is only needed at the end of the input 651 // string. In that case, DecodeRuneInString returns width=0. 652 searchPos++ 653 } else { 654 searchPos = a[1] 655 } 656 } 657 658 // Copy the unmatched characters after the last match. 659 if bsrc != nil { 660 buf = append(buf, bsrc[lastMatchEnd:]...) 661 } else { 662 buf = append(buf, src[lastMatchEnd:]...) 663 } 664 665 return buf 666} 667 668// ReplaceAll returns a copy of src, replacing matches of the [Regexp] 669// with the replacement text repl. 670// Inside repl, $ signs are interpreted as in [Regexp.Expand]. 671func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 672 n := 2 673 if bytes.IndexByte(repl, '$') >= 0 { 674 n = 2 * (re.numSubexp + 1) 675 } 676 srepl := "" 677 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 678 if len(srepl) != len(repl) { 679 srepl = string(repl) 680 } 681 return re.expand(dst, srepl, src, "", match) 682 }) 683 return b 684} 685 686// ReplaceAllLiteral returns a copy of src, replacing matches of the [Regexp] 687// with the replacement bytes repl. The replacement repl is substituted directly, 688// without using [Regexp.Expand]. 689func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 690 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 691 return append(dst, repl...) 692 }) 693} 694 695// ReplaceAllFunc returns a copy of src in which all matches of the 696// [Regexp] have been replaced by the return value of function repl applied 697// to the matched byte slice. The replacement returned by repl is substituted 698// directly, without using [Regexp.Expand]. 699func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 700 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 701 return append(dst, repl(src[match[0]:match[1]])...) 702 }) 703} 704 705// Bitmap used by func special to check whether a character needs to be escaped. 706var specialBytes [16]byte 707 708// special reports whether byte b needs to be escaped by QuoteMeta. 709func special(b byte) bool { 710 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 711} 712 713func init() { 714 for _, b := range []byte(`\.+*?()|[]{}^$`) { 715 specialBytes[b%16] |= 1 << (b / 16) 716 } 717} 718 719// QuoteMeta returns a string that escapes all regular expression metacharacters 720// inside the argument text; the returned string is a regular expression matching 721// the literal text. 722func QuoteMeta(s string) string { 723 // A byte loop is correct because all metacharacters are ASCII. 724 var i int 725 for i = 0; i < len(s); i++ { 726 if special(s[i]) { 727 break 728 } 729 } 730 // No meta characters found, so return original string. 731 if i >= len(s) { 732 return s 733 } 734 735 b := make([]byte, 2*len(s)-i) 736 copy(b, s[:i]) 737 j := i 738 for ; i < len(s); i++ { 739 if special(s[i]) { 740 b[j] = '\\' 741 j++ 742 } 743 b[j] = s[i] 744 j++ 745 } 746 return string(b[:j]) 747} 748 749// The number of capture values in the program may correspond 750// to fewer capturing expressions than are in the regexp. 751// For example, "(a){0}" turns into an empty program, so the 752// maximum capture in the program is 0 but we need to return 753// an expression for \1. Pad appends -1s to the slice a as needed. 754func (re *Regexp) pad(a []int) []int { 755 if a == nil { 756 // No match. 757 return nil 758 } 759 n := (1 + re.numSubexp) * 2 760 for len(a) < n { 761 a = append(a, -1) 762 } 763 return a 764} 765 766// allMatches calls deliver at most n times 767// with the location of successive matches in the input text. 768// The input text is b if non-nil, otherwise s. 769func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 770 var end int 771 if b == nil { 772 end = len(s) 773 } else { 774 end = len(b) 775 } 776 777 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 778 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 779 if len(matches) == 0 { 780 break 781 } 782 783 accept := true 784 if matches[1] == pos { 785 // We've found an empty match. 786 if matches[0] == prevMatchEnd { 787 // We don't allow an empty match right 788 // after a previous match, so ignore it. 789 accept = false 790 } 791 var width int 792 if b == nil { 793 is := inputString{str: s} 794 _, width = is.step(pos) 795 } else { 796 ib := inputBytes{str: b} 797 _, width = ib.step(pos) 798 } 799 if width > 0 { 800 pos += width 801 } else { 802 pos = end + 1 803 } 804 } else { 805 pos = matches[1] 806 } 807 prevMatchEnd = matches[1] 808 809 if accept { 810 deliver(re.pad(matches)) 811 i++ 812 } 813 } 814} 815 816// Find returns a slice holding the text of the leftmost match in b of the regular expression. 817// A return value of nil indicates no match. 818func (re *Regexp) Find(b []byte) []byte { 819 var dstCap [2]int 820 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 821 if a == nil { 822 return nil 823 } 824 return b[a[0]:a[1]:a[1]] 825} 826 827// FindIndex returns a two-element slice of integers defining the location of 828// the leftmost match in b of the regular expression. The match itself is at 829// b[loc[0]:loc[1]]. 830// A return value of nil indicates no match. 831func (re *Regexp) FindIndex(b []byte) (loc []int) { 832 a := re.doExecute(nil, b, "", 0, 2, nil) 833 if a == nil { 834 return nil 835 } 836 return a[0:2] 837} 838 839// FindString returns a string holding the text of the leftmost match in s of the regular 840// expression. If there is no match, the return value is an empty string, 841// but it will also be empty if the regular expression successfully matches 842// an empty string. Use [Regexp.FindStringIndex] or [Regexp.FindStringSubmatch] if it is 843// necessary to distinguish these cases. 844func (re *Regexp) FindString(s string) string { 845 var dstCap [2]int 846 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 847 if a == nil { 848 return "" 849 } 850 return s[a[0]:a[1]] 851} 852 853// FindStringIndex returns a two-element slice of integers defining the 854// location of the leftmost match in s of the regular expression. The match 855// itself is at s[loc[0]:loc[1]]. 856// A return value of nil indicates no match. 857func (re *Regexp) FindStringIndex(s string) (loc []int) { 858 a := re.doExecute(nil, nil, s, 0, 2, nil) 859 if a == nil { 860 return nil 861 } 862 return a[0:2] 863} 864 865// FindReaderIndex returns a two-element slice of integers defining the 866// location of the leftmost match of the regular expression in text read from 867// the [io.RuneReader]. The match text was found in the input stream at 868// byte offset loc[0] through loc[1]-1. 869// A return value of nil indicates no match. 870func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 871 a := re.doExecute(r, nil, "", 0, 2, nil) 872 if a == nil { 873 return nil 874 } 875 return a[0:2] 876} 877 878// FindSubmatch returns a slice of slices holding the text of the leftmost 879// match of the regular expression in b and the matches, if any, of its 880// subexpressions, as defined by the 'Submatch' descriptions in the package 881// comment. 882// A return value of nil indicates no match. 883func (re *Regexp) FindSubmatch(b []byte) [][]byte { 884 var dstCap [4]int 885 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 886 if a == nil { 887 return nil 888 } 889 ret := make([][]byte, 1+re.numSubexp) 890 for i := range ret { 891 if 2*i < len(a) && a[2*i] >= 0 { 892 ret[i] = b[a[2*i]:a[2*i+1]:a[2*i+1]] 893 } 894 } 895 return ret 896} 897 898// Expand appends template to dst and returns the result; during the 899// append, Expand replaces variables in the template with corresponding 900// matches drawn from src. The match slice should have been returned by 901// [Regexp.FindSubmatchIndex]. 902// 903// In the template, a variable is denoted by a substring of the form 904// $name or ${name}, where name is a non-empty sequence of letters, 905// digits, and underscores. A purely numeric name like $1 refers to 906// the submatch with the corresponding index; other names refer to 907// capturing parentheses named with the (?P<name>...) syntax. A 908// reference to an out of range or unmatched index or a name that is not 909// present in the regular expression is replaced with an empty slice. 910// 911// In the $name form, name is taken to be as long as possible: $1x is 912// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 913// 914// To insert a literal $ in the output, use $$ in the template. 915func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 916 return re.expand(dst, string(template), src, "", match) 917} 918 919// ExpandString is like [Regexp.Expand] but the template and source are strings. 920// It appends to and returns a byte slice in order to give the calling 921// code control over allocation. 922func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 923 return re.expand(dst, template, nil, src, match) 924} 925 926func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 927 for len(template) > 0 { 928 before, after, ok := strings.Cut(template, "$") 929 if !ok { 930 break 931 } 932 dst = append(dst, before...) 933 template = after 934 if template != "" && template[0] == '$' { 935 // Treat $$ as $. 936 dst = append(dst, '$') 937 template = template[1:] 938 continue 939 } 940 name, num, rest, ok := extract(template) 941 if !ok { 942 // Malformed; treat $ as raw text. 943 dst = append(dst, '$') 944 continue 945 } 946 template = rest 947 if num >= 0 { 948 if 2*num+1 < len(match) && match[2*num] >= 0 { 949 if bsrc != nil { 950 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 951 } else { 952 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 953 } 954 } 955 } else { 956 for i, namei := range re.subexpNames { 957 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 958 if bsrc != nil { 959 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 960 } else { 961 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 962 } 963 break 964 } 965 } 966 } 967 } 968 dst = append(dst, template...) 969 return dst 970} 971 972// extract returns the name from a leading "name" or "{name}" in str. 973// (The $ has already been removed by the caller.) 974// If it is a number, extract returns num set to that number; otherwise num = -1. 975func extract(str string) (name string, num int, rest string, ok bool) { 976 if str == "" { 977 return 978 } 979 brace := false 980 if str[0] == '{' { 981 brace = true 982 str = str[1:] 983 } 984 i := 0 985 for i < len(str) { 986 rune, size := utf8.DecodeRuneInString(str[i:]) 987 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 988 break 989 } 990 i += size 991 } 992 if i == 0 { 993 // empty name is not okay 994 return 995 } 996 name = str[:i] 997 if brace { 998 if i >= len(str) || str[i] != '}' { 999 // missing closing brace 1000 return 1001 } 1002 i++ 1003 } 1004 1005 // Parse number. 1006 num = 0 1007 for i := 0; i < len(name); i++ { 1008 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 1009 num = -1 1010 break 1011 } 1012 num = num*10 + int(name[i]) - '0' 1013 } 1014 // Disallow leading zeros. 1015 if name[0] == '0' && len(name) > 1 { 1016 num = -1 1017 } 1018 1019 rest = str[i:] 1020 ok = true 1021 return 1022} 1023 1024// FindSubmatchIndex returns a slice holding the index pairs identifying the 1025// leftmost match of the regular expression in b and the matches, if any, of 1026// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 1027// in the package comment. 1028// A return value of nil indicates no match. 1029func (re *Regexp) FindSubmatchIndex(b []byte) []int { 1030 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 1031} 1032 1033// FindStringSubmatch returns a slice of strings holding the text of the 1034// leftmost match of the regular expression in s and the matches, if any, of 1035// its subexpressions, as defined by the 'Submatch' description in the 1036// package comment. 1037// A return value of nil indicates no match. 1038func (re *Regexp) FindStringSubmatch(s string) []string { 1039 var dstCap [4]int 1040 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 1041 if a == nil { 1042 return nil 1043 } 1044 ret := make([]string, 1+re.numSubexp) 1045 for i := range ret { 1046 if 2*i < len(a) && a[2*i] >= 0 { 1047 ret[i] = s[a[2*i]:a[2*i+1]] 1048 } 1049 } 1050 return ret 1051} 1052 1053// FindStringSubmatchIndex returns a slice holding the index pairs 1054// identifying the leftmost match of the regular expression in s and the 1055// matches, if any, of its subexpressions, as defined by the 'Submatch' and 1056// 'Index' descriptions in the package comment. 1057// A return value of nil indicates no match. 1058func (re *Regexp) FindStringSubmatchIndex(s string) []int { 1059 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 1060} 1061 1062// FindReaderSubmatchIndex returns a slice holding the index pairs 1063// identifying the leftmost match of the regular expression of text read by 1064// the [io.RuneReader], and the matches, if any, of its subexpressions, as defined 1065// by the 'Submatch' and 'Index' descriptions in the package comment. A 1066// return value of nil indicates no match. 1067func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 1068 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 1069} 1070 1071const startSize = 10 // The size at which to start a slice in the 'All' routines. 1072 1073// FindAll is the 'All' version of [Regexp.Find]; it returns a slice of all successive 1074// matches of the expression, as defined by the 'All' description in the 1075// package comment. 1076// A return value of nil indicates no match. 1077func (re *Regexp) FindAll(b []byte, n int) [][]byte { 1078 if n < 0 { 1079 n = len(b) + 1 1080 } 1081 var result [][]byte 1082 re.allMatches("", b, n, func(match []int) { 1083 if result == nil { 1084 result = make([][]byte, 0, startSize) 1085 } 1086 result = append(result, b[match[0]:match[1]:match[1]]) 1087 }) 1088 return result 1089} 1090 1091// FindAllIndex is the 'All' version of [Regexp.FindIndex]; it returns a slice of all 1092// successive matches of the expression, as defined by the 'All' description 1093// in the package comment. 1094// A return value of nil indicates no match. 1095func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1096 if n < 0 { 1097 n = len(b) + 1 1098 } 1099 var result [][]int 1100 re.allMatches("", b, n, func(match []int) { 1101 if result == nil { 1102 result = make([][]int, 0, startSize) 1103 } 1104 result = append(result, match[0:2]) 1105 }) 1106 return result 1107} 1108 1109// FindAllString is the 'All' version of [Regexp.FindString]; it returns a slice of all 1110// successive matches of the expression, as defined by the 'All' description 1111// in the package comment. 1112// A return value of nil indicates no match. 1113func (re *Regexp) FindAllString(s string, n int) []string { 1114 if n < 0 { 1115 n = len(s) + 1 1116 } 1117 var result []string 1118 re.allMatches(s, nil, n, func(match []int) { 1119 if result == nil { 1120 result = make([]string, 0, startSize) 1121 } 1122 result = append(result, s[match[0]:match[1]]) 1123 }) 1124 return result 1125} 1126 1127// FindAllStringIndex is the 'All' version of [Regexp.FindStringIndex]; it returns a 1128// slice of all successive matches of the expression, as defined by the 'All' 1129// description in the package comment. 1130// A return value of nil indicates no match. 1131func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1132 if n < 0 { 1133 n = len(s) + 1 1134 } 1135 var result [][]int 1136 re.allMatches(s, nil, n, func(match []int) { 1137 if result == nil { 1138 result = make([][]int, 0, startSize) 1139 } 1140 result = append(result, match[0:2]) 1141 }) 1142 return result 1143} 1144 1145// FindAllSubmatch is the 'All' version of [Regexp.FindSubmatch]; it returns a slice 1146// of all successive matches of the expression, as defined by the 'All' 1147// description in the package comment. 1148// A return value of nil indicates no match. 1149func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1150 if n < 0 { 1151 n = len(b) + 1 1152 } 1153 var result [][][]byte 1154 re.allMatches("", b, n, func(match []int) { 1155 if result == nil { 1156 result = make([][][]byte, 0, startSize) 1157 } 1158 slice := make([][]byte, len(match)/2) 1159 for j := range slice { 1160 if match[2*j] >= 0 { 1161 slice[j] = b[match[2*j]:match[2*j+1]:match[2*j+1]] 1162 } 1163 } 1164 result = append(result, slice) 1165 }) 1166 return result 1167} 1168 1169// FindAllSubmatchIndex is the 'All' version of [Regexp.FindSubmatchIndex]; it returns 1170// a slice of all successive matches of the expression, as defined by the 1171// 'All' description in the package comment. 1172// A return value of nil indicates no match. 1173func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1174 if n < 0 { 1175 n = len(b) + 1 1176 } 1177 var result [][]int 1178 re.allMatches("", b, n, func(match []int) { 1179 if result == nil { 1180 result = make([][]int, 0, startSize) 1181 } 1182 result = append(result, match) 1183 }) 1184 return result 1185} 1186 1187// FindAllStringSubmatch is the 'All' version of [Regexp.FindStringSubmatch]; it 1188// returns a slice of all successive matches of the expression, as defined by 1189// the 'All' description in the package comment. 1190// A return value of nil indicates no match. 1191func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1192 if n < 0 { 1193 n = len(s) + 1 1194 } 1195 var result [][]string 1196 re.allMatches(s, nil, n, func(match []int) { 1197 if result == nil { 1198 result = make([][]string, 0, startSize) 1199 } 1200 slice := make([]string, len(match)/2) 1201 for j := range slice { 1202 if match[2*j] >= 0 { 1203 slice[j] = s[match[2*j]:match[2*j+1]] 1204 } 1205 } 1206 result = append(result, slice) 1207 }) 1208 return result 1209} 1210 1211// FindAllStringSubmatchIndex is the 'All' version of 1212// [Regexp.FindStringSubmatchIndex]; it returns a slice of all successive matches of 1213// the expression, as defined by the 'All' description in the package 1214// comment. 1215// A return value of nil indicates no match. 1216func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1217 if n < 0 { 1218 n = len(s) + 1 1219 } 1220 var result [][]int 1221 re.allMatches(s, nil, n, func(match []int) { 1222 if result == nil { 1223 result = make([][]int, 0, startSize) 1224 } 1225 result = append(result, match) 1226 }) 1227 return result 1228} 1229 1230// Split slices s into substrings separated by the expression and returns a slice of 1231// the substrings between those expression matches. 1232// 1233// The slice returned by this method consists of all the substrings of s 1234// not contained in the slice returned by [Regexp.FindAllString]. When called on an expression 1235// that contains no metacharacters, it is equivalent to [strings.SplitN]. 1236// 1237// Example: 1238// 1239// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1240// // s: ["", "b", "b", "c", "cadaaae"] 1241// 1242// The count determines the number of substrings to return: 1243// - n > 0: at most n substrings; the last substring will be the unsplit remainder; 1244// - n == 0: the result is nil (zero substrings); 1245// - n < 0: all substrings. 1246func (re *Regexp) Split(s string, n int) []string { 1247 1248 if n == 0 { 1249 return nil 1250 } 1251 1252 if len(re.expr) > 0 && len(s) == 0 { 1253 return []string{""} 1254 } 1255 1256 matches := re.FindAllStringIndex(s, n) 1257 strings := make([]string, 0, len(matches)) 1258 1259 beg := 0 1260 end := 0 1261 for _, match := range matches { 1262 if n > 0 && len(strings) >= n-1 { 1263 break 1264 } 1265 1266 end = match[0] 1267 if match[1] != 0 { 1268 strings = append(strings, s[beg:end]) 1269 } 1270 beg = match[1] 1271 } 1272 1273 if end != len(s) { 1274 strings = append(strings, s[beg:]) 1275 } 1276 1277 return strings 1278} 1279 1280// MarshalText implements [encoding.TextMarshaler]. The output 1281// matches that of calling the [Regexp.String] method. 1282// 1283// Note that the output is lossy in some cases: This method does not indicate 1284// POSIX regular expressions (i.e. those compiled by calling [CompilePOSIX]), or 1285// those for which the [Regexp.Longest] method has been called. 1286func (re *Regexp) MarshalText() ([]byte, error) { 1287 return []byte(re.String()), nil 1288} 1289 1290// UnmarshalText implements [encoding.TextUnmarshaler] by calling 1291// [Compile] on the encoded value. 1292func (re *Regexp) UnmarshalText(text []byte) error { 1293 newRE, err := Compile(string(text)) 1294 if err != nil { 1295 return err 1296 } 1297 *re = *newRE 1298 return nil 1299} 1300