1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package bytes implements functions for the manipulation of byte slices. 6// It is analogous to the facilities of the [strings] package. 7package bytes 8 9import ( 10 "internal/bytealg" 11 "unicode" 12 "unicode/utf8" 13 _ "unsafe" // for linkname 14) 15 16// Equal reports whether a and b 17// are the same length and contain the same bytes. 18// A nil argument is equivalent to an empty slice. 19func Equal(a, b []byte) bool { 20 // Neither cmd/compile nor gccgo allocates for these string conversions. 21 return string(a) == string(b) 22} 23 24// Compare returns an integer comparing two byte slices lexicographically. 25// The result will be 0 if a == b, -1 if a < b, and +1 if a > b. 26// A nil argument is equivalent to an empty slice. 27func Compare(a, b []byte) int { 28 return bytealg.Compare(a, b) 29} 30 31// explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes), 32// up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes. 33func explode(s []byte, n int) [][]byte { 34 if n <= 0 || n > len(s) { 35 n = len(s) 36 } 37 a := make([][]byte, n) 38 var size int 39 na := 0 40 for len(s) > 0 { 41 if na+1 >= n { 42 a[na] = s 43 na++ 44 break 45 } 46 _, size = utf8.DecodeRune(s) 47 a[na] = s[0:size:size] 48 s = s[size:] 49 na++ 50 } 51 return a[0:na] 52} 53 54// Count counts the number of non-overlapping instances of sep in s. 55// If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s. 56func Count(s, sep []byte) int { 57 // special case 58 if len(sep) == 0 { 59 return utf8.RuneCount(s) + 1 60 } 61 if len(sep) == 1 { 62 return bytealg.Count(s, sep[0]) 63 } 64 n := 0 65 for { 66 i := Index(s, sep) 67 if i == -1 { 68 return n 69 } 70 n++ 71 s = s[i+len(sep):] 72 } 73} 74 75// Contains reports whether subslice is within b. 76func Contains(b, subslice []byte) bool { 77 return Index(b, subslice) != -1 78} 79 80// ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b. 81func ContainsAny(b []byte, chars string) bool { 82 return IndexAny(b, chars) >= 0 83} 84 85// ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b. 86func ContainsRune(b []byte, r rune) bool { 87 return IndexRune(b, r) >= 0 88} 89 90// ContainsFunc reports whether any of the UTF-8-encoded code points r within b satisfy f(r). 91func ContainsFunc(b []byte, f func(rune) bool) bool { 92 return IndexFunc(b, f) >= 0 93} 94 95// IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b. 96func IndexByte(b []byte, c byte) int { 97 return bytealg.IndexByte(b, c) 98} 99 100func indexBytePortable(s []byte, c byte) int { 101 for i, b := range s { 102 if b == c { 103 return i 104 } 105 } 106 return -1 107} 108 109// LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s. 110func LastIndex(s, sep []byte) int { 111 n := len(sep) 112 switch { 113 case n == 0: 114 return len(s) 115 case n == 1: 116 return bytealg.LastIndexByte(s, sep[0]) 117 case n == len(s): 118 if Equal(s, sep) { 119 return 0 120 } 121 return -1 122 case n > len(s): 123 return -1 124 } 125 return bytealg.LastIndexRabinKarp(s, sep) 126} 127 128// LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s. 129func LastIndexByte(s []byte, c byte) int { 130 return bytealg.LastIndexByte(s, c) 131} 132 133// IndexRune interprets s as a sequence of UTF-8-encoded code points. 134// It returns the byte index of the first occurrence in s of the given rune. 135// It returns -1 if rune is not present in s. 136// If r is [utf8.RuneError], it returns the first instance of any 137// invalid UTF-8 byte sequence. 138func IndexRune(s []byte, r rune) int { 139 switch { 140 case 0 <= r && r < utf8.RuneSelf: 141 return IndexByte(s, byte(r)) 142 case r == utf8.RuneError: 143 for i := 0; i < len(s); { 144 r1, n := utf8.DecodeRune(s[i:]) 145 if r1 == utf8.RuneError { 146 return i 147 } 148 i += n 149 } 150 return -1 151 case !utf8.ValidRune(r): 152 return -1 153 default: 154 var b [utf8.UTFMax]byte 155 n := utf8.EncodeRune(b[:], r) 156 return Index(s, b[:n]) 157 } 158} 159 160// IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points. 161// It returns the byte index of the first occurrence in s of any of the Unicode 162// code points in chars. It returns -1 if chars is empty or if there is no code 163// point in common. 164func IndexAny(s []byte, chars string) int { 165 if chars == "" { 166 // Avoid scanning all of s. 167 return -1 168 } 169 if len(s) == 1 { 170 r := rune(s[0]) 171 if r >= utf8.RuneSelf { 172 // search utf8.RuneError. 173 for _, r = range chars { 174 if r == utf8.RuneError { 175 return 0 176 } 177 } 178 return -1 179 } 180 if bytealg.IndexByteString(chars, s[0]) >= 0 { 181 return 0 182 } 183 return -1 184 } 185 if len(chars) == 1 { 186 r := rune(chars[0]) 187 if r >= utf8.RuneSelf { 188 r = utf8.RuneError 189 } 190 return IndexRune(s, r) 191 } 192 if len(s) > 8 { 193 if as, isASCII := makeASCIISet(chars); isASCII { 194 for i, c := range s { 195 if as.contains(c) { 196 return i 197 } 198 } 199 return -1 200 } 201 } 202 var width int 203 for i := 0; i < len(s); i += width { 204 r := rune(s[i]) 205 if r < utf8.RuneSelf { 206 if bytealg.IndexByteString(chars, s[i]) >= 0 { 207 return i 208 } 209 width = 1 210 continue 211 } 212 r, width = utf8.DecodeRune(s[i:]) 213 if r != utf8.RuneError { 214 // r is 2 to 4 bytes 215 if len(chars) == width { 216 if chars == string(r) { 217 return i 218 } 219 continue 220 } 221 // Use bytealg.IndexString for performance if available. 222 if bytealg.MaxLen >= width { 223 if bytealg.IndexString(chars, string(r)) >= 0 { 224 return i 225 } 226 continue 227 } 228 } 229 for _, ch := range chars { 230 if r == ch { 231 return i 232 } 233 } 234 } 235 return -1 236} 237 238// LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code 239// points. It returns the byte index of the last occurrence in s of any of 240// the Unicode code points in chars. It returns -1 if chars is empty or if 241// there is no code point in common. 242func LastIndexAny(s []byte, chars string) int { 243 if chars == "" { 244 // Avoid scanning all of s. 245 return -1 246 } 247 if len(s) > 8 { 248 if as, isASCII := makeASCIISet(chars); isASCII { 249 for i := len(s) - 1; i >= 0; i-- { 250 if as.contains(s[i]) { 251 return i 252 } 253 } 254 return -1 255 } 256 } 257 if len(s) == 1 { 258 r := rune(s[0]) 259 if r >= utf8.RuneSelf { 260 for _, r = range chars { 261 if r == utf8.RuneError { 262 return 0 263 } 264 } 265 return -1 266 } 267 if bytealg.IndexByteString(chars, s[0]) >= 0 { 268 return 0 269 } 270 return -1 271 } 272 if len(chars) == 1 { 273 cr := rune(chars[0]) 274 if cr >= utf8.RuneSelf { 275 cr = utf8.RuneError 276 } 277 for i := len(s); i > 0; { 278 r, size := utf8.DecodeLastRune(s[:i]) 279 i -= size 280 if r == cr { 281 return i 282 } 283 } 284 return -1 285 } 286 for i := len(s); i > 0; { 287 r := rune(s[i-1]) 288 if r < utf8.RuneSelf { 289 if bytealg.IndexByteString(chars, s[i-1]) >= 0 { 290 return i - 1 291 } 292 i-- 293 continue 294 } 295 r, size := utf8.DecodeLastRune(s[:i]) 296 i -= size 297 if r != utf8.RuneError { 298 // r is 2 to 4 bytes 299 if len(chars) == size { 300 if chars == string(r) { 301 return i 302 } 303 continue 304 } 305 // Use bytealg.IndexString for performance if available. 306 if bytealg.MaxLen >= size { 307 if bytealg.IndexString(chars, string(r)) >= 0 { 308 return i 309 } 310 continue 311 } 312 } 313 for _, ch := range chars { 314 if r == ch { 315 return i 316 } 317 } 318 } 319 return -1 320} 321 322// Generic split: splits after each instance of sep, 323// including sepSave bytes of sep in the subslices. 324func genSplit(s, sep []byte, sepSave, n int) [][]byte { 325 if n == 0 { 326 return nil 327 } 328 if len(sep) == 0 { 329 return explode(s, n) 330 } 331 if n < 0 { 332 n = Count(s, sep) + 1 333 } 334 if n > len(s)+1 { 335 n = len(s) + 1 336 } 337 338 a := make([][]byte, n) 339 n-- 340 i := 0 341 for i < n { 342 m := Index(s, sep) 343 if m < 0 { 344 break 345 } 346 a[i] = s[: m+sepSave : m+sepSave] 347 s = s[m+len(sep):] 348 i++ 349 } 350 a[i] = s 351 return a[:i+1] 352} 353 354// SplitN slices s into subslices separated by sep and returns a slice of 355// the subslices between those separators. 356// If sep is empty, SplitN splits after each UTF-8 sequence. 357// The count determines the number of subslices to return: 358// - n > 0: at most n subslices; the last subslice will be the unsplit remainder; 359// - n == 0: the result is nil (zero subslices); 360// - n < 0: all subslices. 361// 362// To split around the first instance of a separator, see [Cut]. 363func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) } 364 365// SplitAfterN slices s into subslices after each instance of sep and 366// returns a slice of those subslices. 367// If sep is empty, SplitAfterN splits after each UTF-8 sequence. 368// The count determines the number of subslices to return: 369// - n > 0: at most n subslices; the last subslice will be the unsplit remainder; 370// - n == 0: the result is nil (zero subslices); 371// - n < 0: all subslices. 372func SplitAfterN(s, sep []byte, n int) [][]byte { 373 return genSplit(s, sep, len(sep), n) 374} 375 376// Split slices s into all subslices separated by sep and returns a slice of 377// the subslices between those separators. 378// If sep is empty, Split splits after each UTF-8 sequence. 379// It is equivalent to SplitN with a count of -1. 380// 381// To split around the first instance of a separator, see [Cut]. 382func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) } 383 384// SplitAfter slices s into all subslices after each instance of sep and 385// returns a slice of those subslices. 386// If sep is empty, SplitAfter splits after each UTF-8 sequence. 387// It is equivalent to SplitAfterN with a count of -1. 388func SplitAfter(s, sep []byte) [][]byte { 389 return genSplit(s, sep, len(sep), -1) 390} 391 392var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1} 393 394// Fields interprets s as a sequence of UTF-8-encoded code points. 395// It splits the slice s around each instance of one or more consecutive white space 396// characters, as defined by [unicode.IsSpace], returning a slice of subslices of s or an 397// empty slice if s contains only white space. 398func Fields(s []byte) [][]byte { 399 // First count the fields. 400 // This is an exact count if s is ASCII, otherwise it is an approximation. 401 n := 0 402 wasSpace := 1 403 // setBits is used to track which bits are set in the bytes of s. 404 setBits := uint8(0) 405 for i := 0; i < len(s); i++ { 406 r := s[i] 407 setBits |= r 408 isSpace := int(asciiSpace[r]) 409 n += wasSpace & ^isSpace 410 wasSpace = isSpace 411 } 412 413 if setBits >= utf8.RuneSelf { 414 // Some runes in the input slice are not ASCII. 415 return FieldsFunc(s, unicode.IsSpace) 416 } 417 418 // ASCII fast path 419 a := make([][]byte, n) 420 na := 0 421 fieldStart := 0 422 i := 0 423 // Skip spaces in the front of the input. 424 for i < len(s) && asciiSpace[s[i]] != 0 { 425 i++ 426 } 427 fieldStart = i 428 for i < len(s) { 429 if asciiSpace[s[i]] == 0 { 430 i++ 431 continue 432 } 433 a[na] = s[fieldStart:i:i] 434 na++ 435 i++ 436 // Skip spaces in between fields. 437 for i < len(s) && asciiSpace[s[i]] != 0 { 438 i++ 439 } 440 fieldStart = i 441 } 442 if fieldStart < len(s) { // Last field might end at EOF. 443 a[na] = s[fieldStart:len(s):len(s)] 444 } 445 return a 446} 447 448// FieldsFunc interprets s as a sequence of UTF-8-encoded code points. 449// It splits the slice s at each run of code points c satisfying f(c) and 450// returns a slice of subslices of s. If all code points in s satisfy f(c), or 451// len(s) == 0, an empty slice is returned. 452// 453// FieldsFunc makes no guarantees about the order in which it calls f(c) 454// and assumes that f always returns the same value for a given c. 455func FieldsFunc(s []byte, f func(rune) bool) [][]byte { 456 // A span is used to record a slice of s of the form s[start:end]. 457 // The start index is inclusive and the end index is exclusive. 458 type span struct { 459 start int 460 end int 461 } 462 spans := make([]span, 0, 32) 463 464 // Find the field start and end indices. 465 // Doing this in a separate pass (rather than slicing the string s 466 // and collecting the result substrings right away) is significantly 467 // more efficient, possibly due to cache effects. 468 start := -1 // valid span start if >= 0 469 for i := 0; i < len(s); { 470 size := 1 471 r := rune(s[i]) 472 if r >= utf8.RuneSelf { 473 r, size = utf8.DecodeRune(s[i:]) 474 } 475 if f(r) { 476 if start >= 0 { 477 spans = append(spans, span{start, i}) 478 start = -1 479 } 480 } else { 481 if start < 0 { 482 start = i 483 } 484 } 485 i += size 486 } 487 488 // Last field might end at EOF. 489 if start >= 0 { 490 spans = append(spans, span{start, len(s)}) 491 } 492 493 // Create subslices from recorded field indices. 494 a := make([][]byte, len(spans)) 495 for i, span := range spans { 496 a[i] = s[span.start:span.end:span.end] 497 } 498 499 return a 500} 501 502// Join concatenates the elements of s to create a new byte slice. The separator 503// sep is placed between elements in the resulting slice. 504func Join(s [][]byte, sep []byte) []byte { 505 if len(s) == 0 { 506 return []byte{} 507 } 508 if len(s) == 1 { 509 // Just return a copy. 510 return append([]byte(nil), s[0]...) 511 } 512 513 var n int 514 if len(sep) > 0 { 515 if len(sep) >= maxInt/(len(s)-1) { 516 panic("bytes: Join output length overflow") 517 } 518 n += len(sep) * (len(s) - 1) 519 } 520 for _, v := range s { 521 if len(v) > maxInt-n { 522 panic("bytes: Join output length overflow") 523 } 524 n += len(v) 525 } 526 527 b := bytealg.MakeNoZero(n)[:n:n] 528 bp := copy(b, s[0]) 529 for _, v := range s[1:] { 530 bp += copy(b[bp:], sep) 531 bp += copy(b[bp:], v) 532 } 533 return b 534} 535 536// HasPrefix reports whether the byte slice s begins with prefix. 537func HasPrefix(s, prefix []byte) bool { 538 return len(s) >= len(prefix) && Equal(s[0:len(prefix)], prefix) 539} 540 541// HasSuffix reports whether the byte slice s ends with suffix. 542func HasSuffix(s, suffix []byte) bool { 543 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix) 544} 545 546// Map returns a copy of the byte slice s with all its characters modified 547// according to the mapping function. If mapping returns a negative value, the character is 548// dropped from the byte slice with no replacement. The characters in s and the 549// output are interpreted as UTF-8-encoded code points. 550func Map(mapping func(r rune) rune, s []byte) []byte { 551 // In the worst case, the slice can grow when mapped, making 552 // things unpleasant. But it's so rare we barge in assuming it's 553 // fine. It could also shrink but that falls out naturally. 554 b := make([]byte, 0, len(s)) 555 for i := 0; i < len(s); { 556 wid := 1 557 r := rune(s[i]) 558 if r >= utf8.RuneSelf { 559 r, wid = utf8.DecodeRune(s[i:]) 560 } 561 r = mapping(r) 562 if r >= 0 { 563 b = utf8.AppendRune(b, r) 564 } 565 i += wid 566 } 567 return b 568} 569 570// Despite being an exported symbol, 571// Repeat is linknamed by widely used packages. 572// Notable members of the hall of shame include: 573// - gitee.com/quant1x/num 574// 575// Do not remove or change the type signature. 576// See go.dev/issue/67401. 577// 578// Note that this comment is not part of the doc comment. 579// 580//go:linkname Repeat 581 582// Repeat returns a new byte slice consisting of count copies of b. 583// 584// It panics if count is negative or if the result of (len(b) * count) 585// overflows. 586func Repeat(b []byte, count int) []byte { 587 if count == 0 { 588 return []byte{} 589 } 590 591 // Since we cannot return an error on overflow, 592 // we should panic if the repeat will generate an overflow. 593 // See golang.org/issue/16237. 594 if count < 0 { 595 panic("bytes: negative Repeat count") 596 } 597 if len(b) > maxInt/count { 598 panic("bytes: Repeat output length overflow") 599 } 600 n := len(b) * count 601 602 if len(b) == 0 { 603 return []byte{} 604 } 605 606 // Past a certain chunk size it is counterproductive to use 607 // larger chunks as the source of the write, as when the source 608 // is too large we are basically just thrashing the CPU D-cache. 609 // So if the result length is larger than an empirically-found 610 // limit (8KB), we stop growing the source string once the limit 611 // is reached and keep reusing the same source string - that 612 // should therefore be always resident in the L1 cache - until we 613 // have completed the construction of the result. 614 // This yields significant speedups (up to +100%) in cases where 615 // the result length is large (roughly, over L2 cache size). 616 const chunkLimit = 8 * 1024 617 chunkMax := n 618 if chunkMax > chunkLimit { 619 chunkMax = chunkLimit / len(b) * len(b) 620 if chunkMax == 0 { 621 chunkMax = len(b) 622 } 623 } 624 nb := bytealg.MakeNoZero(n)[:n:n] 625 bp := copy(nb, b) 626 for bp < n { 627 chunk := bp 628 if chunk > chunkMax { 629 chunk = chunkMax 630 } 631 bp += copy(nb[bp:], nb[:chunk]) 632 } 633 return nb 634} 635 636// ToUpper returns a copy of the byte slice s with all Unicode letters mapped to 637// their upper case. 638func ToUpper(s []byte) []byte { 639 isASCII, hasLower := true, false 640 for i := 0; i < len(s); i++ { 641 c := s[i] 642 if c >= utf8.RuneSelf { 643 isASCII = false 644 break 645 } 646 hasLower = hasLower || ('a' <= c && c <= 'z') 647 } 648 649 if isASCII { // optimize for ASCII-only byte slices. 650 if !hasLower { 651 // Just return a copy. 652 return append([]byte(""), s...) 653 } 654 b := bytealg.MakeNoZero(len(s))[:len(s):len(s)] 655 for i := 0; i < len(s); i++ { 656 c := s[i] 657 if 'a' <= c && c <= 'z' { 658 c -= 'a' - 'A' 659 } 660 b[i] = c 661 } 662 return b 663 } 664 return Map(unicode.ToUpper, s) 665} 666 667// ToLower returns a copy of the byte slice s with all Unicode letters mapped to 668// their lower case. 669func ToLower(s []byte) []byte { 670 isASCII, hasUpper := true, false 671 for i := 0; i < len(s); i++ { 672 c := s[i] 673 if c >= utf8.RuneSelf { 674 isASCII = false 675 break 676 } 677 hasUpper = hasUpper || ('A' <= c && c <= 'Z') 678 } 679 680 if isASCII { // optimize for ASCII-only byte slices. 681 if !hasUpper { 682 return append([]byte(""), s...) 683 } 684 b := bytealg.MakeNoZero(len(s))[:len(s):len(s)] 685 for i := 0; i < len(s); i++ { 686 c := s[i] 687 if 'A' <= c && c <= 'Z' { 688 c += 'a' - 'A' 689 } 690 b[i] = c 691 } 692 return b 693 } 694 return Map(unicode.ToLower, s) 695} 696 697// ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case. 698func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) } 699 700// ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 701// upper case, giving priority to the special casing rules. 702func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte { 703 return Map(c.ToUpper, s) 704} 705 706// ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 707// lower case, giving priority to the special casing rules. 708func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte { 709 return Map(c.ToLower, s) 710} 711 712// ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their 713// title case, giving priority to the special casing rules. 714func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte { 715 return Map(c.ToTitle, s) 716} 717 718// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes 719// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty. 720func ToValidUTF8(s, replacement []byte) []byte { 721 b := make([]byte, 0, len(s)+len(replacement)) 722 invalid := false // previous byte was from an invalid UTF-8 sequence 723 for i := 0; i < len(s); { 724 c := s[i] 725 if c < utf8.RuneSelf { 726 i++ 727 invalid = false 728 b = append(b, c) 729 continue 730 } 731 _, wid := utf8.DecodeRune(s[i:]) 732 if wid == 1 { 733 i++ 734 if !invalid { 735 invalid = true 736 b = append(b, replacement...) 737 } 738 continue 739 } 740 invalid = false 741 b = append(b, s[i:i+wid]...) 742 i += wid 743 } 744 return b 745} 746 747// isSeparator reports whether the rune could mark a word boundary. 748// TODO: update when package unicode captures more of the properties. 749func isSeparator(r rune) bool { 750 // ASCII alphanumerics and underscore are not separators 751 if r <= 0x7F { 752 switch { 753 case '0' <= r && r <= '9': 754 return false 755 case 'a' <= r && r <= 'z': 756 return false 757 case 'A' <= r && r <= 'Z': 758 return false 759 case r == '_': 760 return false 761 } 762 return true 763 } 764 // Letters and digits are not separators 765 if unicode.IsLetter(r) || unicode.IsDigit(r) { 766 return false 767 } 768 // Otherwise, all we can do for now is treat spaces as separators. 769 return unicode.IsSpace(r) 770} 771 772// Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin 773// words mapped to their title case. 774// 775// Deprecated: The rule Title uses for word boundaries does not handle Unicode 776// punctuation properly. Use golang.org/x/text/cases instead. 777func Title(s []byte) []byte { 778 // Use a closure here to remember state. 779 // Hackish but effective. Depends on Map scanning in order and calling 780 // the closure once per rune. 781 prev := ' ' 782 return Map( 783 func(r rune) rune { 784 if isSeparator(prev) { 785 prev = r 786 return unicode.ToTitle(r) 787 } 788 prev = r 789 return r 790 }, 791 s) 792} 793 794// TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off 795// all leading UTF-8-encoded code points c that satisfy f(c). 796func TrimLeftFunc(s []byte, f func(r rune) bool) []byte { 797 i := indexFunc(s, f, false) 798 if i == -1 { 799 return nil 800 } 801 return s[i:] 802} 803 804// TrimRightFunc returns a subslice of s by slicing off all trailing 805// UTF-8-encoded code points c that satisfy f(c). 806func TrimRightFunc(s []byte, f func(r rune) bool) []byte { 807 i := lastIndexFunc(s, f, false) 808 if i >= 0 && s[i] >= utf8.RuneSelf { 809 _, wid := utf8.DecodeRune(s[i:]) 810 i += wid 811 } else { 812 i++ 813 } 814 return s[0:i] 815} 816 817// TrimFunc returns a subslice of s by slicing off all leading and trailing 818// UTF-8-encoded code points c that satisfy f(c). 819func TrimFunc(s []byte, f func(r rune) bool) []byte { 820 return TrimRightFunc(TrimLeftFunc(s, f), f) 821} 822 823// TrimPrefix returns s without the provided leading prefix string. 824// If s doesn't start with prefix, s is returned unchanged. 825func TrimPrefix(s, prefix []byte) []byte { 826 if HasPrefix(s, prefix) { 827 return s[len(prefix):] 828 } 829 return s 830} 831 832// TrimSuffix returns s without the provided trailing suffix string. 833// If s doesn't end with suffix, s is returned unchanged. 834func TrimSuffix(s, suffix []byte) []byte { 835 if HasSuffix(s, suffix) { 836 return s[:len(s)-len(suffix)] 837 } 838 return s 839} 840 841// IndexFunc interprets s as a sequence of UTF-8-encoded code points. 842// It returns the byte index in s of the first Unicode 843// code point satisfying f(c), or -1 if none do. 844func IndexFunc(s []byte, f func(r rune) bool) int { 845 return indexFunc(s, f, true) 846} 847 848// LastIndexFunc interprets s as a sequence of UTF-8-encoded code points. 849// It returns the byte index in s of the last Unicode 850// code point satisfying f(c), or -1 if none do. 851func LastIndexFunc(s []byte, f func(r rune) bool) int { 852 return lastIndexFunc(s, f, true) 853} 854 855// indexFunc is the same as IndexFunc except that if 856// truth==false, the sense of the predicate function is 857// inverted. 858func indexFunc(s []byte, f func(r rune) bool, truth bool) int { 859 start := 0 860 for start < len(s) { 861 wid := 1 862 r := rune(s[start]) 863 if r >= utf8.RuneSelf { 864 r, wid = utf8.DecodeRune(s[start:]) 865 } 866 if f(r) == truth { 867 return start 868 } 869 start += wid 870 } 871 return -1 872} 873 874// lastIndexFunc is the same as LastIndexFunc except that if 875// truth==false, the sense of the predicate function is 876// inverted. 877func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int { 878 for i := len(s); i > 0; { 879 r, size := rune(s[i-1]), 1 880 if r >= utf8.RuneSelf { 881 r, size = utf8.DecodeLastRune(s[0:i]) 882 } 883 i -= size 884 if f(r) == truth { 885 return i 886 } 887 } 888 return -1 889} 890 891// asciiSet is a 32-byte value, where each bit represents the presence of a 892// given ASCII character in the set. The 128-bits of the lower 16 bytes, 893// starting with the least-significant bit of the lowest word to the 894// most-significant bit of the highest word, map to the full range of all 895// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, 896// ensuring that any non-ASCII character will be reported as not in the set. 897// This allocates a total of 32 bytes even though the upper half 898// is unused to avoid bounds checks in asciiSet.contains. 899type asciiSet [8]uint32 900 901// makeASCIISet creates a set of ASCII characters and reports whether all 902// characters in chars are ASCII. 903func makeASCIISet(chars string) (as asciiSet, ok bool) { 904 for i := 0; i < len(chars); i++ { 905 c := chars[i] 906 if c >= utf8.RuneSelf { 907 return as, false 908 } 909 as[c/32] |= 1 << (c % 32) 910 } 911 return as, true 912} 913 914// contains reports whether c is inside the set. 915func (as *asciiSet) contains(c byte) bool { 916 return (as[c/32] & (1 << (c % 32))) != 0 917} 918 919// containsRune is a simplified version of strings.ContainsRune 920// to avoid importing the strings package. 921// We avoid bytes.ContainsRune to avoid allocating a temporary copy of s. 922func containsRune(s string, r rune) bool { 923 for _, c := range s { 924 if c == r { 925 return true 926 } 927 } 928 return false 929} 930 931// Trim returns a subslice of s by slicing off all leading and 932// trailing UTF-8-encoded code points contained in cutset. 933func Trim(s []byte, cutset string) []byte { 934 if len(s) == 0 { 935 // This is what we've historically done. 936 return nil 937 } 938 if cutset == "" { 939 return s 940 } 941 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 942 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0]) 943 } 944 if as, ok := makeASCIISet(cutset); ok { 945 return trimLeftASCII(trimRightASCII(s, &as), &as) 946 } 947 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset) 948} 949 950// TrimLeft returns a subslice of s by slicing off all leading 951// UTF-8-encoded code points contained in cutset. 952func TrimLeft(s []byte, cutset string) []byte { 953 if len(s) == 0 { 954 // This is what we've historically done. 955 return nil 956 } 957 if cutset == "" { 958 return s 959 } 960 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 961 return trimLeftByte(s, cutset[0]) 962 } 963 if as, ok := makeASCIISet(cutset); ok { 964 return trimLeftASCII(s, &as) 965 } 966 return trimLeftUnicode(s, cutset) 967} 968 969func trimLeftByte(s []byte, c byte) []byte { 970 for len(s) > 0 && s[0] == c { 971 s = s[1:] 972 } 973 if len(s) == 0 { 974 // This is what we've historically done. 975 return nil 976 } 977 return s 978} 979 980func trimLeftASCII(s []byte, as *asciiSet) []byte { 981 for len(s) > 0 { 982 if !as.contains(s[0]) { 983 break 984 } 985 s = s[1:] 986 } 987 if len(s) == 0 { 988 // This is what we've historically done. 989 return nil 990 } 991 return s 992} 993 994func trimLeftUnicode(s []byte, cutset string) []byte { 995 for len(s) > 0 { 996 r, n := rune(s[0]), 1 997 if r >= utf8.RuneSelf { 998 r, n = utf8.DecodeRune(s) 999 } 1000 if !containsRune(cutset, r) { 1001 break 1002 } 1003 s = s[n:] 1004 } 1005 if len(s) == 0 { 1006 // This is what we've historically done. 1007 return nil 1008 } 1009 return s 1010} 1011 1012// TrimRight returns a subslice of s by slicing off all trailing 1013// UTF-8-encoded code points that are contained in cutset. 1014func TrimRight(s []byte, cutset string) []byte { 1015 if len(s) == 0 || cutset == "" { 1016 return s 1017 } 1018 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf { 1019 return trimRightByte(s, cutset[0]) 1020 } 1021 if as, ok := makeASCIISet(cutset); ok { 1022 return trimRightASCII(s, &as) 1023 } 1024 return trimRightUnicode(s, cutset) 1025} 1026 1027func trimRightByte(s []byte, c byte) []byte { 1028 for len(s) > 0 && s[len(s)-1] == c { 1029 s = s[:len(s)-1] 1030 } 1031 return s 1032} 1033 1034func trimRightASCII(s []byte, as *asciiSet) []byte { 1035 for len(s) > 0 { 1036 if !as.contains(s[len(s)-1]) { 1037 break 1038 } 1039 s = s[:len(s)-1] 1040 } 1041 return s 1042} 1043 1044func trimRightUnicode(s []byte, cutset string) []byte { 1045 for len(s) > 0 { 1046 r, n := rune(s[len(s)-1]), 1 1047 if r >= utf8.RuneSelf { 1048 r, n = utf8.DecodeLastRune(s) 1049 } 1050 if !containsRune(cutset, r) { 1051 break 1052 } 1053 s = s[:len(s)-n] 1054 } 1055 return s 1056} 1057 1058// TrimSpace returns a subslice of s by slicing off all leading and 1059// trailing white space, as defined by Unicode. 1060func TrimSpace(s []byte) []byte { 1061 // Fast path for ASCII: look for the first ASCII non-space byte 1062 start := 0 1063 for ; start < len(s); start++ { 1064 c := s[start] 1065 if c >= utf8.RuneSelf { 1066 // If we run into a non-ASCII byte, fall back to the 1067 // slower unicode-aware method on the remaining bytes 1068 return TrimFunc(s[start:], unicode.IsSpace) 1069 } 1070 if asciiSpace[c] == 0 { 1071 break 1072 } 1073 } 1074 1075 // Now look for the first ASCII non-space byte from the end 1076 stop := len(s) 1077 for ; stop > start; stop-- { 1078 c := s[stop-1] 1079 if c >= utf8.RuneSelf { 1080 return TrimFunc(s[start:stop], unicode.IsSpace) 1081 } 1082 if asciiSpace[c] == 0 { 1083 break 1084 } 1085 } 1086 1087 // At this point s[start:stop] starts and ends with an ASCII 1088 // non-space bytes, so we're done. Non-ASCII cases have already 1089 // been handled above. 1090 if start == stop { 1091 // Special case to preserve previous TrimLeftFunc behavior, 1092 // returning nil instead of empty slice if all spaces. 1093 return nil 1094 } 1095 return s[start:stop] 1096} 1097 1098// Runes interprets s as a sequence of UTF-8-encoded code points. 1099// It returns a slice of runes (Unicode code points) equivalent to s. 1100func Runes(s []byte) []rune { 1101 t := make([]rune, utf8.RuneCount(s)) 1102 i := 0 1103 for len(s) > 0 { 1104 r, l := utf8.DecodeRune(s) 1105 t[i] = r 1106 i++ 1107 s = s[l:] 1108 } 1109 return t 1110} 1111 1112// Replace returns a copy of the slice s with the first n 1113// non-overlapping instances of old replaced by new. 1114// If old is empty, it matches at the beginning of the slice 1115// and after each UTF-8 sequence, yielding up to k+1 replacements 1116// for a k-rune slice. 1117// If n < 0, there is no limit on the number of replacements. 1118func Replace(s, old, new []byte, n int) []byte { 1119 m := 0 1120 if n != 0 { 1121 // Compute number of replacements. 1122 m = Count(s, old) 1123 } 1124 if m == 0 { 1125 // Just return a copy. 1126 return append([]byte(nil), s...) 1127 } 1128 if n < 0 || m < n { 1129 n = m 1130 } 1131 1132 // Apply replacements to buffer. 1133 t := make([]byte, len(s)+n*(len(new)-len(old))) 1134 w := 0 1135 start := 0 1136 for i := 0; i < n; i++ { 1137 j := start 1138 if len(old) == 0 { 1139 if i > 0 { 1140 _, wid := utf8.DecodeRune(s[start:]) 1141 j += wid 1142 } 1143 } else { 1144 j += Index(s[start:], old) 1145 } 1146 w += copy(t[w:], s[start:j]) 1147 w += copy(t[w:], new) 1148 start = j + len(old) 1149 } 1150 w += copy(t[w:], s[start:]) 1151 return t[0:w] 1152} 1153 1154// ReplaceAll returns a copy of the slice s with all 1155// non-overlapping instances of old replaced by new. 1156// If old is empty, it matches at the beginning of the slice 1157// and after each UTF-8 sequence, yielding up to k+1 replacements 1158// for a k-rune slice. 1159func ReplaceAll(s, old, new []byte) []byte { 1160 return Replace(s, old, new, -1) 1161} 1162 1163// EqualFold reports whether s and t, interpreted as UTF-8 strings, 1164// are equal under simple Unicode case-folding, which is a more general 1165// form of case-insensitivity. 1166func EqualFold(s, t []byte) bool { 1167 // ASCII fast path 1168 i := 0 1169 for ; i < len(s) && i < len(t); i++ { 1170 sr := s[i] 1171 tr := t[i] 1172 if sr|tr >= utf8.RuneSelf { 1173 goto hasUnicode 1174 } 1175 1176 // Easy case. 1177 if tr == sr { 1178 continue 1179 } 1180 1181 // Make sr < tr to simplify what follows. 1182 if tr < sr { 1183 tr, sr = sr, tr 1184 } 1185 // ASCII only, sr/tr must be upper/lower case 1186 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1187 continue 1188 } 1189 return false 1190 } 1191 // Check if we've exhausted both strings. 1192 return len(s) == len(t) 1193 1194hasUnicode: 1195 s = s[i:] 1196 t = t[i:] 1197 for len(s) != 0 && len(t) != 0 { 1198 // Extract first rune from each. 1199 var sr, tr rune 1200 if s[0] < utf8.RuneSelf { 1201 sr, s = rune(s[0]), s[1:] 1202 } else { 1203 r, size := utf8.DecodeRune(s) 1204 sr, s = r, s[size:] 1205 } 1206 if t[0] < utf8.RuneSelf { 1207 tr, t = rune(t[0]), t[1:] 1208 } else { 1209 r, size := utf8.DecodeRune(t) 1210 tr, t = r, t[size:] 1211 } 1212 1213 // If they match, keep going; if not, return false. 1214 1215 // Easy case. 1216 if tr == sr { 1217 continue 1218 } 1219 1220 // Make sr < tr to simplify what follows. 1221 if tr < sr { 1222 tr, sr = sr, tr 1223 } 1224 // Fast check for ASCII. 1225 if tr < utf8.RuneSelf { 1226 // ASCII only, sr/tr must be upper/lower case 1227 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' { 1228 continue 1229 } 1230 return false 1231 } 1232 1233 // General case. SimpleFold(x) returns the next equivalent rune > x 1234 // or wraps around to smaller values. 1235 r := unicode.SimpleFold(sr) 1236 for r != sr && r < tr { 1237 r = unicode.SimpleFold(r) 1238 } 1239 if r == tr { 1240 continue 1241 } 1242 return false 1243 } 1244 1245 // One string is empty. Are both? 1246 return len(s) == len(t) 1247} 1248 1249// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s. 1250func Index(s, sep []byte) int { 1251 n := len(sep) 1252 switch { 1253 case n == 0: 1254 return 0 1255 case n == 1: 1256 return IndexByte(s, sep[0]) 1257 case n == len(s): 1258 if Equal(sep, s) { 1259 return 0 1260 } 1261 return -1 1262 case n > len(s): 1263 return -1 1264 case n <= bytealg.MaxLen: 1265 // Use brute force when s and sep both are small 1266 if len(s) <= bytealg.MaxBruteForce { 1267 return bytealg.Index(s, sep) 1268 } 1269 c0 := sep[0] 1270 c1 := sep[1] 1271 i := 0 1272 t := len(s) - n + 1 1273 fails := 0 1274 for i < t { 1275 if s[i] != c0 { 1276 // IndexByte is faster than bytealg.Index, so use it as long as 1277 // we're not getting lots of false positives. 1278 o := IndexByte(s[i+1:t], c0) 1279 if o < 0 { 1280 return -1 1281 } 1282 i += o + 1 1283 } 1284 if s[i+1] == c1 && Equal(s[i:i+n], sep) { 1285 return i 1286 } 1287 fails++ 1288 i++ 1289 // Switch to bytealg.Index when IndexByte produces too many false positives. 1290 if fails > bytealg.Cutover(i) { 1291 r := bytealg.Index(s[i:], sep) 1292 if r >= 0 { 1293 return r + i 1294 } 1295 return -1 1296 } 1297 } 1298 return -1 1299 } 1300 c0 := sep[0] 1301 c1 := sep[1] 1302 i := 0 1303 fails := 0 1304 t := len(s) - n + 1 1305 for i < t { 1306 if s[i] != c0 { 1307 o := IndexByte(s[i+1:t], c0) 1308 if o < 0 { 1309 break 1310 } 1311 i += o + 1 1312 } 1313 if s[i+1] == c1 && Equal(s[i:i+n], sep) { 1314 return i 1315 } 1316 i++ 1317 fails++ 1318 if fails >= 4+i>>4 && i < t { 1319 // Give up on IndexByte, it isn't skipping ahead 1320 // far enough to be better than Rabin-Karp. 1321 // Experiments (using IndexPeriodic) suggest 1322 // the cutover is about 16 byte skips. 1323 // TODO: if large prefixes of sep are matching 1324 // we should cutover at even larger average skips, 1325 // because Equal becomes that much more expensive. 1326 // This code does not take that effect into account. 1327 j := bytealg.IndexRabinKarp(s[i:], sep) 1328 if j < 0 { 1329 return -1 1330 } 1331 return i + j 1332 } 1333 } 1334 return -1 1335} 1336 1337// Cut slices s around the first instance of sep, 1338// returning the text before and after sep. 1339// The found result reports whether sep appears in s. 1340// If sep does not appear in s, cut returns s, nil, false. 1341// 1342// Cut returns slices of the original slice s, not copies. 1343func Cut(s, sep []byte) (before, after []byte, found bool) { 1344 if i := Index(s, sep); i >= 0 { 1345 return s[:i], s[i+len(sep):], true 1346 } 1347 return s, nil, false 1348} 1349 1350// Clone returns a copy of b[:len(b)]. 1351// The result may have additional unused capacity. 1352// Clone(nil) returns nil. 1353func Clone(b []byte) []byte { 1354 if b == nil { 1355 return nil 1356 } 1357 return append([]byte{}, b...) 1358} 1359 1360// CutPrefix returns s without the provided leading prefix byte slice 1361// and reports whether it found the prefix. 1362// If s doesn't start with prefix, CutPrefix returns s, false. 1363// If prefix is the empty byte slice, CutPrefix returns s, true. 1364// 1365// CutPrefix returns slices of the original slice s, not copies. 1366func CutPrefix(s, prefix []byte) (after []byte, found bool) { 1367 if !HasPrefix(s, prefix) { 1368 return s, false 1369 } 1370 return s[len(prefix):], true 1371} 1372 1373// CutSuffix returns s without the provided ending suffix byte slice 1374// and reports whether it found the suffix. 1375// If s doesn't end with suffix, CutSuffix returns s, false. 1376// If suffix is the empty byte slice, CutSuffix returns s, true. 1377// 1378// CutSuffix returns slices of the original slice s, not copies. 1379func CutSuffix(s, suffix []byte) (before []byte, found bool) { 1380 if !HasSuffix(s, suffix) { 1381 return s, false 1382 } 1383 return s[:len(s)-len(suffix)], true 1384} 1385