1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:generate go run makeisprint.go -output isprint.go 6 7package strconv 8 9import ( 10 "unicode/utf8" 11) 12 13const ( 14 lowerhex = "0123456789abcdef" 15 upperhex = "0123456789ABCDEF" 16) 17 18// contains reports whether the string contains the byte c. 19func contains(s string, c byte) bool { 20 return index(s, c) != -1 21} 22 23func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string { 24 return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly)) 25} 26 27func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string { 28 return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly)) 29} 30 31func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte { 32 // Often called with big strings, so preallocate. If there's quoting, 33 // this is conservative but still helps a lot. 34 if cap(buf)-len(buf) < len(s) { 35 nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1) 36 copy(nBuf, buf) 37 buf = nBuf 38 } 39 buf = append(buf, quote) 40 for width := 0; len(s) > 0; s = s[width:] { 41 r := rune(s[0]) 42 width = 1 43 if r >= utf8.RuneSelf { 44 r, width = utf8.DecodeRuneInString(s) 45 } 46 if width == 1 && r == utf8.RuneError { 47 buf = append(buf, `\x`...) 48 buf = append(buf, lowerhex[s[0]>>4]) 49 buf = append(buf, lowerhex[s[0]&0xF]) 50 continue 51 } 52 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 53 } 54 buf = append(buf, quote) 55 return buf 56} 57 58func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 59 buf = append(buf, quote) 60 if !utf8.ValidRune(r) { 61 r = utf8.RuneError 62 } 63 buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly) 64 buf = append(buf, quote) 65 return buf 66} 67 68func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte { 69 if r == rune(quote) || r == '\\' { // always backslashed 70 buf = append(buf, '\\') 71 buf = append(buf, byte(r)) 72 return buf 73 } 74 if ASCIIonly { 75 if r < utf8.RuneSelf && IsPrint(r) { 76 buf = append(buf, byte(r)) 77 return buf 78 } 79 } else if IsPrint(r) || graphicOnly && isInGraphicList(r) { 80 return utf8.AppendRune(buf, r) 81 } 82 switch r { 83 case '\a': 84 buf = append(buf, `\a`...) 85 case '\b': 86 buf = append(buf, `\b`...) 87 case '\f': 88 buf = append(buf, `\f`...) 89 case '\n': 90 buf = append(buf, `\n`...) 91 case '\r': 92 buf = append(buf, `\r`...) 93 case '\t': 94 buf = append(buf, `\t`...) 95 case '\v': 96 buf = append(buf, `\v`...) 97 default: 98 switch { 99 case r < ' ' || r == 0x7f: 100 buf = append(buf, `\x`...) 101 buf = append(buf, lowerhex[byte(r)>>4]) 102 buf = append(buf, lowerhex[byte(r)&0xF]) 103 case !utf8.ValidRune(r): 104 r = 0xFFFD 105 fallthrough 106 case r < 0x10000: 107 buf = append(buf, `\u`...) 108 for s := 12; s >= 0; s -= 4 { 109 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 110 } 111 default: 112 buf = append(buf, `\U`...) 113 for s := 28; s >= 0; s -= 4 { 114 buf = append(buf, lowerhex[r>>uint(s)&0xF]) 115 } 116 } 117 } 118 return buf 119} 120 121// Quote returns a double-quoted Go string literal representing s. The 122// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 123// control characters and non-printable characters as defined by 124// [IsPrint]. 125func Quote(s string) string { 126 return quoteWith(s, '"', false, false) 127} 128 129// AppendQuote appends a double-quoted Go string literal representing s, 130// as generated by [Quote], to dst and returns the extended buffer. 131func AppendQuote(dst []byte, s string) []byte { 132 return appendQuotedWith(dst, s, '"', false, false) 133} 134 135// QuoteToASCII returns a double-quoted Go string literal representing s. 136// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for 137// non-ASCII characters and non-printable characters as defined by [IsPrint]. 138func QuoteToASCII(s string) string { 139 return quoteWith(s, '"', true, false) 140} 141 142// AppendQuoteToASCII appends a double-quoted Go string literal representing s, 143// as generated by [QuoteToASCII], to dst and returns the extended buffer. 144func AppendQuoteToASCII(dst []byte, s string) []byte { 145 return appendQuotedWith(dst, s, '"', true, false) 146} 147 148// QuoteToGraphic returns a double-quoted Go string literal representing s. 149// The returned string leaves Unicode graphic characters, as defined by 150// [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100) 151// for non-graphic characters. 152func QuoteToGraphic(s string) string { 153 return quoteWith(s, '"', false, true) 154} 155 156// AppendQuoteToGraphic appends a double-quoted Go string literal representing s, 157// as generated by [QuoteToGraphic], to dst and returns the extended buffer. 158func AppendQuoteToGraphic(dst []byte, s string) []byte { 159 return appendQuotedWith(dst, s, '"', false, true) 160} 161 162// QuoteRune returns a single-quoted Go character literal representing the 163// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) 164// for control characters and non-printable characters as defined by [IsPrint]. 165// If r is not a valid Unicode code point, it is interpreted as the Unicode 166// replacement character U+FFFD. 167func QuoteRune(r rune) string { 168 return quoteRuneWith(r, '\'', false, false) 169} 170 171// AppendQuoteRune appends a single-quoted Go character literal representing the rune, 172// as generated by [QuoteRune], to dst and returns the extended buffer. 173func AppendQuoteRune(dst []byte, r rune) []byte { 174 return appendQuotedRuneWith(dst, r, '\'', false, false) 175} 176 177// QuoteRuneToASCII returns a single-quoted Go character literal representing 178// the rune. The returned string uses Go escape sequences (\t, \n, \xFF, 179// \u0100) for non-ASCII characters and non-printable characters as defined 180// by [IsPrint]. 181// If r is not a valid Unicode code point, it is interpreted as the Unicode 182// replacement character U+FFFD. 183func QuoteRuneToASCII(r rune) string { 184 return quoteRuneWith(r, '\'', true, false) 185} 186 187// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune, 188// as generated by [QuoteRuneToASCII], to dst and returns the extended buffer. 189func AppendQuoteRuneToASCII(dst []byte, r rune) []byte { 190 return appendQuotedRuneWith(dst, r, '\'', true, false) 191} 192 193// QuoteRuneToGraphic returns a single-quoted Go character literal representing 194// the rune. If the rune is not a Unicode graphic character, 195// as defined by [IsGraphic], the returned string will use a Go escape sequence 196// (\t, \n, \xFF, \u0100). 197// If r is not a valid Unicode code point, it is interpreted as the Unicode 198// replacement character U+FFFD. 199func QuoteRuneToGraphic(r rune) string { 200 return quoteRuneWith(r, '\'', false, true) 201} 202 203// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune, 204// as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer. 205func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte { 206 return appendQuotedRuneWith(dst, r, '\'', false, true) 207} 208 209// CanBackquote reports whether the string s can be represented 210// unchanged as a single-line backquoted string without control 211// characters other than tab. 212func CanBackquote(s string) bool { 213 for len(s) > 0 { 214 r, wid := utf8.DecodeRuneInString(s) 215 s = s[wid:] 216 if wid > 1 { 217 if r == '\ufeff' { 218 return false // BOMs are invisible and should not be quoted. 219 } 220 continue // All other multibyte runes are correctly encoded and assumed printable. 221 } 222 if r == utf8.RuneError { 223 return false 224 } 225 if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' { 226 return false 227 } 228 } 229 return true 230} 231 232func unhex(b byte) (v rune, ok bool) { 233 c := rune(b) 234 switch { 235 case '0' <= c && c <= '9': 236 return c - '0', true 237 case 'a' <= c && c <= 'f': 238 return c - 'a' + 10, true 239 case 'A' <= c && c <= 'F': 240 return c - 'A' + 10, true 241 } 242 return 243} 244 245// UnquoteChar decodes the first character or byte in the escaped string 246// or character literal represented by the string s. 247// It returns four values: 248// 249// 1. value, the decoded Unicode code point or byte value; 250// 2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation; 251// 3. tail, the remainder of the string after the character; and 252// 4. an error that will be nil if the character is syntactically valid. 253// 254// The second argument, quote, specifies the type of literal being parsed 255// and therefore which escaped quote character is permitted. 256// If set to a single quote, it permits the sequence \' and disallows unescaped '. 257// If set to a double quote, it permits \" and disallows unescaped ". 258// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped. 259func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) { 260 // easy cases 261 if len(s) == 0 { 262 err = ErrSyntax 263 return 264 } 265 switch c := s[0]; { 266 case c == quote && (quote == '\'' || quote == '"'): 267 err = ErrSyntax 268 return 269 case c >= utf8.RuneSelf: 270 r, size := utf8.DecodeRuneInString(s) 271 return r, true, s[size:], nil 272 case c != '\\': 273 return rune(s[0]), false, s[1:], nil 274 } 275 276 // hard case: c is backslash 277 if len(s) <= 1 { 278 err = ErrSyntax 279 return 280 } 281 c := s[1] 282 s = s[2:] 283 284 switch c { 285 case 'a': 286 value = '\a' 287 case 'b': 288 value = '\b' 289 case 'f': 290 value = '\f' 291 case 'n': 292 value = '\n' 293 case 'r': 294 value = '\r' 295 case 't': 296 value = '\t' 297 case 'v': 298 value = '\v' 299 case 'x', 'u', 'U': 300 n := 0 301 switch c { 302 case 'x': 303 n = 2 304 case 'u': 305 n = 4 306 case 'U': 307 n = 8 308 } 309 var v rune 310 if len(s) < n { 311 err = ErrSyntax 312 return 313 } 314 for j := 0; j < n; j++ { 315 x, ok := unhex(s[j]) 316 if !ok { 317 err = ErrSyntax 318 return 319 } 320 v = v<<4 | x 321 } 322 s = s[n:] 323 if c == 'x' { 324 // single-byte string, possibly not UTF-8 325 value = v 326 break 327 } 328 if !utf8.ValidRune(v) { 329 err = ErrSyntax 330 return 331 } 332 value = v 333 multibyte = true 334 case '0', '1', '2', '3', '4', '5', '6', '7': 335 v := rune(c) - '0' 336 if len(s) < 2 { 337 err = ErrSyntax 338 return 339 } 340 for j := 0; j < 2; j++ { // one digit already; two more 341 x := rune(s[j]) - '0' 342 if x < 0 || x > 7 { 343 err = ErrSyntax 344 return 345 } 346 v = (v << 3) | x 347 } 348 s = s[2:] 349 if v > 255 { 350 err = ErrSyntax 351 return 352 } 353 value = v 354 case '\\': 355 value = '\\' 356 case '\'', '"': 357 if c != quote { 358 err = ErrSyntax 359 return 360 } 361 value = rune(c) 362 default: 363 err = ErrSyntax 364 return 365 } 366 tail = s 367 return 368} 369 370// QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s. 371// If s does not start with a valid quoted string, QuotedPrefix returns an error. 372func QuotedPrefix(s string) (string, error) { 373 out, _, err := unquote(s, false) 374 return out, err 375} 376 377// Unquote interprets s as a single-quoted, double-quoted, 378// or backquoted Go string literal, returning the string value 379// that s quotes. (If s is single-quoted, it would be a Go 380// character literal; Unquote returns the corresponding 381// one-character string.) 382func Unquote(s string) (string, error) { 383 out, rem, err := unquote(s, true) 384 if len(rem) > 0 { 385 return "", ErrSyntax 386 } 387 return out, err 388} 389 390// unquote parses a quoted string at the start of the input, 391// returning the parsed prefix, the remaining suffix, and any parse errors. 392// If unescape is true, the parsed prefix is unescaped, 393// otherwise the input prefix is provided verbatim. 394func unquote(in string, unescape bool) (out, rem string, err error) { 395 // Determine the quote form and optimistically find the terminating quote. 396 if len(in) < 2 { 397 return "", in, ErrSyntax 398 } 399 quote := in[0] 400 end := index(in[1:], quote) 401 if end < 0 { 402 return "", in, ErrSyntax 403 } 404 end += 2 // position after terminating quote; may be wrong if escape sequences are present 405 406 switch quote { 407 case '`': 408 switch { 409 case !unescape: 410 out = in[:end] // include quotes 411 case !contains(in[:end], '\r'): 412 out = in[len("`") : end-len("`")] // exclude quotes 413 default: 414 // Carriage return characters ('\r') inside raw string literals 415 // are discarded from the raw string value. 416 buf := make([]byte, 0, end-len("`")-len("\r")-len("`")) 417 for i := len("`"); i < end-len("`"); i++ { 418 if in[i] != '\r' { 419 buf = append(buf, in[i]) 420 } 421 } 422 out = string(buf) 423 } 424 // NOTE: Prior implementations did not verify that raw strings consist 425 // of valid UTF-8 characters and we continue to not verify it as such. 426 // The Go specification does not explicitly require valid UTF-8, 427 // but only mention that it is implicitly valid for Go source code 428 // (which must be valid UTF-8). 429 return out, in[end:], nil 430 case '"', '\'': 431 // Handle quoted strings without any escape sequences. 432 if !contains(in[:end], '\\') && !contains(in[:end], '\n') { 433 var valid bool 434 switch quote { 435 case '"': 436 valid = utf8.ValidString(in[len(`"`) : end-len(`"`)]) 437 case '\'': 438 r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")]) 439 valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1) 440 } 441 if valid { 442 out = in[:end] 443 if unescape { 444 out = out[1 : end-1] // exclude quotes 445 } 446 return out, in[end:], nil 447 } 448 } 449 450 // Handle quoted strings with escape sequences. 451 var buf []byte 452 in0 := in 453 in = in[1:] // skip starting quote 454 if unescape { 455 buf = make([]byte, 0, 3*end/2) // try to avoid more allocations 456 } 457 for len(in) > 0 && in[0] != quote { 458 // Process the next character, 459 // rejecting any unescaped newline characters which are invalid. 460 r, multibyte, rem, err := UnquoteChar(in, quote) 461 if in[0] == '\n' || err != nil { 462 return "", in0, ErrSyntax 463 } 464 in = rem 465 466 // Append the character if unescaping the input. 467 if unescape { 468 if r < utf8.RuneSelf || !multibyte { 469 buf = append(buf, byte(r)) 470 } else { 471 buf = utf8.AppendRune(buf, r) 472 } 473 } 474 475 // Single quoted strings must be a single character. 476 if quote == '\'' { 477 break 478 } 479 } 480 481 // Verify that the string ends with a terminating quote. 482 if !(len(in) > 0 && in[0] == quote) { 483 return "", in0, ErrSyntax 484 } 485 in = in[1:] // skip terminating quote 486 487 if unescape { 488 return string(buf), in, nil 489 } 490 return in0[:len(in0)-len(in)], in, nil 491 default: 492 return "", in, ErrSyntax 493 } 494} 495 496// bsearch is semantically the same as [slices.BinarySearch] (without NaN checks) 497// We copied this function because we can not import "slices" here. 498func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) { 499 n := len(s) 500 i, j := 0, n 501 for i < j { 502 h := i + (j-i)>>1 503 if s[h] < v { 504 i = h + 1 505 } else { 506 j = h 507 } 508 } 509 return i, i < n && s[i] == v 510} 511 512// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests 513// to give the same answer. It allows this package not to depend on unicode, 514// and therefore not pull in all the Unicode tables. If the linker were better 515// at tossing unused tables, we could get rid of this implementation. 516// That would be nice. 517 518// IsPrint reports whether the rune is defined as printable by Go, with 519// the same definition as [unicode.IsPrint]: letters, numbers, punctuation, 520// symbols and ASCII space. 521func IsPrint(r rune) bool { 522 // Fast check for Latin-1 523 if r <= 0xFF { 524 if 0x20 <= r && r <= 0x7E { 525 // All the ASCII is printable from space through DEL-1. 526 return true 527 } 528 if 0xA1 <= r && r <= 0xFF { 529 // Similarly for ¡ through ÿ... 530 return r != 0xAD // ...except for the bizarre soft hyphen. 531 } 532 return false 533 } 534 535 // Same algorithm, either on uint16 or uint32 value. 536 // First, find first i such that isPrint[i] >= x. 537 // This is the index of either the start or end of a pair that might span x. 538 // The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]). 539 // If we find x in a range, make sure x is not in isNotPrint list. 540 541 if 0 <= r && r < 1<<16 { 542 rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16 543 i, _ := bsearch(isPrint, rr) 544 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 545 return false 546 } 547 _, found := bsearch(isNotPrint, rr) 548 return !found 549 } 550 551 rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32 552 i, _ := bsearch(isPrint, rr) 553 if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr { 554 return false 555 } 556 if r >= 0x20000 { 557 return true 558 } 559 r -= 0x10000 560 _, found := bsearch(isNotPrint, uint16(r)) 561 return !found 562} 563 564// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such 565// characters include letters, marks, numbers, punctuation, symbols, and 566// spaces, from categories L, M, N, P, S, and Zs. 567func IsGraphic(r rune) bool { 568 if IsPrint(r) { 569 return true 570 } 571 return isInGraphicList(r) 572} 573 574// isInGraphicList reports whether the rune is in the isGraphic list. This separation 575// from IsGraphic allows quoteWith to avoid two calls to IsPrint. 576// Should be called only if IsPrint fails. 577func isInGraphicList(r rune) bool { 578 // We know r must fit in 16 bits - see makeisprint.go. 579 if r > 0xFFFF { 580 return false 581 } 582 _, found := bsearch(isGraphic, uint16(r)) 583 return found 584} 585