1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:generate go run makeisprint.go -output isprint.go
6
7package strconv
8
9import (
10	"unicode/utf8"
11)
12
13const (
14	lowerhex = "0123456789abcdef"
15	upperhex = "0123456789ABCDEF"
16)
17
18// contains reports whether the string contains the byte c.
19func contains(s string, c byte) bool {
20	return index(s, c) != -1
21}
22
23func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
24	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
25}
26
27func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
28	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
29}
30
31func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
32	// Often called with big strings, so preallocate. If there's quoting,
33	// this is conservative but still helps a lot.
34	if cap(buf)-len(buf) < len(s) {
35		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
36		copy(nBuf, buf)
37		buf = nBuf
38	}
39	buf = append(buf, quote)
40	for width := 0; len(s) > 0; s = s[width:] {
41		r := rune(s[0])
42		width = 1
43		if r >= utf8.RuneSelf {
44			r, width = utf8.DecodeRuneInString(s)
45		}
46		if width == 1 && r == utf8.RuneError {
47			buf = append(buf, `\x`...)
48			buf = append(buf, lowerhex[s[0]>>4])
49			buf = append(buf, lowerhex[s[0]&0xF])
50			continue
51		}
52		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
53	}
54	buf = append(buf, quote)
55	return buf
56}
57
58func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
59	buf = append(buf, quote)
60	if !utf8.ValidRune(r) {
61		r = utf8.RuneError
62	}
63	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
64	buf = append(buf, quote)
65	return buf
66}
67
68func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
69	if r == rune(quote) || r == '\\' { // always backslashed
70		buf = append(buf, '\\')
71		buf = append(buf, byte(r))
72		return buf
73	}
74	if ASCIIonly {
75		if r < utf8.RuneSelf && IsPrint(r) {
76			buf = append(buf, byte(r))
77			return buf
78		}
79	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
80		return utf8.AppendRune(buf, r)
81	}
82	switch r {
83	case '\a':
84		buf = append(buf, `\a`...)
85	case '\b':
86		buf = append(buf, `\b`...)
87	case '\f':
88		buf = append(buf, `\f`...)
89	case '\n':
90		buf = append(buf, `\n`...)
91	case '\r':
92		buf = append(buf, `\r`...)
93	case '\t':
94		buf = append(buf, `\t`...)
95	case '\v':
96		buf = append(buf, `\v`...)
97	default:
98		switch {
99		case r < ' ' || r == 0x7f:
100			buf = append(buf, `\x`...)
101			buf = append(buf, lowerhex[byte(r)>>4])
102			buf = append(buf, lowerhex[byte(r)&0xF])
103		case !utf8.ValidRune(r):
104			r = 0xFFFD
105			fallthrough
106		case r < 0x10000:
107			buf = append(buf, `\u`...)
108			for s := 12; s >= 0; s -= 4 {
109				buf = append(buf, lowerhex[r>>uint(s)&0xF])
110			}
111		default:
112			buf = append(buf, `\U`...)
113			for s := 28; s >= 0; s -= 4 {
114				buf = append(buf, lowerhex[r>>uint(s)&0xF])
115			}
116		}
117	}
118	return buf
119}
120
121// Quote returns a double-quoted Go string literal representing s. The
122// returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
123// control characters and non-printable characters as defined by
124// [IsPrint].
125func Quote(s string) string {
126	return quoteWith(s, '"', false, false)
127}
128
129// AppendQuote appends a double-quoted Go string literal representing s,
130// as generated by [Quote], to dst and returns the extended buffer.
131func AppendQuote(dst []byte, s string) []byte {
132	return appendQuotedWith(dst, s, '"', false, false)
133}
134
135// QuoteToASCII returns a double-quoted Go string literal representing s.
136// The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
137// non-ASCII characters and non-printable characters as defined by [IsPrint].
138func QuoteToASCII(s string) string {
139	return quoteWith(s, '"', true, false)
140}
141
142// AppendQuoteToASCII appends a double-quoted Go string literal representing s,
143// as generated by [QuoteToASCII], to dst and returns the extended buffer.
144func AppendQuoteToASCII(dst []byte, s string) []byte {
145	return appendQuotedWith(dst, s, '"', true, false)
146}
147
148// QuoteToGraphic returns a double-quoted Go string literal representing s.
149// The returned string leaves Unicode graphic characters, as defined by
150// [IsGraphic], unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
151// for non-graphic characters.
152func QuoteToGraphic(s string) string {
153	return quoteWith(s, '"', false, true)
154}
155
156// AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
157// as generated by [QuoteToGraphic], to dst and returns the extended buffer.
158func AppendQuoteToGraphic(dst []byte, s string) []byte {
159	return appendQuotedWith(dst, s, '"', false, true)
160}
161
162// QuoteRune returns a single-quoted Go character literal representing the
163// rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
164// for control characters and non-printable characters as defined by [IsPrint].
165// If r is not a valid Unicode code point, it is interpreted as the Unicode
166// replacement character U+FFFD.
167func QuoteRune(r rune) string {
168	return quoteRuneWith(r, '\'', false, false)
169}
170
171// AppendQuoteRune appends a single-quoted Go character literal representing the rune,
172// as generated by [QuoteRune], to dst and returns the extended buffer.
173func AppendQuoteRune(dst []byte, r rune) []byte {
174	return appendQuotedRuneWith(dst, r, '\'', false, false)
175}
176
177// QuoteRuneToASCII returns a single-quoted Go character literal representing
178// the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
179// \u0100) for non-ASCII characters and non-printable characters as defined
180// by [IsPrint].
181// If r is not a valid Unicode code point, it is interpreted as the Unicode
182// replacement character U+FFFD.
183func QuoteRuneToASCII(r rune) string {
184	return quoteRuneWith(r, '\'', true, false)
185}
186
187// AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
188// as generated by [QuoteRuneToASCII], to dst and returns the extended buffer.
189func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
190	return appendQuotedRuneWith(dst, r, '\'', true, false)
191}
192
193// QuoteRuneToGraphic returns a single-quoted Go character literal representing
194// the rune. If the rune is not a Unicode graphic character,
195// as defined by [IsGraphic], the returned string will use a Go escape sequence
196// (\t, \n, \xFF, \u0100).
197// If r is not a valid Unicode code point, it is interpreted as the Unicode
198// replacement character U+FFFD.
199func QuoteRuneToGraphic(r rune) string {
200	return quoteRuneWith(r, '\'', false, true)
201}
202
203// AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
204// as generated by [QuoteRuneToGraphic], to dst and returns the extended buffer.
205func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
206	return appendQuotedRuneWith(dst, r, '\'', false, true)
207}
208
209// CanBackquote reports whether the string s can be represented
210// unchanged as a single-line backquoted string without control
211// characters other than tab.
212func CanBackquote(s string) bool {
213	for len(s) > 0 {
214		r, wid := utf8.DecodeRuneInString(s)
215		s = s[wid:]
216		if wid > 1 {
217			if r == '\ufeff' {
218				return false // BOMs are invisible and should not be quoted.
219			}
220			continue // All other multibyte runes are correctly encoded and assumed printable.
221		}
222		if r == utf8.RuneError {
223			return false
224		}
225		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
226			return false
227		}
228	}
229	return true
230}
231
232func unhex(b byte) (v rune, ok bool) {
233	c := rune(b)
234	switch {
235	case '0' <= c && c <= '9':
236		return c - '0', true
237	case 'a' <= c && c <= 'f':
238		return c - 'a' + 10, true
239	case 'A' <= c && c <= 'F':
240		return c - 'A' + 10, true
241	}
242	return
243}
244
245// UnquoteChar decodes the first character or byte in the escaped string
246// or character literal represented by the string s.
247// It returns four values:
248//
249//  1. value, the decoded Unicode code point or byte value;
250//  2. multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
251//  3. tail, the remainder of the string after the character; and
252//  4. an error that will be nil if the character is syntactically valid.
253//
254// The second argument, quote, specifies the type of literal being parsed
255// and therefore which escaped quote character is permitted.
256// If set to a single quote, it permits the sequence \' and disallows unescaped '.
257// If set to a double quote, it permits \" and disallows unescaped ".
258// If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
259func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
260	// easy cases
261	if len(s) == 0 {
262		err = ErrSyntax
263		return
264	}
265	switch c := s[0]; {
266	case c == quote && (quote == '\'' || quote == '"'):
267		err = ErrSyntax
268		return
269	case c >= utf8.RuneSelf:
270		r, size := utf8.DecodeRuneInString(s)
271		return r, true, s[size:], nil
272	case c != '\\':
273		return rune(s[0]), false, s[1:], nil
274	}
275
276	// hard case: c is backslash
277	if len(s) <= 1 {
278		err = ErrSyntax
279		return
280	}
281	c := s[1]
282	s = s[2:]
283
284	switch c {
285	case 'a':
286		value = '\a'
287	case 'b':
288		value = '\b'
289	case 'f':
290		value = '\f'
291	case 'n':
292		value = '\n'
293	case 'r':
294		value = '\r'
295	case 't':
296		value = '\t'
297	case 'v':
298		value = '\v'
299	case 'x', 'u', 'U':
300		n := 0
301		switch c {
302		case 'x':
303			n = 2
304		case 'u':
305			n = 4
306		case 'U':
307			n = 8
308		}
309		var v rune
310		if len(s) < n {
311			err = ErrSyntax
312			return
313		}
314		for j := 0; j < n; j++ {
315			x, ok := unhex(s[j])
316			if !ok {
317				err = ErrSyntax
318				return
319			}
320			v = v<<4 | x
321		}
322		s = s[n:]
323		if c == 'x' {
324			// single-byte string, possibly not UTF-8
325			value = v
326			break
327		}
328		if !utf8.ValidRune(v) {
329			err = ErrSyntax
330			return
331		}
332		value = v
333		multibyte = true
334	case '0', '1', '2', '3', '4', '5', '6', '7':
335		v := rune(c) - '0'
336		if len(s) < 2 {
337			err = ErrSyntax
338			return
339		}
340		for j := 0; j < 2; j++ { // one digit already; two more
341			x := rune(s[j]) - '0'
342			if x < 0 || x > 7 {
343				err = ErrSyntax
344				return
345			}
346			v = (v << 3) | x
347		}
348		s = s[2:]
349		if v > 255 {
350			err = ErrSyntax
351			return
352		}
353		value = v
354	case '\\':
355		value = '\\'
356	case '\'', '"':
357		if c != quote {
358			err = ErrSyntax
359			return
360		}
361		value = rune(c)
362	default:
363		err = ErrSyntax
364		return
365	}
366	tail = s
367	return
368}
369
370// QuotedPrefix returns the quoted string (as understood by [Unquote]) at the prefix of s.
371// If s does not start with a valid quoted string, QuotedPrefix returns an error.
372func QuotedPrefix(s string) (string, error) {
373	out, _, err := unquote(s, false)
374	return out, err
375}
376
377// Unquote interprets s as a single-quoted, double-quoted,
378// or backquoted Go string literal, returning the string value
379// that s quotes.  (If s is single-quoted, it would be a Go
380// character literal; Unquote returns the corresponding
381// one-character string.)
382func Unquote(s string) (string, error) {
383	out, rem, err := unquote(s, true)
384	if len(rem) > 0 {
385		return "", ErrSyntax
386	}
387	return out, err
388}
389
390// unquote parses a quoted string at the start of the input,
391// returning the parsed prefix, the remaining suffix, and any parse errors.
392// If unescape is true, the parsed prefix is unescaped,
393// otherwise the input prefix is provided verbatim.
394func unquote(in string, unescape bool) (out, rem string, err error) {
395	// Determine the quote form and optimistically find the terminating quote.
396	if len(in) < 2 {
397		return "", in, ErrSyntax
398	}
399	quote := in[0]
400	end := index(in[1:], quote)
401	if end < 0 {
402		return "", in, ErrSyntax
403	}
404	end += 2 // position after terminating quote; may be wrong if escape sequences are present
405
406	switch quote {
407	case '`':
408		switch {
409		case !unescape:
410			out = in[:end] // include quotes
411		case !contains(in[:end], '\r'):
412			out = in[len("`") : end-len("`")] // exclude quotes
413		default:
414			// Carriage return characters ('\r') inside raw string literals
415			// are discarded from the raw string value.
416			buf := make([]byte, 0, end-len("`")-len("\r")-len("`"))
417			for i := len("`"); i < end-len("`"); i++ {
418				if in[i] != '\r' {
419					buf = append(buf, in[i])
420				}
421			}
422			out = string(buf)
423		}
424		// NOTE: Prior implementations did not verify that raw strings consist
425		// of valid UTF-8 characters and we continue to not verify it as such.
426		// The Go specification does not explicitly require valid UTF-8,
427		// but only mention that it is implicitly valid for Go source code
428		// (which must be valid UTF-8).
429		return out, in[end:], nil
430	case '"', '\'':
431		// Handle quoted strings without any escape sequences.
432		if !contains(in[:end], '\\') && !contains(in[:end], '\n') {
433			var valid bool
434			switch quote {
435			case '"':
436				valid = utf8.ValidString(in[len(`"`) : end-len(`"`)])
437			case '\'':
438				r, n := utf8.DecodeRuneInString(in[len("'") : end-len("'")])
439				valid = len("'")+n+len("'") == end && (r != utf8.RuneError || n != 1)
440			}
441			if valid {
442				out = in[:end]
443				if unescape {
444					out = out[1 : end-1] // exclude quotes
445				}
446				return out, in[end:], nil
447			}
448		}
449
450		// Handle quoted strings with escape sequences.
451		var buf []byte
452		in0 := in
453		in = in[1:] // skip starting quote
454		if unescape {
455			buf = make([]byte, 0, 3*end/2) // try to avoid more allocations
456		}
457		for len(in) > 0 && in[0] != quote {
458			// Process the next character,
459			// rejecting any unescaped newline characters which are invalid.
460			r, multibyte, rem, err := UnquoteChar(in, quote)
461			if in[0] == '\n' || err != nil {
462				return "", in0, ErrSyntax
463			}
464			in = rem
465
466			// Append the character if unescaping the input.
467			if unescape {
468				if r < utf8.RuneSelf || !multibyte {
469					buf = append(buf, byte(r))
470				} else {
471					buf = utf8.AppendRune(buf, r)
472				}
473			}
474
475			// Single quoted strings must be a single character.
476			if quote == '\'' {
477				break
478			}
479		}
480
481		// Verify that the string ends with a terminating quote.
482		if !(len(in) > 0 && in[0] == quote) {
483			return "", in0, ErrSyntax
484		}
485		in = in[1:] // skip terminating quote
486
487		if unescape {
488			return string(buf), in, nil
489		}
490		return in0[:len(in0)-len(in)], in, nil
491	default:
492		return "", in, ErrSyntax
493	}
494}
495
496// bsearch is semantically the same as [slices.BinarySearch] (without NaN checks)
497// We copied this function because we can not import "slices" here.
498func bsearch[S ~[]E, E ~uint16 | ~uint32](s S, v E) (int, bool) {
499	n := len(s)
500	i, j := 0, n
501	for i < j {
502		h := i + (j-i)>>1
503		if s[h] < v {
504			i = h + 1
505		} else {
506			j = h
507		}
508	}
509	return i, i < n && s[i] == v
510}
511
512// TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
513// to give the same answer. It allows this package not to depend on unicode,
514// and therefore not pull in all the Unicode tables. If the linker were better
515// at tossing unused tables, we could get rid of this implementation.
516// That would be nice.
517
518// IsPrint reports whether the rune is defined as printable by Go, with
519// the same definition as [unicode.IsPrint]: letters, numbers, punctuation,
520// symbols and ASCII space.
521func IsPrint(r rune) bool {
522	// Fast check for Latin-1
523	if r <= 0xFF {
524		if 0x20 <= r && r <= 0x7E {
525			// All the ASCII is printable from space through DEL-1.
526			return true
527		}
528		if 0xA1 <= r && r <= 0xFF {
529			// Similarly for ¡ through ÿ...
530			return r != 0xAD // ...except for the bizarre soft hyphen.
531		}
532		return false
533	}
534
535	// Same algorithm, either on uint16 or uint32 value.
536	// First, find first i such that isPrint[i] >= x.
537	// This is the index of either the start or end of a pair that might span x.
538	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
539	// If we find x in a range, make sure x is not in isNotPrint list.
540
541	if 0 <= r && r < 1<<16 {
542		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
543		i, _ := bsearch(isPrint, rr)
544		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
545			return false
546		}
547		_, found := bsearch(isNotPrint, rr)
548		return !found
549	}
550
551	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
552	i, _ := bsearch(isPrint, rr)
553	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
554		return false
555	}
556	if r >= 0x20000 {
557		return true
558	}
559	r -= 0x10000
560	_, found := bsearch(isNotPrint, uint16(r))
561	return !found
562}
563
564// IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
565// characters include letters, marks, numbers, punctuation, symbols, and
566// spaces, from categories L, M, N, P, S, and Zs.
567func IsGraphic(r rune) bool {
568	if IsPrint(r) {
569		return true
570	}
571	return isInGraphicList(r)
572}
573
574// isInGraphicList reports whether the rune is in the isGraphic list. This separation
575// from IsGraphic allows quoteWith to avoid two calls to IsPrint.
576// Should be called only if IsPrint fails.
577func isInGraphicList(r rune) bool {
578	// We know r must fit in 16 bits - see makeisprint.go.
579	if r > 0xFFFF {
580		return false
581	}
582	_, found := bsearch(isGraphic, uint16(r))
583	return found
584}
585