1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package utf8_test
6
7import (
8	"bytes"
9	"strings"
10	"testing"
11	"unicode"
12	. "unicode/utf8"
13)
14
15// Validate the constants redefined from unicode.
16func init() {
17	if MaxRune != unicode.MaxRune {
18		panic("utf8.MaxRune is wrong")
19	}
20	if RuneError != unicode.ReplacementChar {
21		panic("utf8.RuneError is wrong")
22	}
23}
24
25// Validate the constants redefined from unicode.
26func TestConstants(t *testing.T) {
27	if MaxRune != unicode.MaxRune {
28		t.Errorf("utf8.MaxRune is wrong: %x should be %x", MaxRune, unicode.MaxRune)
29	}
30	if RuneError != unicode.ReplacementChar {
31		t.Errorf("utf8.RuneError is wrong: %x should be %x", RuneError, unicode.ReplacementChar)
32	}
33}
34
35type Utf8Map struct {
36	r   rune
37	str string
38}
39
40var utf8map = []Utf8Map{
41	{0x0000, "\x00"},
42	{0x0001, "\x01"},
43	{0x007e, "\x7e"},
44	{0x007f, "\x7f"},
45	{0x0080, "\xc2\x80"},
46	{0x0081, "\xc2\x81"},
47	{0x00bf, "\xc2\xbf"},
48	{0x00c0, "\xc3\x80"},
49	{0x00c1, "\xc3\x81"},
50	{0x00c8, "\xc3\x88"},
51	{0x00d0, "\xc3\x90"},
52	{0x00e0, "\xc3\xa0"},
53	{0x00f0, "\xc3\xb0"},
54	{0x00f8, "\xc3\xb8"},
55	{0x00ff, "\xc3\xbf"},
56	{0x0100, "\xc4\x80"},
57	{0x07ff, "\xdf\xbf"},
58	{0x0400, "\xd0\x80"},
59	{0x0800, "\xe0\xa0\x80"},
60	{0x0801, "\xe0\xa0\x81"},
61	{0x1000, "\xe1\x80\x80"},
62	{0xd000, "\xed\x80\x80"},
63	{0xd7ff, "\xed\x9f\xbf"}, // last code point before surrogate half.
64	{0xe000, "\xee\x80\x80"}, // first code point after surrogate half.
65	{0xfffe, "\xef\xbf\xbe"},
66	{0xffff, "\xef\xbf\xbf"},
67	{0x10000, "\xf0\x90\x80\x80"},
68	{0x10001, "\xf0\x90\x80\x81"},
69	{0x40000, "\xf1\x80\x80\x80"},
70	{0x10fffe, "\xf4\x8f\xbf\xbe"},
71	{0x10ffff, "\xf4\x8f\xbf\xbf"},
72	{0xFFFD, "\xef\xbf\xbd"},
73}
74
75var surrogateMap = []Utf8Map{
76	{0xd800, "\xed\xa0\x80"}, // surrogate min decodes to (RuneError, 1)
77	{0xdfff, "\xed\xbf\xbf"}, // surrogate max decodes to (RuneError, 1)
78}
79
80var testStrings = []string{
81	"",
82	"abcd",
83	"☺☻☹",
84	"日a本b語ç日ð本Ê語þ日¥本¼語i日©",
85	"日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©日a本b語ç日ð本Ê語þ日¥本¼語i日©",
86	"\x80\x80\x80\x80",
87}
88
89func TestFullRune(t *testing.T) {
90	for _, m := range utf8map {
91		b := []byte(m.str)
92		if !FullRune(b) {
93			t.Errorf("FullRune(%q) (%U) = false, want true", b, m.r)
94		}
95		s := m.str
96		if !FullRuneInString(s) {
97			t.Errorf("FullRuneInString(%q) (%U) = false, want true", s, m.r)
98		}
99		b1 := b[0 : len(b)-1]
100		if FullRune(b1) {
101			t.Errorf("FullRune(%q) = true, want false", b1)
102		}
103		s1 := string(b1)
104		if FullRuneInString(s1) {
105			t.Errorf("FullRune(%q) = true, want false", s1)
106		}
107	}
108	for _, s := range []string{"\xc0", "\xc1"} {
109		b := []byte(s)
110		if !FullRune(b) {
111			t.Errorf("FullRune(%q) = false, want true", s)
112		}
113		if !FullRuneInString(s) {
114			t.Errorf("FullRuneInString(%q) = false, want true", s)
115		}
116	}
117}
118
119func TestEncodeRune(t *testing.T) {
120	for _, m := range utf8map {
121		b := []byte(m.str)
122		var buf [10]byte
123		n := EncodeRune(buf[0:], m.r)
124		b1 := buf[0:n]
125		if !bytes.Equal(b, b1) {
126			t.Errorf("EncodeRune(%#04x) = %q want %q", m.r, b1, b)
127		}
128	}
129}
130
131func TestAppendRune(t *testing.T) {
132	for _, m := range utf8map {
133		if buf := AppendRune(nil, m.r); string(buf) != m.str {
134			t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str)
135		}
136		if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str {
137			t.Errorf("AppendRune(init, %#04x) = %s, want %s", m.r, buf, "init"+m.str)
138		}
139	}
140}
141
142func TestDecodeRune(t *testing.T) {
143	for _, m := range utf8map {
144		b := []byte(m.str)
145		r, size := DecodeRune(b)
146		if r != m.r || size != len(b) {
147			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
148		}
149		s := m.str
150		r, size = DecodeRuneInString(s)
151		if r != m.r || size != len(b) {
152			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
153		}
154
155		// there's an extra byte that bytes left behind - make sure trailing byte works
156		r, size = DecodeRune(b[0:cap(b)])
157		if r != m.r || size != len(b) {
158			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, m.r, len(b))
159		}
160		s = m.str + "\x00"
161		r, size = DecodeRuneInString(s)
162		if r != m.r || size != len(b) {
163			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, m.r, len(b))
164		}
165
166		// make sure missing bytes fail
167		wantsize := 1
168		if wantsize >= len(b) {
169			wantsize = 0
170		}
171		r, size = DecodeRune(b[0 : len(b)-1])
172		if r != RuneError || size != wantsize {
173			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b[0:len(b)-1], r, size, RuneError, wantsize)
174		}
175		s = m.str[0 : len(m.str)-1]
176		r, size = DecodeRuneInString(s)
177		if r != RuneError || size != wantsize {
178			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, wantsize)
179		}
180
181		// make sure bad sequences fail
182		if len(b) == 1 {
183			b[0] = 0x80
184		} else {
185			b[len(b)-1] = 0x7F
186		}
187		r, size = DecodeRune(b)
188		if r != RuneError || size != 1 {
189			t.Errorf("DecodeRune(%q) = %#04x, %d want %#04x, %d", b, r, size, RuneError, 1)
190		}
191		s = string(b)
192		r, size = DecodeRuneInString(s)
193		if r != RuneError || size != 1 {
194			t.Errorf("DecodeRuneInString(%q) = %#04x, %d want %#04x, %d", s, r, size, RuneError, 1)
195		}
196
197	}
198}
199
200func TestDecodeSurrogateRune(t *testing.T) {
201	for _, m := range surrogateMap {
202		b := []byte(m.str)
203		r, size := DecodeRune(b)
204		if r != RuneError || size != 1 {
205			t.Errorf("DecodeRune(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
206		}
207		s := m.str
208		r, size = DecodeRuneInString(s)
209		if r != RuneError || size != 1 {
210			t.Errorf("DecodeRuneInString(%q) = %x, %d want %x, %d", b, r, size, RuneError, 1)
211		}
212	}
213}
214
215// Check that DecodeRune and DecodeLastRune correspond to
216// the equivalent range loop.
217func TestSequencing(t *testing.T) {
218	for _, ts := range testStrings {
219		for _, m := range utf8map {
220			for _, s := range []string{ts + m.str, m.str + ts, ts + m.str + ts} {
221				testSequence(t, s)
222			}
223		}
224	}
225}
226
227func runtimeRuneCount(s string) int {
228	return len([]rune(s)) // Replaced by gc with call to runtime.countrunes(s).
229}
230
231// Check that a range loop, len([]rune(string)) optimization and
232// []rune conversions visit the same runes.
233// Not really a test of this package, but the assumption is used here and
234// it's good to verify.
235func TestRuntimeConversion(t *testing.T) {
236	for _, ts := range testStrings {
237		count := RuneCountInString(ts)
238		if n := runtimeRuneCount(ts); n != count {
239			t.Errorf("%q: len([]rune()) counted %d runes; got %d from RuneCountInString", ts, n, count)
240			break
241		}
242
243		runes := []rune(ts)
244		if n := len(runes); n != count {
245			t.Errorf("%q: []rune() has length %d; got %d from RuneCountInString", ts, n, count)
246			break
247		}
248		i := 0
249		for _, r := range ts {
250			if r != runes[i] {
251				t.Errorf("%q[%d]: expected %c (%U); got %c (%U)", ts, i, runes[i], runes[i], r, r)
252			}
253			i++
254		}
255	}
256}
257
258var invalidSequenceTests = []string{
259	"\xed\xa0\x80\x80", // surrogate min
260	"\xed\xbf\xbf\x80", // surrogate max
261
262	// xx
263	"\x91\x80\x80\x80",
264
265	// s1
266	"\xC2\x7F\x80\x80",
267	"\xC2\xC0\x80\x80",
268	"\xDF\x7F\x80\x80",
269	"\xDF\xC0\x80\x80",
270
271	// s2
272	"\xE0\x9F\xBF\x80",
273	"\xE0\xA0\x7F\x80",
274	"\xE0\xBF\xC0\x80",
275	"\xE0\xC0\x80\x80",
276
277	// s3
278	"\xE1\x7F\xBF\x80",
279	"\xE1\x80\x7F\x80",
280	"\xE1\xBF\xC0\x80",
281	"\xE1\xC0\x80\x80",
282
283	//s4
284	"\xED\x7F\xBF\x80",
285	"\xED\x80\x7F\x80",
286	"\xED\x9F\xC0\x80",
287	"\xED\xA0\x80\x80",
288
289	// s5
290	"\xF0\x8F\xBF\xBF",
291	"\xF0\x90\x7F\xBF",
292	"\xF0\x90\x80\x7F",
293	"\xF0\xBF\xBF\xC0",
294	"\xF0\xBF\xC0\x80",
295	"\xF0\xC0\x80\x80",
296
297	// s6
298	"\xF1\x7F\xBF\xBF",
299	"\xF1\x80\x7F\xBF",
300	"\xF1\x80\x80\x7F",
301	"\xF1\xBF\xBF\xC0",
302	"\xF1\xBF\xC0\x80",
303	"\xF1\xC0\x80\x80",
304
305	// s7
306	"\xF4\x7F\xBF\xBF",
307	"\xF4\x80\x7F\xBF",
308	"\xF4\x80\x80\x7F",
309	"\xF4\x8F\xBF\xC0",
310	"\xF4\x8F\xC0\x80",
311	"\xF4\x90\x80\x80",
312}
313
314func runtimeDecodeRune(s string) rune {
315	for _, r := range s {
316		return r
317	}
318	return -1
319}
320
321func TestDecodeInvalidSequence(t *testing.T) {
322	for _, s := range invalidSequenceTests {
323		r1, _ := DecodeRune([]byte(s))
324		if want := RuneError; r1 != want {
325			t.Errorf("DecodeRune(%#x) = %#04x, want %#04x", s, r1, want)
326			return
327		}
328		r2, _ := DecodeRuneInString(s)
329		if want := RuneError; r2 != want {
330			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s, r2, want)
331			return
332		}
333		if r1 != r2 {
334			t.Errorf("DecodeRune(%#x) = %#04x mismatch with DecodeRuneInString(%q) = %#04x", s, r1, s, r2)
335			return
336		}
337		r3 := runtimeDecodeRune(s)
338		if r2 != r3 {
339			t.Errorf("DecodeRuneInString(%q) = %#04x mismatch with runtime.decoderune(%q) = %#04x", s, r2, s, r3)
340			return
341		}
342	}
343}
344
345func testSequence(t *testing.T, s string) {
346	type info struct {
347		index int
348		r     rune
349	}
350	index := make([]info, len(s))
351	b := []byte(s)
352	si := 0
353	j := 0
354	for i, r := range s {
355		if si != i {
356			t.Errorf("Sequence(%q) mismatched index %d, want %d", s, si, i)
357			return
358		}
359		index[j] = info{i, r}
360		j++
361		r1, size1 := DecodeRune(b[i:])
362		if r != r1 {
363			t.Errorf("DecodeRune(%q) = %#04x, want %#04x", s[i:], r1, r)
364			return
365		}
366		r2, size2 := DecodeRuneInString(s[i:])
367		if r != r2 {
368			t.Errorf("DecodeRuneInString(%q) = %#04x, want %#04x", s[i:], r2, r)
369			return
370		}
371		if size1 != size2 {
372			t.Errorf("DecodeRune/DecodeRuneInString(%q) size mismatch %d/%d", s[i:], size1, size2)
373			return
374		}
375		si += size1
376	}
377	j--
378	for si = len(s); si > 0; {
379		r1, size1 := DecodeLastRune(b[0:si])
380		r2, size2 := DecodeLastRuneInString(s[0:si])
381		if size1 != size2 {
382			t.Errorf("DecodeLastRune/DecodeLastRuneInString(%q, %d) size mismatch %d/%d", s, si, size1, size2)
383			return
384		}
385		if r1 != index[j].r {
386			t.Errorf("DecodeLastRune(%q, %d) = %#04x, want %#04x", s, si, r1, index[j].r)
387			return
388		}
389		if r2 != index[j].r {
390			t.Errorf("DecodeLastRuneInString(%q, %d) = %#04x, want %#04x", s, si, r2, index[j].r)
391			return
392		}
393		si -= size1
394		if si != index[j].index {
395			t.Errorf("DecodeLastRune(%q) index mismatch at %d, want %d", s, si, index[j].index)
396			return
397		}
398		j--
399	}
400	if si != 0 {
401		t.Errorf("DecodeLastRune(%q) finished at %d, not 0", s, si)
402	}
403}
404
405// Check that negative runes encode as U+FFFD.
406func TestNegativeRune(t *testing.T) {
407	errorbuf := make([]byte, UTFMax)
408	errorbuf = errorbuf[0:EncodeRune(errorbuf, RuneError)]
409	buf := make([]byte, UTFMax)
410	buf = buf[0:EncodeRune(buf, -1)]
411	if !bytes.Equal(buf, errorbuf) {
412		t.Errorf("incorrect encoding [% x] for -1; expected [% x]", buf, errorbuf)
413	}
414}
415
416type RuneCountTest struct {
417	in  string
418	out int
419}
420
421var runecounttests = []RuneCountTest{
422	{"abcd", 4},
423	{"☺☻☹", 3},
424	{"1,2,3,4", 7},
425	{"\xe2\x00", 2},
426	{"\xe2\x80", 2},
427	{"a\xe2\x80", 3},
428}
429
430func TestRuneCount(t *testing.T) {
431	for _, tt := range runecounttests {
432		if out := RuneCountInString(tt.in); out != tt.out {
433			t.Errorf("RuneCountInString(%q) = %d, want %d", tt.in, out, tt.out)
434		}
435		if out := RuneCount([]byte(tt.in)); out != tt.out {
436			t.Errorf("RuneCount(%q) = %d, want %d", tt.in, out, tt.out)
437		}
438	}
439}
440
441type RuneLenTest struct {
442	r    rune
443	size int
444}
445
446var runelentests = []RuneLenTest{
447	{0, 1},
448	{'e', 1},
449	{'é', 2},
450	{'☺', 3},
451	{RuneError, 3},
452	{MaxRune, 4},
453	{0xD800, -1},
454	{0xDFFF, -1},
455	{MaxRune + 1, -1},
456	{-1, -1},
457}
458
459func TestRuneLen(t *testing.T) {
460	for _, tt := range runelentests {
461		if size := RuneLen(tt.r); size != tt.size {
462			t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, size, tt.size)
463		}
464	}
465}
466
467type ValidTest struct {
468	in  string
469	out bool
470}
471
472var validTests = []ValidTest{
473	{"", true},
474	{"a", true},
475	{"abc", true},
476	{"Ж", true},
477	{"ЖЖ", true},
478	{"брэд-ЛГТМ", true},
479	{"☺☻☹", true},
480	{"aa\xe2", false},
481	{string([]byte{66, 250}), false},
482	{string([]byte{66, 250, 67}), false},
483	{"a\uFFFDb", true},
484	{string("\xF4\x8F\xBF\xBF"), true},      // U+10FFFF
485	{string("\xF4\x90\x80\x80"), false},     // U+10FFFF+1; out of range
486	{string("\xF7\xBF\xBF\xBF"), false},     // 0x1FFFFF; out of range
487	{string("\xFB\xBF\xBF\xBF\xBF"), false}, // 0x3FFFFFF; out of range
488	{string("\xc0\x80"), false},             // U+0000 encoded in two bytes: incorrect
489	{string("\xed\xa0\x80"), false},         // U+D800 high surrogate (sic)
490	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
491}
492
493func TestValid(t *testing.T) {
494	for _, tt := range validTests {
495		if Valid([]byte(tt.in)) != tt.out {
496			t.Errorf("Valid(%q) = %v; want %v", tt.in, !tt.out, tt.out)
497		}
498		if ValidString(tt.in) != tt.out {
499			t.Errorf("ValidString(%q) = %v; want %v", tt.in, !tt.out, tt.out)
500		}
501	}
502}
503
504type ValidRuneTest struct {
505	r  rune
506	ok bool
507}
508
509var validrunetests = []ValidRuneTest{
510	{0, true},
511	{'e', true},
512	{'é', true},
513	{'☺', true},
514	{RuneError, true},
515	{MaxRune, true},
516	{0xD7FF, true},
517	{0xD800, false},
518	{0xDFFF, false},
519	{0xE000, true},
520	{MaxRune + 1, false},
521	{-1, false},
522}
523
524func TestValidRune(t *testing.T) {
525	for _, tt := range validrunetests {
526		if ok := ValidRune(tt.r); ok != tt.ok {
527			t.Errorf("ValidRune(%#U) = %t, want %t", tt.r, ok, tt.ok)
528		}
529	}
530}
531
532func BenchmarkRuneCountTenASCIIChars(b *testing.B) {
533	s := []byte("0123456789")
534	for i := 0; i < b.N; i++ {
535		RuneCount(s)
536	}
537}
538
539func BenchmarkRuneCountTenJapaneseChars(b *testing.B) {
540	s := []byte("日本語日本語日本語日")
541	for i := 0; i < b.N; i++ {
542		RuneCount(s)
543	}
544}
545
546func BenchmarkRuneCountInStringTenASCIIChars(b *testing.B) {
547	for i := 0; i < b.N; i++ {
548		RuneCountInString("0123456789")
549	}
550}
551
552func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
553	for i := 0; i < b.N; i++ {
554		RuneCountInString("日本語日本語日本語日")
555	}
556}
557
558var ascii100000 = strings.Repeat("0123456789", 10000)
559
560func BenchmarkValidTenASCIIChars(b *testing.B) {
561	s := []byte("0123456789")
562	for i := 0; i < b.N; i++ {
563		Valid(s)
564	}
565}
566
567func BenchmarkValid100KASCIIChars(b *testing.B) {
568	s := []byte(ascii100000)
569	for i := 0; i < b.N; i++ {
570		Valid(s)
571	}
572}
573
574func BenchmarkValidTenJapaneseChars(b *testing.B) {
575	s := []byte("日本語日本語日本語日")
576	for i := 0; i < b.N; i++ {
577		Valid(s)
578	}
579}
580func BenchmarkValidLongMostlyASCII(b *testing.B) {
581	longMostlyASCII := []byte(longStringMostlyASCII)
582	for i := 0; i < b.N; i++ {
583		Valid(longMostlyASCII)
584	}
585}
586
587func BenchmarkValidLongJapanese(b *testing.B) {
588	longJapanese := []byte(longStringJapanese)
589	for i := 0; i < b.N; i++ {
590		Valid(longJapanese)
591	}
592}
593
594func BenchmarkValidStringTenASCIIChars(b *testing.B) {
595	for i := 0; i < b.N; i++ {
596		ValidString("0123456789")
597	}
598}
599
600func BenchmarkValidString100KASCIIChars(b *testing.B) {
601	for i := 0; i < b.N; i++ {
602		ValidString(ascii100000)
603	}
604}
605
606func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
607	for i := 0; i < b.N; i++ {
608		ValidString("日本語日本語日本語日")
609	}
610}
611
612func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
613	for i := 0; i < b.N; i++ {
614		ValidString(longStringMostlyASCII)
615	}
616}
617
618func BenchmarkValidStringLongJapanese(b *testing.B) {
619	for i := 0; i < b.N; i++ {
620		ValidString(longStringJapanese)
621	}
622}
623
624var longStringMostlyASCII string // ~100KB, ~97% ASCII
625var longStringJapanese string    // ~100KB, non-ASCII
626
627func init() {
628	const japanese = "日本語日本語日本語日"
629	var b strings.Builder
630	for i := 0; b.Len() < 100_000; i++ {
631		if i%100 == 0 {
632			b.WriteString(japanese)
633		} else {
634			b.WriteString("0123456789")
635		}
636	}
637	longStringMostlyASCII = b.String()
638	longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
639}
640
641func BenchmarkEncodeASCIIRune(b *testing.B) {
642	buf := make([]byte, UTFMax)
643	for i := 0; i < b.N; i++ {
644		EncodeRune(buf, 'a')
645	}
646}
647
648func BenchmarkEncodeJapaneseRune(b *testing.B) {
649	buf := make([]byte, UTFMax)
650	for i := 0; i < b.N; i++ {
651		EncodeRune(buf, '本')
652	}
653}
654
655func BenchmarkAppendASCIIRune(b *testing.B) {
656	buf := make([]byte, UTFMax)
657	for i := 0; i < b.N; i++ {
658		AppendRune(buf[:0], 'a')
659	}
660}
661
662func BenchmarkAppendJapaneseRune(b *testing.B) {
663	buf := make([]byte, UTFMax)
664	for i := 0; i < b.N; i++ {
665		AppendRune(buf[:0], '本')
666	}
667}
668
669func BenchmarkDecodeASCIIRune(b *testing.B) {
670	a := []byte{'a'}
671	for i := 0; i < b.N; i++ {
672		DecodeRune(a)
673	}
674}
675
676func BenchmarkDecodeJapaneseRune(b *testing.B) {
677	nihon := []byte("本")
678	for i := 0; i < b.N; i++ {
679		DecodeRune(nihon)
680	}
681}
682
683// boolSink is used to reference the return value of benchmarked
684// functions to avoid dead code elimination.
685var boolSink bool
686
687func BenchmarkFullRune(b *testing.B) {
688	benchmarks := []struct {
689		name string
690		data []byte
691	}{
692		{"ASCII", []byte("a")},
693		{"Incomplete", []byte("\xf0\x90\x80")},
694		{"Japanese", []byte("本")},
695	}
696	for _, bm := range benchmarks {
697		b.Run(bm.name, func(b *testing.B) {
698			for i := 0; i < b.N; i++ {
699				boolSink = FullRune(bm.data)
700			}
701		})
702	}
703}
704