1 #include "utf8_validity.h"
2
3 #include <gtest/gtest.h>
4 #include "absl/strings/string_view.h"
5
6 namespace utf8_range {
7
TEST(Utf8Validity,SpanStructurallyValid)8 TEST(Utf8Validity, SpanStructurallyValid) {
9 // Test simple good strings
10 EXPECT_EQ(4, SpanStructurallyValid("abcd"));
11 EXPECT_EQ(4, SpanStructurallyValid(absl::string_view("a\0cd", 4))); // NULL
12 EXPECT_EQ(4, SpanStructurallyValid("ab\xc2\x81")); // 2-byte
13 EXPECT_EQ(4, SpanStructurallyValid("a\xe2\x81\x81")); // 3-byte
14 EXPECT_EQ(4, SpanStructurallyValid("\xf2\x81\x81\x81")); // 4
15
16 // Test simple bad strings
17 EXPECT_EQ(3, SpanStructurallyValid("abc\x80")); // bad char
18 EXPECT_EQ(3, SpanStructurallyValid("abc\xc2")); // trunc 2
19 EXPECT_EQ(2, SpanStructurallyValid("ab\xe2\x81")); // trunc 3
20 EXPECT_EQ(1, SpanStructurallyValid("a\xf2\x81\x81")); // trunc 4
21 EXPECT_EQ(2, SpanStructurallyValid("ab\xc0\x81")); // not 1
22 EXPECT_EQ(1, SpanStructurallyValid("a\xe0\x81\x81")); // not 2
23 EXPECT_EQ(0, SpanStructurallyValid("\xf0\x81\x81\x81")); // not 3
24 EXPECT_EQ(0, SpanStructurallyValid("\xf4\xbf\xbf\xbf")); // big
25 // surrogate min, max
26 EXPECT_EQ(0, SpanStructurallyValid("\xED\xA0\x80")); // U+D800
27 EXPECT_EQ(0, SpanStructurallyValid("\xED\xBF\xBF")); // U+DFFF
28
29 // non-shortest forms should all return false
30 EXPECT_EQ(0, SpanStructurallyValid("\xc0\x80"));
31 EXPECT_EQ(0, SpanStructurallyValid("\xc1\xbf"));
32 EXPECT_EQ(0, SpanStructurallyValid("\xe0\x80\x80"));
33 EXPECT_EQ(0, SpanStructurallyValid("\xe0\x9f\xbf"));
34 EXPECT_EQ(0, SpanStructurallyValid("\xf0\x80\x80\x80"));
35 EXPECT_EQ(0, SpanStructurallyValid("\xf0\x83\xbf\xbf"));
36
37 // This string unchecked caused GWS to crash 7/2006:
38 // invalid sequence 0xc7 0xc8 0xcd 0xcb
39 EXPECT_EQ(0, SpanStructurallyValid("\xc7\xc8\xcd\xcb"));
40 }
41
TEST(Utf8Validity,IsStructurallyValid)42 TEST(Utf8Validity, IsStructurallyValid) {
43 // Test simple good strings
44 EXPECT_TRUE(IsStructurallyValid("abcd"));
45 EXPECT_TRUE(IsStructurallyValid(absl::string_view("a\0cd", 4))); // NULL
46 EXPECT_TRUE(IsStructurallyValid("ab\xc2\x81")); // 2-byte
47 EXPECT_TRUE(IsStructurallyValid("a\xe2\x81\x81")); // 3-byte
48 EXPECT_TRUE(IsStructurallyValid("\xf2\x81\x81\x81")); // 4
49
50 // Test simple bad strings
51 EXPECT_FALSE(IsStructurallyValid("abc\x80")); // bad char
52 EXPECT_FALSE(IsStructurallyValid("abc\xc2")); // trunc 2
53 EXPECT_FALSE(IsStructurallyValid("ab\xe2\x81")); // trunc 3
54 EXPECT_FALSE(IsStructurallyValid("a\xf2\x81\x81")); // trunc 4
55 EXPECT_FALSE(IsStructurallyValid("ab\xc0\x81")); // not 1
56 EXPECT_FALSE(IsStructurallyValid("a\xe0\x81\x81")); // not 2
57 EXPECT_FALSE(IsStructurallyValid("\xf0\x81\x81\x81")); // not 3
58 EXPECT_FALSE(IsStructurallyValid("\xf4\xbf\xbf\xbf")); // big
59 // surrogate min, max
60 EXPECT_FALSE(IsStructurallyValid("\xED\xA0\x80")); // U+D800
61 EXPECT_FALSE(IsStructurallyValid("\xED\xBF\xBF")); // U+DFFF
62
63 // non-shortest forms should all return false
64 EXPECT_FALSE(IsStructurallyValid("\xc0\x80"));
65 EXPECT_FALSE(IsStructurallyValid("\xc1\xbf"));
66 EXPECT_FALSE(IsStructurallyValid("\xe0\x80\x80"));
67 EXPECT_FALSE(IsStructurallyValid("\xe0\x9f\xbf"));
68 EXPECT_FALSE(IsStructurallyValid("\xf0\x80\x80\x80"));
69 EXPECT_FALSE(IsStructurallyValid("\xf0\x83\xbf\xbf"));
70
71 // This string unchecked caused GWS to crash 7/2006:
72 // invalid sequence 0xc7 0xc8 0xcd 0xcb
73 EXPECT_FALSE(IsStructurallyValid("\xc7\xc8\xcd\xcb"));
74 }
75
76 } // namespace utf8_range
77