1
2 /*
3 * Copyright (C) 2024 The Android Open Source Project
4 *
5 * Licensed under the Apache License, Version 2.0 (the "License");
6 * you may not use this file except in compliance with the License.
7 * You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 #include "perfetto/protozero/field.h"
19 #include "src/trace_processor/importers/proto/string_encoding_utils.h"
20
21 #include <cstdint>
22 #include <cstring>
23 #include <limits>
24 #include <string>
25
26 #include "perfetto/ext/base/string_view.h"
27 #include "test/gtest_and_gmock.h"
28
29 namespace perfetto {
30 namespace trace_processor {
31 namespace {
32
33 using ::protozero::ConstBytes;
34 using ::testing::Eq;
35 using ::testing::SizeIs;
36
TEST(ConvertLatin1ToUtf8,FullCodePage)37 TEST(ConvertLatin1ToUtf8, FullCodePage) {
38 std::vector<uint8_t> latin1;
39 latin1.reserve(256 / 5);
40 for (uint16_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i += 5) {
41 latin1.push_back(static_cast<uint8_t>(i));
42 }
43
44 std::string uft8 = ConvertLatin1ToUtf8({latin1.data(), latin1.size()});
45
46 // Obtained via:
47 // for i in $(seq 0 5 255); do printf '\\\\x%x' $i ; done | xargs echo -en |
48 // iconv -f latin1 -t utf8| hexdump -e '1/1 "0x%02x,\n"'
49 const uint8_t kExpected[] = {
50 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x19, 0x1e, 0x23, 0x28, 0x2d, 0x32, 0x37,
51 0x3c, 0x41, 0x46, 0x4b, 0x50, 0x55, 0x5a, 0x5f, 0x64, 0x69, 0x6e, 0x73,
52 0x78, 0x7d, 0xc2, 0x82, 0xc2, 0x87, 0xc2, 0x8c, 0xc2, 0x91, 0xc2, 0x96,
53 0xc2, 0x9b, 0xc2, 0xa0, 0xc2, 0xa5, 0xc2, 0xaa, 0xc2, 0xaf, 0xc2, 0xb4,
54 0xc2, 0xb9, 0xc2, 0xbe, 0xc3, 0x83, 0xc3, 0x88, 0xc3, 0x8d, 0xc3, 0x92,
55 0xc3, 0x97, 0xc3, 0x9c, 0xc3, 0xa1, 0xc3, 0xa6, 0xc3, 0xab, 0xc3, 0xb0,
56 0xc3, 0xb5, 0xc3, 0xba, 0xc3, 0xbf};
57
58 EXPECT_THAT(uft8, Eq(std::string(reinterpret_cast<const char*>(kExpected),
59 sizeof(kExpected))));
60 }
61
62 // The following strings are different encodings of the following code points:
63 // \u0000, \u0001, \u0002, \u0005, \u000A, \u0015, \u002A, \u0055, \u00AA,
64 // \u0155, \u02AA, \u0555, \u0AAA, \u1555, \u2AAA, \u5555, \uAAAA,
65 // \U00015555, \U0002AAAA, \U00055555, \U000AAAAA, \U0010AAAA
66 // This gives a reasonable coverage of the entire code point range so that we
67 // force all types of encoding, ie utf8: 1-4 bytes, utf16: with and without
68 // surrogate pairs
69 const uint8_t kUtf16Le[] = {
70 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x05, 0x00, 0x0a, 0x00, 0x15,
71 0x00, 0x2a, 0x00, 0x55, 0x00, 0xaa, 0x00, 0x55, 0x01, 0xaa, 0x02,
72 0x55, 0x05, 0xaa, 0x0a, 0x55, 0x15, 0xaa, 0x2a, 0x55, 0x55, 0xaa,
73 0xaa, 0x15, 0xd8, 0x55, 0xdd, 0x6a, 0xd8, 0xaa, 0xde, 0x15, 0xd9,
74 0x55, 0xdd, 0x6a, 0xda, 0xaa, 0xde, 0xea, 0xdb, 0xaa, 0xde};
75
76 const uint8_t kUtf16Be[] = {
77 0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x05, 0x00, 0x0a, 0x00,
78 0x15, 0x00, 0x2a, 0x00, 0x55, 0x00, 0xaa, 0x01, 0x55, 0x02, 0xaa,
79 0x05, 0x55, 0x0a, 0xaa, 0x15, 0x55, 0x2a, 0xaa, 0x55, 0x55, 0xaa,
80 0xaa, 0xd8, 0x15, 0xdd, 0x55, 0xd8, 0x6a, 0xde, 0xaa, 0xd9, 0x15,
81 0xdd, 0x55, 0xda, 0x6a, 0xde, 0xaa, 0xdb, 0xea, 0xde, 0xaa};
82
83 const uint8_t kExpectedUtf8[] = {
84 0x00, 0x01, 0x02, 0x05, 0x0a, 0x15, 0x2a, 0x55, 0xc2, 0xaa, 0xc5,
85 0x95, 0xca, 0xaa, 0xd5, 0x95, 0xe0, 0xaa, 0xaa, 0xe1, 0x95, 0x95,
86 0xe2, 0xaa, 0xaa, 0xe5, 0x95, 0x95, 0xea, 0xaa, 0xaa, 0xf0, 0x95,
87 0x95, 0x95, 0xf0, 0xaa, 0xaa, 0xaa, 0xf1, 0x95, 0x95, 0x95, 0xf2,
88 0xaa, 0xaa, 0xaa, 0xf4, 0x8a, 0xaa, 0xaa};
89
90 // Collection of invalid bytes: High surrogate followed by non low surrogate,
91 // low surrogate, 1 random byte (not enough to read one code unit which is 2
92 // bytes)
93 const uint8_t kInvalidUtf16Le[] = {0xea, 0xdb, 0x00, 0x00, 0xaa, 0xde, 0x00};
94 const uint8_t kInvalidUtf16Be[] = {0xdb, 0xea, 0x00, 0x00, 0xde, 0xaa, 0x00};
95
96 // We expect 3 invalid char code points.
97 const uint8_t kExpectedUtf8ForInvalidUtf16[] = {
98 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
99 };
100
TEST(ConvertUtf16LeToUtf8,ValidInput)101 TEST(ConvertUtf16LeToUtf8, ValidInput) {
102 std::string utf8 = ConvertUtf16LeToUtf8({kUtf16Le, sizeof(kUtf16Le)});
103 EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(kExpectedUtf8),
104 sizeof(kExpectedUtf8))));
105 }
106
TEST(ConvertUtf16BeToUtf8,ValidInput)107 TEST(ConvertUtf16BeToUtf8, ValidInput) {
108 std::string utf8 = ConvertUtf16BeToUtf8({kUtf16Be, sizeof(kUtf16Be)});
109 EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(kExpectedUtf8),
110 sizeof(kExpectedUtf8))));
111 }
112
TEST(ConvertUtf16LeToUtf8,InvalidValidInput)113 TEST(ConvertUtf16LeToUtf8, InvalidValidInput) {
114 std::string utf8 =
115 ConvertUtf16LeToUtf8({kInvalidUtf16Le, sizeof(kInvalidUtf16Le)});
116 EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(
117 kExpectedUtf8ForInvalidUtf16),
118 sizeof(kExpectedUtf8ForInvalidUtf16))));
119 }
120
TEST(ConvertUtf16BeToUtf8,InvalidValidInput)121 TEST(ConvertUtf16BeToUtf8, InvalidValidInput) {
122 std::string utf8 =
123 ConvertUtf16BeToUtf8({kInvalidUtf16Be, sizeof(kInvalidUtf16Be)});
124 EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(
125 kExpectedUtf8ForInvalidUtf16),
126 sizeof(kExpectedUtf8ForInvalidUtf16))));
127 }
128
129 } // namespace
130
131 } // namespace trace_processor
132 } // namespace perfetto
133