1 
2 /*
3  * Copyright (C) 2024 The Android Open Source Project
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *      http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17 
18 #include "perfetto/protozero/field.h"
19 #include "src/trace_processor/importers/proto/string_encoding_utils.h"
20 
21 #include <cstdint>
22 #include <cstring>
23 #include <limits>
24 #include <string>
25 
26 #include "perfetto/ext/base/string_view.h"
27 #include "test/gtest_and_gmock.h"
28 
29 namespace perfetto {
30 namespace trace_processor {
31 namespace {
32 
33 using ::protozero::ConstBytes;
34 using ::testing::Eq;
35 using ::testing::SizeIs;
36 
TEST(ConvertLatin1ToUtf8,FullCodePage)37 TEST(ConvertLatin1ToUtf8, FullCodePage) {
38   std::vector<uint8_t> latin1;
39   latin1.reserve(256 / 5);
40   for (uint16_t i = 0; i <= std::numeric_limits<uint8_t>::max(); i += 5) {
41     latin1.push_back(static_cast<uint8_t>(i));
42   }
43 
44   std::string uft8 = ConvertLatin1ToUtf8({latin1.data(), latin1.size()});
45 
46   //  Obtained via:
47   //  for i in $(seq 0 5 255); do printf '\\\\x%x' $i ; done | xargs echo -en |
48   //     iconv -f latin1 -t utf8| hexdump -e '1/1 "0x%02x,\n"'
49   const uint8_t kExpected[] = {
50       0x00, 0x05, 0x0a, 0x0f, 0x14, 0x19, 0x1e, 0x23, 0x28, 0x2d, 0x32, 0x37,
51       0x3c, 0x41, 0x46, 0x4b, 0x50, 0x55, 0x5a, 0x5f, 0x64, 0x69, 0x6e, 0x73,
52       0x78, 0x7d, 0xc2, 0x82, 0xc2, 0x87, 0xc2, 0x8c, 0xc2, 0x91, 0xc2, 0x96,
53       0xc2, 0x9b, 0xc2, 0xa0, 0xc2, 0xa5, 0xc2, 0xaa, 0xc2, 0xaf, 0xc2, 0xb4,
54       0xc2, 0xb9, 0xc2, 0xbe, 0xc3, 0x83, 0xc3, 0x88, 0xc3, 0x8d, 0xc3, 0x92,
55       0xc3, 0x97, 0xc3, 0x9c, 0xc3, 0xa1, 0xc3, 0xa6, 0xc3, 0xab, 0xc3, 0xb0,
56       0xc3, 0xb5, 0xc3, 0xba, 0xc3, 0xbf};
57 
58   EXPECT_THAT(uft8, Eq(std::string(reinterpret_cast<const char*>(kExpected),
59                                    sizeof(kExpected))));
60 }
61 
62 // The following strings are different encodings of the following code points:
63 //     \u0000, \u0001, \u0002, \u0005, \u000A, \u0015, \u002A, \u0055, \u00AA,
64 //     \u0155, \u02AA, \u0555, \u0AAA, \u1555, \u2AAA, \u5555, \uAAAA,
65 //     \U00015555, \U0002AAAA, \U00055555, \U000AAAAA, \U0010AAAA
66 // This gives a reasonable coverage of the entire code point range so that we
67 // force all types of encoding, ie utf8: 1-4 bytes, utf16: with and without
68 // surrogate pairs
69 const uint8_t kUtf16Le[] = {
70     0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x05, 0x00, 0x0a, 0x00, 0x15,
71     0x00, 0x2a, 0x00, 0x55, 0x00, 0xaa, 0x00, 0x55, 0x01, 0xaa, 0x02,
72     0x55, 0x05, 0xaa, 0x0a, 0x55, 0x15, 0xaa, 0x2a, 0x55, 0x55, 0xaa,
73     0xaa, 0x15, 0xd8, 0x55, 0xdd, 0x6a, 0xd8, 0xaa, 0xde, 0x15, 0xd9,
74     0x55, 0xdd, 0x6a, 0xda, 0xaa, 0xde, 0xea, 0xdb, 0xaa, 0xde};
75 
76 const uint8_t kUtf16Be[] = {
77     0x00, 0x00, 0x00, 0x01, 0x00, 0x02, 0x00, 0x05, 0x00, 0x0a, 0x00,
78     0x15, 0x00, 0x2a, 0x00, 0x55, 0x00, 0xaa, 0x01, 0x55, 0x02, 0xaa,
79     0x05, 0x55, 0x0a, 0xaa, 0x15, 0x55, 0x2a, 0xaa, 0x55, 0x55, 0xaa,
80     0xaa, 0xd8, 0x15, 0xdd, 0x55, 0xd8, 0x6a, 0xde, 0xaa, 0xd9, 0x15,
81     0xdd, 0x55, 0xda, 0x6a, 0xde, 0xaa, 0xdb, 0xea, 0xde, 0xaa};
82 
83 const uint8_t kExpectedUtf8[] = {
84     0x00, 0x01, 0x02, 0x05, 0x0a, 0x15, 0x2a, 0x55, 0xc2, 0xaa, 0xc5,
85     0x95, 0xca, 0xaa, 0xd5, 0x95, 0xe0, 0xaa, 0xaa, 0xe1, 0x95, 0x95,
86     0xe2, 0xaa, 0xaa, 0xe5, 0x95, 0x95, 0xea, 0xaa, 0xaa, 0xf0, 0x95,
87     0x95, 0x95, 0xf0, 0xaa, 0xaa, 0xaa, 0xf1, 0x95, 0x95, 0x95, 0xf2,
88     0xaa, 0xaa, 0xaa, 0xf4, 0x8a, 0xaa, 0xaa};
89 
90 // Collection of invalid bytes: High surrogate followed by non low surrogate,
91 // low surrogate, 1 random byte (not enough to read one code unit which is 2
92 // bytes)
93 const uint8_t kInvalidUtf16Le[] = {0xea, 0xdb, 0x00, 0x00, 0xaa, 0xde, 0x00};
94 const uint8_t kInvalidUtf16Be[] = {0xdb, 0xea, 0x00, 0x00, 0xde, 0xaa, 0x00};
95 
96 // We expect 3 invalid char code points.
97 const uint8_t kExpectedUtf8ForInvalidUtf16[] = {
98     0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd, 0xef, 0xbf, 0xbd,
99 };
100 
TEST(ConvertUtf16LeToUtf8,ValidInput)101 TEST(ConvertUtf16LeToUtf8, ValidInput) {
102   std::string utf8 = ConvertUtf16LeToUtf8({kUtf16Le, sizeof(kUtf16Le)});
103   EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(kExpectedUtf8),
104                                    sizeof(kExpectedUtf8))));
105 }
106 
TEST(ConvertUtf16BeToUtf8,ValidInput)107 TEST(ConvertUtf16BeToUtf8, ValidInput) {
108   std::string utf8 = ConvertUtf16BeToUtf8({kUtf16Be, sizeof(kUtf16Be)});
109   EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(kExpectedUtf8),
110                                    sizeof(kExpectedUtf8))));
111 }
112 
TEST(ConvertUtf16LeToUtf8,InvalidValidInput)113 TEST(ConvertUtf16LeToUtf8, InvalidValidInput) {
114   std::string utf8 =
115       ConvertUtf16LeToUtf8({kInvalidUtf16Le, sizeof(kInvalidUtf16Le)});
116   EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(
117                                        kExpectedUtf8ForInvalidUtf16),
118                                    sizeof(kExpectedUtf8ForInvalidUtf16))));
119 }
120 
TEST(ConvertUtf16BeToUtf8,InvalidValidInput)121 TEST(ConvertUtf16BeToUtf8, InvalidValidInput) {
122   std::string utf8 =
123       ConvertUtf16BeToUtf8({kInvalidUtf16Be, sizeof(kInvalidUtf16Be)});
124   EXPECT_THAT(utf8, Eq(std::string(reinterpret_cast<const char*>(
125                                        kExpectedUtf8ForInvalidUtf16),
126                                    sizeof(kExpectedUtf8ForInvalidUtf16))));
127 }
128 
129 }  // namespace
130 
131 }  // namespace trace_processor
132 }  // namespace perfetto
133