xref: /aosp_15_r20/external/brotli/c/enc/utf8_util.c (revision f4ee7fba7774faf2a30f13154332c0a06550dbc4)
1*f4ee7fbaSAndroid Build Coastguard Worker /* Copyright 2013 Google Inc. All Rights Reserved.
2*f4ee7fbaSAndroid Build Coastguard Worker 
3*f4ee7fbaSAndroid Build Coastguard Worker    Distributed under MIT license.
4*f4ee7fbaSAndroid Build Coastguard Worker    See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*f4ee7fbaSAndroid Build Coastguard Worker */
6*f4ee7fbaSAndroid Build Coastguard Worker 
7*f4ee7fbaSAndroid Build Coastguard Worker /* Heuristics for deciding about the UTF8-ness of strings. */
8*f4ee7fbaSAndroid Build Coastguard Worker 
9*f4ee7fbaSAndroid Build Coastguard Worker #include "./utf8_util.h"
10*f4ee7fbaSAndroid Build Coastguard Worker 
11*f4ee7fbaSAndroid Build Coastguard Worker #include <brotli/types.h>
12*f4ee7fbaSAndroid Build Coastguard Worker 
13*f4ee7fbaSAndroid Build Coastguard Worker #if defined(__cplusplus) || defined(c_plusplus)
14*f4ee7fbaSAndroid Build Coastguard Worker extern "C" {
15*f4ee7fbaSAndroid Build Coastguard Worker #endif
16*f4ee7fbaSAndroid Build Coastguard Worker 
BrotliParseAsUTF8(int * symbol,const uint8_t * input,size_t size)17*f4ee7fbaSAndroid Build Coastguard Worker static size_t BrotliParseAsUTF8(
18*f4ee7fbaSAndroid Build Coastguard Worker     int* symbol, const uint8_t* input, size_t size) {
19*f4ee7fbaSAndroid Build Coastguard Worker   /* ASCII */
20*f4ee7fbaSAndroid Build Coastguard Worker   if ((input[0] & 0x80) == 0) {
21*f4ee7fbaSAndroid Build Coastguard Worker     *symbol = input[0];
22*f4ee7fbaSAndroid Build Coastguard Worker     if (*symbol > 0) {
23*f4ee7fbaSAndroid Build Coastguard Worker       return 1;
24*f4ee7fbaSAndroid Build Coastguard Worker     }
25*f4ee7fbaSAndroid Build Coastguard Worker   }
26*f4ee7fbaSAndroid Build Coastguard Worker   /* 2-byte UTF8 */
27*f4ee7fbaSAndroid Build Coastguard Worker   if (size > 1u &&
28*f4ee7fbaSAndroid Build Coastguard Worker       (input[0] & 0xE0) == 0xC0 &&
29*f4ee7fbaSAndroid Build Coastguard Worker       (input[1] & 0xC0) == 0x80) {
30*f4ee7fbaSAndroid Build Coastguard Worker     *symbol = (((input[0] & 0x1F) << 6) |
31*f4ee7fbaSAndroid Build Coastguard Worker                (input[1] & 0x3F));
32*f4ee7fbaSAndroid Build Coastguard Worker     if (*symbol > 0x7F) {
33*f4ee7fbaSAndroid Build Coastguard Worker       return 2;
34*f4ee7fbaSAndroid Build Coastguard Worker     }
35*f4ee7fbaSAndroid Build Coastguard Worker   }
36*f4ee7fbaSAndroid Build Coastguard Worker   /* 3-byte UFT8 */
37*f4ee7fbaSAndroid Build Coastguard Worker   if (size > 2u &&
38*f4ee7fbaSAndroid Build Coastguard Worker       (input[0] & 0xF0) == 0xE0 &&
39*f4ee7fbaSAndroid Build Coastguard Worker       (input[1] & 0xC0) == 0x80 &&
40*f4ee7fbaSAndroid Build Coastguard Worker       (input[2] & 0xC0) == 0x80) {
41*f4ee7fbaSAndroid Build Coastguard Worker     *symbol = (((input[0] & 0x0F) << 12) |
42*f4ee7fbaSAndroid Build Coastguard Worker                ((input[1] & 0x3F) << 6) |
43*f4ee7fbaSAndroid Build Coastguard Worker                (input[2] & 0x3F));
44*f4ee7fbaSAndroid Build Coastguard Worker     if (*symbol > 0x7FF) {
45*f4ee7fbaSAndroid Build Coastguard Worker       return 3;
46*f4ee7fbaSAndroid Build Coastguard Worker     }
47*f4ee7fbaSAndroid Build Coastguard Worker   }
48*f4ee7fbaSAndroid Build Coastguard Worker   /* 4-byte UFT8 */
49*f4ee7fbaSAndroid Build Coastguard Worker   if (size > 3u &&
50*f4ee7fbaSAndroid Build Coastguard Worker       (input[0] & 0xF8) == 0xF0 &&
51*f4ee7fbaSAndroid Build Coastguard Worker       (input[1] & 0xC0) == 0x80 &&
52*f4ee7fbaSAndroid Build Coastguard Worker       (input[2] & 0xC0) == 0x80 &&
53*f4ee7fbaSAndroid Build Coastguard Worker       (input[3] & 0xC0) == 0x80) {
54*f4ee7fbaSAndroid Build Coastguard Worker     *symbol = (((input[0] & 0x07) << 18) |
55*f4ee7fbaSAndroid Build Coastguard Worker                ((input[1] & 0x3F) << 12) |
56*f4ee7fbaSAndroid Build Coastguard Worker                ((input[2] & 0x3F) << 6) |
57*f4ee7fbaSAndroid Build Coastguard Worker                (input[3] & 0x3F));
58*f4ee7fbaSAndroid Build Coastguard Worker     if (*symbol > 0xFFFF && *symbol <= 0x10FFFF) {
59*f4ee7fbaSAndroid Build Coastguard Worker       return 4;
60*f4ee7fbaSAndroid Build Coastguard Worker     }
61*f4ee7fbaSAndroid Build Coastguard Worker   }
62*f4ee7fbaSAndroid Build Coastguard Worker   /* Not UTF8, emit a special symbol above the UTF8-code space */
63*f4ee7fbaSAndroid Build Coastguard Worker   *symbol = 0x110000 | input[0];
64*f4ee7fbaSAndroid Build Coastguard Worker   return 1;
65*f4ee7fbaSAndroid Build Coastguard Worker }
66*f4ee7fbaSAndroid Build Coastguard Worker 
67*f4ee7fbaSAndroid Build Coastguard Worker /* Returns 1 if at least min_fraction of the data is UTF8-encoded.*/
BrotliIsMostlyUTF8(const uint8_t * data,const size_t pos,const size_t mask,const size_t length,const double min_fraction)68*f4ee7fbaSAndroid Build Coastguard Worker BROTLI_BOOL BrotliIsMostlyUTF8(
69*f4ee7fbaSAndroid Build Coastguard Worker     const uint8_t* data, const size_t pos, const size_t mask,
70*f4ee7fbaSAndroid Build Coastguard Worker     const size_t length, const double min_fraction) {
71*f4ee7fbaSAndroid Build Coastguard Worker   size_t size_utf8 = 0;
72*f4ee7fbaSAndroid Build Coastguard Worker   size_t i = 0;
73*f4ee7fbaSAndroid Build Coastguard Worker   while (i < length) {
74*f4ee7fbaSAndroid Build Coastguard Worker     int symbol;
75*f4ee7fbaSAndroid Build Coastguard Worker     size_t bytes_read =
76*f4ee7fbaSAndroid Build Coastguard Worker         BrotliParseAsUTF8(&symbol, &data[(pos + i) & mask], length - i);
77*f4ee7fbaSAndroid Build Coastguard Worker     i += bytes_read;
78*f4ee7fbaSAndroid Build Coastguard Worker     if (symbol < 0x110000) size_utf8 += bytes_read;
79*f4ee7fbaSAndroid Build Coastguard Worker   }
80*f4ee7fbaSAndroid Build Coastguard Worker   return TO_BROTLI_BOOL((double)size_utf8 > min_fraction * (double)length);
81*f4ee7fbaSAndroid Build Coastguard Worker }
82*f4ee7fbaSAndroid Build Coastguard Worker 
83*f4ee7fbaSAndroid Build Coastguard Worker #if defined(__cplusplus) || defined(c_plusplus)
84*f4ee7fbaSAndroid Build Coastguard Worker }  /* extern "C" */
85*f4ee7fbaSAndroid Build Coastguard Worker #endif
86