xref: /aosp_15_r20/external/cronet/third_party/icu/scripts/sjis_gen.sh (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1#!/bin/sh
2# Copyright 2014 The Chromium Authors. All rights reserved.
3# Use of this source code is governed by a BSD-style license that can be
4# found in the LICENSE file.
5
6# References:
7#   https://encoding.spec.whatwg.org/#shift_jis
8
9# Download the following file, run it in source/data/mappings directory
10# and save the result to euc-jp-html5.ucm
11#   https://encoding.spec.whatwg.org/index-jis0208.txt
12
13function preamble {
14cat <<PREAMBLE
15# ***************************************************************************
16# *
17# *   Copyright (C) 1995-2014, International Business Machines
18# *   Corporation and others.  All Rights Reserved.
19# *
20# *   Generated per the algorithm for Shift_JIS
21# *   described at https://encoding.spec.whatwg.org/#shift_jis
22# *
23# ***************************************************************************
24<code_set_name>               "shift_jis-html5"
25<char_name_mask>              "AXXXX"
26<mb_cur_max>                  2
27<mb_cur_min>                  1
28<uconv_class>                 "MBCS"
29<subchar>                     \x3F
30<icu:charsetFamily>           "ASCII"
31
32<icu:state>                   0-80, 81-9f:1, a1-df, e0-fc:1, 82:3, 84:4, 85-86:2, 87:5, 88:2, 98:6, eb-ec:2, ef:2, f9:2, fc:7
33
34<icu:state>                   40-7e, 80-fc
35<icu:state>                   80-fc
36<icu:state>                   4f-7e, 80-fc, 59-5f.i, 7a-7e.i
37<icu:state>                   40-7e, 80-fc, 61-6f.i
38<icu:state>                   40-7e, 80-fc, 76-7d.i
39<icu:state>                   40-7e, 80-fc, 73-7e.i
40<icu:state>                   40-4b, 80-fc
41
42
43CHARMAP
44PREAMBLE
45}
46
47# The encoding spec for Shift_JIS says U+0080 has to be round-tripped with
48# 0x80. So, this is one character more than ASCII up to 128 (0x80).
49function ascii {
50  for i in $(seq 0 128)
51  do
52    printf '<U%04X> \\x%02X |0\n' $i $i
53  done
54}
55
56
57# Map 0x[A1-DF] to U+FF61 to U+FF9F
58function half_width_kana {
59  for i in $(seq 0xA1 0xDF)
60  do
61    # 65377 = 0xFF61, 161 = 0xA1
62    printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161))  $i
63  done
64}
65
66
67# From https://encoding.spec.whatwg.org/#index-shift_jis-pointer
68# The index shift_jis pointer for code point is the return value of
69# these steps for the round-trip code points (tag = 0)
70#
71#   Let index be index jis0208 excluding all pointers in the range 8272 to 8835.
72#   Return the index pointer for code point in index.
73# For index ($1) outside the above range, it's for decoding only and tag
74# is set to '3'.
75# Besides, there are 24 more characters with multiple SJIS representations.
76# Only the first of multiple is tagged with '0' (bi-directional mapping)
77# while the rest is tagged with '3'.
78
79function jis208 {
80  awk '!/^#/ && !/^$/ \
81       { lead = $1 / 188; \
82         lead_offset = lead < 0x1F ? 0x81 : 0xC1; \
83         trail = $1 % 188; \
84         trail_offset = trail < 0x3F ? 0x40 : 0x41; \
85         is_in_range = ($1 < 8272 || $1 > 8835); \
86         tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \
87         printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
88                 lead + lead_offset, trail + trail_offset, tag);\
89         if (is_in_range) has_seen[$2] = 1; \
90       }' \
91  index-jis0208.txt
92}
93
94# EUDC (End User Defined Characters)  is for decoding only
95# (use '|3' to denote that).
96# See https://encoding.spec.whatwg.org/#shift_jis-decoder - step 5
97# This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41}
98# to implement it.
99
100function eudc {
101  # The upper bound for the lead byte is 0xF8 because each lead can
102  # have 188 characters and the total # of characters in the EUDC
103  # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder
104  # step 3.5 in the encoding spec.)
105  for lead in $(seq 0xF0 0xF8)
106  do
107    for byte in $(seq $1 $2)
108    do
109      offset=$3
110      pointer=$((($lead - 0xC1) * 188 + $byte - $offset))
111      unicode=$(($pointer - 8836 + 0xE000))
112      printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte
113    done
114  done
115}
116
117function unsorted_table {
118  ascii
119  half_width_kana
120  jis208
121  eudc "0x40" "0x7E" "0x40"
122  eudc "0x80" "0xFC" "0x41"
123  echo '<U00A5> \x5C |1'
124  echo '<U203E> \x7E |1'
125  echo '<U2212> \x81\x7C |1'
126}
127
128wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt
129preamble
130unsorted_table | sort  | uniq
131echo 'END CHARMAP'
132