1#!/bin/sh 2# Copyright 2014 The Chromium Authors. All rights reserved. 3# Use of this source code is governed by a BSD-style license that can be 4# found in the LICENSE file. 5 6# References: 7# https://encoding.spec.whatwg.org/#shift_jis 8 9# Download the following file, run it in source/data/mappings directory 10# and save the result to euc-jp-html5.ucm 11# https://encoding.spec.whatwg.org/index-jis0208.txt 12 13function preamble { 14cat <<PREAMBLE 15# *************************************************************************** 16# * 17# * Copyright (C) 1995-2014, International Business Machines 18# * Corporation and others. All Rights Reserved. 19# * 20# * Generated per the algorithm for Shift_JIS 21# * described at https://encoding.spec.whatwg.org/#shift_jis 22# * 23# *************************************************************************** 24<code_set_name> "shift_jis-html5" 25<char_name_mask> "AXXXX" 26<mb_cur_max> 2 27<mb_cur_min> 1 28<uconv_class> "MBCS" 29<subchar> \x3F 30<icu:charsetFamily> "ASCII" 31 32<icu:state> 0-80, 81-9f:1, a1-df, e0-fc:1, 82:3, 84:4, 85-86:2, 87:5, 88:2, 98:6, eb-ec:2, ef:2, f9:2, fc:7 33 34<icu:state> 40-7e, 80-fc 35<icu:state> 80-fc 36<icu:state> 4f-7e, 80-fc, 59-5f.i, 7a-7e.i 37<icu:state> 40-7e, 80-fc, 61-6f.i 38<icu:state> 40-7e, 80-fc, 76-7d.i 39<icu:state> 40-7e, 80-fc, 73-7e.i 40<icu:state> 40-4b, 80-fc 41 42 43CHARMAP 44PREAMBLE 45} 46 47# The encoding spec for Shift_JIS says U+0080 has to be round-tripped with 48# 0x80. So, this is one character more than ASCII up to 128 (0x80). 49function ascii { 50 for i in $(seq 0 128) 51 do 52 printf '<U%04X> \\x%02X |0\n' $i $i 53 done 54} 55 56 57# Map 0x[A1-DF] to U+FF61 to U+FF9F 58function half_width_kana { 59 for i in $(seq 0xA1 0xDF) 60 do 61 # 65377 = 0xFF61, 161 = 0xA1 62 printf '<U%04X> \\x%02X |0\n' $(($i + 65377 - 161)) $i 63 done 64} 65 66 67# From https://encoding.spec.whatwg.org/#index-shift_jis-pointer 68# The index shift_jis pointer for code point is the return value of 69# these steps for the round-trip code points (tag = 0) 70# 71# Let index be index jis0208 excluding all pointers in the range 8272 to 8835. 72# Return the index pointer for code point in index. 73# For index ($1) outside the above range, it's for decoding only and tag 74# is set to '3'. 75# Besides, there are 24 more characters with multiple SJIS representations. 76# Only the first of multiple is tagged with '0' (bi-directional mapping) 77# while the rest is tagged with '3'. 78 79function jis208 { 80 awk '!/^#/ && !/^$/ \ 81 { lead = $1 / 188; \ 82 lead_offset = lead < 0x1F ? 0x81 : 0xC1; \ 83 trail = $1 % 188; \ 84 trail_offset = trail < 0x3F ? 0x40 : 0x41; \ 85 is_in_range = ($1 < 8272 || $1 > 8835); \ 86 tag = (is_in_range && has_seen[$2] == 0) ? 0 : 3; \ 87 printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 88 lead + lead_offset, trail + trail_offset, tag);\ 89 if (is_in_range) has_seen[$2] = 1; \ 90 }' \ 91 index-jis0208.txt 92} 93 94# EUDC (End User Defined Characters) is for decoding only 95# (use '|3' to denote that). 96# See https://encoding.spec.whatwg.org/#shift_jis-decoder - step 5 97# This function is called twice with {0x40, 0x7E, 0x40} and {0x80, 0xFC, 0x41} 98# to implement it. 99 100function eudc { 101 # The upper bound for the lead byte is 0xF8 because each lead can 102 # have 188 characters and the total # of characters in the EUDC 103 # is 1692 = 188 * (0xF9 - 0xF0) = 10528 - 8836 (see Shift_JIS decoder 104 # step 3.5 in the encoding spec.) 105 for lead in $(seq 0xF0 0xF8) 106 do 107 for byte in $(seq $1 $2) 108 do 109 offset=$3 110 pointer=$((($lead - 0xC1) * 188 + $byte - $offset)) 111 unicode=$(($pointer - 8836 + 0xE000)) 112 printf "<U%4X> \\\x%02X\\\x%02X |3\n" $unicode $lead $byte 113 done 114 done 115} 116 117function unsorted_table { 118 ascii 119 half_width_kana 120 jis208 121 eudc "0x40" "0x7E" "0x40" 122 eudc "0x80" "0xFC" "0x41" 123 echo '<U00A5> \x5C |1' 124 echo '<U203E> \x7E |1' 125 echo '<U2212> \x81\x7C |1' 126} 127 128wget -N -r -nd https://encoding.spec.whatwg.org/index-jis0208.txt 129preamble 130unsorted_table | sort | uniq 131echo 'END CHARMAP' 132