1// Copyright 2023 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Windows UTF-16 strings can contain unpaired surrogates, which can't be 6// decoded into a valid UTF-8 string. This file defines a set of functions 7// that can be used to encode and decode potentially ill-formed UTF-16 strings 8// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). 9// 10// WTF-8 is a strict superset of UTF-8, i.e. any string that is 11// well-formed in UTF-8 is also well-formed in WTF-8 and the content 12// is unchanged. Also, the conversion never fails and is lossless. 13// 14// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string 15// is that the conversion is lossless even for ill-formed UTF-16 strings. 16// This property allows to read an ill-formed UTF-16 string, convert it 17// to a Go string, and convert it back to the same original UTF-16 string. 18// 19// See go.dev/issues/59971 for more info. 20 21package syscall 22 23import ( 24 "unicode/utf16" 25 "unicode/utf8" 26) 27 28const ( 29 surr1 = 0xd800 30 surr2 = 0xdc00 31 surr3 = 0xe000 32 33 tx = 0b10000000 34 t3 = 0b11100000 35 maskx = 0b00111111 36 mask3 = 0b00001111 37 38 rune1Max = 1<<7 - 1 39 rune2Max = 1<<11 - 1 40) 41 42// encodeWTF16 returns the potentially ill-formed 43// UTF-16 encoding of s. 44func encodeWTF16(s string, buf []uint16) []uint16 { 45 for i := 0; i < len(s); { 46 // Cannot use 'for range s' because it expects valid 47 // UTF-8 runes. 48 r, size := utf8.DecodeRuneInString(s[i:]) 49 if r == utf8.RuneError { 50 // Check if s[i:] contains a valid WTF-8 encoded surrogate. 51 if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF { 52 r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx) 53 buf = append(buf, uint16(r)) 54 i += 3 55 continue 56 } 57 } 58 i += size 59 buf = utf16.AppendRune(buf, r) 60 } 61 return buf 62} 63 64// decodeWTF16 returns the WTF-8 encoding of 65// the potentially ill-formed UTF-16 s. 66func decodeWTF16(s []uint16, buf []byte) []byte { 67 for i := 0; i < len(s); i++ { 68 var ar rune 69 switch r := s[i]; { 70 case r < surr1, surr3 <= r: 71 // normal rune 72 ar = rune(r) 73 case surr1 <= r && r < surr2 && i+1 < len(s) && 74 surr2 <= s[i+1] && s[i+1] < surr3: 75 // valid surrogate sequence 76 ar = utf16.DecodeRune(rune(r), rune(s[i+1])) 77 i++ 78 default: 79 // WTF-8 fallback. 80 // This only handles the 3-byte case of utf8.AppendRune, 81 // as surrogates always fall in that case. 82 ar = rune(r) 83 if ar > utf8.MaxRune { 84 ar = utf8.RuneError 85 } 86 buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx) 87 continue 88 } 89 buf = utf8.AppendRune(buf, ar) 90 } 91 return buf 92} 93