1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package mime 6 7import ( 8 "bytes" 9 "encoding/base64" 10 "errors" 11 "fmt" 12 "io" 13 "strings" 14 "unicode" 15 "unicode/utf8" 16) 17 18// A WordEncoder is an RFC 2047 encoded-word encoder. 19type WordEncoder byte 20 21const ( 22 // BEncoding represents Base64 encoding scheme as defined by RFC 2045. 23 BEncoding = WordEncoder('b') 24 // QEncoding represents the Q-encoding scheme as defined by RFC 2047. 25 QEncoding = WordEncoder('q') 26) 27 28var ( 29 errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word") 30) 31 32// Encode returns the encoded-word form of s. If s is ASCII without special 33// characters, it is returned unchanged. The provided charset is the IANA 34// charset name of s. It is case insensitive. 35func (e WordEncoder) Encode(charset, s string) string { 36 if !needsEncoding(s) { 37 return s 38 } 39 return e.encodeWord(charset, s) 40} 41 42func needsEncoding(s string) bool { 43 for _, b := range s { 44 if (b < ' ' || b > '~') && b != '\t' { 45 return true 46 } 47 } 48 return false 49} 50 51// encodeWord encodes a string into an encoded-word. 52func (e WordEncoder) encodeWord(charset, s string) string { 53 var buf strings.Builder 54 // Could use a hint like len(s)*3, but that's not enough for cases 55 // with word splits and too much for simpler inputs. 56 // 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class. 57 buf.Grow(48) 58 59 e.openWord(&buf, charset) 60 if e == BEncoding { 61 e.bEncode(&buf, charset, s) 62 } else { 63 e.qEncode(&buf, charset, s) 64 } 65 closeWord(&buf) 66 67 return buf.String() 68} 69 70const ( 71 // The maximum length of an encoded-word is 75 characters. 72 // See RFC 2047, section 2. 73 maxEncodedWordLen = 75 74 // maxContentLen is how much content can be encoded, ignoring the header and 75 // 2-byte footer. 76 maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=") 77) 78 79var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen) 80 81// bEncode encodes s using base64 encoding and writes it to buf. 82func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) { 83 w := base64.NewEncoder(base64.StdEncoding, buf) 84 // If the charset is not UTF-8 or if the content is short, do not bother 85 // splitting the encoded-word. 86 if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen { 87 io.WriteString(w, s) 88 w.Close() 89 return 90 } 91 92 var currentLen, last, runeLen int 93 for i := 0; i < len(s); i += runeLen { 94 // Multi-byte characters must not be split across encoded-words. 95 // See RFC 2047, section 5.3. 96 _, runeLen = utf8.DecodeRuneInString(s[i:]) 97 98 if currentLen+runeLen <= maxBase64Len { 99 currentLen += runeLen 100 } else { 101 io.WriteString(w, s[last:i]) 102 w.Close() 103 e.splitWord(buf, charset) 104 last = i 105 currentLen = runeLen 106 } 107 } 108 io.WriteString(w, s[last:]) 109 w.Close() 110} 111 112// qEncode encodes s using Q encoding and writes it to buf. It splits the 113// encoded-words when necessary. 114func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) { 115 // We only split encoded-words when the charset is UTF-8. 116 if !isUTF8(charset) { 117 writeQString(buf, s) 118 return 119 } 120 121 var currentLen, runeLen int 122 for i := 0; i < len(s); i += runeLen { 123 b := s[i] 124 // Multi-byte characters must not be split across encoded-words. 125 // See RFC 2047, section 5.3. 126 var encLen int 127 if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' { 128 runeLen, encLen = 1, 1 129 } else { 130 _, runeLen = utf8.DecodeRuneInString(s[i:]) 131 encLen = 3 * runeLen 132 } 133 134 if currentLen+encLen > maxContentLen { 135 e.splitWord(buf, charset) 136 currentLen = 0 137 } 138 writeQString(buf, s[i:i+runeLen]) 139 currentLen += encLen 140 } 141} 142 143// writeQString encodes s using Q encoding and writes it to buf. 144func writeQString(buf *strings.Builder, s string) { 145 for i := 0; i < len(s); i++ { 146 switch b := s[i]; { 147 case b == ' ': 148 buf.WriteByte('_') 149 case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_': 150 buf.WriteByte(b) 151 default: 152 buf.WriteByte('=') 153 buf.WriteByte(upperhex[b>>4]) 154 buf.WriteByte(upperhex[b&0x0f]) 155 } 156 } 157} 158 159// openWord writes the beginning of an encoded-word into buf. 160func (e WordEncoder) openWord(buf *strings.Builder, charset string) { 161 buf.WriteString("=?") 162 buf.WriteString(charset) 163 buf.WriteByte('?') 164 buf.WriteByte(byte(e)) 165 buf.WriteByte('?') 166} 167 168// closeWord writes the end of an encoded-word into buf. 169func closeWord(buf *strings.Builder) { 170 buf.WriteString("?=") 171} 172 173// splitWord closes the current encoded-word and opens a new one. 174func (e WordEncoder) splitWord(buf *strings.Builder, charset string) { 175 closeWord(buf) 176 buf.WriteByte(' ') 177 e.openWord(buf, charset) 178} 179 180func isUTF8(charset string) bool { 181 return strings.EqualFold(charset, "UTF-8") 182} 183 184const upperhex = "0123456789ABCDEF" 185 186// A WordDecoder decodes MIME headers containing RFC 2047 encoded-words. 187type WordDecoder struct { 188 // CharsetReader, if non-nil, defines a function to generate 189 // charset-conversion readers, converting from the provided 190 // charset into UTF-8. 191 // Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets 192 // are handled by default. 193 // One of the CharsetReader's result values must be non-nil. 194 CharsetReader func(charset string, input io.Reader) (io.Reader, error) 195} 196 197// Decode decodes an RFC 2047 encoded-word. 198func (d *WordDecoder) Decode(word string) (string, error) { 199 // See https://tools.ietf.org/html/rfc2047#section-2 for details. 200 // Our decoder is permissive, we accept empty encoded-text. 201 if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 { 202 return "", errInvalidWord 203 } 204 word = word[2 : len(word)-2] 205 206 // split word "UTF-8?q?text" into "UTF-8", 'q', and "text" 207 charset, text, _ := strings.Cut(word, "?") 208 if charset == "" { 209 return "", errInvalidWord 210 } 211 encoding, text, _ := strings.Cut(text, "?") 212 if len(encoding) != 1 { 213 return "", errInvalidWord 214 } 215 216 content, err := decode(encoding[0], text) 217 if err != nil { 218 return "", err 219 } 220 221 var buf strings.Builder 222 if err := d.convert(&buf, charset, content); err != nil { 223 return "", err 224 } 225 return buf.String(), nil 226} 227 228// DecodeHeader decodes all encoded-words of the given string. It returns an 229// error if and only if WordDecoder.CharsetReader of d returns an error. 230func (d *WordDecoder) DecodeHeader(header string) (string, error) { 231 // If there is no encoded-word, returns before creating a buffer. 232 i := strings.Index(header, "=?") 233 if i == -1 { 234 return header, nil 235 } 236 237 var buf strings.Builder 238 239 buf.WriteString(header[:i]) 240 header = header[i:] 241 242 betweenWords := false 243 for { 244 start := strings.Index(header, "=?") 245 if start == -1 { 246 break 247 } 248 cur := start + len("=?") 249 250 i := strings.Index(header[cur:], "?") 251 if i == -1 { 252 break 253 } 254 charset := header[cur : cur+i] 255 cur += i + len("?") 256 257 if len(header) < cur+len("Q??=") { 258 break 259 } 260 encoding := header[cur] 261 cur++ 262 263 if header[cur] != '?' { 264 break 265 } 266 cur++ 267 268 j := strings.Index(header[cur:], "?=") 269 if j == -1 { 270 break 271 } 272 text := header[cur : cur+j] 273 end := cur + j + len("?=") 274 275 content, err := decode(encoding, text) 276 if err != nil { 277 betweenWords = false 278 buf.WriteString(header[:start+2]) 279 header = header[start+2:] 280 continue 281 } 282 283 // Write characters before the encoded-word. White-space and newline 284 // characters separating two encoded-words must be deleted. 285 if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) { 286 buf.WriteString(header[:start]) 287 } 288 289 if err := d.convert(&buf, charset, content); err != nil { 290 return "", err 291 } 292 293 header = header[end:] 294 betweenWords = true 295 } 296 297 if len(header) > 0 { 298 buf.WriteString(header) 299 } 300 301 return buf.String(), nil 302} 303 304func decode(encoding byte, text string) ([]byte, error) { 305 switch encoding { 306 case 'B', 'b': 307 return base64.StdEncoding.DecodeString(text) 308 case 'Q', 'q': 309 return qDecode(text) 310 default: 311 return nil, errInvalidWord 312 } 313} 314 315func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error { 316 switch { 317 case strings.EqualFold("utf-8", charset): 318 buf.Write(content) 319 case strings.EqualFold("iso-8859-1", charset): 320 for _, c := range content { 321 buf.WriteRune(rune(c)) 322 } 323 case strings.EqualFold("us-ascii", charset): 324 for _, c := range content { 325 if c >= utf8.RuneSelf { 326 buf.WriteRune(unicode.ReplacementChar) 327 } else { 328 buf.WriteByte(c) 329 } 330 } 331 default: 332 if d.CharsetReader == nil { 333 return fmt.Errorf("mime: unhandled charset %q", charset) 334 } 335 r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content)) 336 if err != nil { 337 return err 338 } 339 if _, err = io.Copy(buf, r); err != nil { 340 return err 341 } 342 } 343 return nil 344} 345 346// hasNonWhitespace reports whether s (assumed to be ASCII) contains at least 347// one byte of non-whitespace. 348func hasNonWhitespace(s string) bool { 349 for _, b := range s { 350 switch b { 351 // Encoded-words can only be separated by linear white spaces which does 352 // not include vertical tabs (\v). 353 case ' ', '\t', '\n', '\r': 354 default: 355 return true 356 } 357 } 358 return false 359} 360 361// qDecode decodes a Q encoded string. 362func qDecode(s string) ([]byte, error) { 363 dec := make([]byte, len(s)) 364 n := 0 365 for i := 0; i < len(s); i++ { 366 switch c := s[i]; { 367 case c == '_': 368 dec[n] = ' ' 369 case c == '=': 370 if i+2 >= len(s) { 371 return nil, errInvalidWord 372 } 373 b, err := readHexByte(s[i+1], s[i+2]) 374 if err != nil { 375 return nil, err 376 } 377 dec[n] = b 378 i += 2 379 case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t': 380 dec[n] = c 381 default: 382 return nil, errInvalidWord 383 } 384 n++ 385 } 386 387 return dec[:n], nil 388} 389 390// readHexByte returns the byte from its quoted-printable representation. 391func readHexByte(a, b byte) (byte, error) { 392 var hb, lb byte 393 var err error 394 if hb, err = fromHex(a); err != nil { 395 return 0, err 396 } 397 if lb, err = fromHex(b); err != nil { 398 return 0, err 399 } 400 return hb<<4 | lb, nil 401} 402 403func fromHex(b byte) (byte, error) { 404 switch { 405 case b >= '0' && b <= '9': 406 return b - '0', nil 407 case b >= 'A' && b <= 'F': 408 return b - 'A' + 10, nil 409 // Accept badly encoded bytes. 410 case b >= 'a' && b <= 'f': 411 return b - 'a' + 10, nil 412 } 413 return 0, fmt.Errorf("mime: invalid hex byte %#02x", b) 414} 415