1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package mime
6
7import (
8	"bytes"
9	"encoding/base64"
10	"errors"
11	"fmt"
12	"io"
13	"strings"
14	"unicode"
15	"unicode/utf8"
16)
17
18// A WordEncoder is an RFC 2047 encoded-word encoder.
19type WordEncoder byte
20
21const (
22	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
23	BEncoding = WordEncoder('b')
24	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
25	QEncoding = WordEncoder('q')
26)
27
28var (
29	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
30)
31
32// Encode returns the encoded-word form of s. If s is ASCII without special
33// characters, it is returned unchanged. The provided charset is the IANA
34// charset name of s. It is case insensitive.
35func (e WordEncoder) Encode(charset, s string) string {
36	if !needsEncoding(s) {
37		return s
38	}
39	return e.encodeWord(charset, s)
40}
41
42func needsEncoding(s string) bool {
43	for _, b := range s {
44		if (b < ' ' || b > '~') && b != '\t' {
45			return true
46		}
47	}
48	return false
49}
50
51// encodeWord encodes a string into an encoded-word.
52func (e WordEncoder) encodeWord(charset, s string) string {
53	var buf strings.Builder
54	// Could use a hint like len(s)*3, but that's not enough for cases
55	// with word splits and too much for simpler inputs.
56	// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
57	buf.Grow(48)
58
59	e.openWord(&buf, charset)
60	if e == BEncoding {
61		e.bEncode(&buf, charset, s)
62	} else {
63		e.qEncode(&buf, charset, s)
64	}
65	closeWord(&buf)
66
67	return buf.String()
68}
69
70const (
71	// The maximum length of an encoded-word is 75 characters.
72	// See RFC 2047, section 2.
73	maxEncodedWordLen = 75
74	// maxContentLen is how much content can be encoded, ignoring the header and
75	// 2-byte footer.
76	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
77)
78
79var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
80
81// bEncode encodes s using base64 encoding and writes it to buf.
82func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
83	w := base64.NewEncoder(base64.StdEncoding, buf)
84	// If the charset is not UTF-8 or if the content is short, do not bother
85	// splitting the encoded-word.
86	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
87		io.WriteString(w, s)
88		w.Close()
89		return
90	}
91
92	var currentLen, last, runeLen int
93	for i := 0; i < len(s); i += runeLen {
94		// Multi-byte characters must not be split across encoded-words.
95		// See RFC 2047, section 5.3.
96		_, runeLen = utf8.DecodeRuneInString(s[i:])
97
98		if currentLen+runeLen <= maxBase64Len {
99			currentLen += runeLen
100		} else {
101			io.WriteString(w, s[last:i])
102			w.Close()
103			e.splitWord(buf, charset)
104			last = i
105			currentLen = runeLen
106		}
107	}
108	io.WriteString(w, s[last:])
109	w.Close()
110}
111
112// qEncode encodes s using Q encoding and writes it to buf. It splits the
113// encoded-words when necessary.
114func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
115	// We only split encoded-words when the charset is UTF-8.
116	if !isUTF8(charset) {
117		writeQString(buf, s)
118		return
119	}
120
121	var currentLen, runeLen int
122	for i := 0; i < len(s); i += runeLen {
123		b := s[i]
124		// Multi-byte characters must not be split across encoded-words.
125		// See RFC 2047, section 5.3.
126		var encLen int
127		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
128			runeLen, encLen = 1, 1
129		} else {
130			_, runeLen = utf8.DecodeRuneInString(s[i:])
131			encLen = 3 * runeLen
132		}
133
134		if currentLen+encLen > maxContentLen {
135			e.splitWord(buf, charset)
136			currentLen = 0
137		}
138		writeQString(buf, s[i:i+runeLen])
139		currentLen += encLen
140	}
141}
142
143// writeQString encodes s using Q encoding and writes it to buf.
144func writeQString(buf *strings.Builder, s string) {
145	for i := 0; i < len(s); i++ {
146		switch b := s[i]; {
147		case b == ' ':
148			buf.WriteByte('_')
149		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
150			buf.WriteByte(b)
151		default:
152			buf.WriteByte('=')
153			buf.WriteByte(upperhex[b>>4])
154			buf.WriteByte(upperhex[b&0x0f])
155		}
156	}
157}
158
159// openWord writes the beginning of an encoded-word into buf.
160func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
161	buf.WriteString("=?")
162	buf.WriteString(charset)
163	buf.WriteByte('?')
164	buf.WriteByte(byte(e))
165	buf.WriteByte('?')
166}
167
168// closeWord writes the end of an encoded-word into buf.
169func closeWord(buf *strings.Builder) {
170	buf.WriteString("?=")
171}
172
173// splitWord closes the current encoded-word and opens a new one.
174func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
175	closeWord(buf)
176	buf.WriteByte(' ')
177	e.openWord(buf, charset)
178}
179
180func isUTF8(charset string) bool {
181	return strings.EqualFold(charset, "UTF-8")
182}
183
184const upperhex = "0123456789ABCDEF"
185
186// A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
187type WordDecoder struct {
188	// CharsetReader, if non-nil, defines a function to generate
189	// charset-conversion readers, converting from the provided
190	// charset into UTF-8.
191	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
192	// are handled by default.
193	// One of the CharsetReader's result values must be non-nil.
194	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
195}
196
197// Decode decodes an RFC 2047 encoded-word.
198func (d *WordDecoder) Decode(word string) (string, error) {
199	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
200	// Our decoder is permissive, we accept empty encoded-text.
201	if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
202		return "", errInvalidWord
203	}
204	word = word[2 : len(word)-2]
205
206	// split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
207	charset, text, _ := strings.Cut(word, "?")
208	if charset == "" {
209		return "", errInvalidWord
210	}
211	encoding, text, _ := strings.Cut(text, "?")
212	if len(encoding) != 1 {
213		return "", errInvalidWord
214	}
215
216	content, err := decode(encoding[0], text)
217	if err != nil {
218		return "", err
219	}
220
221	var buf strings.Builder
222	if err := d.convert(&buf, charset, content); err != nil {
223		return "", err
224	}
225	return buf.String(), nil
226}
227
228// DecodeHeader decodes all encoded-words of the given string. It returns an
229// error if and only if WordDecoder.CharsetReader of d returns an error.
230func (d *WordDecoder) DecodeHeader(header string) (string, error) {
231	// If there is no encoded-word, returns before creating a buffer.
232	i := strings.Index(header, "=?")
233	if i == -1 {
234		return header, nil
235	}
236
237	var buf strings.Builder
238
239	buf.WriteString(header[:i])
240	header = header[i:]
241
242	betweenWords := false
243	for {
244		start := strings.Index(header, "=?")
245		if start == -1 {
246			break
247		}
248		cur := start + len("=?")
249
250		i := strings.Index(header[cur:], "?")
251		if i == -1 {
252			break
253		}
254		charset := header[cur : cur+i]
255		cur += i + len("?")
256
257		if len(header) < cur+len("Q??=") {
258			break
259		}
260		encoding := header[cur]
261		cur++
262
263		if header[cur] != '?' {
264			break
265		}
266		cur++
267
268		j := strings.Index(header[cur:], "?=")
269		if j == -1 {
270			break
271		}
272		text := header[cur : cur+j]
273		end := cur + j + len("?=")
274
275		content, err := decode(encoding, text)
276		if err != nil {
277			betweenWords = false
278			buf.WriteString(header[:start+2])
279			header = header[start+2:]
280			continue
281		}
282
283		// Write characters before the encoded-word. White-space and newline
284		// characters separating two encoded-words must be deleted.
285		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
286			buf.WriteString(header[:start])
287		}
288
289		if err := d.convert(&buf, charset, content); err != nil {
290			return "", err
291		}
292
293		header = header[end:]
294		betweenWords = true
295	}
296
297	if len(header) > 0 {
298		buf.WriteString(header)
299	}
300
301	return buf.String(), nil
302}
303
304func decode(encoding byte, text string) ([]byte, error) {
305	switch encoding {
306	case 'B', 'b':
307		return base64.StdEncoding.DecodeString(text)
308	case 'Q', 'q':
309		return qDecode(text)
310	default:
311		return nil, errInvalidWord
312	}
313}
314
315func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
316	switch {
317	case strings.EqualFold("utf-8", charset):
318		buf.Write(content)
319	case strings.EqualFold("iso-8859-1", charset):
320		for _, c := range content {
321			buf.WriteRune(rune(c))
322		}
323	case strings.EqualFold("us-ascii", charset):
324		for _, c := range content {
325			if c >= utf8.RuneSelf {
326				buf.WriteRune(unicode.ReplacementChar)
327			} else {
328				buf.WriteByte(c)
329			}
330		}
331	default:
332		if d.CharsetReader == nil {
333			return fmt.Errorf("mime: unhandled charset %q", charset)
334		}
335		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
336		if err != nil {
337			return err
338		}
339		if _, err = io.Copy(buf, r); err != nil {
340			return err
341		}
342	}
343	return nil
344}
345
346// hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
347// one byte of non-whitespace.
348func hasNonWhitespace(s string) bool {
349	for _, b := range s {
350		switch b {
351		// Encoded-words can only be separated by linear white spaces which does
352		// not include vertical tabs (\v).
353		case ' ', '\t', '\n', '\r':
354		default:
355			return true
356		}
357	}
358	return false
359}
360
361// qDecode decodes a Q encoded string.
362func qDecode(s string) ([]byte, error) {
363	dec := make([]byte, len(s))
364	n := 0
365	for i := 0; i < len(s); i++ {
366		switch c := s[i]; {
367		case c == '_':
368			dec[n] = ' '
369		case c == '=':
370			if i+2 >= len(s) {
371				return nil, errInvalidWord
372			}
373			b, err := readHexByte(s[i+1], s[i+2])
374			if err != nil {
375				return nil, err
376			}
377			dec[n] = b
378			i += 2
379		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
380			dec[n] = c
381		default:
382			return nil, errInvalidWord
383		}
384		n++
385	}
386
387	return dec[:n], nil
388}
389
390// readHexByte returns the byte from its quoted-printable representation.
391func readHexByte(a, b byte) (byte, error) {
392	var hb, lb byte
393	var err error
394	if hb, err = fromHex(a); err != nil {
395		return 0, err
396	}
397	if lb, err = fromHex(b); err != nil {
398		return 0, err
399	}
400	return hb<<4 | lb, nil
401}
402
403func fromHex(b byte) (byte, error) {
404	switch {
405	case b >= '0' && b <= '9':
406		return b - '0', nil
407	case b >= 'A' && b <= 'F':
408		return b - 'A' + 10, nil
409	// Accept badly encoded bytes.
410	case b >= 'a' && b <= 'f':
411		return b - 'a' + 10, nil
412	}
413	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
414}
415