xref: /aosp_15_r20/prebuilts/build-tools/common/py3-stdlib/email/charset.py (revision cda5da8d549138a6648c5ee6d7a49cf8f4a657be)
1*cda5da8dSAndroid Build Coastguard Worker# Copyright (C) 2001-2007 Python Software Foundation
2*cda5da8dSAndroid Build Coastguard Worker# Author: Ben Gertzfield, Barry Warsaw
3*cda5da8dSAndroid Build Coastguard Worker# Contact: [email protected]
4*cda5da8dSAndroid Build Coastguard Worker
5*cda5da8dSAndroid Build Coastguard Worker__all__ = [
6*cda5da8dSAndroid Build Coastguard Worker    'Charset',
7*cda5da8dSAndroid Build Coastguard Worker    'add_alias',
8*cda5da8dSAndroid Build Coastguard Worker    'add_charset',
9*cda5da8dSAndroid Build Coastguard Worker    'add_codec',
10*cda5da8dSAndroid Build Coastguard Worker    ]
11*cda5da8dSAndroid Build Coastguard Worker
12*cda5da8dSAndroid Build Coastguard Workerfrom functools import partial
13*cda5da8dSAndroid Build Coastguard Worker
14*cda5da8dSAndroid Build Coastguard Workerimport email.base64mime
15*cda5da8dSAndroid Build Coastguard Workerimport email.quoprimime
16*cda5da8dSAndroid Build Coastguard Worker
17*cda5da8dSAndroid Build Coastguard Workerfrom email import errors
18*cda5da8dSAndroid Build Coastguard Workerfrom email.encoders import encode_7or8bit
19*cda5da8dSAndroid Build Coastguard Worker
20*cda5da8dSAndroid Build Coastguard Worker
21*cda5da8dSAndroid Build Coastguard Worker
22*cda5da8dSAndroid Build Coastguard Worker# Flags for types of header encodings
23*cda5da8dSAndroid Build Coastguard WorkerQP          = 1 # Quoted-Printable
24*cda5da8dSAndroid Build Coastguard WorkerBASE64      = 2 # Base64
25*cda5da8dSAndroid Build Coastguard WorkerSHORTEST    = 3 # the shorter of QP and base64, but only for headers
26*cda5da8dSAndroid Build Coastguard Worker
27*cda5da8dSAndroid Build Coastguard Worker# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7
28*cda5da8dSAndroid Build Coastguard WorkerRFC2047_CHROME_LEN = 7
29*cda5da8dSAndroid Build Coastguard Worker
30*cda5da8dSAndroid Build Coastguard WorkerDEFAULT_CHARSET = 'us-ascii'
31*cda5da8dSAndroid Build Coastguard WorkerUNKNOWN8BIT = 'unknown-8bit'
32*cda5da8dSAndroid Build Coastguard WorkerEMPTYSTRING = ''
33*cda5da8dSAndroid Build Coastguard Worker
34*cda5da8dSAndroid Build Coastguard Worker
35*cda5da8dSAndroid Build Coastguard Worker
36*cda5da8dSAndroid Build Coastguard Worker# Defaults
37*cda5da8dSAndroid Build Coastguard WorkerCHARSETS = {
38*cda5da8dSAndroid Build Coastguard Worker    # input        header enc  body enc output conv
39*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-1':  (QP,        QP,      None),
40*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-2':  (QP,        QP,      None),
41*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-3':  (QP,        QP,      None),
42*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-4':  (QP,        QP,      None),
43*cda5da8dSAndroid Build Coastguard Worker    # iso-8859-5 is Cyrillic, and not especially used
44*cda5da8dSAndroid Build Coastguard Worker    # iso-8859-6 is Arabic, also not particularly used
45*cda5da8dSAndroid Build Coastguard Worker    # iso-8859-7 is Greek, QP will not make it readable
46*cda5da8dSAndroid Build Coastguard Worker    # iso-8859-8 is Hebrew, QP will not make it readable
47*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-9':  (QP,        QP,      None),
48*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-10': (QP,        QP,      None),
49*cda5da8dSAndroid Build Coastguard Worker    # iso-8859-11 is Thai, QP will not make it readable
50*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-13': (QP,        QP,      None),
51*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-14': (QP,        QP,      None),
52*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-15': (QP,        QP,      None),
53*cda5da8dSAndroid Build Coastguard Worker    'iso-8859-16': (QP,        QP,      None),
54*cda5da8dSAndroid Build Coastguard Worker    'windows-1252':(QP,        QP,      None),
55*cda5da8dSAndroid Build Coastguard Worker    'viscii':      (QP,        QP,      None),
56*cda5da8dSAndroid Build Coastguard Worker    'us-ascii':    (None,      None,    None),
57*cda5da8dSAndroid Build Coastguard Worker    'big5':        (BASE64,    BASE64,  None),
58*cda5da8dSAndroid Build Coastguard Worker    'gb2312':      (BASE64,    BASE64,  None),
59*cda5da8dSAndroid Build Coastguard Worker    'euc-jp':      (BASE64,    None,    'iso-2022-jp'),
60*cda5da8dSAndroid Build Coastguard Worker    'shift_jis':   (BASE64,    None,    'iso-2022-jp'),
61*cda5da8dSAndroid Build Coastguard Worker    'iso-2022-jp': (BASE64,    None,    None),
62*cda5da8dSAndroid Build Coastguard Worker    'koi8-r':      (BASE64,    BASE64,  None),
63*cda5da8dSAndroid Build Coastguard Worker    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),
64*cda5da8dSAndroid Build Coastguard Worker    }
65*cda5da8dSAndroid Build Coastguard Worker
66*cda5da8dSAndroid Build Coastguard Worker# Aliases for other commonly-used names for character sets.  Map
67*cda5da8dSAndroid Build Coastguard Worker# them to the real ones used in email.
68*cda5da8dSAndroid Build Coastguard WorkerALIASES = {
69*cda5da8dSAndroid Build Coastguard Worker    'latin_1': 'iso-8859-1',
70*cda5da8dSAndroid Build Coastguard Worker    'latin-1': 'iso-8859-1',
71*cda5da8dSAndroid Build Coastguard Worker    'latin_2': 'iso-8859-2',
72*cda5da8dSAndroid Build Coastguard Worker    'latin-2': 'iso-8859-2',
73*cda5da8dSAndroid Build Coastguard Worker    'latin_3': 'iso-8859-3',
74*cda5da8dSAndroid Build Coastguard Worker    'latin-3': 'iso-8859-3',
75*cda5da8dSAndroid Build Coastguard Worker    'latin_4': 'iso-8859-4',
76*cda5da8dSAndroid Build Coastguard Worker    'latin-4': 'iso-8859-4',
77*cda5da8dSAndroid Build Coastguard Worker    'latin_5': 'iso-8859-9',
78*cda5da8dSAndroid Build Coastguard Worker    'latin-5': 'iso-8859-9',
79*cda5da8dSAndroid Build Coastguard Worker    'latin_6': 'iso-8859-10',
80*cda5da8dSAndroid Build Coastguard Worker    'latin-6': 'iso-8859-10',
81*cda5da8dSAndroid Build Coastguard Worker    'latin_7': 'iso-8859-13',
82*cda5da8dSAndroid Build Coastguard Worker    'latin-7': 'iso-8859-13',
83*cda5da8dSAndroid Build Coastguard Worker    'latin_8': 'iso-8859-14',
84*cda5da8dSAndroid Build Coastguard Worker    'latin-8': 'iso-8859-14',
85*cda5da8dSAndroid Build Coastguard Worker    'latin_9': 'iso-8859-15',
86*cda5da8dSAndroid Build Coastguard Worker    'latin-9': 'iso-8859-15',
87*cda5da8dSAndroid Build Coastguard Worker    'latin_10':'iso-8859-16',
88*cda5da8dSAndroid Build Coastguard Worker    'latin-10':'iso-8859-16',
89*cda5da8dSAndroid Build Coastguard Worker    'cp949':   'ks_c_5601-1987',
90*cda5da8dSAndroid Build Coastguard Worker    'euc_jp':  'euc-jp',
91*cda5da8dSAndroid Build Coastguard Worker    'euc_kr':  'euc-kr',
92*cda5da8dSAndroid Build Coastguard Worker    'ascii':   'us-ascii',
93*cda5da8dSAndroid Build Coastguard Worker    }
94*cda5da8dSAndroid Build Coastguard Worker
95*cda5da8dSAndroid Build Coastguard Worker
96*cda5da8dSAndroid Build Coastguard Worker# Map charsets to their Unicode codec strings.
97*cda5da8dSAndroid Build Coastguard WorkerCODEC_MAP = {
98*cda5da8dSAndroid Build Coastguard Worker    'gb2312':      'eucgb2312_cn',
99*cda5da8dSAndroid Build Coastguard Worker    'big5':        'big5_tw',
100*cda5da8dSAndroid Build Coastguard Worker    # Hack: We don't want *any* conversion for stuff marked us-ascii, as all
101*cda5da8dSAndroid Build Coastguard Worker    # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.
102*cda5da8dSAndroid Build Coastguard Worker    # Let that stuff pass through without conversion to/from Unicode.
103*cda5da8dSAndroid Build Coastguard Worker    'us-ascii':    None,
104*cda5da8dSAndroid Build Coastguard Worker    }
105*cda5da8dSAndroid Build Coastguard Worker
106*cda5da8dSAndroid Build Coastguard Worker
107*cda5da8dSAndroid Build Coastguard Worker
108*cda5da8dSAndroid Build Coastguard Worker# Convenience functions for extending the above mappings
109*cda5da8dSAndroid Build Coastguard Workerdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None):
110*cda5da8dSAndroid Build Coastguard Worker    """Add character set properties to the global registry.
111*cda5da8dSAndroid Build Coastguard Worker
112*cda5da8dSAndroid Build Coastguard Worker    charset is the input character set, and must be the canonical name of a
113*cda5da8dSAndroid Build Coastguard Worker    character set.
114*cda5da8dSAndroid Build Coastguard Worker
115*cda5da8dSAndroid Build Coastguard Worker    Optional header_enc and body_enc is either Charset.QP for
116*cda5da8dSAndroid Build Coastguard Worker    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for
117*cda5da8dSAndroid Build Coastguard Worker    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST
118*cda5da8dSAndroid Build Coastguard Worker    is only valid for header_enc.  It describes how message headers and
119*cda5da8dSAndroid Build Coastguard Worker    message bodies in the input charset are to be encoded.  Default is no
120*cda5da8dSAndroid Build Coastguard Worker    encoding.
121*cda5da8dSAndroid Build Coastguard Worker
122*cda5da8dSAndroid Build Coastguard Worker    Optional output_charset is the character set that the output should be
123*cda5da8dSAndroid Build Coastguard Worker    in.  Conversions will proceed from input charset, to Unicode, to the
124*cda5da8dSAndroid Build Coastguard Worker    output charset when the method Charset.convert() is called.  The default
125*cda5da8dSAndroid Build Coastguard Worker    is to output in the same character set as the input.
126*cda5da8dSAndroid Build Coastguard Worker
127*cda5da8dSAndroid Build Coastguard Worker    Both input_charset and output_charset must have Unicode codec entries in
128*cda5da8dSAndroid Build Coastguard Worker    the module's charset-to-codec mapping; use add_codec(charset, codecname)
129*cda5da8dSAndroid Build Coastguard Worker    to add codecs the module does not know about.  See the codecs module's
130*cda5da8dSAndroid Build Coastguard Worker    documentation for more information.
131*cda5da8dSAndroid Build Coastguard Worker    """
132*cda5da8dSAndroid Build Coastguard Worker    if body_enc == SHORTEST:
133*cda5da8dSAndroid Build Coastguard Worker        raise ValueError('SHORTEST not allowed for body_enc')
134*cda5da8dSAndroid Build Coastguard Worker    CHARSETS[charset] = (header_enc, body_enc, output_charset)
135*cda5da8dSAndroid Build Coastguard Worker
136*cda5da8dSAndroid Build Coastguard Worker
137*cda5da8dSAndroid Build Coastguard Workerdef add_alias(alias, canonical):
138*cda5da8dSAndroid Build Coastguard Worker    """Add a character set alias.
139*cda5da8dSAndroid Build Coastguard Worker
140*cda5da8dSAndroid Build Coastguard Worker    alias is the alias name, e.g. latin-1
141*cda5da8dSAndroid Build Coastguard Worker    canonical is the character set's canonical name, e.g. iso-8859-1
142*cda5da8dSAndroid Build Coastguard Worker    """
143*cda5da8dSAndroid Build Coastguard Worker    ALIASES[alias] = canonical
144*cda5da8dSAndroid Build Coastguard Worker
145*cda5da8dSAndroid Build Coastguard Worker
146*cda5da8dSAndroid Build Coastguard Workerdef add_codec(charset, codecname):
147*cda5da8dSAndroid Build Coastguard Worker    """Add a codec that map characters in the given charset to/from Unicode.
148*cda5da8dSAndroid Build Coastguard Worker
149*cda5da8dSAndroid Build Coastguard Worker    charset is the canonical name of a character set.  codecname is the name
150*cda5da8dSAndroid Build Coastguard Worker    of a Python codec, as appropriate for the second argument to the unicode()
151*cda5da8dSAndroid Build Coastguard Worker    built-in, or to the encode() method of a Unicode string.
152*cda5da8dSAndroid Build Coastguard Worker    """
153*cda5da8dSAndroid Build Coastguard Worker    CODEC_MAP[charset] = codecname
154*cda5da8dSAndroid Build Coastguard Worker
155*cda5da8dSAndroid Build Coastguard Worker
156*cda5da8dSAndroid Build Coastguard Worker
157*cda5da8dSAndroid Build Coastguard Worker# Convenience function for encoding strings, taking into account
158*cda5da8dSAndroid Build Coastguard Worker# that they might be unknown-8bit (ie: have surrogate-escaped bytes)
159*cda5da8dSAndroid Build Coastguard Workerdef _encode(string, codec):
160*cda5da8dSAndroid Build Coastguard Worker    if codec == UNKNOWN8BIT:
161*cda5da8dSAndroid Build Coastguard Worker        return string.encode('ascii', 'surrogateescape')
162*cda5da8dSAndroid Build Coastguard Worker    else:
163*cda5da8dSAndroid Build Coastguard Worker        return string.encode(codec)
164*cda5da8dSAndroid Build Coastguard Worker
165*cda5da8dSAndroid Build Coastguard Worker
166*cda5da8dSAndroid Build Coastguard Worker
167*cda5da8dSAndroid Build Coastguard Workerclass Charset:
168*cda5da8dSAndroid Build Coastguard Worker    """Map character sets to their email properties.
169*cda5da8dSAndroid Build Coastguard Worker
170*cda5da8dSAndroid Build Coastguard Worker    This class provides information about the requirements imposed on email
171*cda5da8dSAndroid Build Coastguard Worker    for a specific character set.  It also provides convenience routines for
172*cda5da8dSAndroid Build Coastguard Worker    converting between character sets, given the availability of the
173*cda5da8dSAndroid Build Coastguard Worker    applicable codecs.  Given a character set, it will do its best to provide
174*cda5da8dSAndroid Build Coastguard Worker    information on how to use that character set in an email in an
175*cda5da8dSAndroid Build Coastguard Worker    RFC-compliant way.
176*cda5da8dSAndroid Build Coastguard Worker
177*cda5da8dSAndroid Build Coastguard Worker    Certain character sets must be encoded with quoted-printable or base64
178*cda5da8dSAndroid Build Coastguard Worker    when used in email headers or bodies.  Certain character sets must be
179*cda5da8dSAndroid Build Coastguard Worker    converted outright, and are not allowed in email.  Instances of this
180*cda5da8dSAndroid Build Coastguard Worker    module expose the following information about a character set:
181*cda5da8dSAndroid Build Coastguard Worker
182*cda5da8dSAndroid Build Coastguard Worker    input_charset: The initial character set specified.  Common aliases
183*cda5da8dSAndroid Build Coastguard Worker                   are converted to their `official' email names (e.g. latin_1
184*cda5da8dSAndroid Build Coastguard Worker                   is converted to iso-8859-1).  Defaults to 7-bit us-ascii.
185*cda5da8dSAndroid Build Coastguard Worker
186*cda5da8dSAndroid Build Coastguard Worker    header_encoding: If the character set must be encoded before it can be
187*cda5da8dSAndroid Build Coastguard Worker                     used in an email header, this attribute will be set to
188*cda5da8dSAndroid Build Coastguard Worker                     Charset.QP (for quoted-printable), Charset.BASE64 (for
189*cda5da8dSAndroid Build Coastguard Worker                     base64 encoding), or Charset.SHORTEST for the shortest of
190*cda5da8dSAndroid Build Coastguard Worker                     QP or BASE64 encoding.  Otherwise, it will be None.
191*cda5da8dSAndroid Build Coastguard Worker
192*cda5da8dSAndroid Build Coastguard Worker    body_encoding: Same as header_encoding, but describes the encoding for the
193*cda5da8dSAndroid Build Coastguard Worker                   mail message's body, which indeed may be different than the
194*cda5da8dSAndroid Build Coastguard Worker                   header encoding.  Charset.SHORTEST is not allowed for
195*cda5da8dSAndroid Build Coastguard Worker                   body_encoding.
196*cda5da8dSAndroid Build Coastguard Worker
197*cda5da8dSAndroid Build Coastguard Worker    output_charset: Some character sets must be converted before they can be
198*cda5da8dSAndroid Build Coastguard Worker                    used in email headers or bodies.  If the input_charset is
199*cda5da8dSAndroid Build Coastguard Worker                    one of them, this attribute will contain the name of the
200*cda5da8dSAndroid Build Coastguard Worker                    charset output will be converted to.  Otherwise, it will
201*cda5da8dSAndroid Build Coastguard Worker                    be None.
202*cda5da8dSAndroid Build Coastguard Worker
203*cda5da8dSAndroid Build Coastguard Worker    input_codec: The name of the Python codec used to convert the
204*cda5da8dSAndroid Build Coastguard Worker                 input_charset to Unicode.  If no conversion codec is
205*cda5da8dSAndroid Build Coastguard Worker                 necessary, this attribute will be None.
206*cda5da8dSAndroid Build Coastguard Worker
207*cda5da8dSAndroid Build Coastguard Worker    output_codec: The name of the Python codec used to convert Unicode
208*cda5da8dSAndroid Build Coastguard Worker                  to the output_charset.  If no conversion codec is necessary,
209*cda5da8dSAndroid Build Coastguard Worker                  this attribute will have the same value as the input_codec.
210*cda5da8dSAndroid Build Coastguard Worker    """
211*cda5da8dSAndroid Build Coastguard Worker    def __init__(self, input_charset=DEFAULT_CHARSET):
212*cda5da8dSAndroid Build Coastguard Worker        # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to
213*cda5da8dSAndroid Build Coastguard Worker        # unicode because its .lower() is locale insensitive.  If the argument
214*cda5da8dSAndroid Build Coastguard Worker        # is already a unicode, we leave it at that, but ensure that the
215*cda5da8dSAndroid Build Coastguard Worker        # charset is ASCII, as the standard (RFC XXX) requires.
216*cda5da8dSAndroid Build Coastguard Worker        try:
217*cda5da8dSAndroid Build Coastguard Worker            if isinstance(input_charset, str):
218*cda5da8dSAndroid Build Coastguard Worker                input_charset.encode('ascii')
219*cda5da8dSAndroid Build Coastguard Worker            else:
220*cda5da8dSAndroid Build Coastguard Worker                input_charset = str(input_charset, 'ascii')
221*cda5da8dSAndroid Build Coastguard Worker        except UnicodeError:
222*cda5da8dSAndroid Build Coastguard Worker            raise errors.CharsetError(input_charset)
223*cda5da8dSAndroid Build Coastguard Worker        input_charset = input_charset.lower()
224*cda5da8dSAndroid Build Coastguard Worker        # Set the input charset after filtering through the aliases
225*cda5da8dSAndroid Build Coastguard Worker        self.input_charset = ALIASES.get(input_charset, input_charset)
226*cda5da8dSAndroid Build Coastguard Worker        # We can try to guess which encoding and conversion to use by the
227*cda5da8dSAndroid Build Coastguard Worker        # charset_map dictionary.  Try that first, but let the user override
228*cda5da8dSAndroid Build Coastguard Worker        # it.
229*cda5da8dSAndroid Build Coastguard Worker        henc, benc, conv = CHARSETS.get(self.input_charset,
230*cda5da8dSAndroid Build Coastguard Worker                                        (SHORTEST, BASE64, None))
231*cda5da8dSAndroid Build Coastguard Worker        if not conv:
232*cda5da8dSAndroid Build Coastguard Worker            conv = self.input_charset
233*cda5da8dSAndroid Build Coastguard Worker        # Set the attributes, allowing the arguments to override the default.
234*cda5da8dSAndroid Build Coastguard Worker        self.header_encoding = henc
235*cda5da8dSAndroid Build Coastguard Worker        self.body_encoding = benc
236*cda5da8dSAndroid Build Coastguard Worker        self.output_charset = ALIASES.get(conv, conv)
237*cda5da8dSAndroid Build Coastguard Worker        # Now set the codecs.  If one isn't defined for input_charset,
238*cda5da8dSAndroid Build Coastguard Worker        # guess and try a Unicode codec with the same name as input_codec.
239*cda5da8dSAndroid Build Coastguard Worker        self.input_codec = CODEC_MAP.get(self.input_charset,
240*cda5da8dSAndroid Build Coastguard Worker                                         self.input_charset)
241*cda5da8dSAndroid Build Coastguard Worker        self.output_codec = CODEC_MAP.get(self.output_charset,
242*cda5da8dSAndroid Build Coastguard Worker                                          self.output_charset)
243*cda5da8dSAndroid Build Coastguard Worker
244*cda5da8dSAndroid Build Coastguard Worker    def __repr__(self):
245*cda5da8dSAndroid Build Coastguard Worker        return self.input_charset.lower()
246*cda5da8dSAndroid Build Coastguard Worker
247*cda5da8dSAndroid Build Coastguard Worker    def __eq__(self, other):
248*cda5da8dSAndroid Build Coastguard Worker        return str(self) == str(other).lower()
249*cda5da8dSAndroid Build Coastguard Worker
250*cda5da8dSAndroid Build Coastguard Worker    def get_body_encoding(self):
251*cda5da8dSAndroid Build Coastguard Worker        """Return the content-transfer-encoding used for body encoding.
252*cda5da8dSAndroid Build Coastguard Worker
253*cda5da8dSAndroid Build Coastguard Worker        This is either the string `quoted-printable' or `base64' depending on
254*cda5da8dSAndroid Build Coastguard Worker        the encoding used, or it is a function in which case you should call
255*cda5da8dSAndroid Build Coastguard Worker        the function with a single argument, the Message object being
256*cda5da8dSAndroid Build Coastguard Worker        encoded.  The function should then set the Content-Transfer-Encoding
257*cda5da8dSAndroid Build Coastguard Worker        header itself to whatever is appropriate.
258*cda5da8dSAndroid Build Coastguard Worker
259*cda5da8dSAndroid Build Coastguard Worker        Returns "quoted-printable" if self.body_encoding is QP.
260*cda5da8dSAndroid Build Coastguard Worker        Returns "base64" if self.body_encoding is BASE64.
261*cda5da8dSAndroid Build Coastguard Worker        Returns conversion function otherwise.
262*cda5da8dSAndroid Build Coastguard Worker        """
263*cda5da8dSAndroid Build Coastguard Worker        assert self.body_encoding != SHORTEST
264*cda5da8dSAndroid Build Coastguard Worker        if self.body_encoding == QP:
265*cda5da8dSAndroid Build Coastguard Worker            return 'quoted-printable'
266*cda5da8dSAndroid Build Coastguard Worker        elif self.body_encoding == BASE64:
267*cda5da8dSAndroid Build Coastguard Worker            return 'base64'
268*cda5da8dSAndroid Build Coastguard Worker        else:
269*cda5da8dSAndroid Build Coastguard Worker            return encode_7or8bit
270*cda5da8dSAndroid Build Coastguard Worker
271*cda5da8dSAndroid Build Coastguard Worker    def get_output_charset(self):
272*cda5da8dSAndroid Build Coastguard Worker        """Return the output character set.
273*cda5da8dSAndroid Build Coastguard Worker
274*cda5da8dSAndroid Build Coastguard Worker        This is self.output_charset if that is not None, otherwise it is
275*cda5da8dSAndroid Build Coastguard Worker        self.input_charset.
276*cda5da8dSAndroid Build Coastguard Worker        """
277*cda5da8dSAndroid Build Coastguard Worker        return self.output_charset or self.input_charset
278*cda5da8dSAndroid Build Coastguard Worker
279*cda5da8dSAndroid Build Coastguard Worker    def header_encode(self, string):
280*cda5da8dSAndroid Build Coastguard Worker        """Header-encode a string by converting it first to bytes.
281*cda5da8dSAndroid Build Coastguard Worker
282*cda5da8dSAndroid Build Coastguard Worker        The type of encoding (base64 or quoted-printable) will be based on
283*cda5da8dSAndroid Build Coastguard Worker        this charset's `header_encoding`.
284*cda5da8dSAndroid Build Coastguard Worker
285*cda5da8dSAndroid Build Coastguard Worker        :param string: A unicode string for the header.  It must be possible
286*cda5da8dSAndroid Build Coastguard Worker            to encode this string to bytes using the character set's
287*cda5da8dSAndroid Build Coastguard Worker            output codec.
288*cda5da8dSAndroid Build Coastguard Worker        :return: The encoded string, with RFC 2047 chrome.
289*cda5da8dSAndroid Build Coastguard Worker        """
290*cda5da8dSAndroid Build Coastguard Worker        codec = self.output_codec or 'us-ascii'
291*cda5da8dSAndroid Build Coastguard Worker        header_bytes = _encode(string, codec)
292*cda5da8dSAndroid Build Coastguard Worker        # 7bit/8bit encodings return the string unchanged (modulo conversions)
293*cda5da8dSAndroid Build Coastguard Worker        encoder_module = self._get_encoder(header_bytes)
294*cda5da8dSAndroid Build Coastguard Worker        if encoder_module is None:
295*cda5da8dSAndroid Build Coastguard Worker            return string
296*cda5da8dSAndroid Build Coastguard Worker        return encoder_module.header_encode(header_bytes, codec)
297*cda5da8dSAndroid Build Coastguard Worker
298*cda5da8dSAndroid Build Coastguard Worker    def header_encode_lines(self, string, maxlengths):
299*cda5da8dSAndroid Build Coastguard Worker        """Header-encode a string by converting it first to bytes.
300*cda5da8dSAndroid Build Coastguard Worker
301*cda5da8dSAndroid Build Coastguard Worker        This is similar to `header_encode()` except that the string is fit
302*cda5da8dSAndroid Build Coastguard Worker        into maximum line lengths as given by the argument.
303*cda5da8dSAndroid Build Coastguard Worker
304*cda5da8dSAndroid Build Coastguard Worker        :param string: A unicode string for the header.  It must be possible
305*cda5da8dSAndroid Build Coastguard Worker            to encode this string to bytes using the character set's
306*cda5da8dSAndroid Build Coastguard Worker            output codec.
307*cda5da8dSAndroid Build Coastguard Worker        :param maxlengths: Maximum line length iterator.  Each element
308*cda5da8dSAndroid Build Coastguard Worker            returned from this iterator will provide the next maximum line
309*cda5da8dSAndroid Build Coastguard Worker            length.  This parameter is used as an argument to built-in next()
310*cda5da8dSAndroid Build Coastguard Worker            and should never be exhausted.  The maximum line lengths should
311*cda5da8dSAndroid Build Coastguard Worker            not count the RFC 2047 chrome.  These line lengths are only a
312*cda5da8dSAndroid Build Coastguard Worker            hint; the splitter does the best it can.
313*cda5da8dSAndroid Build Coastguard Worker        :return: Lines of encoded strings, each with RFC 2047 chrome.
314*cda5da8dSAndroid Build Coastguard Worker        """
315*cda5da8dSAndroid Build Coastguard Worker        # See which encoding we should use.
316*cda5da8dSAndroid Build Coastguard Worker        codec = self.output_codec or 'us-ascii'
317*cda5da8dSAndroid Build Coastguard Worker        header_bytes = _encode(string, codec)
318*cda5da8dSAndroid Build Coastguard Worker        encoder_module = self._get_encoder(header_bytes)
319*cda5da8dSAndroid Build Coastguard Worker        encoder = partial(encoder_module.header_encode, charset=codec)
320*cda5da8dSAndroid Build Coastguard Worker        # Calculate the number of characters that the RFC 2047 chrome will
321*cda5da8dSAndroid Build Coastguard Worker        # contribute to each line.
322*cda5da8dSAndroid Build Coastguard Worker        charset = self.get_output_charset()
323*cda5da8dSAndroid Build Coastguard Worker        extra = len(charset) + RFC2047_CHROME_LEN
324*cda5da8dSAndroid Build Coastguard Worker        # Now comes the hard part.  We must encode bytes but we can't split on
325*cda5da8dSAndroid Build Coastguard Worker        # bytes because some character sets are variable length and each
326*cda5da8dSAndroid Build Coastguard Worker        # encoded word must stand on its own.  So the problem is you have to
327*cda5da8dSAndroid Build Coastguard Worker        # encode to bytes to figure out this word's length, but you must split
328*cda5da8dSAndroid Build Coastguard Worker        # on characters.  This causes two problems: first, we don't know how
329*cda5da8dSAndroid Build Coastguard Worker        # many octets a specific substring of unicode characters will get
330*cda5da8dSAndroid Build Coastguard Worker        # encoded to, and second, we don't know how many ASCII characters
331*cda5da8dSAndroid Build Coastguard Worker        # those octets will get encoded to.  Unless we try it.  Which seems
332*cda5da8dSAndroid Build Coastguard Worker        # inefficient.  In the interest of being correct rather than fast (and
333*cda5da8dSAndroid Build Coastguard Worker        # in the hope that there will be few encoded headers in any such
334*cda5da8dSAndroid Build Coastguard Worker        # message), brute force it. :(
335*cda5da8dSAndroid Build Coastguard Worker        lines = []
336*cda5da8dSAndroid Build Coastguard Worker        current_line = []
337*cda5da8dSAndroid Build Coastguard Worker        maxlen = next(maxlengths) - extra
338*cda5da8dSAndroid Build Coastguard Worker        for character in string:
339*cda5da8dSAndroid Build Coastguard Worker            current_line.append(character)
340*cda5da8dSAndroid Build Coastguard Worker            this_line = EMPTYSTRING.join(current_line)
341*cda5da8dSAndroid Build Coastguard Worker            length = encoder_module.header_length(_encode(this_line, charset))
342*cda5da8dSAndroid Build Coastguard Worker            if length > maxlen:
343*cda5da8dSAndroid Build Coastguard Worker                # This last character doesn't fit so pop it off.
344*cda5da8dSAndroid Build Coastguard Worker                current_line.pop()
345*cda5da8dSAndroid Build Coastguard Worker                # Does nothing fit on the first line?
346*cda5da8dSAndroid Build Coastguard Worker                if not lines and not current_line:
347*cda5da8dSAndroid Build Coastguard Worker                    lines.append(None)
348*cda5da8dSAndroid Build Coastguard Worker                else:
349*cda5da8dSAndroid Build Coastguard Worker                    separator = (' ' if lines else '')
350*cda5da8dSAndroid Build Coastguard Worker                    joined_line = EMPTYSTRING.join(current_line)
351*cda5da8dSAndroid Build Coastguard Worker                    header_bytes = _encode(joined_line, codec)
352*cda5da8dSAndroid Build Coastguard Worker                    lines.append(encoder(header_bytes))
353*cda5da8dSAndroid Build Coastguard Worker                current_line = [character]
354*cda5da8dSAndroid Build Coastguard Worker                maxlen = next(maxlengths) - extra
355*cda5da8dSAndroid Build Coastguard Worker        joined_line = EMPTYSTRING.join(current_line)
356*cda5da8dSAndroid Build Coastguard Worker        header_bytes = _encode(joined_line, codec)
357*cda5da8dSAndroid Build Coastguard Worker        lines.append(encoder(header_bytes))
358*cda5da8dSAndroid Build Coastguard Worker        return lines
359*cda5da8dSAndroid Build Coastguard Worker
360*cda5da8dSAndroid Build Coastguard Worker    def _get_encoder(self, header_bytes):
361*cda5da8dSAndroid Build Coastguard Worker        if self.header_encoding == BASE64:
362*cda5da8dSAndroid Build Coastguard Worker            return email.base64mime
363*cda5da8dSAndroid Build Coastguard Worker        elif self.header_encoding == QP:
364*cda5da8dSAndroid Build Coastguard Worker            return email.quoprimime
365*cda5da8dSAndroid Build Coastguard Worker        elif self.header_encoding == SHORTEST:
366*cda5da8dSAndroid Build Coastguard Worker            len64 = email.base64mime.header_length(header_bytes)
367*cda5da8dSAndroid Build Coastguard Worker            lenqp = email.quoprimime.header_length(header_bytes)
368*cda5da8dSAndroid Build Coastguard Worker            if len64 < lenqp:
369*cda5da8dSAndroid Build Coastguard Worker                return email.base64mime
370*cda5da8dSAndroid Build Coastguard Worker            else:
371*cda5da8dSAndroid Build Coastguard Worker                return email.quoprimime
372*cda5da8dSAndroid Build Coastguard Worker        else:
373*cda5da8dSAndroid Build Coastguard Worker            return None
374*cda5da8dSAndroid Build Coastguard Worker
375*cda5da8dSAndroid Build Coastguard Worker    def body_encode(self, string):
376*cda5da8dSAndroid Build Coastguard Worker        """Body-encode a string by converting it first to bytes.
377*cda5da8dSAndroid Build Coastguard Worker
378*cda5da8dSAndroid Build Coastguard Worker        The type of encoding (base64 or quoted-printable) will be based on
379*cda5da8dSAndroid Build Coastguard Worker        self.body_encoding.  If body_encoding is None, we assume the
380*cda5da8dSAndroid Build Coastguard Worker        output charset is a 7bit encoding, so re-encoding the decoded
381*cda5da8dSAndroid Build Coastguard Worker        string using the ascii codec produces the correct string version
382*cda5da8dSAndroid Build Coastguard Worker        of the content.
383*cda5da8dSAndroid Build Coastguard Worker        """
384*cda5da8dSAndroid Build Coastguard Worker        if not string:
385*cda5da8dSAndroid Build Coastguard Worker            return string
386*cda5da8dSAndroid Build Coastguard Worker        if self.body_encoding is BASE64:
387*cda5da8dSAndroid Build Coastguard Worker            if isinstance(string, str):
388*cda5da8dSAndroid Build Coastguard Worker                string = string.encode(self.output_charset)
389*cda5da8dSAndroid Build Coastguard Worker            return email.base64mime.body_encode(string)
390*cda5da8dSAndroid Build Coastguard Worker        elif self.body_encoding is QP:
391*cda5da8dSAndroid Build Coastguard Worker            # quopromime.body_encode takes a string, but operates on it as if
392*cda5da8dSAndroid Build Coastguard Worker            # it were a list of byte codes.  For a (minimal) history on why
393*cda5da8dSAndroid Build Coastguard Worker            # this is so, see changeset 0cf700464177.  To correctly encode a
394*cda5da8dSAndroid Build Coastguard Worker            # character set, then, we must turn it into pseudo bytes via the
395*cda5da8dSAndroid Build Coastguard Worker            # latin1 charset, which will encode any byte as a single code point
396*cda5da8dSAndroid Build Coastguard Worker            # between 0 and 255, which is what body_encode is expecting.
397*cda5da8dSAndroid Build Coastguard Worker            if isinstance(string, str):
398*cda5da8dSAndroid Build Coastguard Worker                string = string.encode(self.output_charset)
399*cda5da8dSAndroid Build Coastguard Worker            string = string.decode('latin1')
400*cda5da8dSAndroid Build Coastguard Worker            return email.quoprimime.body_encode(string)
401*cda5da8dSAndroid Build Coastguard Worker        else:
402*cda5da8dSAndroid Build Coastguard Worker            if isinstance(string, str):
403*cda5da8dSAndroid Build Coastguard Worker                string = string.encode(self.output_charset).decode('ascii')
404*cda5da8dSAndroid Build Coastguard Worker            return string
405