1*cda5da8dSAndroid Build Coastguard Worker# Copyright (C) 2001-2007 Python Software Foundation 2*cda5da8dSAndroid Build Coastguard Worker# Author: Ben Gertzfield, Barry Warsaw 3*cda5da8dSAndroid Build Coastguard Worker# Contact: [email protected] 4*cda5da8dSAndroid Build Coastguard Worker 5*cda5da8dSAndroid Build Coastguard Worker__all__ = [ 6*cda5da8dSAndroid Build Coastguard Worker 'Charset', 7*cda5da8dSAndroid Build Coastguard Worker 'add_alias', 8*cda5da8dSAndroid Build Coastguard Worker 'add_charset', 9*cda5da8dSAndroid Build Coastguard Worker 'add_codec', 10*cda5da8dSAndroid Build Coastguard Worker ] 11*cda5da8dSAndroid Build Coastguard Worker 12*cda5da8dSAndroid Build Coastguard Workerfrom functools import partial 13*cda5da8dSAndroid Build Coastguard Worker 14*cda5da8dSAndroid Build Coastguard Workerimport email.base64mime 15*cda5da8dSAndroid Build Coastguard Workerimport email.quoprimime 16*cda5da8dSAndroid Build Coastguard Worker 17*cda5da8dSAndroid Build Coastguard Workerfrom email import errors 18*cda5da8dSAndroid Build Coastguard Workerfrom email.encoders import encode_7or8bit 19*cda5da8dSAndroid Build Coastguard Worker 20*cda5da8dSAndroid Build Coastguard Worker 21*cda5da8dSAndroid Build Coastguard Worker 22*cda5da8dSAndroid Build Coastguard Worker# Flags for types of header encodings 23*cda5da8dSAndroid Build Coastguard WorkerQP = 1 # Quoted-Printable 24*cda5da8dSAndroid Build Coastguard WorkerBASE64 = 2 # Base64 25*cda5da8dSAndroid Build Coastguard WorkerSHORTEST = 3 # the shorter of QP and base64, but only for headers 26*cda5da8dSAndroid Build Coastguard Worker 27*cda5da8dSAndroid Build Coastguard Worker# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7 28*cda5da8dSAndroid Build Coastguard WorkerRFC2047_CHROME_LEN = 7 29*cda5da8dSAndroid Build Coastguard Worker 30*cda5da8dSAndroid Build Coastguard WorkerDEFAULT_CHARSET = 'us-ascii' 31*cda5da8dSAndroid Build Coastguard WorkerUNKNOWN8BIT = 'unknown-8bit' 32*cda5da8dSAndroid Build Coastguard WorkerEMPTYSTRING = '' 33*cda5da8dSAndroid Build Coastguard Worker 34*cda5da8dSAndroid Build Coastguard Worker 35*cda5da8dSAndroid Build Coastguard Worker 36*cda5da8dSAndroid Build Coastguard Worker# Defaults 37*cda5da8dSAndroid Build Coastguard WorkerCHARSETS = { 38*cda5da8dSAndroid Build Coastguard Worker # input header enc body enc output conv 39*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-1': (QP, QP, None), 40*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-2': (QP, QP, None), 41*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-3': (QP, QP, None), 42*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-4': (QP, QP, None), 43*cda5da8dSAndroid Build Coastguard Worker # iso-8859-5 is Cyrillic, and not especially used 44*cda5da8dSAndroid Build Coastguard Worker # iso-8859-6 is Arabic, also not particularly used 45*cda5da8dSAndroid Build Coastguard Worker # iso-8859-7 is Greek, QP will not make it readable 46*cda5da8dSAndroid Build Coastguard Worker # iso-8859-8 is Hebrew, QP will not make it readable 47*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-9': (QP, QP, None), 48*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-10': (QP, QP, None), 49*cda5da8dSAndroid Build Coastguard Worker # iso-8859-11 is Thai, QP will not make it readable 50*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-13': (QP, QP, None), 51*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-14': (QP, QP, None), 52*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-15': (QP, QP, None), 53*cda5da8dSAndroid Build Coastguard Worker 'iso-8859-16': (QP, QP, None), 54*cda5da8dSAndroid Build Coastguard Worker 'windows-1252':(QP, QP, None), 55*cda5da8dSAndroid Build Coastguard Worker 'viscii': (QP, QP, None), 56*cda5da8dSAndroid Build Coastguard Worker 'us-ascii': (None, None, None), 57*cda5da8dSAndroid Build Coastguard Worker 'big5': (BASE64, BASE64, None), 58*cda5da8dSAndroid Build Coastguard Worker 'gb2312': (BASE64, BASE64, None), 59*cda5da8dSAndroid Build Coastguard Worker 'euc-jp': (BASE64, None, 'iso-2022-jp'), 60*cda5da8dSAndroid Build Coastguard Worker 'shift_jis': (BASE64, None, 'iso-2022-jp'), 61*cda5da8dSAndroid Build Coastguard Worker 'iso-2022-jp': (BASE64, None, None), 62*cda5da8dSAndroid Build Coastguard Worker 'koi8-r': (BASE64, BASE64, None), 63*cda5da8dSAndroid Build Coastguard Worker 'utf-8': (SHORTEST, BASE64, 'utf-8'), 64*cda5da8dSAndroid Build Coastguard Worker } 65*cda5da8dSAndroid Build Coastguard Worker 66*cda5da8dSAndroid Build Coastguard Worker# Aliases for other commonly-used names for character sets. Map 67*cda5da8dSAndroid Build Coastguard Worker# them to the real ones used in email. 68*cda5da8dSAndroid Build Coastguard WorkerALIASES = { 69*cda5da8dSAndroid Build Coastguard Worker 'latin_1': 'iso-8859-1', 70*cda5da8dSAndroid Build Coastguard Worker 'latin-1': 'iso-8859-1', 71*cda5da8dSAndroid Build Coastguard Worker 'latin_2': 'iso-8859-2', 72*cda5da8dSAndroid Build Coastguard Worker 'latin-2': 'iso-8859-2', 73*cda5da8dSAndroid Build Coastguard Worker 'latin_3': 'iso-8859-3', 74*cda5da8dSAndroid Build Coastguard Worker 'latin-3': 'iso-8859-3', 75*cda5da8dSAndroid Build Coastguard Worker 'latin_4': 'iso-8859-4', 76*cda5da8dSAndroid Build Coastguard Worker 'latin-4': 'iso-8859-4', 77*cda5da8dSAndroid Build Coastguard Worker 'latin_5': 'iso-8859-9', 78*cda5da8dSAndroid Build Coastguard Worker 'latin-5': 'iso-8859-9', 79*cda5da8dSAndroid Build Coastguard Worker 'latin_6': 'iso-8859-10', 80*cda5da8dSAndroid Build Coastguard Worker 'latin-6': 'iso-8859-10', 81*cda5da8dSAndroid Build Coastguard Worker 'latin_7': 'iso-8859-13', 82*cda5da8dSAndroid Build Coastguard Worker 'latin-7': 'iso-8859-13', 83*cda5da8dSAndroid Build Coastguard Worker 'latin_8': 'iso-8859-14', 84*cda5da8dSAndroid Build Coastguard Worker 'latin-8': 'iso-8859-14', 85*cda5da8dSAndroid Build Coastguard Worker 'latin_9': 'iso-8859-15', 86*cda5da8dSAndroid Build Coastguard Worker 'latin-9': 'iso-8859-15', 87*cda5da8dSAndroid Build Coastguard Worker 'latin_10':'iso-8859-16', 88*cda5da8dSAndroid Build Coastguard Worker 'latin-10':'iso-8859-16', 89*cda5da8dSAndroid Build Coastguard Worker 'cp949': 'ks_c_5601-1987', 90*cda5da8dSAndroid Build Coastguard Worker 'euc_jp': 'euc-jp', 91*cda5da8dSAndroid Build Coastguard Worker 'euc_kr': 'euc-kr', 92*cda5da8dSAndroid Build Coastguard Worker 'ascii': 'us-ascii', 93*cda5da8dSAndroid Build Coastguard Worker } 94*cda5da8dSAndroid Build Coastguard Worker 95*cda5da8dSAndroid Build Coastguard Worker 96*cda5da8dSAndroid Build Coastguard Worker# Map charsets to their Unicode codec strings. 97*cda5da8dSAndroid Build Coastguard WorkerCODEC_MAP = { 98*cda5da8dSAndroid Build Coastguard Worker 'gb2312': 'eucgb2312_cn', 99*cda5da8dSAndroid Build Coastguard Worker 'big5': 'big5_tw', 100*cda5da8dSAndroid Build Coastguard Worker # Hack: We don't want *any* conversion for stuff marked us-ascii, as all 101*cda5da8dSAndroid Build Coastguard Worker # sorts of garbage might be sent to us in the guise of 7-bit us-ascii. 102*cda5da8dSAndroid Build Coastguard Worker # Let that stuff pass through without conversion to/from Unicode. 103*cda5da8dSAndroid Build Coastguard Worker 'us-ascii': None, 104*cda5da8dSAndroid Build Coastguard Worker } 105*cda5da8dSAndroid Build Coastguard Worker 106*cda5da8dSAndroid Build Coastguard Worker 107*cda5da8dSAndroid Build Coastguard Worker 108*cda5da8dSAndroid Build Coastguard Worker# Convenience functions for extending the above mappings 109*cda5da8dSAndroid Build Coastguard Workerdef add_charset(charset, header_enc=None, body_enc=None, output_charset=None): 110*cda5da8dSAndroid Build Coastguard Worker """Add character set properties to the global registry. 111*cda5da8dSAndroid Build Coastguard Worker 112*cda5da8dSAndroid Build Coastguard Worker charset is the input character set, and must be the canonical name of a 113*cda5da8dSAndroid Build Coastguard Worker character set. 114*cda5da8dSAndroid Build Coastguard Worker 115*cda5da8dSAndroid Build Coastguard Worker Optional header_enc and body_enc is either Charset.QP for 116*cda5da8dSAndroid Build Coastguard Worker quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for 117*cda5da8dSAndroid Build Coastguard Worker the shortest of qp or base64 encoding, or None for no encoding. SHORTEST 118*cda5da8dSAndroid Build Coastguard Worker is only valid for header_enc. It describes how message headers and 119*cda5da8dSAndroid Build Coastguard Worker message bodies in the input charset are to be encoded. Default is no 120*cda5da8dSAndroid Build Coastguard Worker encoding. 121*cda5da8dSAndroid Build Coastguard Worker 122*cda5da8dSAndroid Build Coastguard Worker Optional output_charset is the character set that the output should be 123*cda5da8dSAndroid Build Coastguard Worker in. Conversions will proceed from input charset, to Unicode, to the 124*cda5da8dSAndroid Build Coastguard Worker output charset when the method Charset.convert() is called. The default 125*cda5da8dSAndroid Build Coastguard Worker is to output in the same character set as the input. 126*cda5da8dSAndroid Build Coastguard Worker 127*cda5da8dSAndroid Build Coastguard Worker Both input_charset and output_charset must have Unicode codec entries in 128*cda5da8dSAndroid Build Coastguard Worker the module's charset-to-codec mapping; use add_codec(charset, codecname) 129*cda5da8dSAndroid Build Coastguard Worker to add codecs the module does not know about. See the codecs module's 130*cda5da8dSAndroid Build Coastguard Worker documentation for more information. 131*cda5da8dSAndroid Build Coastguard Worker """ 132*cda5da8dSAndroid Build Coastguard Worker if body_enc == SHORTEST: 133*cda5da8dSAndroid Build Coastguard Worker raise ValueError('SHORTEST not allowed for body_enc') 134*cda5da8dSAndroid Build Coastguard Worker CHARSETS[charset] = (header_enc, body_enc, output_charset) 135*cda5da8dSAndroid Build Coastguard Worker 136*cda5da8dSAndroid Build Coastguard Worker 137*cda5da8dSAndroid Build Coastguard Workerdef add_alias(alias, canonical): 138*cda5da8dSAndroid Build Coastguard Worker """Add a character set alias. 139*cda5da8dSAndroid Build Coastguard Worker 140*cda5da8dSAndroid Build Coastguard Worker alias is the alias name, e.g. latin-1 141*cda5da8dSAndroid Build Coastguard Worker canonical is the character set's canonical name, e.g. iso-8859-1 142*cda5da8dSAndroid Build Coastguard Worker """ 143*cda5da8dSAndroid Build Coastguard Worker ALIASES[alias] = canonical 144*cda5da8dSAndroid Build Coastguard Worker 145*cda5da8dSAndroid Build Coastguard Worker 146*cda5da8dSAndroid Build Coastguard Workerdef add_codec(charset, codecname): 147*cda5da8dSAndroid Build Coastguard Worker """Add a codec that map characters in the given charset to/from Unicode. 148*cda5da8dSAndroid Build Coastguard Worker 149*cda5da8dSAndroid Build Coastguard Worker charset is the canonical name of a character set. codecname is the name 150*cda5da8dSAndroid Build Coastguard Worker of a Python codec, as appropriate for the second argument to the unicode() 151*cda5da8dSAndroid Build Coastguard Worker built-in, or to the encode() method of a Unicode string. 152*cda5da8dSAndroid Build Coastguard Worker """ 153*cda5da8dSAndroid Build Coastguard Worker CODEC_MAP[charset] = codecname 154*cda5da8dSAndroid Build Coastguard Worker 155*cda5da8dSAndroid Build Coastguard Worker 156*cda5da8dSAndroid Build Coastguard Worker 157*cda5da8dSAndroid Build Coastguard Worker# Convenience function for encoding strings, taking into account 158*cda5da8dSAndroid Build Coastguard Worker# that they might be unknown-8bit (ie: have surrogate-escaped bytes) 159*cda5da8dSAndroid Build Coastguard Workerdef _encode(string, codec): 160*cda5da8dSAndroid Build Coastguard Worker if codec == UNKNOWN8BIT: 161*cda5da8dSAndroid Build Coastguard Worker return string.encode('ascii', 'surrogateescape') 162*cda5da8dSAndroid Build Coastguard Worker else: 163*cda5da8dSAndroid Build Coastguard Worker return string.encode(codec) 164*cda5da8dSAndroid Build Coastguard Worker 165*cda5da8dSAndroid Build Coastguard Worker 166*cda5da8dSAndroid Build Coastguard Worker 167*cda5da8dSAndroid Build Coastguard Workerclass Charset: 168*cda5da8dSAndroid Build Coastguard Worker """Map character sets to their email properties. 169*cda5da8dSAndroid Build Coastguard Worker 170*cda5da8dSAndroid Build Coastguard Worker This class provides information about the requirements imposed on email 171*cda5da8dSAndroid Build Coastguard Worker for a specific character set. It also provides convenience routines for 172*cda5da8dSAndroid Build Coastguard Worker converting between character sets, given the availability of the 173*cda5da8dSAndroid Build Coastguard Worker applicable codecs. Given a character set, it will do its best to provide 174*cda5da8dSAndroid Build Coastguard Worker information on how to use that character set in an email in an 175*cda5da8dSAndroid Build Coastguard Worker RFC-compliant way. 176*cda5da8dSAndroid Build Coastguard Worker 177*cda5da8dSAndroid Build Coastguard Worker Certain character sets must be encoded with quoted-printable or base64 178*cda5da8dSAndroid Build Coastguard Worker when used in email headers or bodies. Certain character sets must be 179*cda5da8dSAndroid Build Coastguard Worker converted outright, and are not allowed in email. Instances of this 180*cda5da8dSAndroid Build Coastguard Worker module expose the following information about a character set: 181*cda5da8dSAndroid Build Coastguard Worker 182*cda5da8dSAndroid Build Coastguard Worker input_charset: The initial character set specified. Common aliases 183*cda5da8dSAndroid Build Coastguard Worker are converted to their `official' email names (e.g. latin_1 184*cda5da8dSAndroid Build Coastguard Worker is converted to iso-8859-1). Defaults to 7-bit us-ascii. 185*cda5da8dSAndroid Build Coastguard Worker 186*cda5da8dSAndroid Build Coastguard Worker header_encoding: If the character set must be encoded before it can be 187*cda5da8dSAndroid Build Coastguard Worker used in an email header, this attribute will be set to 188*cda5da8dSAndroid Build Coastguard Worker Charset.QP (for quoted-printable), Charset.BASE64 (for 189*cda5da8dSAndroid Build Coastguard Worker base64 encoding), or Charset.SHORTEST for the shortest of 190*cda5da8dSAndroid Build Coastguard Worker QP or BASE64 encoding. Otherwise, it will be None. 191*cda5da8dSAndroid Build Coastguard Worker 192*cda5da8dSAndroid Build Coastguard Worker body_encoding: Same as header_encoding, but describes the encoding for the 193*cda5da8dSAndroid Build Coastguard Worker mail message's body, which indeed may be different than the 194*cda5da8dSAndroid Build Coastguard Worker header encoding. Charset.SHORTEST is not allowed for 195*cda5da8dSAndroid Build Coastguard Worker body_encoding. 196*cda5da8dSAndroid Build Coastguard Worker 197*cda5da8dSAndroid Build Coastguard Worker output_charset: Some character sets must be converted before they can be 198*cda5da8dSAndroid Build Coastguard Worker used in email headers or bodies. If the input_charset is 199*cda5da8dSAndroid Build Coastguard Worker one of them, this attribute will contain the name of the 200*cda5da8dSAndroid Build Coastguard Worker charset output will be converted to. Otherwise, it will 201*cda5da8dSAndroid Build Coastguard Worker be None. 202*cda5da8dSAndroid Build Coastguard Worker 203*cda5da8dSAndroid Build Coastguard Worker input_codec: The name of the Python codec used to convert the 204*cda5da8dSAndroid Build Coastguard Worker input_charset to Unicode. If no conversion codec is 205*cda5da8dSAndroid Build Coastguard Worker necessary, this attribute will be None. 206*cda5da8dSAndroid Build Coastguard Worker 207*cda5da8dSAndroid Build Coastguard Worker output_codec: The name of the Python codec used to convert Unicode 208*cda5da8dSAndroid Build Coastguard Worker to the output_charset. If no conversion codec is necessary, 209*cda5da8dSAndroid Build Coastguard Worker this attribute will have the same value as the input_codec. 210*cda5da8dSAndroid Build Coastguard Worker """ 211*cda5da8dSAndroid Build Coastguard Worker def __init__(self, input_charset=DEFAULT_CHARSET): 212*cda5da8dSAndroid Build Coastguard Worker # RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to 213*cda5da8dSAndroid Build Coastguard Worker # unicode because its .lower() is locale insensitive. If the argument 214*cda5da8dSAndroid Build Coastguard Worker # is already a unicode, we leave it at that, but ensure that the 215*cda5da8dSAndroid Build Coastguard Worker # charset is ASCII, as the standard (RFC XXX) requires. 216*cda5da8dSAndroid Build Coastguard Worker try: 217*cda5da8dSAndroid Build Coastguard Worker if isinstance(input_charset, str): 218*cda5da8dSAndroid Build Coastguard Worker input_charset.encode('ascii') 219*cda5da8dSAndroid Build Coastguard Worker else: 220*cda5da8dSAndroid Build Coastguard Worker input_charset = str(input_charset, 'ascii') 221*cda5da8dSAndroid Build Coastguard Worker except UnicodeError: 222*cda5da8dSAndroid Build Coastguard Worker raise errors.CharsetError(input_charset) 223*cda5da8dSAndroid Build Coastguard Worker input_charset = input_charset.lower() 224*cda5da8dSAndroid Build Coastguard Worker # Set the input charset after filtering through the aliases 225*cda5da8dSAndroid Build Coastguard Worker self.input_charset = ALIASES.get(input_charset, input_charset) 226*cda5da8dSAndroid Build Coastguard Worker # We can try to guess which encoding and conversion to use by the 227*cda5da8dSAndroid Build Coastguard Worker # charset_map dictionary. Try that first, but let the user override 228*cda5da8dSAndroid Build Coastguard Worker # it. 229*cda5da8dSAndroid Build Coastguard Worker henc, benc, conv = CHARSETS.get(self.input_charset, 230*cda5da8dSAndroid Build Coastguard Worker (SHORTEST, BASE64, None)) 231*cda5da8dSAndroid Build Coastguard Worker if not conv: 232*cda5da8dSAndroid Build Coastguard Worker conv = self.input_charset 233*cda5da8dSAndroid Build Coastguard Worker # Set the attributes, allowing the arguments to override the default. 234*cda5da8dSAndroid Build Coastguard Worker self.header_encoding = henc 235*cda5da8dSAndroid Build Coastguard Worker self.body_encoding = benc 236*cda5da8dSAndroid Build Coastguard Worker self.output_charset = ALIASES.get(conv, conv) 237*cda5da8dSAndroid Build Coastguard Worker # Now set the codecs. If one isn't defined for input_charset, 238*cda5da8dSAndroid Build Coastguard Worker # guess and try a Unicode codec with the same name as input_codec. 239*cda5da8dSAndroid Build Coastguard Worker self.input_codec = CODEC_MAP.get(self.input_charset, 240*cda5da8dSAndroid Build Coastguard Worker self.input_charset) 241*cda5da8dSAndroid Build Coastguard Worker self.output_codec = CODEC_MAP.get(self.output_charset, 242*cda5da8dSAndroid Build Coastguard Worker self.output_charset) 243*cda5da8dSAndroid Build Coastguard Worker 244*cda5da8dSAndroid Build Coastguard Worker def __repr__(self): 245*cda5da8dSAndroid Build Coastguard Worker return self.input_charset.lower() 246*cda5da8dSAndroid Build Coastguard Worker 247*cda5da8dSAndroid Build Coastguard Worker def __eq__(self, other): 248*cda5da8dSAndroid Build Coastguard Worker return str(self) == str(other).lower() 249*cda5da8dSAndroid Build Coastguard Worker 250*cda5da8dSAndroid Build Coastguard Worker def get_body_encoding(self): 251*cda5da8dSAndroid Build Coastguard Worker """Return the content-transfer-encoding used for body encoding. 252*cda5da8dSAndroid Build Coastguard Worker 253*cda5da8dSAndroid Build Coastguard Worker This is either the string `quoted-printable' or `base64' depending on 254*cda5da8dSAndroid Build Coastguard Worker the encoding used, or it is a function in which case you should call 255*cda5da8dSAndroid Build Coastguard Worker the function with a single argument, the Message object being 256*cda5da8dSAndroid Build Coastguard Worker encoded. The function should then set the Content-Transfer-Encoding 257*cda5da8dSAndroid Build Coastguard Worker header itself to whatever is appropriate. 258*cda5da8dSAndroid Build Coastguard Worker 259*cda5da8dSAndroid Build Coastguard Worker Returns "quoted-printable" if self.body_encoding is QP. 260*cda5da8dSAndroid Build Coastguard Worker Returns "base64" if self.body_encoding is BASE64. 261*cda5da8dSAndroid Build Coastguard Worker Returns conversion function otherwise. 262*cda5da8dSAndroid Build Coastguard Worker """ 263*cda5da8dSAndroid Build Coastguard Worker assert self.body_encoding != SHORTEST 264*cda5da8dSAndroid Build Coastguard Worker if self.body_encoding == QP: 265*cda5da8dSAndroid Build Coastguard Worker return 'quoted-printable' 266*cda5da8dSAndroid Build Coastguard Worker elif self.body_encoding == BASE64: 267*cda5da8dSAndroid Build Coastguard Worker return 'base64' 268*cda5da8dSAndroid Build Coastguard Worker else: 269*cda5da8dSAndroid Build Coastguard Worker return encode_7or8bit 270*cda5da8dSAndroid Build Coastguard Worker 271*cda5da8dSAndroid Build Coastguard Worker def get_output_charset(self): 272*cda5da8dSAndroid Build Coastguard Worker """Return the output character set. 273*cda5da8dSAndroid Build Coastguard Worker 274*cda5da8dSAndroid Build Coastguard Worker This is self.output_charset if that is not None, otherwise it is 275*cda5da8dSAndroid Build Coastguard Worker self.input_charset. 276*cda5da8dSAndroid Build Coastguard Worker """ 277*cda5da8dSAndroid Build Coastguard Worker return self.output_charset or self.input_charset 278*cda5da8dSAndroid Build Coastguard Worker 279*cda5da8dSAndroid Build Coastguard Worker def header_encode(self, string): 280*cda5da8dSAndroid Build Coastguard Worker """Header-encode a string by converting it first to bytes. 281*cda5da8dSAndroid Build Coastguard Worker 282*cda5da8dSAndroid Build Coastguard Worker The type of encoding (base64 or quoted-printable) will be based on 283*cda5da8dSAndroid Build Coastguard Worker this charset's `header_encoding`. 284*cda5da8dSAndroid Build Coastguard Worker 285*cda5da8dSAndroid Build Coastguard Worker :param string: A unicode string for the header. It must be possible 286*cda5da8dSAndroid Build Coastguard Worker to encode this string to bytes using the character set's 287*cda5da8dSAndroid Build Coastguard Worker output codec. 288*cda5da8dSAndroid Build Coastguard Worker :return: The encoded string, with RFC 2047 chrome. 289*cda5da8dSAndroid Build Coastguard Worker """ 290*cda5da8dSAndroid Build Coastguard Worker codec = self.output_codec or 'us-ascii' 291*cda5da8dSAndroid Build Coastguard Worker header_bytes = _encode(string, codec) 292*cda5da8dSAndroid Build Coastguard Worker # 7bit/8bit encodings return the string unchanged (modulo conversions) 293*cda5da8dSAndroid Build Coastguard Worker encoder_module = self._get_encoder(header_bytes) 294*cda5da8dSAndroid Build Coastguard Worker if encoder_module is None: 295*cda5da8dSAndroid Build Coastguard Worker return string 296*cda5da8dSAndroid Build Coastguard Worker return encoder_module.header_encode(header_bytes, codec) 297*cda5da8dSAndroid Build Coastguard Worker 298*cda5da8dSAndroid Build Coastguard Worker def header_encode_lines(self, string, maxlengths): 299*cda5da8dSAndroid Build Coastguard Worker """Header-encode a string by converting it first to bytes. 300*cda5da8dSAndroid Build Coastguard Worker 301*cda5da8dSAndroid Build Coastguard Worker This is similar to `header_encode()` except that the string is fit 302*cda5da8dSAndroid Build Coastguard Worker into maximum line lengths as given by the argument. 303*cda5da8dSAndroid Build Coastguard Worker 304*cda5da8dSAndroid Build Coastguard Worker :param string: A unicode string for the header. It must be possible 305*cda5da8dSAndroid Build Coastguard Worker to encode this string to bytes using the character set's 306*cda5da8dSAndroid Build Coastguard Worker output codec. 307*cda5da8dSAndroid Build Coastguard Worker :param maxlengths: Maximum line length iterator. Each element 308*cda5da8dSAndroid Build Coastguard Worker returned from this iterator will provide the next maximum line 309*cda5da8dSAndroid Build Coastguard Worker length. This parameter is used as an argument to built-in next() 310*cda5da8dSAndroid Build Coastguard Worker and should never be exhausted. The maximum line lengths should 311*cda5da8dSAndroid Build Coastguard Worker not count the RFC 2047 chrome. These line lengths are only a 312*cda5da8dSAndroid Build Coastguard Worker hint; the splitter does the best it can. 313*cda5da8dSAndroid Build Coastguard Worker :return: Lines of encoded strings, each with RFC 2047 chrome. 314*cda5da8dSAndroid Build Coastguard Worker """ 315*cda5da8dSAndroid Build Coastguard Worker # See which encoding we should use. 316*cda5da8dSAndroid Build Coastguard Worker codec = self.output_codec or 'us-ascii' 317*cda5da8dSAndroid Build Coastguard Worker header_bytes = _encode(string, codec) 318*cda5da8dSAndroid Build Coastguard Worker encoder_module = self._get_encoder(header_bytes) 319*cda5da8dSAndroid Build Coastguard Worker encoder = partial(encoder_module.header_encode, charset=codec) 320*cda5da8dSAndroid Build Coastguard Worker # Calculate the number of characters that the RFC 2047 chrome will 321*cda5da8dSAndroid Build Coastguard Worker # contribute to each line. 322*cda5da8dSAndroid Build Coastguard Worker charset = self.get_output_charset() 323*cda5da8dSAndroid Build Coastguard Worker extra = len(charset) + RFC2047_CHROME_LEN 324*cda5da8dSAndroid Build Coastguard Worker # Now comes the hard part. We must encode bytes but we can't split on 325*cda5da8dSAndroid Build Coastguard Worker # bytes because some character sets are variable length and each 326*cda5da8dSAndroid Build Coastguard Worker # encoded word must stand on its own. So the problem is you have to 327*cda5da8dSAndroid Build Coastguard Worker # encode to bytes to figure out this word's length, but you must split 328*cda5da8dSAndroid Build Coastguard Worker # on characters. This causes two problems: first, we don't know how 329*cda5da8dSAndroid Build Coastguard Worker # many octets a specific substring of unicode characters will get 330*cda5da8dSAndroid Build Coastguard Worker # encoded to, and second, we don't know how many ASCII characters 331*cda5da8dSAndroid Build Coastguard Worker # those octets will get encoded to. Unless we try it. Which seems 332*cda5da8dSAndroid Build Coastguard Worker # inefficient. In the interest of being correct rather than fast (and 333*cda5da8dSAndroid Build Coastguard Worker # in the hope that there will be few encoded headers in any such 334*cda5da8dSAndroid Build Coastguard Worker # message), brute force it. :( 335*cda5da8dSAndroid Build Coastguard Worker lines = [] 336*cda5da8dSAndroid Build Coastguard Worker current_line = [] 337*cda5da8dSAndroid Build Coastguard Worker maxlen = next(maxlengths) - extra 338*cda5da8dSAndroid Build Coastguard Worker for character in string: 339*cda5da8dSAndroid Build Coastguard Worker current_line.append(character) 340*cda5da8dSAndroid Build Coastguard Worker this_line = EMPTYSTRING.join(current_line) 341*cda5da8dSAndroid Build Coastguard Worker length = encoder_module.header_length(_encode(this_line, charset)) 342*cda5da8dSAndroid Build Coastguard Worker if length > maxlen: 343*cda5da8dSAndroid Build Coastguard Worker # This last character doesn't fit so pop it off. 344*cda5da8dSAndroid Build Coastguard Worker current_line.pop() 345*cda5da8dSAndroid Build Coastguard Worker # Does nothing fit on the first line? 346*cda5da8dSAndroid Build Coastguard Worker if not lines and not current_line: 347*cda5da8dSAndroid Build Coastguard Worker lines.append(None) 348*cda5da8dSAndroid Build Coastguard Worker else: 349*cda5da8dSAndroid Build Coastguard Worker separator = (' ' if lines else '') 350*cda5da8dSAndroid Build Coastguard Worker joined_line = EMPTYSTRING.join(current_line) 351*cda5da8dSAndroid Build Coastguard Worker header_bytes = _encode(joined_line, codec) 352*cda5da8dSAndroid Build Coastguard Worker lines.append(encoder(header_bytes)) 353*cda5da8dSAndroid Build Coastguard Worker current_line = [character] 354*cda5da8dSAndroid Build Coastguard Worker maxlen = next(maxlengths) - extra 355*cda5da8dSAndroid Build Coastguard Worker joined_line = EMPTYSTRING.join(current_line) 356*cda5da8dSAndroid Build Coastguard Worker header_bytes = _encode(joined_line, codec) 357*cda5da8dSAndroid Build Coastguard Worker lines.append(encoder(header_bytes)) 358*cda5da8dSAndroid Build Coastguard Worker return lines 359*cda5da8dSAndroid Build Coastguard Worker 360*cda5da8dSAndroid Build Coastguard Worker def _get_encoder(self, header_bytes): 361*cda5da8dSAndroid Build Coastguard Worker if self.header_encoding == BASE64: 362*cda5da8dSAndroid Build Coastguard Worker return email.base64mime 363*cda5da8dSAndroid Build Coastguard Worker elif self.header_encoding == QP: 364*cda5da8dSAndroid Build Coastguard Worker return email.quoprimime 365*cda5da8dSAndroid Build Coastguard Worker elif self.header_encoding == SHORTEST: 366*cda5da8dSAndroid Build Coastguard Worker len64 = email.base64mime.header_length(header_bytes) 367*cda5da8dSAndroid Build Coastguard Worker lenqp = email.quoprimime.header_length(header_bytes) 368*cda5da8dSAndroid Build Coastguard Worker if len64 < lenqp: 369*cda5da8dSAndroid Build Coastguard Worker return email.base64mime 370*cda5da8dSAndroid Build Coastguard Worker else: 371*cda5da8dSAndroid Build Coastguard Worker return email.quoprimime 372*cda5da8dSAndroid Build Coastguard Worker else: 373*cda5da8dSAndroid Build Coastguard Worker return None 374*cda5da8dSAndroid Build Coastguard Worker 375*cda5da8dSAndroid Build Coastguard Worker def body_encode(self, string): 376*cda5da8dSAndroid Build Coastguard Worker """Body-encode a string by converting it first to bytes. 377*cda5da8dSAndroid Build Coastguard Worker 378*cda5da8dSAndroid Build Coastguard Worker The type of encoding (base64 or quoted-printable) will be based on 379*cda5da8dSAndroid Build Coastguard Worker self.body_encoding. If body_encoding is None, we assume the 380*cda5da8dSAndroid Build Coastguard Worker output charset is a 7bit encoding, so re-encoding the decoded 381*cda5da8dSAndroid Build Coastguard Worker string using the ascii codec produces the correct string version 382*cda5da8dSAndroid Build Coastguard Worker of the content. 383*cda5da8dSAndroid Build Coastguard Worker """ 384*cda5da8dSAndroid Build Coastguard Worker if not string: 385*cda5da8dSAndroid Build Coastguard Worker return string 386*cda5da8dSAndroid Build Coastguard Worker if self.body_encoding is BASE64: 387*cda5da8dSAndroid Build Coastguard Worker if isinstance(string, str): 388*cda5da8dSAndroid Build Coastguard Worker string = string.encode(self.output_charset) 389*cda5da8dSAndroid Build Coastguard Worker return email.base64mime.body_encode(string) 390*cda5da8dSAndroid Build Coastguard Worker elif self.body_encoding is QP: 391*cda5da8dSAndroid Build Coastguard Worker # quopromime.body_encode takes a string, but operates on it as if 392*cda5da8dSAndroid Build Coastguard Worker # it were a list of byte codes. For a (minimal) history on why 393*cda5da8dSAndroid Build Coastguard Worker # this is so, see changeset 0cf700464177. To correctly encode a 394*cda5da8dSAndroid Build Coastguard Worker # character set, then, we must turn it into pseudo bytes via the 395*cda5da8dSAndroid Build Coastguard Worker # latin1 charset, which will encode any byte as a single code point 396*cda5da8dSAndroid Build Coastguard Worker # between 0 and 255, which is what body_encode is expecting. 397*cda5da8dSAndroid Build Coastguard Worker if isinstance(string, str): 398*cda5da8dSAndroid Build Coastguard Worker string = string.encode(self.output_charset) 399*cda5da8dSAndroid Build Coastguard Worker string = string.decode('latin1') 400*cda5da8dSAndroid Build Coastguard Worker return email.quoprimime.body_encode(string) 401*cda5da8dSAndroid Build Coastguard Worker else: 402*cda5da8dSAndroid Build Coastguard Worker if isinstance(string, str): 403*cda5da8dSAndroid Build Coastguard Worker string = string.encode(self.output_charset).decode('ascii') 404*cda5da8dSAndroid Build Coastguard Worker return string 405