[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / email / charset.py

# Copyright (C) 2001-2006 Python Software Foundation\r
# Author: Ben Gertzfield, Barry Warsaw\r
# Contact: email-sig@python.org\r
\r
__all__ = [\r
    'Charset',\r
    'add_alias',\r
    'add_charset',\r
    'add_codec',\r
    ]\r
\r
import codecs\r
import email.base64mime\r
import email.quoprimime\r
\r
from email import errors\r
from email.encoders import encode_7or8bit\r
\r
\r
\f\r
# Flags for types of header encodings\r
QP          = 1 # Quoted-Printable\r
BASE64      = 2 # Base64\r
SHORTEST    = 3 # the shorter of QP and base64, but only for headers\r
\r
# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7\r
MISC_LEN = 7\r
\r
DEFAULT_CHARSET = 'us-ascii'\r
\r
\r
\f\r
# Defaults\r
CHARSETS = {\r
    # input        header enc  body enc output conv\r
    'iso-8859-1':  (QP,        QP,      None),\r
    'iso-8859-2':  (QP,        QP,      None),\r
    'iso-8859-3':  (QP,        QP,      None),\r
    'iso-8859-4':  (QP,        QP,      None),\r
    # iso-8859-5 is Cyrillic, and not especially used\r
    # iso-8859-6 is Arabic, also not particularly used\r
    # iso-8859-7 is Greek, QP will not make it readable\r
    # iso-8859-8 is Hebrew, QP will not make it readable\r
    'iso-8859-9':  (QP,        QP,      None),\r
    'iso-8859-10': (QP,        QP,      None),\r
    # iso-8859-11 is Thai, QP will not make it readable\r
    'iso-8859-13': (QP,        QP,      None),\r
    'iso-8859-14': (QP,        QP,      None),\r
    'iso-8859-15': (QP,        QP,      None),\r
    'iso-8859-16': (QP,        QP,      None),\r
    'windows-1252':(QP,        QP,      None),\r
    'viscii':      (QP,        QP,      None),\r
    'us-ascii':    (None,      None,    None),\r
    'big5':        (BASE64,    BASE64,  None),\r
    'gb2312':      (BASE64,    BASE64,  None),\r
    'euc-jp':      (BASE64,    None,    'iso-2022-jp'),\r
    'shift_jis':   (BASE64,    None,    'iso-2022-jp'),\r
    'iso-2022-jp': (BASE64,    None,    None),\r
    'koi8-r':      (BASE64,    BASE64,  None),\r
    'utf-8':       (SHORTEST,  BASE64, 'utf-8'),\r
    # We're making this one up to represent raw unencoded 8-bit\r
    '8bit':        (None,      BASE64, 'utf-8'),\r
    }\r
\r
# Aliases for other commonly-used names for character sets.  Map\r
# them to the real ones used in email.\r
ALIASES = {\r
    'latin_1': 'iso-8859-1',\r
    'latin-1': 'iso-8859-1',\r
    'latin_2': 'iso-8859-2',\r
    'latin-2': 'iso-8859-2',\r
    'latin_3': 'iso-8859-3',\r
    'latin-3': 'iso-8859-3',\r
    'latin_4': 'iso-8859-4',\r
    'latin-4': 'iso-8859-4',\r
    'latin_5': 'iso-8859-9',\r
    'latin-5': 'iso-8859-9',\r
    'latin_6': 'iso-8859-10',\r
    'latin-6': 'iso-8859-10',\r
    'latin_7': 'iso-8859-13',\r
    'latin-7': 'iso-8859-13',\r
    'latin_8': 'iso-8859-14',\r
    'latin-8': 'iso-8859-14',\r
    'latin_9': 'iso-8859-15',\r
    'latin-9': 'iso-8859-15',\r
    'latin_10':'iso-8859-16',\r
    'latin-10':'iso-8859-16',\r
    'cp949':   'ks_c_5601-1987',\r
    'euc_jp':  'euc-jp',\r
    'euc_kr':  'euc-kr',\r
    'ascii':   'us-ascii',\r
    }\r
\r
\r
# Map charsets to their Unicode codec strings.\r
CODEC_MAP = {\r
    'gb2312':      'eucgb2312_cn',\r
    'big5':        'big5_tw',\r
    # Hack: We don't want *any* conversion for stuff marked us-ascii, as all\r
    # sorts of garbage might be sent to us in the guise of 7-bit us-ascii.\r
    # Let that stuff pass through without conversion to/from Unicode.\r
    'us-ascii':    None,\r
    }\r
\r
\r
\f\r
# Convenience functions for extending the above mappings\r
def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):\r
    """Add character set properties to the global registry.\r
\r
    charset is the input character set, and must be the canonical name of a\r
    character set.\r
\r
    Optional header_enc and body_enc is either Charset.QP for\r
    quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for\r
    the shortest of qp or base64 encoding, or None for no encoding.  SHORTEST\r
    is only valid for header_enc.  It describes how message headers and\r
    message bodies in the input charset are to be encoded.  Default is no\r
    encoding.\r
\r
    Optional output_charset is the character set that the output should be\r
    in.  Conversions will proceed from input charset, to Unicode, to the\r
    output charset when the method Charset.convert() is called.  The default\r
    is to output in the same character set as the input.\r
\r
    Both input_charset and output_charset must have Unicode codec entries in\r
    the module's charset-to-codec mapping; use add_codec(charset, codecname)\r
    to add codecs the module does not know about.  See the codecs module's\r
    documentation for more information.\r
    """\r
    if body_enc == SHORTEST:\r
        raise ValueError('SHORTEST not allowed for body_enc')\r
    CHARSETS[charset] = (header_enc, body_enc, output_charset)\r
\r
\r
def add_alias(alias, canonical):\r
    """Add a character set alias.\r
\r
    alias is the alias name, e.g. latin-1\r
    canonical is the character set's canonical name, e.g. iso-8859-1\r
    """\r
    ALIASES[alias] = canonical\r
\r
\r
def add_codec(charset, codecname):\r
    """Add a codec that map characters in the given charset to/from Unicode.\r
\r
    charset is the canonical name of a character set.  codecname is the name\r
    of a Python codec, as appropriate for the second argument to the unicode()\r
    built-in, or to the encode() method of a Unicode string.\r
    """\r
    CODEC_MAP[charset] = codecname\r
\r
\r
\f\r
class Charset:\r
    """Map character sets to their email properties.\r
\r
    This class provides information about the requirements imposed on email\r
    for a specific character set.  It also provides convenience routines for\r
    converting between character sets, given the availability of the\r
    applicable codecs.  Given a character set, it will do its best to provide\r
    information on how to use that character set in an email in an\r
    RFC-compliant way.\r
\r
    Certain character sets must be encoded with quoted-printable or base64\r
    when used in email headers or bodies.  Certain character sets must be\r
    converted outright, and are not allowed in email.  Instances of this\r
    module expose the following information about a character set:\r
\r
    input_charset: The initial character set specified.  Common aliases\r
                   are converted to their `official' email names (e.g. latin_1\r
                   is converted to iso-8859-1).  Defaults to 7-bit us-ascii.\r
\r
    header_encoding: If the character set must be encoded before it can be\r
                     used in an email header, this attribute will be set to\r
                     Charset.QP (for quoted-printable), Charset.BASE64 (for\r
                     base64 encoding), or Charset.SHORTEST for the shortest of\r
                     QP or BASE64 encoding.  Otherwise, it will be None.\r
\r
    body_encoding: Same as header_encoding, but describes the encoding for the\r
                   mail message's body, which indeed may be different than the\r
                   header encoding.  Charset.SHORTEST is not allowed for\r
                   body_encoding.\r
\r
    output_charset: Some character sets must be converted before the can be\r
                    used in email headers or bodies.  If the input_charset is\r
                    one of them, this attribute will contain the name of the\r
                    charset output will be converted to.  Otherwise, it will\r
                    be None.\r
\r
    input_codec: The name of the Python codec used to convert the\r
                 input_charset to Unicode.  If no conversion codec is\r
                 necessary, this attribute will be None.\r
\r
    output_codec: The name of the Python codec used to convert Unicode\r
                  to the output_charset.  If no conversion codec is necessary,\r
                  this attribute will have the same value as the input_codec.\r
    """\r
    def __init__(self, input_charset=DEFAULT_CHARSET):\r
        # RFC 2046, $4.1.2 says charsets are not case sensitive.  We coerce to\r
        # unicode because its .lower() is locale insensitive.  If the argument\r
        # is already a unicode, we leave it at that, but ensure that the\r
        # charset is ASCII, as the standard (RFC XXX) requires.\r
        try:\r
            if isinstance(input_charset, unicode):\r
                input_charset.encode('ascii')\r
            else:\r
                input_charset = unicode(input_charset, 'ascii')\r
        except UnicodeError:\r
            raise errors.CharsetError(input_charset)\r
        input_charset = input_charset.lower().encode('ascii')\r
        # Set the input charset after filtering through the aliases and/or codecs\r
        if not (input_charset in ALIASES or input_charset in CHARSETS):\r
            try:\r
                input_charset = codecs.lookup(input_charset).name\r
            except LookupError:\r
                pass\r
        self.input_charset = ALIASES.get(input_charset, input_charset)\r
        # We can try to guess which encoding and conversion to use by the\r
        # charset_map dictionary.  Try that first, but let the user override\r
        # it.\r
        henc, benc, conv = CHARSETS.get(self.input_charset,\r
                                        (SHORTEST, BASE64, None))\r
        if not conv:\r
            conv = self.input_charset\r
        # Set the attributes, allowing the arguments to override the default.\r
        self.header_encoding = henc\r
        self.body_encoding = benc\r
        self.output_charset = ALIASES.get(conv, conv)\r
        # Now set the codecs.  If one isn't defined for input_charset,\r
        # guess and try a Unicode codec with the same name as input_codec.\r
        self.input_codec = CODEC_MAP.get(self.input_charset,\r
                                         self.input_charset)\r
        self.output_codec = CODEC_MAP.get(self.output_charset,\r
                                          self.output_charset)\r
\r
    def __str__(self):\r
        return self.input_charset.lower()\r
\r
    __repr__ = __str__\r
\r
    def __eq__(self, other):\r
        return str(self) == str(other).lower()\r
\r
    def __ne__(self, other):\r
        return not self.__eq__(other)\r
\r
    def get_body_encoding(self):\r
        """Return the content-transfer-encoding used for body encoding.\r
\r
        This is either the string `quoted-printable' or `base64' depending on\r
        the encoding used, or it is a function in which case you should call\r
        the function with a single argument, the Message object being\r
        encoded.  The function should then set the Content-Transfer-Encoding\r
        header itself to whatever is appropriate.\r
\r
        Returns "quoted-printable" if self.body_encoding is QP.\r
        Returns "base64" if self.body_encoding is BASE64.\r
        Returns "7bit" otherwise.\r
        """\r
        assert self.body_encoding != SHORTEST\r
        if self.body_encoding == QP:\r
            return 'quoted-printable'\r
        elif self.body_encoding == BASE64:\r
            return 'base64'\r
        else:\r
            return encode_7or8bit\r
\r
    def convert(self, s):\r
        """Convert a string from the input_codec to the output_codec."""\r
        if self.input_codec != self.output_codec:\r
            return unicode(s, self.input_codec).encode(self.output_codec)\r
        else:\r
            return s\r
\r
    def to_splittable(self, s):\r
        """Convert a possibly multibyte string to a safely splittable format.\r
\r
        Uses the input_codec to try and convert the string to Unicode, so it\r
        can be safely split on character boundaries (even for multibyte\r
        characters).\r
\r
        Returns the string as-is if it isn't known how to convert it to\r
        Unicode with the input_charset.\r
\r
        Characters that could not be converted to Unicode will be replaced\r
        with the Unicode replacement character U+FFFD.\r
        """\r
        if isinstance(s, unicode) or self.input_codec is None:\r
            return s\r
        try:\r
            return unicode(s, self.input_codec, 'replace')\r
        except LookupError:\r
            # Input codec not installed on system, so return the original\r
            # string unchanged.\r
            return s\r
\r
    def from_splittable(self, ustr, to_output=True):\r
        """Convert a splittable string back into an encoded string.\r
\r
        Uses the proper codec to try and convert the string from Unicode back\r
        into an encoded format.  Return the string as-is if it is not Unicode,\r
        or if it could not be converted from Unicode.\r
\r
        Characters that could not be converted from Unicode will be replaced\r
        with an appropriate character (usually '?').\r
\r
        If to_output is True (the default), uses output_codec to convert to an\r
        encoded format.  If to_output is False, uses input_codec.\r
        """\r
        if to_output:\r
            codec = self.output_codec\r
        else:\r
            codec = self.input_codec\r
        if not isinstance(ustr, unicode) or codec is None:\r
            return ustr\r
        try:\r
            return ustr.encode(codec, 'replace')\r
        except LookupError:\r
            # Output codec not installed\r
            return ustr\r
\r
    def get_output_charset(self):\r
        """Return the output character set.\r
\r
        This is self.output_charset if that is not None, otherwise it is\r
        self.input_charset.\r
        """\r
        return self.output_charset or self.input_charset\r
\r
    def encoded_header_len(self, s):\r
        """Return the length of the encoded header string."""\r
        cset = self.get_output_charset()\r
        # The len(s) of a 7bit encoding is len(s)\r
        if self.header_encoding == BASE64:\r
            return email.base64mime.base64_len(s) + len(cset) + MISC_LEN\r
        elif self.header_encoding == QP:\r
            return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN\r
        elif self.header_encoding == SHORTEST:\r
            lenb64 = email.base64mime.base64_len(s)\r
            lenqp = email.quoprimime.header_quopri_len(s)\r
            return min(lenb64, lenqp) + len(cset) + MISC_LEN\r
        else:\r
            return len(s)\r
\r
    def header_encode(self, s, convert=False):\r
        """Header-encode a string, optionally converting it to output_charset.\r
\r
        If convert is True, the string will be converted from the input\r
        charset to the output charset automatically.  This is not useful for\r
        multibyte character sets, which have line length issues (multibyte\r
        characters must be split on a character, not a byte boundary); use the\r
        high-level Header class to deal with these issues.  convert defaults\r
        to False.\r
\r
        The type of encoding (base64 or quoted-printable) will be based on\r
        self.header_encoding.\r
        """\r
        cset = self.get_output_charset()\r
        if convert:\r
            s = self.convert(s)\r
        # 7bit/8bit encodings return the string unchanged (modulo conversions)\r
        if self.header_encoding == BASE64:\r
            return email.base64mime.header_encode(s, cset)\r
        elif self.header_encoding == QP:\r
            return email.quoprimime.header_encode(s, cset, maxlinelen=None)\r
        elif self.header_encoding == SHORTEST:\r
            lenb64 = email.base64mime.base64_len(s)\r
            lenqp = email.quoprimime.header_quopri_len(s)\r
            if lenb64 < lenqp:\r
                return email.base64mime.header_encode(s, cset)\r
            else:\r
                return email.quoprimime.header_encode(s, cset, maxlinelen=None)\r
        else:\r
            return s\r
\r
    def body_encode(self, s, convert=True):\r
        """Body-encode a string and convert it to output_charset.\r
\r
        If convert is True (the default), the string will be converted from\r
        the input charset to output charset automatically.  Unlike\r
        header_encode(), there are no issues with byte boundaries and\r
        multibyte charsets in email bodies, so this is usually pretty safe.\r
\r
        The type of encoding (base64 or quoted-printable) will be based on\r
        self.body_encoding.\r
        """\r
        if convert:\r
            s = self.convert(s)\r
        # 7bit/8bit encodings return the string unchanged (module conversions)\r
        if self.body_encoding is BASE64:\r
            return email.base64mime.body_encode(s)\r
        elif self.body_encoding is QP:\r
            return email.quoprimime.body_encode(s)\r
        else:\r
            return s\r
Commit	Line	Data
4710c53d	1	# Copyright (C) 2001-2006 Python Software Foundation\r
	2	# Author: Ben Gertzfield, Barry Warsaw\r
	3	# Contact: email-sig@python.org\r
	4	\r
	5	__all__ = [\r
	6	'Charset',\r
	7	'add_alias',\r
	8	'add_charset',\r
	9	'add_codec',\r
	10	]\r
	11	\r
	12	import codecs\r
	13	import email.base64mime\r
	14	import email.quoprimime\r
	15	\r
	16	from email import errors\r
	17	from email.encoders import encode_7or8bit\r
	18	\r
	19	\r
	20	\f\r
	21	# Flags for types of header encodings\r
	22	QP = 1 # Quoted-Printable\r
	23	BASE64 = 2 # Base64\r
	24	SHORTEST = 3 # the shorter of QP and base64, but only for headers\r
	25	\r
	26	# In "=?charset?q?hello_world?=", the =?, ?q?, and ?= add up to 7\r
	27	MISC_LEN = 7\r
	28	\r
	29	DEFAULT_CHARSET = 'us-ascii'\r
	30	\r
	31	\r
	32	\f\r
	33	# Defaults\r
	34	CHARSETS = {\r
	35	# input header enc body enc output conv\r
	36	'iso-8859-1': (QP, QP, None),\r
	37	'iso-8859-2': (QP, QP, None),\r
	38	'iso-8859-3': (QP, QP, None),\r
	39	'iso-8859-4': (QP, QP, None),\r
	40	# iso-8859-5 is Cyrillic, and not especially used\r
	41	# iso-8859-6 is Arabic, also not particularly used\r
	42	# iso-8859-7 is Greek, QP will not make it readable\r
	43	# iso-8859-8 is Hebrew, QP will not make it readable\r
	44	'iso-8859-9': (QP, QP, None),\r
	45	'iso-8859-10': (QP, QP, None),\r
	46	# iso-8859-11 is Thai, QP will not make it readable\r
	47	'iso-8859-13': (QP, QP, None),\r
	48	'iso-8859-14': (QP, QP, None),\r
	49	'iso-8859-15': (QP, QP, None),\r
	50	'iso-8859-16': (QP, QP, None),\r
	51	'windows-1252':(QP, QP, None),\r
	52	'viscii': (QP, QP, None),\r
	53	'us-ascii': (None, None, None),\r
	54	'big5': (BASE64, BASE64, None),\r
	55	'gb2312': (BASE64, BASE64, None),\r
	56	'euc-jp': (BASE64, None, 'iso-2022-jp'),\r
	57	'shift_jis': (BASE64, None, 'iso-2022-jp'),\r
	58	'iso-2022-jp': (BASE64, None, None),\r
	59	'koi8-r': (BASE64, BASE64, None),\r
	60	'utf-8': (SHORTEST, BASE64, 'utf-8'),\r
	61	# We're making this one up to represent raw unencoded 8-bit\r
	62	'8bit': (None, BASE64, 'utf-8'),\r
	63	}\r
	64	\r
65	# Aliases for other commonly-used names for character sets. Map\r
66	# them to the real ones used in email.\r
67	ALIASES = {\r
68	'latin_1': 'iso-8859-1',\r
69	'latin-1': 'iso-8859-1',\r
70	'latin_2': 'iso-8859-2',\r
71	'latin-2': 'iso-8859-2',\r
72	'latin_3': 'iso-8859-3',\r
73	'latin-3': 'iso-8859-3',\r
74	'latin_4': 'iso-8859-4',\r
75	'latin-4': 'iso-8859-4',\r
76	'latin_5': 'iso-8859-9',\r
77	'latin-5': 'iso-8859-9',\r
78	'latin_6': 'iso-8859-10',\r
79	'latin-6': 'iso-8859-10',\r
80	'latin_7': 'iso-8859-13',\r
81	'latin-7': 'iso-8859-13',\r
82	'latin_8': 'iso-8859-14',\r
83	'latin-8': 'iso-8859-14',\r
84	'latin_9': 'iso-8859-15',\r
85	'latin-9': 'iso-8859-15',\r
86	'latin_10':'iso-8859-16',\r
87	'latin-10':'iso-8859-16',\r
88	'cp949': 'ks_c_5601-1987',\r
89	'euc_jp': 'euc-jp',\r
90	'euc_kr': 'euc-kr',\r
91	'ascii': 'us-ascii',\r
92	}\r
93	\r
94	\r
95	# Map charsets to their Unicode codec strings.\r
96	CODEC_MAP = {\r
97	'gb2312': 'eucgb2312_cn',\r
98	'big5': 'big5_tw',\r
99	# Hack: We don't want any conversion for stuff marked us-ascii, as all\r
100	# sorts of garbage might be sent to us in the guise of 7-bit us-ascii.\r
101	# Let that stuff pass through without conversion to/from Unicode.\r
102	'us-ascii': None,\r
103	}\r
104	\r
105	\r
106	\f\r
107	# Convenience functions for extending the above mappings\r
108	def add_charset(charset, header_enc=None, body_enc=None, output_charset=None):\r
109	"""Add character set properties to the global registry.\r
110	\r
111	charset is the input character set, and must be the canonical name of a\r
112	character set.\r
113	\r
114	Optional header_enc and body_enc is either Charset.QP for\r
115	quoted-printable, Charset.BASE64 for base64 encoding, Charset.SHORTEST for\r
116	the shortest of qp or base64 encoding, or None for no encoding. SHORTEST\r
117	is only valid for header_enc. It describes how message headers and\r
118	message bodies in the input charset are to be encoded. Default is no\r
119	encoding.\r
120	\r
121	Optional output_charset is the character set that the output should be\r
122	in. Conversions will proceed from input charset, to Unicode, to the\r
123	output charset when the method Charset.convert() is called. The default\r
124	is to output in the same character set as the input.\r
125	\r
126	Both input_charset and output_charset must have Unicode codec entries in\r
127	the module's charset-to-codec mapping; use add_codec(charset, codecname)\r
128	to add codecs the module does not know about. See the codecs module's\r
129	documentation for more information.\r
130	"""\r
131	if body_enc == SHORTEST:\r
132	raise ValueError('SHORTEST not allowed for body_enc')\r
133	CHARSETS[charset] = (header_enc, body_enc, output_charset)\r
134	\r
135	\r
136	def add_alias(alias, canonical):\r
137	"""Add a character set alias.\r
138	\r
139	alias is the alias name, e.g. latin-1\r
140	canonical is the character set's canonical name, e.g. iso-8859-1\r
141	"""\r
142	ALIASES[alias] = canonical\r
143	\r
144	\r
145	def add_codec(charset, codecname):\r
146	"""Add a codec that map characters in the given charset to/from Unicode.\r
147	\r
148	charset is the canonical name of a character set. codecname is the name\r
149	of a Python codec, as appropriate for the second argument to the unicode()\r
150	built-in, or to the encode() method of a Unicode string.\r
151	"""\r
152	CODEC_MAP[charset] = codecname\r
153	\r
154	\r
155	\f\r
156	class Charset:\r
157	"""Map character sets to their email properties.\r
158	\r
159	This class provides information about the requirements imposed on email\r
160	for a specific character set. It also provides convenience routines for\r
161	converting between character sets, given the availability of the\r
162	applicable codecs. Given a character set, it will do its best to provide\r
163	information on how to use that character set in an email in an\r
164	RFC-compliant way.\r
165	\r
166	Certain character sets must be encoded with quoted-printable or base64\r
167	when used in email headers or bodies. Certain character sets must be\r
168	converted outright, and are not allowed in email. Instances of this\r
169	module expose the following information about a character set:\r
170	\r
171	input_charset: The initial character set specified. Common aliases\r
172	are converted to their `official' email names (e.g. latin_1\r
173	is converted to iso-8859-1). Defaults to 7-bit us-ascii.\r
174	\r
175	header_encoding: If the character set must be encoded before it can be\r
176	used in an email header, this attribute will be set to\r
177	Charset.QP (for quoted-printable), Charset.BASE64 (for\r
178	base64 encoding), or Charset.SHORTEST for the shortest of\r
179	QP or BASE64 encoding. Otherwise, it will be None.\r
180	\r
181	body_encoding: Same as header_encoding, but describes the encoding for the\r
182	mail message's body, which indeed may be different than the\r
183	header encoding. Charset.SHORTEST is not allowed for\r
184	body_encoding.\r
185	\r
186	output_charset: Some character sets must be converted before the can be\r
187	used in email headers or bodies. If the input_charset is\r
188	one of them, this attribute will contain the name of the\r
189	charset output will be converted to. Otherwise, it will\r
190	be None.\r
191	\r
192	input_codec: The name of the Python codec used to convert the\r
193	input_charset to Unicode. If no conversion codec is\r
194	necessary, this attribute will be None.\r
195	\r
196	output_codec: The name of the Python codec used to convert Unicode\r
197	to the output_charset. If no conversion codec is necessary,\r
198	this attribute will have the same value as the input_codec.\r
199	"""\r
200	def __init__(self, input_charset=DEFAULT_CHARSET):\r
201	# RFC 2046, $4.1.2 says charsets are not case sensitive. We coerce to\r
202	# unicode because its .lower() is locale insensitive. If the argument\r
203	# is already a unicode, we leave it at that, but ensure that the\r
204	# charset is ASCII, as the standard (RFC XXX) requires.\r
205	try:\r
206	if isinstance(input_charset, unicode):\r
207	input_charset.encode('ascii')\r
208	else:\r
209	input_charset = unicode(input_charset, 'ascii')\r
210	except UnicodeError:\r
211	raise errors.CharsetError(input_charset)\r
212	input_charset = input_charset.lower().encode('ascii')\r
213	# Set the input charset after filtering through the aliases and/or codecs\r
214	if not (input_charset in ALIASES or input_charset in CHARSETS):\r
215	try:\r
216	input_charset = codecs.lookup(input_charset).name\r
217	except LookupError:\r
218	pass\r
219	self.input_charset = ALIASES.get(input_charset, input_charset)\r
220	# We can try to guess which encoding and conversion to use by the\r
221	# charset_map dictionary. Try that first, but let the user override\r
222	# it.\r
223	henc, benc, conv = CHARSETS.get(self.input_charset,\r
224	(SHORTEST, BASE64, None))\r
225	if not conv:\r
226	conv = self.input_charset\r
227	# Set the attributes, allowing the arguments to override the default.\r
228	self.header_encoding = henc\r
229	self.body_encoding = benc\r
230	self.output_charset = ALIASES.get(conv, conv)\r
231	# Now set the codecs. If one isn't defined for input_charset,\r
232	# guess and try a Unicode codec with the same name as input_codec.\r
233	self.input_codec = CODEC_MAP.get(self.input_charset,\r
234	self.input_charset)\r
235	self.output_codec = CODEC_MAP.get(self.output_charset,\r
236	self.output_charset)\r
237	\r
238	def __str__(self):\r
239	return self.input_charset.lower()\r
240	\r
241	__repr__ = __str__\r
242	\r
243	def __eq__(self, other):\r
244	return str(self) == str(other).lower()\r
245	\r
246	def __ne__(self, other):\r
247	return not self.__eq__(other)\r
248	\r
249	def get_body_encoding(self):\r
250	"""Return the content-transfer-encoding used for body encoding.\r
251	\r
252	This is either the string `quoted-printable' or `base64' depending on\r
253	the encoding used, or it is a function in which case you should call\r
254	the function with a single argument, the Message object being\r
255	encoded. The function should then set the Content-Transfer-Encoding\r
256	header itself to whatever is appropriate.\r
257	\r
258	Returns "quoted-printable" if self.body_encoding is QP.\r
259	Returns "base64" if self.body_encoding is BASE64.\r
260	Returns "7bit" otherwise.\r
261	"""\r
262	assert self.body_encoding != SHORTEST\r
263	if self.body_encoding == QP:\r
264	return 'quoted-printable'\r
265	elif self.body_encoding == BASE64:\r
266	return 'base64'\r
267	else:\r
268	return encode_7or8bit\r
269	\r
270	def convert(self, s):\r
271	"""Convert a string from the input_codec to the output_codec."""\r
272	if self.input_codec != self.output_codec:\r
273	return unicode(s, self.input_codec).encode(self.output_codec)\r
274	else:\r
275	return s\r
276	\r
277	def to_splittable(self, s):\r
278	"""Convert a possibly multibyte string to a safely splittable format.\r
279	\r
280	Uses the input_codec to try and convert the string to Unicode, so it\r
281	can be safely split on character boundaries (even for multibyte\r
282	characters).\r
283	\r
284	Returns the string as-is if it isn't known how to convert it to\r
285	Unicode with the input_charset.\r
286	\r
287	Characters that could not be converted to Unicode will be replaced\r
288	with the Unicode replacement character U+FFFD.\r
289	"""\r
290	if isinstance(s, unicode) or self.input_codec is None:\r
291	return s\r
292	try:\r
293	return unicode(s, self.input_codec, 'replace')\r
294	except LookupError:\r
295	# Input codec not installed on system, so return the original\r
296	# string unchanged.\r
297	return s\r
298	\r
299	def from_splittable(self, ustr, to_output=True):\r
300	"""Convert a splittable string back into an encoded string.\r
301	\r
302	Uses the proper codec to try and convert the string from Unicode back\r
303	into an encoded format. Return the string as-is if it is not Unicode,\r
304	or if it could not be converted from Unicode.\r
305	\r
306	Characters that could not be converted from Unicode will be replaced\r
307	with an appropriate character (usually '?').\r
308	\r
309	If to_output is True (the default), uses output_codec to convert to an\r
310	encoded format. If to_output is False, uses input_codec.\r
311	"""\r
312	if to_output:\r
313	codec = self.output_codec\r
314	else:\r
315	codec = self.input_codec\r
316	if not isinstance(ustr, unicode) or codec is None:\r
317	return ustr\r
318	try:\r
319	return ustr.encode(codec, 'replace')\r
320	except LookupError:\r
321	# Output codec not installed\r
322	return ustr\r
323	\r
324	def get_output_charset(self):\r
325	"""Return the output character set.\r
326	\r
327	This is self.output_charset if that is not None, otherwise it is\r
328	self.input_charset.\r
329	"""\r
330	return self.output_charset or self.input_charset\r
331	\r
332	def encoded_header_len(self, s):\r
333	"""Return the length of the encoded header string."""\r
334	cset = self.get_output_charset()\r
335	# The len(s) of a 7bit encoding is len(s)\r
336	if self.header_encoding == BASE64:\r
337	return email.base64mime.base64_len(s) + len(cset) + MISC_LEN\r
338	elif self.header_encoding == QP:\r
339	return email.quoprimime.header_quopri_len(s) + len(cset) + MISC_LEN\r
340	elif self.header_encoding == SHORTEST:\r
341	lenb64 = email.base64mime.base64_len(s)\r
342	lenqp = email.quoprimime.header_quopri_len(s)\r
343	return min(lenb64, lenqp) + len(cset) + MISC_LEN\r
344	else:\r
345	return len(s)\r
346	\r
347	def header_encode(self, s, convert=False):\r
348	"""Header-encode a string, optionally converting it to output_charset.\r
349	\r
350	If convert is True, the string will be converted from the input\r
351	charset to the output charset automatically. This is not useful for\r
352	multibyte character sets, which have line length issues (multibyte\r
353	characters must be split on a character, not a byte boundary); use the\r
354	high-level Header class to deal with these issues. convert defaults\r
355	to False.\r
356	\r
357	The type of encoding (base64 or quoted-printable) will be based on\r
358	self.header_encoding.\r
359	"""\r
360	cset = self.get_output_charset()\r
361	if convert:\r
362	s = self.convert(s)\r
363	# 7bit/8bit encodings return the string unchanged (modulo conversions)\r
364	if self.header_encoding == BASE64:\r
365	return email.base64mime.header_encode(s, cset)\r
366	elif self.header_encoding == QP:\r
367	return email.quoprimime.header_encode(s, cset, maxlinelen=None)\r
368	elif self.header_encoding == SHORTEST:\r
369	lenb64 = email.base64mime.base64_len(s)\r
370	lenqp = email.quoprimime.header_quopri_len(s)\r
371	if lenb64 < lenqp:\r
372	return email.base64mime.header_encode(s, cset)\r
373	else:\r
374	return email.quoprimime.header_encode(s, cset, maxlinelen=None)\r
375	else:\r
376	return s\r
377	\r
378	def body_encode(self, s, convert=True):\r
379	"""Body-encode a string and convert it to output_charset.\r
380	\r
381	If convert is True (the default), the string will be converted from\r
382	the input charset to output charset automatically. Unlike\r
383	header_encode(), there are no issues with byte boundaries and\r
384	multibyte charsets in email bodies, so this is usually pretty safe.\r
385	\r
386	The type of encoding (base64 or quoted-printable) will be based on\r
387	self.body_encoding.\r
388	"""\r
389	if convert:\r
390	s = self.convert(s)\r
391	# 7bit/8bit encodings return the string unchanged (module conversions)\r
392	if self.body_encoding is BASE64:\r
393	return email.base64mime.body_encode(s)\r
394	elif self.body_encoding is QP:\r
395	return email.quoprimime.body_encode(s)\r
396	else:\r
397	return s\r