1 import re
2 from binascii import b2a_base64, a2b_base64
4 ecre = re.compile(r'''
5 =\? # literal =?
6 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
7 \? # literal ?
8 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
9 \? # literal ?
10 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
11 \?= # literal ?=
12 ''', re.VERBOSE | re.IGNORECASE)
14 hqre = re.compile(r'^[-a-zA-Z0-9!*+/\[\]., ]+$')
16 def base64_decode(s, convert_eols=None):
17 """Decode a raw base64 string.
19 If convert_eols is set to a string value, all canonical email linefeeds,
20 e.g. "\\r\\n", in the decoded text will be converted to the value of
21 convert_eols. os.linesep is a good choice for convert_eols if you are
22 decoding a text attachment.
24 This function does not parse a full MIME header value encoded with
25 base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
26 level email.Header class for that functionality.
28 Taken from 'email' module
29 """
30 if not s:
31 return s
33 dec = a2b_base64(s)
34 if convert_eols:
35 return dec.replace(CRLF, convert_eols)
36 return dec
38 def unquote_match(match):
39 """Turn a match in the form =AB to the ASCII character with value 0xab
41 Taken from 'email' module
42 """
43 s = match.group(0)
44 return chr(int(s[1:3], 16))
46 def qp_decode(s):
47 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
49 This function does not parse a full MIME header value encoded with
50 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
51 the high level email.Header class for that functionality.
53 Taken from 'email' module
54 """
55 s = s.replace('_', ' ')
56 return re.sub(r'=\w{2}', unquote_match, s)
58 def _decode_header(header):
59 """Decode a message header value without converting charset.
61 Returns a list of (decoded_string, charset) pairs containing each of the
62 decoded parts of the header. Charset is None for non-encoded parts of the
63 header, otherwise a lower-case string containing the name of the character
64 set specified in the encoded string.
66 Taken from 'email' module
67 """
68 # If no encoding, just return the header
69 header = str(header)
70 if not ecre.search(header):
71 return [(header, None)]
73 decoded = []
74 dec = ''
75 for line in header.splitlines():
76 # This line might not have an encoding in it
77 if not ecre.search(line):
78 decoded.append((line, None))
79 continue
81 parts = ecre.split(line)
82 while parts:
83 unenc = parts.pop(0)
84 if unenc:
85 if unenc.strip():
86 decoded.append((unenc, None))
87 if parts:
88 charset, encoding = [s.lower() for s in parts[0:2]]
89 encoded = parts[2]
90 dec = ''
91 if encoding == 'q':
92 dec = qp_decode(encoded)
93 elif encoding == 'b':
94 dec = base64_decode(encoded)
95 else:
96 dec = encoded
98 if decoded and decoded[-1][1] == charset:
99 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
100 else:
101 decoded.append((dec, charset))
102 del parts[0:3]
103 return decoded
105 def decode_header(hdr):
106 """ Decodes rfc2822 encoded header and return utf-8 encoded string
107 """
108 if not hdr:
109 return None
110 outs = u""
111 for section in _decode_header(hdr):
112 charset = unaliasCharset(section[1])
113 outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
114 return outs.encode('utf-8')
116 def encode_header(header):
117 """ Will encode in quoted-printable encoding only if header
118 contains non latin characters
119 """
121 # Return empty headers unchanged
122 if not header:
123 return header
125 # return plain header if it is not contains non-ascii characters
126 if hqre.match(header):
127 return header
129 charset = 'utf-8'
130 quoted = ''
131 #max_encoded = 76 - len(charset) - 7
132 for c in header:
133 # Space may be represented as _ instead of =20 for readability
134 if c == ' ':
135 quoted += '_'
136 # These characters can be included verbatim
137 elif hqre.match(c):
138 quoted += c
139 # Otherwise, replace with hex value like =E2
140 else:
141 quoted += "=%02X" % ord(c)
142 plain = 0
144 return '=?%s?q?%s?=' % (charset, quoted)
146 def unaliasCharset(charset):
147 if charset:
148 return charset.lower().replace("windows-", 'cp')
149 #return charset_table.get(charset.lower(), charset)
150 return None
152 def test():
153 print encode_header("Contrary, Mary")
154 #print unaliasCharset('Windows-1251')
156 if __name__ == '__main__':
157 test()
159 # vim: et