1 import re
2 from string import letters, digits
3 from binascii import b2a_base64, a2b_base64
5 ecre = re.compile(r'''
6 =\? # literal =?
7 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
8 \? # literal ?
9 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
10 \? # literal ?
11 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
12 \?= # literal ?=
13 ''', re.VERBOSE | re.IGNORECASE)
15 hqre = re.compile(r'^[A-z0-9!"#$%%&\'()*+,-./:;<=>?@\[\]^_`{|}~ ]+$')
17 def base64_decode(s, convert_eols=None):
18 """Decode a raw base64 string.
20 If convert_eols is set to a string value, all canonical email linefeeds,
21 e.g. "\\r\\n", in the decoded text will be converted to the value of
22 convert_eols. os.linesep is a good choice for convert_eols if you are
23 decoding a text attachment.
25 This function does not parse a full MIME header value encoded with
26 base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
27 level email.Header class for that functionality.
29 Taken from 'email' module
30 """
31 if not s:
32 return s
34 dec = a2b_base64(s)
35 if convert_eols:
36 return dec.replace(CRLF, convert_eols)
37 return dec
39 def unquote_match(match):
40 """Turn a match in the form =AB to the ASCII character with value 0xab
42 Taken from 'email' module
43 """
44 s = match.group(0)
45 return chr(int(s[1:3], 16))
47 def qp_decode(s):
48 """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
50 This function does not parse a full MIME header value encoded with
51 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
52 the high level email.Header class for that functionality.
54 Taken from 'email' module
55 """
56 s = s.replace('_', ' ')
57 return re.sub(r'=\w{2}', unquote_match, s)
59 def _decode_header(header):
60 """Decode a message header value without converting charset.
62 Returns a list of (decoded_string, charset) pairs containing each of the
63 decoded parts of the header. Charset is None for non-encoded parts of the
64 header, otherwise a lower-case string containing the name of the character
65 set specified in the encoded string.
67 Taken from 'email' module
68 """
69 # If no encoding, just return the header
70 header = str(header)
71 if not ecre.search(header):
72 return [(header, None)]
74 decoded = []
75 dec = ''
76 for line in header.splitlines():
77 # This line might not have an encoding in it
78 if not ecre.search(line):
79 decoded.append((line, None))
80 continue
82 parts = ecre.split(line)
83 while parts:
84 unenc = parts.pop(0)
85 if unenc:
86 if unenc.strip():
87 decoded.append((unenc, None))
88 if parts:
89 charset, encoding = [s.lower() for s in parts[0:2]]
90 encoded = parts[2]
91 dec = ''
92 if encoding == 'q':
93 dec = qp_decode(encoded)
94 elif encoding == 'b':
95 dec = base64_decode(encoded)
96 else:
97 dec = encoded
99 if decoded and decoded[-1][1] == charset:
100 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
101 else:
102 decoded.append((dec, charset))
103 del parts[0:3]
104 return decoded
106 def decode_header(hdr):
107 """ Decodes rfc2822 encoded header and return utf-8 encoded string
108 """
109 if not hdr:
110 return None
111 outs = u""
112 for section in _decode_header(hdr):
113 charset = unaliasCharset(section[1])
114 outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
115 return outs.encode('utf-8')
117 def encode_header(header):
118 """ Will encode in quoted-printable encoding only if header
119 contains non latin characters
120 """
122 # Return empty headers unchanged
123 if not header:
124 return header
126 # return plain header if it is not contains non-ascii characters
127 if hqre.match(header):
128 return header
130 charset = 'utf-8'
131 quoted = ''
132 #max_encoded = 76 - len(charset) - 7
133 for c in header:
134 # Space may be represented as _ instead of =20 for readability
135 if c == ' ':
136 quoted += '_'
137 # These characters can be included verbatim
138 elif hqre.match(c):
139 quoted += c
140 # Otherwise, replace with hex value like =E2
141 else:
142 quoted += "=%02X" % ord(c)
143 plain = 0
145 return '=?%s?q?%s?=' % (charset, quoted)
147 def unaliasCharset(charset):
148 if charset:
149 return charset.lower().replace("windows-", 'cp')
150 #return charset_table.get(charset.lower(), charset)
151 return None
153 def test():
154 print encode_header("Contrary, Mary")
155 #print unaliasCharset('Windows-1251')
157 if __name__ == '__main__':
158 test()
160 # vim: et