1 """Some rfc822 functions taken from the new (python2.3) "email" module.
2 """
3 __docformat__ = 'restructuredtext'
5 import re
6 from string import letters, digits
7 from binascii import b2a_base64, a2b_base64
9 ecre = re.compile(r'''
10 =\? # literal =?
11 (?P<charset>[^?]*?) # non-greedy up to the next ? is the charset
12 \? # literal ?
13 (?P<encoding>[qb]) # either a "q" or a "b", case insensitive
14 \? # literal ?
15 (?P<encoded>.*?) # non-greedy up to the next ?= is the encoded string
16 \?= # literal ?=
17 ''', re.VERBOSE | re.IGNORECASE)
19 hqre = re.compile(r'^[A-z0-9!"#$%%&\'()*+,-./:;<=>?@\[\]^_`{|}~ ]+$')
21 CRLF = '\r\n'
23 def base64_decode(s, convert_eols=None):
24 """Decode a raw base64 string.
26 If convert_eols is set to a string value, all canonical email linefeeds,
27 e.g. "\\r\\n", in the decoded text will be converted to the value of
28 convert_eols. os.linesep is a good choice for convert_eols if you are
29 decoding a text attachment.
31 This function does not parse a full MIME header value encoded with
32 base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
33 level email.Header class for that functionality.
35 Taken from 'email' module
36 """
37 if not s:
38 return s
40 dec = a2b_base64(s)
41 if convert_eols:
42 return dec.replace(CRLF, convert_eols)
43 return dec
45 def unquote_match(match):
46 """Turn a match in the form ``=AB`` to the ASCII character with value
47 0xab.
49 Taken from 'email' module
50 """
51 s = match.group(0)
52 return chr(int(s[1:3], 16))
54 def qp_decode(s):
55 """Decode a string encoded with RFC 2045 MIME header 'Q' encoding.
57 This function does not parse a full MIME header value encoded with
58 quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
59 the high level email.Header class for that functionality.
61 Taken from 'email' module
62 """
63 s = s.replace('_', ' ')
64 return re.sub(r'=\w{2}', unquote_match, s)
66 def _decode_header(header):
67 """Decode a message header value without converting charset.
69 Returns a list of (decoded_string, charset) pairs containing each of the
70 decoded parts of the header. Charset is None for non-encoded parts of the
71 header, otherwise a lower-case string containing the name of the character
72 set specified in the encoded string.
74 Taken from 'email' module
75 """
76 # If no encoding, just return the header
77 header = str(header)
78 if not ecre.search(header):
79 return [(header, None)]
81 decoded = []
82 dec = ''
83 for line in header.splitlines():
84 # This line might not have an encoding in it
85 if not ecre.search(line):
86 decoded.append((line, None))
87 continue
89 parts = ecre.split(line)
90 while parts:
91 unenc = parts.pop(0)
92 if unenc:
93 if unenc.strip():
94 decoded.append((unenc, None))
95 if parts:
96 charset, encoding = [s.lower() for s in parts[0:2]]
97 encoded = parts[2]
98 dec = ''
99 if encoding == 'q':
100 dec = qp_decode(encoded)
101 elif encoding == 'b':
102 dec = base64_decode(encoded)
103 else:
104 dec = encoded
106 if decoded and decoded[-1][1] == charset:
107 decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
108 else:
109 decoded.append((dec, charset))
110 del parts[0:3]
111 return decoded
113 def decode_header(hdr):
114 """ Decodes rfc2822 encoded header and return utf-8 encoded string
115 """
116 if not hdr:
117 return None
118 outs = u""
119 for section in _decode_header(hdr):
120 charset = unaliasCharset(section[1])
121 outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
122 return outs.encode('utf-8')
124 def encode_header(header, charset='utf-8'):
125 """ Will encode in quoted-printable encoding only if header
126 contains non latin characters
127 """
129 # Return empty headers unchanged
130 if not header:
131 return header
133 # return plain header if it is not contains non-ascii characters
134 if hqre.match(header):
135 return header
137 quoted = ''
138 #max_encoded = 76 - len(charset) - 7
139 for c in header:
140 # Space may be represented as _ instead of =20 for readability
141 if c == ' ':
142 quoted += '_'
143 # These characters can be included verbatim
144 elif hqre.match(c) and c not in '_=?':
145 quoted += c
146 # Otherwise, replace with hex value like =E2
147 else:
148 quoted += "=%02X" % ord(c)
149 plain = 0
151 return '=?%s?q?%s?=' % (charset, quoted)
153 def unaliasCharset(charset):
154 if charset:
155 return charset.lower().replace("windows-", 'cp')
156 #return charset_table.get(charset.lower(), charset)
157 return None
159 def test():
160 print encode_header("Contrary, Mary")
161 #print unaliasCharset('Windows-1251')
163 if __name__ == '__main__':
164 test()
166 # vim: et