roundup/rfc2822.py

   1 """Some rfc822 functions taken from the new (python2.3) "email" module.
   2 """
   3 __docformat__ = 'restructuredtext'
   4
   5 import re
   6 from string import letters, digits
   7 from binascii import b2a_base64, a2b_base64
   8
   9 ecre = re.compile(r'''
  10   =\?                   # literal =?
  11   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
  12   \?                    # literal ?
  13   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
  14   \?                    # literal ?
  15   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
  16   \?=                   # literal ?=
  17   ''', re.VERBOSE | re.IGNORECASE)
  18
  19 hqre = re.compile(r'^[A-z0-9!"#$%%&\'()*+,-./:;<=>?@\[\]^_`{|}~ ]+$')
  20
  21 CRLF = '\r\n'
  22
  23 def base64_decode(s, convert_eols=None):
  24     """Decode a raw base64 string.
  25
  26     If convert_eols is set to a string value, all canonical email linefeeds,
  27     e.g. "\\r\\n", in the decoded text will be converted to the value of
  28     convert_eols.  os.linesep is a good choice for convert_eols if you are
  29     decoding a text attachment.
  30
  31     This function does not parse a full MIME header value encoded with
  32     base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
  33     level email.Header class for that functionality.
  34
  35     Taken from 'email' module
  36     """
  37     if not s:
  38         return s
  39
  40     dec = a2b_base64(s)
  41     if convert_eols:
  42         return dec.replace(CRLF, convert_eols)
  43     return dec
  44
  45 def unquote_match(match):
  46     """Turn a match in the form ``=AB`` to the ASCII character with value
  47     0xab.
  48
  49     Taken from 'email' module
  50     """
  51     s = match.group(0)
  52     return chr(int(s[1:3], 16))
  53
  54 def qp_decode(s):
  55     """Decode a string encoded with RFC 2045 MIME header 'Q' encoding.
  56
  57     This function does not parse a full MIME header value encoded with
  58     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
  59     the high level email.Header class for that functionality.
  60
  61     Taken from 'email' module
  62     """
  63     s = s.replace('_', ' ')
  64     return re.sub(r'=\w{2}', unquote_match, s)
  65
  66 def _decode_header(header):
  67     """Decode a message header value without converting charset.
  68
  69     Returns a list of (decoded_string, charset) pairs containing each of the
  70     decoded parts of the header.  Charset is None for non-encoded parts of the
  71     header, otherwise a lower-case string containing the name of the character
  72     set specified in the encoded string.
  73
  74     Taken from 'email' module
  75     """
  76     # If no encoding, just return the header
  77     header = str(header)
  78     if not ecre.search(header):
  79         return [(header, None)]
  80
  81     decoded = []
  82     dec = ''
  83     for line in header.splitlines():
  84         # This line might not have an encoding in it
  85         if not ecre.search(line):
  86             decoded.append((line, None))
  87             continue
  88
  89         parts = ecre.split(line)
  90         while parts:
  91             unenc = parts.pop(0)
  92             if unenc:
  93                 if unenc.strip():
  94                     decoded.append((unenc, None))
  95             if parts:
  96                 charset, encoding = [s.lower() for s in parts[0:2]]
  97                 encoded = parts[2]
  98                 dec = ''
  99                 if encoding == 'q':
 100                     dec = qp_decode(encoded)
 101                 elif encoding == 'b':
 102                     dec = base64_decode(encoded)
 103                 else:
 104                     dec = encoded
 105
 106                 if decoded and decoded[-1][1] == charset:
 107                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
 108                 else:
 109                     decoded.append((dec, charset))
 110             del parts[0:3]
 111     return decoded
 112
 113 def decode_header(hdr):
 114     """ Decodes rfc2822 encoded header and return utf-8 encoded string
 115     """
 116     if not hdr:
 117         return None
 118     outs = u""
 119     for section in _decode_header(hdr):
 120         charset = unaliasCharset(section[1])
 121         outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
 122     return outs.encode('utf-8')
 123
 124 def encode_header(header, charset='utf-8'):
 125     """ Will encode in quoted-printable encoding only if header
 126     contains non latin characters
 127     """
 128
 129     # Return empty headers unchanged
 130     if not header:
 131         return header
 132
 133     # return plain header if it is not contains non-ascii characters
 134     if hqre.match(header):
 135         return header
 136
 137     quoted = ''
 138     #max_encoded = 76 - len(charset) - 7
 139     for c in header:
 140         # Space may be represented as _ instead of =20 for readability
 141         if c == ' ':
 142             quoted += '_'
 143         # These characters can be included verbatim
 144         elif hqre.match(c) and c not in '_=?':
 145             quoted += c
 146         # Otherwise, replace with hex value like =E2
 147         else:
 148             quoted += "=%02X" % ord(c)
 149             plain = 0
 150
 151     return '=?%s?q?%s?=' % (charset, quoted)
 152
 153 def unaliasCharset(charset):
 154     if charset:
 155         return charset.lower().replace("windows-", 'cp')
 156         #return charset_table.get(charset.lower(), charset)
 157     return None
 158
 159 def test():
 160     print encode_header("Contrary, Mary")
 161     #print unaliasCharset('Windows-1251')
 162
 163 if __name__ == '__main__':
 164     test()
 165
 166 # vim: et