roundup/rfc2822.py

   1 import re
   2 from string import letters, digits
   3 from binascii import b2a_base64, a2b_base64
   4
   5 ecre = re.compile(r'''
   6   =\?                   # literal =?
   7   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
   8   \?                    # literal ?
   9   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
  10   \?                    # literal ?
  11   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
  12   \?=                   # literal ?=
  13   ''', re.VERBOSE | re.IGNORECASE)
  14
  15 hqre = re.compile(r'^[A-z0-9!"#$%%&\'()*+,-./:;<=>?@\[\]^_`{|}~ ]+$')
  16
  17 def base64_decode(s, convert_eols=None):
  18     """Decode a raw base64 string.
  19
  20     If convert_eols is set to a string value, all canonical email linefeeds,
  21     e.g. "\\r\\n", in the decoded text will be converted to the value of
  22     convert_eols.  os.linesep is a good choice for convert_eols if you are
  23     decoding a text attachment.
  24
  25     This function does not parse a full MIME header value encoded with
  26     base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
  27     level email.Header class for that functionality.
  28
  29     Taken from 'email' module
  30     """
  31     if not s:
  32         return s
  33
  34     dec = a2b_base64(s)
  35     if convert_eols:
  36         return dec.replace(CRLF, convert_eols)
  37     return dec
  38
  39 def unquote_match(match):
  40     """Turn a match in the form =AB to the ASCII character with value 0xab
  41
  42     Taken from 'email' module
  43     """
  44     s = match.group(0)
  45     return chr(int(s[1:3], 16))
  46
  47 def qp_decode(s):
  48     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
  49
  50     This function does not parse a full MIME header value encoded with
  51     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
  52     the high level email.Header class for that functionality.
  53
  54     Taken from 'email' module
  55     """
  56     s = s.replace('_', ' ')
  57     return re.sub(r'=\w{2}', unquote_match, s)
  58
  59 def _decode_header(header):
  60     """Decode a message header value without converting charset.
  61
  62     Returns a list of (decoded_string, charset) pairs containing each of the
  63     decoded parts of the header.  Charset is None for non-encoded parts of the
  64     header, otherwise a lower-case string containing the name of the character
  65     set specified in the encoded string.
  66
  67     Taken from 'email' module
  68     """
  69     # If no encoding, just return the header
  70     header = str(header)
  71     if not ecre.search(header):
  72         return [(header, None)]
  73
  74     decoded = []
  75     dec = ''
  76     for line in header.splitlines():
  77         # This line might not have an encoding in it
  78         if not ecre.search(line):
  79             decoded.append((line, None))
  80             continue
  81
  82         parts = ecre.split(line)
  83         while parts:
  84             unenc = parts.pop(0)
  85             if unenc:
  86                 if unenc.strip():
  87                     decoded.append((unenc, None))
  88             if parts:
  89                 charset, encoding = [s.lower() for s in parts[0:2]]
  90                 encoded = parts[2]
  91                 dec = ''
  92                 if encoding == 'q':
  93                     dec = qp_decode(encoded)
  94                 elif encoding == 'b':
  95                     dec = base64_decode(encoded)
  96                 else:
  97                     dec = encoded
  98
  99                 if decoded and decoded[-1][1] == charset:
 100                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
 101                 else:
 102                     decoded.append((dec, charset))
 103             del parts[0:3]
 104     return decoded
 105
 106 def decode_header(hdr):
 107     """ Decodes rfc2822 encoded header and return utf-8 encoded string
 108     """
 109     if not hdr:
 110         return None
 111     outs = u""
 112     for section in _decode_header(hdr):
 113         charset = unaliasCharset(section[1])
 114         outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
 115     return outs.encode('utf-8')
 116
 117 def encode_header(header):
 118     """ Will encode in quoted-printable encoding only if header
 119     contains non latin characters
 120     """
 121
 122     # Return empty headers unchanged
 123     if not header:
 124         return header
 125
 126     # return plain header if it is not contains non-ascii characters
 127     if hqre.match(header):
 128         return header
 129
 130     charset = 'utf-8'
 131     quoted = ''
 132     #max_encoded = 76 - len(charset) - 7
 133     for c in header:
 134         # Space may be represented as _ instead of =20 for readability
 135         if c == ' ':
 136             quoted += '_'
 137         # These characters can be included verbatim
 138         elif hqre.match(c):
 139             quoted += c
 140         # Otherwise, replace with hex value like =E2
 141         else:
 142             quoted += "=%02X" % ord(c)
 143             plain = 0
 144
 145     return '=?%s?q?%s?=' % (charset, quoted)
 146
 147 def unaliasCharset(charset):
 148     if charset:
 149         return charset.lower().replace("windows-", 'cp')
 150         #return charset_table.get(charset.lower(), charset)
 151     return None
 152
 153 def test():
 154     print encode_header("Contrary, Mary")
 155     #print unaliasCharset('Windows-1251')
 156
 157 if __name__ == '__main__':
 158     test()
 159
 160 # vim: et