roundup/rfc2822.py

   1 import re
   2 from binascii import b2a_base64, a2b_base64
   3
   4 ecre = re.compile(r'''
   5   =\?                   # literal =?
   6   (?P<charset>[^?]*?)   # non-greedy up to the next ? is the charset
   7   \?                    # literal ?
   8   (?P<encoding>[qb])    # either a "q" or a "b", case insensitive
   9   \?                    # literal ?
  10   (?P<encoded>.*?)      # non-greedy up to the next ?= is the encoded string
  11   \?=                   # literal ?=
  12   ''', re.VERBOSE | re.IGNORECASE)
  13
  14 hqre = re.compile(r'^[-a-zA-Z0-9!*+/\[\]., ]+$')
  15
  16 def base64_decode(s, convert_eols=None):
  17     """Decode a raw base64 string.
  18
  19     If convert_eols is set to a string value, all canonical email linefeeds,
  20     e.g. "\\r\\n", in the decoded text will be converted to the value of
  21     convert_eols.  os.linesep is a good choice for convert_eols if you are
  22     decoding a text attachment.
  23
  24     This function does not parse a full MIME header value encoded with
  25     base64 (like =?iso-8895-1?b?bmloISBuaWgh?=) -- please use the high
  26     level email.Header class for that functionality.
  27
  28     Taken from 'email' module
  29     """
  30     if not s:
  31         return s
  32
  33     dec = a2b_base64(s)
  34     if convert_eols:
  35         return dec.replace(CRLF, convert_eols)
  36     return dec
  37
  38 def unquote_match(match):
  39     """Turn a match in the form =AB to the ASCII character with value 0xab
  40
  41     Taken from 'email' module
  42     """
  43     s = match.group(0)
  44     return chr(int(s[1:3], 16))
  45
  46 def qp_decode(s):
  47     """Decode a string encoded with RFC 2045 MIME header `Q' encoding.
  48
  49     This function does not parse a full MIME header value encoded with
  50     quoted-printable (like =?iso-8895-1?q?Hello_World?=) -- please use
  51     the high level email.Header class for that functionality.
  52
  53     Taken from 'email' module
  54     """
  55     s = s.replace('_', ' ')
  56     return re.sub(r'=\w{2}', unquote_match, s)
  57
  58 def _decode_header(header):
  59     """Decode a message header value without converting charset.
  60
  61     Returns a list of (decoded_string, charset) pairs containing each of the
  62     decoded parts of the header.  Charset is None for non-encoded parts of the
  63     header, otherwise a lower-case string containing the name of the character
  64     set specified in the encoded string.
  65
  66     Taken from 'email' module
  67     """
  68     # If no encoding, just return the header
  69     header = str(header)
  70     if not ecre.search(header):
  71         return [(header, None)]
  72
  73     decoded = []
  74     dec = ''
  75     for line in header.splitlines():
  76         # This line might not have an encoding in it
  77         if not ecre.search(line):
  78             decoded.append((line, None))
  79             continue
  80
  81         parts = ecre.split(line)
  82         while parts:
  83             unenc = parts.pop(0)
  84             if unenc:
  85                 if unenc.strip():
  86                     decoded.append((unenc, None))
  87             if parts:
  88                 charset, encoding = [s.lower() for s in parts[0:2]]
  89                 encoded = parts[2]
  90                 dec = ''
  91                 if encoding == 'q':
  92                     dec = qp_decode(encoded)
  93                 elif encoding == 'b':
  94                     dec = base64_decode(encoded)
  95                 else:
  96                     dec = encoded
  97
  98                 if decoded and decoded[-1][1] == charset:
  99                     decoded[-1] = (decoded[-1][0] + dec, decoded[-1][1])
 100                 else:
 101                     decoded.append((dec, charset))
 102             del parts[0:3]
 103     return decoded
 104
 105 def decode_header(hdr):
 106     """ Decodes rfc2822 encoded header and return utf-8 encoded string
 107     """
 108     if not hdr:
 109         return None
 110     outs = u""
 111     for section in _decode_header(hdr):
 112         charset = unaliasCharset(section[1])
 113         outs += unicode(section[0], charset or 'iso-8859-1', 'replace')
 114     return outs.encode('utf-8')
 115
 116 def encode_header(header):
 117     """ Will encode in quoted-printable encoding only if header
 118     contains non latin characters
 119     """
 120
 121     # Return empty headers unchanged
 122     if not header:
 123         return header
 124
 125     # return plain header if it is not contains non-ascii characters
 126     if hqre.match(header):
 127         return header
 128
 129     charset = 'utf-8'
 130     quoted = ''
 131     #max_encoded = 76 - len(charset) - 7
 132     for c in header:
 133         # Space may be represented as _ instead of =20 for readability
 134         if c == ' ':
 135             quoted += '_'
 136         # These characters can be included verbatim
 137         elif hqre.match(c):
 138             quoted += c
 139         # Otherwise, replace with hex value like =E2
 140         else:
 141             quoted += "=%02X" % ord(c)
 142             plain = 0
 143
 144     return '=?%s?q?%s?=' % (charset, quoted)
 145
 146 def unaliasCharset(charset):
 147     if charset:
 148         return charset.lower().replace("windows-", 'cp')
 149         #return charset_table.get(charset.lower(), charset)
 150     return None
 151
 152 def test():
 153     print encode_header("Contrary, Mary")
 154     #print unaliasCharset('Windows-1251')
 155
 156 if __name__ == '__main__':
 157     test()
 158
 159 # vim: et