roundup/cgi/TAL/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 interesting_cdata = re.compile(r'<(/|\Z)')
  18 incomplete = re.compile('&[a-zA-Z#]')
  19
  20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  22
  23 starttagopen = re.compile('<[a-zA-Z]')
  24 piclose = re.compile('>')
  25 endtagopen = re.compile('</')
  26 commentclose = re.compile(r'--\s*>')
  27 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  28 attrfind = re.compile(
  29     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  30     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  31
  32 locatestarttagend = re.compile(r"""
  33   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  34   (?:\s+                             # whitespace before attribute name
  35     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  36       (?:\s*=\s*                     # value indicator
  37         (?:'[^']*'                   # LITA-enclosed value
  38           |\"[^\"]*\"                # LIT-enclosed value
  39           |[^'\">\s]+                # bare value
  40          )
  41        )?
  42      )
  43    )*
  44   \s*                                # trailing whitespace
  45 """, re.VERBOSE)
  46 endendtag = re.compile('>')
  47 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  48
  49
  50 class HTMLParseError(Exception):
  51     """Exception raised for all parse errors."""
  52
  53     def __init__(self, msg, position=(None, None)):
  54         assert msg
  55         self.msg = msg
  56         self.lineno = position[0]
  57         self.offset = position[1]
  58
  59     def __str__(self):
  60         result = self.msg
  61         if self.lineno is not None:
  62             result = result + ", at line %d" % self.lineno
  63         if self.offset is not None:
  64             result = result + ", column %d" % (self.offset + 1)
  65         return result
  66
  67
  68 def _contains_at(s, sub, pos):
  69     return s[pos:pos+len(sub)] == sub
  70
  71
  72 class HTMLParser(markupbase.ParserBase):
  73     """Find tags and other markup and call handler functions.
  74
  75     Usage:
  76         p = HTMLParser()
  77         p.feed(data)
  78         ...
  79         p.close()
  80
  81     Start tags are handled by calling self.handle_starttag() or
  82     self.handle_startendtag(); end tags by self.handle_endtag().  The
  83     data between tags is passed from the parser to the derived class
  84     by calling self.handle_data() with the data as argument (the data
  85     may be split up in arbitrary chunks).  Entity references are
  86     passed by calling self.handle_entityref() with the entity
  87     reference as the argument.  Numeric character references are
  88     passed to self.handle_charref() with the string containing the
  89     reference as the argument.
  90     """
  91
  92     CDATA_CONTENT_ELEMENTS = ("script", "style")
  93
  94
  95     def __init__(self):
  96         """Initialize and reset this instance."""
  97         self.reset()
  98
  99     def reset(self):
 100         """Reset this instance.  Loses all unprocessed data."""
 101         self.rawdata = ''
 102         self.stack = []
 103         self.lasttag = '???'
 104         self.interesting = interesting_normal
 105         markupbase.ParserBase.reset(self)
 106
 107     def feed(self, data):
 108         """Feed data to the parser.
 109
 110         Call this as often as you want, with as little or as much text
 111         as you want (may include '\n').
 112         """
 113         self.rawdata = self.rawdata + data
 114         self.goahead(0)
 115
 116     def close(self):
 117         """Handle any buffered data."""
 118         self.goahead(1)
 119
 120     def error(self, message):
 121         raise HTMLParseError(message, self.getpos())
 122
 123     __starttag_text = None
 124
 125     def get_starttag_text(self):
 126         """Return full source of start tag: '<...>'."""
 127         return self.__starttag_text
 128
 129     cdata_endtag = None
 130
 131     def set_cdata_mode(self, endtag=None):
 132         self.cdata_endtag = endtag
 133         self.interesting = interesting_cdata
 134
 135     def clear_cdata_mode(self):
 136         self.cdata_endtag = None
 137         self.interesting = interesting_normal
 138
 139     # Internal -- handle data as far as reasonable.  May leave state
 140     # and data to be processed by a subsequent call.  If 'end' is
 141     # true, force handling all data as if followed by EOF marker.
 142     def goahead(self, end):
 143         rawdata = self.rawdata
 144         i = 0
 145         n = len(rawdata)
 146         while i < n:
 147             match = self.interesting.search(rawdata, i) # < or &
 148             if match:
 149                 j = match.start()
 150             else:
 151                 j = n
 152             if i < j: self.handle_data(rawdata[i:j])
 153             i = self.updatepos(i, j)
 154             if i == n: break
 155             if rawdata[i] == '<':
 156                 if starttagopen.match(rawdata, i): # < + letter
 157                     k = self.parse_starttag(i)
 158                 elif endtagopen.match(rawdata, i): # </
 159                     k = self.parse_endtag(i)
 160                 elif _contains_at(rawdata, "<!--", i): # <!--
 161                     k = self.parse_comment(i)
 162                 elif _contains_at(rawdata, "<!", i): # <!
 163                     k = self.parse_declaration(i)
 164                 elif _contains_at(rawdata, "<?", i): # <?
 165                     k = self.parse_pi(i)
 166                 elif _contains_at(rawdata, "<?", i): # <!
 167                     k = self.parse_declaration(i)
 168                 elif (i + 1) < n:
 169                     self.handle_data("<")
 170                     k = i + 1
 171                 else:
 172                     break
 173                 if k < 0:
 174                     if end:
 175                         self.error("EOF in middle of construct")
 176                     break
 177                 i = self.updatepos(i, k)
 178             elif rawdata[i:i+2] == "&#":
 179                 match = charref.match(rawdata, i)
 180                 if match:
 181                     name = match.group()[2:-1]
 182                     self.handle_charref(name)
 183                     k = match.end()
 184                     if rawdata[k-1] != ';':
 185                         k = k - 1
 186                     i = self.updatepos(i, k)
 187                     continue
 188                 else:
 189                     break
 190             elif rawdata[i] == '&':
 191                 match = entityref.match(rawdata, i)
 192                 if match:
 193                     name = match.group(1)
 194                     self.handle_entityref(name)
 195                     k = match.end()
 196                     if rawdata[k-1] != ';':
 197                         k = k - 1
 198                     i = self.updatepos(i, k)
 199                     continue
 200                 match = incomplete.match(rawdata, i)
 201                 if match:
 202                     # match.group() will contain at least 2 chars
 203                     rest = rawdata[i:]
 204                     if end and match.group() == rest:
 205                         self.error("EOF in middle of entity or char ref")
 206                     # incomplete
 207                     break
 208                 elif (i + 1) < n:
 209                     # not the end of the buffer, and can't be confused
 210                     # with some other construct
 211                     self.handle_data("&")
 212                     i = self.updatepos(i, i + 1)
 213                 else:
 214                     break
 215             else:
 216                 assert 0, "interesting.search() lied"
 217         # end while
 218         if end and i < n:
 219             self.handle_data(rawdata[i:n])
 220             i = self.updatepos(i, n)
 221         self.rawdata = rawdata[i:]
 222
 223     # Internal -- parse comment, return end or -1 if not terminated
 224     def parse_comment(self, i, report=1):
 225         rawdata = self.rawdata
 226         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
 227         match = commentclose.search(rawdata, i+4)
 228         if not match:
 229             return -1
 230         if report:
 231             j = match.start()
 232             self.handle_comment(rawdata[i+4: j])
 233         j = match.end()
 234         return j
 235
 236     # Internal -- parse processing instr, return end or -1 if not terminated
 237     def parse_pi(self, i):
 238         rawdata = self.rawdata
 239         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 240         match = piclose.search(rawdata, i+2) # >
 241         if not match:
 242             return -1
 243         j = match.start()
 244         self.handle_pi(rawdata[i+2: j])
 245         j = match.end()
 246         return j
 247
 248     # Internal -- handle starttag, return end or -1 if not terminated
 249     def parse_starttag(self, i):
 250         self.__starttag_text = None
 251         endpos = self.check_for_whole_start_tag(i)
 252         if endpos < 0:
 253             return endpos
 254         rawdata = self.rawdata
 255         self.__starttag_text = rawdata[i:endpos]
 256
 257         # Now parse the data between i+1 and j into a tag and attrs
 258         attrs = []
 259         match = tagfind.match(rawdata, i+1)
 260         assert match, 'unexpected call to parse_starttag()'
 261         k = match.end()
 262         self.lasttag = tag = rawdata[i+1:k].lower()
 263
 264         while k < endpos:
 265             m = attrfind.match(rawdata, k)
 266             if not m:
 267                 break
 268             attrname, rest, attrvalue = m.group(1, 2, 3)
 269             if not rest:
 270                 attrvalue = None
 271             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 272                  attrvalue[:1] == '"' == attrvalue[-1:]:
 273                 attrvalue = attrvalue[1:-1]
 274                 attrvalue = self.unescape(attrvalue)
 275             attrs.append((attrname.lower(), attrvalue))
 276             k = m.end()
 277
 278         end = rawdata[k:endpos].strip()
 279         if end not in (">", "/>"):
 280             lineno, offset = self.getpos()
 281             if "\n" in self.__starttag_text:
 282                 lineno = lineno + self.__starttag_text.count("\n")
 283                 offset = len(self.__starttag_text) \
 284                          - self.__starttag_text.rfind("\n")
 285             else:
 286                 offset = offset + len(self.__starttag_text)
 287             self.error("junk characters in start tag: %s"
 288                        % `rawdata[k:endpos][:20]`)
 289         if end[-2:] == '/>':
 290             # XHTML-style empty tag: <span attr="value" />
 291             self.handle_startendtag(tag, attrs)
 292         else:
 293             self.handle_starttag(tag, attrs)
 294             if tag in self.CDATA_CONTENT_ELEMENTS:
 295                 self.set_cdata_mode(tag)
 296         return endpos
 297
 298     # Internal -- check to see if we have a complete starttag; return end
 299     # or -1 if incomplete.
 300     def check_for_whole_start_tag(self, i):
 301         rawdata = self.rawdata
 302         m = locatestarttagend.match(rawdata, i)
 303         if m:
 304             j = m.end()
 305             next = rawdata[j:j+1]
 306             if next == ">":
 307                 return j + 1
 308             if next == "/":
 309                 s = rawdata[j:j+2]
 310                 if s == "/>":
 311                     return j + 2
 312                 if s == "/":
 313                     # buffer boundary
 314                     return -1
 315                 # else bogus input
 316                 self.updatepos(i, j + 1)
 317                 self.error("malformed empty start tag")
 318             if next == "":
 319                 # end of input
 320                 return -1
 321             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 322                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 323                 # end of input in or before attribute value, or we have the
 324                 # '/' from a '/>' ending
 325                 return -1
 326             self.updatepos(i, j)
 327             self.error("malformed start tag")
 328         raise AssertionError("we should not get here!")
 329
 330     # Internal -- parse endtag, return end or -1 if incomplete
 331     def parse_endtag(self, i):
 332         rawdata = self.rawdata
 333         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 334         match = endendtag.search(rawdata, i+1) # >
 335         if not match:
 336             return -1
 337         j = match.end()
 338         match = endtagfind.match(rawdata, i) # </ + tag + >
 339         if not match:
 340             self.error("bad end tag: %s" % `rawdata[i:j]`)
 341         tag = match.group(1).lower()
 342         if (  self.cdata_endtag is not None
 343               and tag != self.cdata_endtag):
 344             # Should be a mismatched end tag, but we'll treat it
 345             # as text anyway, since most HTML authors aren't
 346             # interested in the finer points of syntax.
 347             self.handle_data(match.group(0))
 348         else:
 349             self.handle_endtag(tag)
 350             self.clear_cdata_mode()
 351         return j
 352
 353     # Overridable -- finish processing of start+end tag: <tag.../>
 354     def handle_startendtag(self, tag, attrs):
 355         self.handle_starttag(tag, attrs)
 356         self.handle_endtag(tag)
 357
 358     # Overridable -- handle start tag
 359     def handle_starttag(self, tag, attrs):
 360         pass
 361
 362     # Overridable -- handle end tag
 363     def handle_endtag(self, tag):
 364         pass
 365
 366     # Overridable -- handle character reference
 367     def handle_charref(self, name):
 368         pass
 369
 370     # Overridable -- handle entity reference
 371     def handle_entityref(self, name):
 372         pass
 373
 374     # Overridable -- handle data
 375     def handle_data(self, data):
 376         pass
 377
 378     # Overridable -- handle comment
 379     def handle_comment(self, data):
 380         pass
 381
 382     # Overridable -- handle declaration
 383     def handle_decl(self, decl):
 384         pass
 385
 386     # Overridable -- handle processing instruction
 387     def handle_pi(self, data):
 388         pass
 389
 390     def unknown_decl(self, data):
 391         self.error("unknown declaration: " + `data`)
 392
 393     # Internal -- helper to remove special character quoting
 394     def unescape(self, s):
 395         if '&' not in s:
 396             return s
 397         s = s.replace("&lt;", "<")
 398         s = s.replace("&gt;", ">")
 399         s = s.replace("&apos;", "'")
 400         s = s.replace("&quot;", '"')
 401         s = s.replace("&amp;", "&") # Must be last
 402         return s