TAL/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13 import string
  14
  15 # Regular expressions used for parsing
  16
  17 interesting_normal = re.compile('[&<]')
  18 interesting_cdata = re.compile(r'<(/|\Z)')
  19 incomplete = re.compile('&[a-zA-Z#]')
  20
  21 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  22 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  23
  24 starttagopen = re.compile('<[a-zA-Z]')
  25 piclose = re.compile('>')
  26 endtagopen = re.compile('</')
  27 commentclose = re.compile(r'--\s*>')
  28 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
  29 attrfind = re.compile(
  30     r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
  31     r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
  32
  33 locatestarttagend = re.compile(r"""
  34   <[a-zA-Z][-.a-zA-Z0-9:_]*          # tag name
  35   (?:\s+                             # whitespace before attribute name
  36     (?:[a-zA-Z_][-.:a-zA-Z0-9_]*     # attribute name
  37       (?:\s*=\s*                     # value indicator
  38         (?:'[^']*'                   # LITA-enclosed value
  39           |\"[^\"]*\"                # LIT-enclosed value
  40           |[^'\">\s]+                # bare value
  41          )
  42        )?
  43      )
  44    )*
  45   \s*                                # trailing whitespace
  46 """, re.VERBOSE)
  47 endendtag = re.compile('>')
  48 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  49
  50
  51 class HTMLParseError(Exception):
  52     """Exception raised for all parse errors."""
  53
  54     def __init__(self, msg, position=(None, None)):
  55         assert msg
  56         self.msg = msg
  57         self.lineno = position[0]
  58         self.offset = position[1]
  59
  60     def __str__(self):
  61         result = self.msg
  62         if self.lineno is not None:
  63             result = result + ", at line %d" % self.lineno
  64         if self.offset is not None:
  65             result = result + ", column %d" % (self.offset + 1)
  66         return result
  67
  68
  69 def _contains_at(s, sub, pos):
  70     return s[pos:pos+len(sub)] == sub
  71
  72
  73 class HTMLParser(markupbase.ParserBase):
  74     """Find tags and other markup and call handler functions.
  75
  76     Usage:
  77         p = HTMLParser()
  78         p.feed(data)
  79         ...
  80         p.close()
  81
  82     Start tags are handled by calling self.handle_starttag() or
  83     self.handle_startendtag(); end tags by self.handle_endtag().  The
  84     data between tags is passed from the parser to the derived class
  85     by calling self.handle_data() with the data as argument (the data
  86     may be split up in arbitrary chunks).  Entity references are
  87     passed by calling self.handle_entityref() with the entity
  88     reference as the argument.  Numeric character references are
  89     passed to self.handle_charref() with the string containing the
  90     reference as the argument.
  91     """
  92
  93     CDATA_CONTENT_ELEMENTS = ("script", "style")
  94
  95
  96     def __init__(self):
  97         """Initialize and reset this instance."""
  98         self.reset()
  99
 100     def reset(self):
 101         """Reset this instance.  Loses all unprocessed data."""
 102         self.rawdata = ''
 103         self.stack = []
 104         self.lasttag = '???'
 105         self.interesting = interesting_normal
 106         markupbase.ParserBase.reset(self)
 107
 108     def feed(self, data):
 109         """Feed data to the parser.
 110
 111         Call this as often as you want, with as little or as much text
 112         as you want (may include '\n').
 113         """
 114         self.rawdata = self.rawdata + data
 115         self.goahead(0)
 116
 117     def close(self):
 118         """Handle any buffered data."""
 119         self.goahead(1)
 120
 121     def error(self, message):
 122         raise HTMLParseError(message, self.getpos())
 123
 124     __starttag_text = None
 125
 126     def get_starttag_text(self):
 127         """Return full source of start tag: '<...>'."""
 128         return self.__starttag_text
 129
 130     cdata_endtag = None
 131
 132     def set_cdata_mode(self, endtag=None):
 133         self.cdata_endtag = endtag
 134         self.interesting = interesting_cdata
 135
 136     def clear_cdata_mode(self):
 137         self.cdata_endtag = None
 138         self.interesting = interesting_normal
 139
 140     # Internal -- handle data as far as reasonable.  May leave state
 141     # and data to be processed by a subsequent call.  If 'end' is
 142     # true, force handling all data as if followed by EOF marker.
 143     def goahead(self, end):
 144         rawdata = self.rawdata
 145         i = 0
 146         n = len(rawdata)
 147         while i < n:
 148             match = self.interesting.search(rawdata, i) # < or &
 149             if match:
 150                 j = match.start()
 151             else:
 152                 j = n
 153             if i < j: self.handle_data(rawdata[i:j])
 154             i = self.updatepos(i, j)
 155             if i == n: break
 156             if rawdata[i] == '<':
 157                 if starttagopen.match(rawdata, i): # < + letter
 158                     k = self.parse_starttag(i)
 159                 elif endtagopen.match(rawdata, i): # </
 160                     k = self.parse_endtag(i)
 161                 elif _contains_at(rawdata, "<!--", i): # <!--
 162                     k = self.parse_comment(i)
 163                 elif _contains_at(rawdata, "<!", i): # <!
 164                     k = self.parse_declaration(i)
 165                 elif _contains_at(rawdata, "<?", i): # <?
 166                     k = self.parse_pi(i)
 167                 elif _contains_at(rawdata, "<?", i): # <!
 168                     k = self.parse_declaration(i)
 169                 elif (i + 1) < n:
 170                     self.handle_data("<")
 171                     k = i + 1
 172                 else:
 173                     break
 174                 if k < 0:
 175                     if end:
 176                         self.error("EOF in middle of construct")
 177                     break
 178                 i = self.updatepos(i, k)
 179             elif rawdata[i:i+2] == "&#":
 180                 match = charref.match(rawdata, i)
 181                 if match:
 182                     name = match.group()[2:-1]
 183                     self.handle_charref(name)
 184                     k = match.end()
 185                     if rawdata[k-1] != ';':
 186                         k = k - 1
 187                     i = self.updatepos(i, k)
 188                     continue
 189                 else:
 190                     break
 191             elif rawdata[i] == '&':
 192                 match = entityref.match(rawdata, i)
 193                 if match:
 194                     name = match.group(1)
 195                     self.handle_entityref(name)
 196                     k = match.end()
 197                     if rawdata[k-1] != ';':
 198                         k = k - 1
 199                     i = self.updatepos(i, k)
 200                     continue
 201                 match = incomplete.match(rawdata, i)
 202                 if match:
 203                     # match.group() will contain at least 2 chars
 204                     rest = rawdata[i:]
 205                     if end and match.group() == rest:
 206                         self.error("EOF in middle of entity or char ref")
 207                     # incomplete
 208                     break
 209                 elif (i + 1) < n:
 210                     # not the end of the buffer, and can't be confused
 211                     # with some other construct
 212                     self.handle_data("&")
 213                     i = self.updatepos(i, i + 1)
 214                 else:
 215                     break
 216             else:
 217                 assert 0, "interesting.search() lied"
 218         # end while
 219         if end and i < n:
 220             self.handle_data(rawdata[i:n])
 221             i = self.updatepos(i, n)
 222         self.rawdata = rawdata[i:]
 223
 224     # Internal -- parse comment, return end or -1 if not terminated
 225     def parse_comment(self, i, report=1):
 226         rawdata = self.rawdata
 227         assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
 228         match = commentclose.search(rawdata, i+4)
 229         if not match:
 230             return -1
 231         if report:
 232             j = match.start()
 233             self.handle_comment(rawdata[i+4: j])
 234         j = match.end()
 235         return j
 236
 237     # Internal -- parse processing instr, return end or -1 if not terminated
 238     def parse_pi(self, i):
 239         rawdata = self.rawdata
 240         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 241         match = piclose.search(rawdata, i+2) # >
 242         if not match:
 243             return -1
 244         j = match.start()
 245         self.handle_pi(rawdata[i+2: j])
 246         j = match.end()
 247         return j
 248
 249     # Internal -- handle starttag, return end or -1 if not terminated
 250     def parse_starttag(self, i):
 251         self.__starttag_text = None
 252         endpos = self.check_for_whole_start_tag(i)
 253         if endpos < 0:
 254             return endpos
 255         rawdata = self.rawdata
 256         self.__starttag_text = rawdata[i:endpos]
 257
 258         # Now parse the data between i+1 and j into a tag and attrs
 259         attrs = []
 260         match = tagfind.match(rawdata, i+1)
 261         assert match, 'unexpected call to parse_starttag()'
 262         k = match.end()
 263         self.lasttag = tag = string.lower(rawdata[i+1:k])
 264
 265         while k < endpos:
 266             m = attrfind.match(rawdata, k)
 267             if not m:
 268                 break
 269             attrname, rest, attrvalue = m.group(1, 2, 3)
 270             if not rest:
 271                 attrvalue = None
 272             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 273                  attrvalue[:1] == '"' == attrvalue[-1:]:
 274                 attrvalue = attrvalue[1:-1]
 275                 attrvalue = self.unescape(attrvalue)
 276             attrs.append((string.lower(attrname), attrvalue))
 277             k = m.end()
 278
 279         end = string.strip(rawdata[k:endpos])
 280         if end not in (">", "/>"):
 281             lineno, offset = self.getpos()
 282             if "\n" in self.__starttag_text:
 283                 lineno = lineno + string.count(self.__starttag_text, "\n")
 284                 offset = len(self.__starttag_text) \
 285                          - string.rfind(self.__starttag_text, "\n")
 286             else:
 287                 offset = offset + len(self.__starttag_text)
 288             self.error("junk characters in start tag: %s"
 289                        % `rawdata[k:endpos][:20]`)
 290         if end[-2:] == '/>':
 291             # XHTML-style empty tag: <span attr="value" />
 292             self.handle_startendtag(tag, attrs)
 293         else:
 294             self.handle_starttag(tag, attrs)
 295             if tag in self.CDATA_CONTENT_ELEMENTS:
 296                 self.set_cdata_mode(tag)
 297         return endpos
 298
 299     # Internal -- check to see if we have a complete starttag; return end
 300     # or -1 if incomplete.
 301     def check_for_whole_start_tag(self, i):
 302         rawdata = self.rawdata
 303         m = locatestarttagend.match(rawdata, i)
 304         if m:
 305             j = m.end()
 306             next = rawdata[j:j+1]
 307             if next == ">":
 308                 return j + 1
 309             if next == "/":
 310                 s = rawdata[j:j+2]
 311                 if s == "/>":
 312                     return j + 2
 313                 if s == "/":
 314                     # buffer boundary
 315                     return -1
 316                 # else bogus input
 317                 self.updatepos(i, j + 1)
 318                 self.error("malformed empty start tag")
 319             if next == "":
 320                 # end of input
 321                 return -1
 322             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 323                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 324                 # end of input in or before attribute value, or we have the
 325                 # '/' from a '/>' ending
 326                 return -1
 327             self.updatepos(i, j)
 328             self.error("malformed start tag")
 329         raise AssertionError("we should not get here!")
 330
 331     # Internal -- parse endtag, return end or -1 if incomplete
 332     def parse_endtag(self, i):
 333         rawdata = self.rawdata
 334         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 335         match = endendtag.search(rawdata, i+1) # >
 336         if not match:
 337             return -1
 338         j = match.end()
 339         match = endtagfind.match(rawdata, i) # </ + tag + >
 340         if not match:
 341             self.error("bad end tag: %s" % `rawdata[i:j]`)
 342         tag = string.lower(match.group(1))
 343         if (  self.cdata_endtag is not None
 344               and tag != self.cdata_endtag):
 345             # Should be a mismatched end tag, but we'll treat it
 346             # as text anyway, since most HTML authors aren't
 347             # interested in the finer points of syntax.
 348             self.handle_data(match.group(0))
 349         else:
 350             self.handle_endtag(tag)
 351             self.clear_cdata_mode()
 352         return j
 353
 354     # Overridable -- finish processing of start+end tag: <tag.../>
 355     def handle_startendtag(self, tag, attrs):
 356         self.handle_starttag(tag, attrs)
 357         self.handle_endtag(tag)
 358
 359     # Overridable -- handle start tag
 360     def handle_starttag(self, tag, attrs):
 361         pass
 362
 363     # Overridable -- handle end tag
 364     def handle_endtag(self, tag):
 365         pass
 366
 367     # Overridable -- handle character reference
 368     def handle_charref(self, name):
 369         pass
 370
 371     # Overridable -- handle entity reference
 372     def handle_entityref(self, name):
 373         pass
 374
 375     # Overridable -- handle data
 376     def handle_data(self, data):
 377         pass
 378
 379     # Overridable -- handle comment
 380     def handle_comment(self, data):
 381         pass
 382
 383     # Overridable -- handle declaration
 384     def handle_decl(self, decl):
 385         pass
 386
 387     # Overridable -- handle processing instruction
 388     def handle_pi(self, data):
 389         pass
 390
 391     def unknown_decl(self, data):
 392         self.error("unknown declaration: " + `data`)
 393
 394     # Internal -- helper to remove special character quoting
 395     def unescape(self, s):
 396         if '&' not in s:
 397             return s
 398         s = string.replace(s, "&lt;", "<")
 399         s = string.replace(s, "&gt;", ">")
 400         s = string.replace(s, "&apos;", "'")
 401         s = string.replace(s, "&quot;", '"')
 402         s = string.replace(s, "&amp;", "&") # Must be last
 403         return s