1 """A parser for HTML and XHTML."""
3 # This file is based on sgmllib.py, but the API is slightly different.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
11 import markupbase
12 import re
13 import string
15 # Regular expressions used for parsing
17 interesting_normal = re.compile('[&<]')
18 interesting_cdata = re.compile(r'<(/|\Z)')
19 incomplete = re.compile('&[a-zA-Z#]')
21 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
22 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
24 starttagopen = re.compile('<[a-zA-Z]')
25 piclose = re.compile('>')
26 endtagopen = re.compile('</')
27 commentclose = re.compile(r'--\s*>')
28 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
29 attrfind = re.compile(
30 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
31 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
33 locatestarttagend = re.compile(r"""
34 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
35 (?:\s+ # whitespace before attribute name
36 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
37 (?:\s*=\s* # value indicator
38 (?:'[^']*' # LITA-enclosed value
39 |\"[^\"]*\" # LIT-enclosed value
40 |[^'\">\s]+ # bare value
41 )
42 )?
43 )
44 )*
45 \s* # trailing whitespace
46 """, re.VERBOSE)
47 endendtag = re.compile('>')
48 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
51 class HTMLParseError(Exception):
52 """Exception raised for all parse errors."""
54 def __init__(self, msg, position=(None, None)):
55 assert msg
56 self.msg = msg
57 self.lineno = position[0]
58 self.offset = position[1]
60 def __str__(self):
61 result = self.msg
62 if self.lineno is not None:
63 result = result + ", at line %d" % self.lineno
64 if self.offset is not None:
65 result = result + ", column %d" % (self.offset + 1)
66 return result
69 def _contains_at(s, sub, pos):
70 return s[pos:pos+len(sub)] == sub
73 class HTMLParser(markupbase.ParserBase):
74 """Find tags and other markup and call handler functions.
76 Usage:
77 p = HTMLParser()
78 p.feed(data)
79 ...
80 p.close()
82 Start tags are handled by calling self.handle_starttag() or
83 self.handle_startendtag(); end tags by self.handle_endtag(). The
84 data between tags is passed from the parser to the derived class
85 by calling self.handle_data() with the data as argument (the data
86 may be split up in arbitrary chunks). Entity references are
87 passed by calling self.handle_entityref() with the entity
88 reference as the argument. Numeric character references are
89 passed to self.handle_charref() with the string containing the
90 reference as the argument.
91 """
93 CDATA_CONTENT_ELEMENTS = ("script", "style")
96 def __init__(self):
97 """Initialize and reset this instance."""
98 self.reset()
100 def reset(self):
101 """Reset this instance. Loses all unprocessed data."""
102 self.rawdata = ''
103 self.stack = []
104 self.lasttag = '???'
105 self.interesting = interesting_normal
106 markupbase.ParserBase.reset(self)
108 def feed(self, data):
109 """Feed data to the parser.
111 Call this as often as you want, with as little or as much text
112 as you want (may include '\n').
113 """
114 self.rawdata = self.rawdata + data
115 self.goahead(0)
117 def close(self):
118 """Handle any buffered data."""
119 self.goahead(1)
121 def error(self, message):
122 raise HTMLParseError(message, self.getpos())
124 __starttag_text = None
126 def get_starttag_text(self):
127 """Return full source of start tag: '<...>'."""
128 return self.__starttag_text
130 cdata_endtag = None
132 def set_cdata_mode(self, endtag=None):
133 self.cdata_endtag = endtag
134 self.interesting = interesting_cdata
136 def clear_cdata_mode(self):
137 self.cdata_endtag = None
138 self.interesting = interesting_normal
140 # Internal -- handle data as far as reasonable. May leave state
141 # and data to be processed by a subsequent call. If 'end' is
142 # true, force handling all data as if followed by EOF marker.
143 def goahead(self, end):
144 rawdata = self.rawdata
145 i = 0
146 n = len(rawdata)
147 while i < n:
148 match = self.interesting.search(rawdata, i) # < or &
149 if match:
150 j = match.start()
151 else:
152 j = n
153 if i < j: self.handle_data(rawdata[i:j])
154 i = self.updatepos(i, j)
155 if i == n: break
156 if rawdata[i] == '<':
157 if starttagopen.match(rawdata, i): # < + letter
158 k = self.parse_starttag(i)
159 elif endtagopen.match(rawdata, i): # </
160 k = self.parse_endtag(i)
161 elif _contains_at(rawdata, "<!--", i): # <!--
162 k = self.parse_comment(i)
163 elif _contains_at(rawdata, "<!", i): # <!
164 k = self.parse_declaration(i)
165 elif _contains_at(rawdata, "<?", i): # <?
166 k = self.parse_pi(i)
167 elif _contains_at(rawdata, "<?", i): # <!
168 k = self.parse_declaration(i)
169 elif (i + 1) < n:
170 self.handle_data("<")
171 k = i + 1
172 else:
173 break
174 if k < 0:
175 if end:
176 self.error("EOF in middle of construct")
177 break
178 i = self.updatepos(i, k)
179 elif rawdata[i:i+2] == "&#":
180 match = charref.match(rawdata, i)
181 if match:
182 name = match.group()[2:-1]
183 self.handle_charref(name)
184 k = match.end()
185 if rawdata[k-1] != ';':
186 k = k - 1
187 i = self.updatepos(i, k)
188 continue
189 else:
190 break
191 elif rawdata[i] == '&':
192 match = entityref.match(rawdata, i)
193 if match:
194 name = match.group(1)
195 self.handle_entityref(name)
196 k = match.end()
197 if rawdata[k-1] != ';':
198 k = k - 1
199 i = self.updatepos(i, k)
200 continue
201 match = incomplete.match(rawdata, i)
202 if match:
203 # match.group() will contain at least 2 chars
204 rest = rawdata[i:]
205 if end and match.group() == rest:
206 self.error("EOF in middle of entity or char ref")
207 # incomplete
208 break
209 elif (i + 1) < n:
210 # not the end of the buffer, and can't be confused
211 # with some other construct
212 self.handle_data("&")
213 i = self.updatepos(i, i + 1)
214 else:
215 break
216 else:
217 assert 0, "interesting.search() lied"
218 # end while
219 if end and i < n:
220 self.handle_data(rawdata[i:n])
221 i = self.updatepos(i, n)
222 self.rawdata = rawdata[i:]
224 # Internal -- parse comment, return end or -1 if not terminated
225 def parse_comment(self, i, report=1):
226 rawdata = self.rawdata
227 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
228 match = commentclose.search(rawdata, i+4)
229 if not match:
230 return -1
231 if report:
232 j = match.start()
233 self.handle_comment(rawdata[i+4: j])
234 j = match.end()
235 return j
237 # Internal -- parse processing instr, return end or -1 if not terminated
238 def parse_pi(self, i):
239 rawdata = self.rawdata
240 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
241 match = piclose.search(rawdata, i+2) # >
242 if not match:
243 return -1
244 j = match.start()
245 self.handle_pi(rawdata[i+2: j])
246 j = match.end()
247 return j
249 # Internal -- handle starttag, return end or -1 if not terminated
250 def parse_starttag(self, i):
251 self.__starttag_text = None
252 endpos = self.check_for_whole_start_tag(i)
253 if endpos < 0:
254 return endpos
255 rawdata = self.rawdata
256 self.__starttag_text = rawdata[i:endpos]
258 # Now parse the data between i+1 and j into a tag and attrs
259 attrs = []
260 match = tagfind.match(rawdata, i+1)
261 assert match, 'unexpected call to parse_starttag()'
262 k = match.end()
263 self.lasttag = tag = string.lower(rawdata[i+1:k])
265 while k < endpos:
266 m = attrfind.match(rawdata, k)
267 if not m:
268 break
269 attrname, rest, attrvalue = m.group(1, 2, 3)
270 if not rest:
271 attrvalue = None
272 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
273 attrvalue[:1] == '"' == attrvalue[-1:]:
274 attrvalue = attrvalue[1:-1]
275 attrvalue = self.unescape(attrvalue)
276 attrs.append((string.lower(attrname), attrvalue))
277 k = m.end()
279 end = string.strip(rawdata[k:endpos])
280 if end not in (">", "/>"):
281 lineno, offset = self.getpos()
282 if "\n" in self.__starttag_text:
283 lineno = lineno + string.count(self.__starttag_text, "\n")
284 offset = len(self.__starttag_text) \
285 - string.rfind(self.__starttag_text, "\n")
286 else:
287 offset = offset + len(self.__starttag_text)
288 self.error("junk characters in start tag: %s"
289 % `rawdata[k:endpos][:20]`)
290 if end[-2:] == '/>':
291 # XHTML-style empty tag: <span attr="value" />
292 self.handle_startendtag(tag, attrs)
293 else:
294 self.handle_starttag(tag, attrs)
295 if tag in self.CDATA_CONTENT_ELEMENTS:
296 self.set_cdata_mode(tag)
297 return endpos
299 # Internal -- check to see if we have a complete starttag; return end
300 # or -1 if incomplete.
301 def check_for_whole_start_tag(self, i):
302 rawdata = self.rawdata
303 m = locatestarttagend.match(rawdata, i)
304 if m:
305 j = m.end()
306 next = rawdata[j:j+1]
307 if next == ">":
308 return j + 1
309 if next == "/":
310 s = rawdata[j:j+2]
311 if s == "/>":
312 return j + 2
313 if s == "/":
314 # buffer boundary
315 return -1
316 # else bogus input
317 self.updatepos(i, j + 1)
318 self.error("malformed empty start tag")
319 if next == "":
320 # end of input
321 return -1
322 if next in ("abcdefghijklmnopqrstuvwxyz=/"
323 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
324 # end of input in or before attribute value, or we have the
325 # '/' from a '/>' ending
326 return -1
327 self.updatepos(i, j)
328 self.error("malformed start tag")
329 raise AssertionError("we should not get here!")
331 # Internal -- parse endtag, return end or -1 if incomplete
332 def parse_endtag(self, i):
333 rawdata = self.rawdata
334 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
335 match = endendtag.search(rawdata, i+1) # >
336 if not match:
337 return -1
338 j = match.end()
339 match = endtagfind.match(rawdata, i) # </ + tag + >
340 if not match:
341 self.error("bad end tag: %s" % `rawdata[i:j]`)
342 tag = string.lower(match.group(1))
343 if ( self.cdata_endtag is not None
344 and tag != self.cdata_endtag):
345 # Should be a mismatched end tag, but we'll treat it
346 # as text anyway, since most HTML authors aren't
347 # interested in the finer points of syntax.
348 self.handle_data(match.group(0))
349 else:
350 self.handle_endtag(tag)
351 self.clear_cdata_mode()
352 return j
354 # Overridable -- finish processing of start+end tag: <tag.../>
355 def handle_startendtag(self, tag, attrs):
356 self.handle_starttag(tag, attrs)
357 self.handle_endtag(tag)
359 # Overridable -- handle start tag
360 def handle_starttag(self, tag, attrs):
361 pass
363 # Overridable -- handle end tag
364 def handle_endtag(self, tag):
365 pass
367 # Overridable -- handle character reference
368 def handle_charref(self, name):
369 pass
371 # Overridable -- handle entity reference
372 def handle_entityref(self, name):
373 pass
375 # Overridable -- handle data
376 def handle_data(self, data):
377 pass
379 # Overridable -- handle comment
380 def handle_comment(self, data):
381 pass
383 # Overridable -- handle declaration
384 def handle_decl(self, decl):
385 pass
387 # Overridable -- handle processing instruction
388 def handle_pi(self, data):
389 pass
391 def unknown_decl(self, data):
392 self.error("unknown declaration: " + `data`)
394 # Internal -- helper to remove special character quoting
395 def unescape(self, s):
396 if '&' not in s:
397 return s
398 s = string.replace(s, "<", "<")
399 s = string.replace(s, ">", ">")
400 s = string.replace(s, "'", "'")
401 s = string.replace(s, """, '"')
402 s = string.replace(s, "&", "&") # Must be last
403 return s