1 """A parser for HTML and XHTML."""
3 # This file is based on sgmllib.py, but the API is slightly different.
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
11 import markupbase
12 import re
14 # Regular expressions used for parsing
16 interesting_normal = re.compile('[&<]')
17 interesting_cdata = re.compile(r'<(/|\Z)')
18 incomplete = re.compile('&[a-zA-Z#]')
20 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
21 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
23 starttagopen = re.compile('<[a-zA-Z]')
24 piclose = re.compile('>')
25 endtagopen = re.compile('</')
26 commentclose = re.compile(r'--\s*>')
27 tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')
28 attrfind = re.compile(
29 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'
30 r'(\'[^\']*\'|"[^"]*"|[-a-zA-Z0-9./:;+*%?!&$\(\)_#=~]*))?')
32 locatestarttagend = re.compile(r"""
33 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
34 (?:\s+ # whitespace before attribute name
35 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
36 (?:\s*=\s* # value indicator
37 (?:'[^']*' # LITA-enclosed value
38 |\"[^\"]*\" # LIT-enclosed value
39 |[^'\">\s]+ # bare value
40 )
41 )?
42 )
43 )*
44 \s* # trailing whitespace
45 """, re.VERBOSE)
46 endendtag = re.compile('>')
47 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
50 class HTMLParseError(Exception):
51 """Exception raised for all parse errors."""
53 def __init__(self, msg, position=(None, None)):
54 assert msg
55 self.msg = msg
56 self.lineno = position[0]
57 self.offset = position[1]
59 def __str__(self):
60 result = self.msg
61 if self.lineno is not None:
62 result = result + ", at line %d" % self.lineno
63 if self.offset is not None:
64 result = result + ", column %d" % (self.offset + 1)
65 return result
68 def _contains_at(s, sub, pos):
69 return s[pos:pos+len(sub)] == sub
72 class HTMLParser(markupbase.ParserBase):
73 """Find tags and other markup and call handler functions.
75 Usage:
76 p = HTMLParser()
77 p.feed(data)
78 ...
79 p.close()
81 Start tags are handled by calling self.handle_starttag() or
82 self.handle_startendtag(); end tags by self.handle_endtag(). The
83 data between tags is passed from the parser to the derived class
84 by calling self.handle_data() with the data as argument (the data
85 may be split up in arbitrary chunks). Entity references are
86 passed by calling self.handle_entityref() with the entity
87 reference as the argument. Numeric character references are
88 passed to self.handle_charref() with the string containing the
89 reference as the argument.
90 """
92 CDATA_CONTENT_ELEMENTS = ("script", "style")
95 def __init__(self):
96 """Initialize and reset this instance."""
97 self.reset()
99 def reset(self):
100 """Reset this instance. Loses all unprocessed data."""
101 self.rawdata = ''
102 self.stack = []
103 self.lasttag = '???'
104 self.interesting = interesting_normal
105 markupbase.ParserBase.reset(self)
107 def feed(self, data):
108 """Feed data to the parser.
110 Call this as often as you want, with as little or as much text
111 as you want (may include '\n').
112 """
113 self.rawdata = self.rawdata + data
114 self.goahead(0)
116 def close(self):
117 """Handle any buffered data."""
118 self.goahead(1)
120 def error(self, message):
121 raise HTMLParseError(message, self.getpos())
123 __starttag_text = None
125 def get_starttag_text(self):
126 """Return full source of start tag: '<...>'."""
127 return self.__starttag_text
129 cdata_endtag = None
131 def set_cdata_mode(self, endtag=None):
132 self.cdata_endtag = endtag
133 self.interesting = interesting_cdata
135 def clear_cdata_mode(self):
136 self.cdata_endtag = None
137 self.interesting = interesting_normal
139 # Internal -- handle data as far as reasonable. May leave state
140 # and data to be processed by a subsequent call. If 'end' is
141 # true, force handling all data as if followed by EOF marker.
142 def goahead(self, end):
143 rawdata = self.rawdata
144 i = 0
145 n = len(rawdata)
146 while i < n:
147 match = self.interesting.search(rawdata, i) # < or &
148 if match:
149 j = match.start()
150 else:
151 j = n
152 if i < j: self.handle_data(rawdata[i:j])
153 i = self.updatepos(i, j)
154 if i == n: break
155 if rawdata[i] == '<':
156 if starttagopen.match(rawdata, i): # < + letter
157 k = self.parse_starttag(i)
158 elif endtagopen.match(rawdata, i): # </
159 k = self.parse_endtag(i)
160 elif _contains_at(rawdata, "<!--", i): # <!--
161 k = self.parse_comment(i)
162 elif _contains_at(rawdata, "<!", i): # <!
163 k = self.parse_declaration(i)
164 elif _contains_at(rawdata, "<?", i): # <?
165 k = self.parse_pi(i)
166 elif _contains_at(rawdata, "<?", i): # <!
167 k = self.parse_declaration(i)
168 elif (i + 1) < n:
169 self.handle_data("<")
170 k = i + 1
171 else:
172 break
173 if k < 0:
174 if end:
175 self.error("EOF in middle of construct")
176 break
177 i = self.updatepos(i, k)
178 elif rawdata[i:i+2] == "&#":
179 match = charref.match(rawdata, i)
180 if match:
181 name = match.group()[2:-1]
182 self.handle_charref(name)
183 k = match.end()
184 if rawdata[k-1] != ';':
185 k = k - 1
186 i = self.updatepos(i, k)
187 continue
188 else:
189 break
190 elif rawdata[i] == '&':
191 match = entityref.match(rawdata, i)
192 if match:
193 name = match.group(1)
194 self.handle_entityref(name)
195 k = match.end()
196 if rawdata[k-1] != ';':
197 k = k - 1
198 i = self.updatepos(i, k)
199 continue
200 match = incomplete.match(rawdata, i)
201 if match:
202 # match.group() will contain at least 2 chars
203 rest = rawdata[i:]
204 if end and match.group() == rest:
205 self.error("EOF in middle of entity or char ref")
206 # incomplete
207 break
208 elif (i + 1) < n:
209 # not the end of the buffer, and can't be confused
210 # with some other construct
211 self.handle_data("&")
212 i = self.updatepos(i, i + 1)
213 else:
214 break
215 else:
216 assert 0, "interesting.search() lied"
217 # end while
218 if end and i < n:
219 self.handle_data(rawdata[i:n])
220 i = self.updatepos(i, n)
221 self.rawdata = rawdata[i:]
223 # Internal -- parse comment, return end or -1 if not terminated
224 def parse_comment(self, i, report=1):
225 rawdata = self.rawdata
226 assert rawdata[i:i+4] == '<!--', 'unexpected call to parse_comment()'
227 match = commentclose.search(rawdata, i+4)
228 if not match:
229 return -1
230 if report:
231 j = match.start()
232 self.handle_comment(rawdata[i+4: j])
233 j = match.end()
234 return j
236 # Internal -- parse processing instr, return end or -1 if not terminated
237 def parse_pi(self, i):
238 rawdata = self.rawdata
239 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
240 match = piclose.search(rawdata, i+2) # >
241 if not match:
242 return -1
243 j = match.start()
244 self.handle_pi(rawdata[i+2: j])
245 j = match.end()
246 return j
248 # Internal -- handle starttag, return end or -1 if not terminated
249 def parse_starttag(self, i):
250 self.__starttag_text = None
251 endpos = self.check_for_whole_start_tag(i)
252 if endpos < 0:
253 return endpos
254 rawdata = self.rawdata
255 self.__starttag_text = rawdata[i:endpos]
257 # Now parse the data between i+1 and j into a tag and attrs
258 attrs = []
259 match = tagfind.match(rawdata, i+1)
260 assert match, 'unexpected call to parse_starttag()'
261 k = match.end()
262 self.lasttag = tag = rawdata[i+1:k].lower()
264 while k < endpos:
265 m = attrfind.match(rawdata, k)
266 if not m:
267 break
268 attrname, rest, attrvalue = m.group(1, 2, 3)
269 if not rest:
270 attrvalue = None
271 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
272 attrvalue[:1] == '"' == attrvalue[-1:]:
273 attrvalue = attrvalue[1:-1]
274 attrvalue = self.unescape(attrvalue)
275 attrs.append((attrname.lower(), attrvalue))
276 k = m.end()
278 end = rawdata[k:endpos].strip()
279 if end not in (">", "/>"):
280 lineno, offset = self.getpos()
281 if "\n" in self.__starttag_text:
282 lineno = lineno + self.__starttag_text.count("\n")
283 offset = len(self.__starttag_text) \
284 - self.__starttag_text.rfind("\n")
285 else:
286 offset = offset + len(self.__starttag_text)
287 self.error("junk characters in start tag: %s"
288 % `rawdata[k:endpos][:20]`)
289 if end[-2:] == '/>':
290 # XHTML-style empty tag: <span attr="value" />
291 self.handle_startendtag(tag, attrs)
292 else:
293 self.handle_starttag(tag, attrs)
294 if tag in self.CDATA_CONTENT_ELEMENTS:
295 self.set_cdata_mode(tag)
296 return endpos
298 # Internal -- check to see if we have a complete starttag; return end
299 # or -1 if incomplete.
300 def check_for_whole_start_tag(self, i):
301 rawdata = self.rawdata
302 m = locatestarttagend.match(rawdata, i)
303 if m:
304 j = m.end()
305 next = rawdata[j:j+1]
306 if next == ">":
307 return j + 1
308 if next == "/":
309 s = rawdata[j:j+2]
310 if s == "/>":
311 return j + 2
312 if s == "/":
313 # buffer boundary
314 return -1
315 # else bogus input
316 self.updatepos(i, j + 1)
317 self.error("malformed empty start tag")
318 if next == "":
319 # end of input
320 return -1
321 if next in ("abcdefghijklmnopqrstuvwxyz=/"
322 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
323 # end of input in or before attribute value, or we have the
324 # '/' from a '/>' ending
325 return -1
326 self.updatepos(i, j)
327 self.error("malformed start tag")
328 raise AssertionError("we should not get here!")
330 # Internal -- parse endtag, return end or -1 if incomplete
331 def parse_endtag(self, i):
332 rawdata = self.rawdata
333 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
334 match = endendtag.search(rawdata, i+1) # >
335 if not match:
336 return -1
337 j = match.end()
338 match = endtagfind.match(rawdata, i) # </ + tag + >
339 if not match:
340 self.error("bad end tag: %s" % `rawdata[i:j]`)
341 tag = match.group(1).lower()
342 if ( self.cdata_endtag is not None
343 and tag != self.cdata_endtag):
344 # Should be a mismatched end tag, but we'll treat it
345 # as text anyway, since most HTML authors aren't
346 # interested in the finer points of syntax.
347 self.handle_data(match.group(0))
348 else:
349 self.handle_endtag(tag)
350 self.clear_cdata_mode()
351 return j
353 # Overridable -- finish processing of start+end tag: <tag.../>
354 def handle_startendtag(self, tag, attrs):
355 self.handle_starttag(tag, attrs)
356 self.handle_endtag(tag)
358 # Overridable -- handle start tag
359 def handle_starttag(self, tag, attrs):
360 pass
362 # Overridable -- handle end tag
363 def handle_endtag(self, tag):
364 pass
366 # Overridable -- handle character reference
367 def handle_charref(self, name):
368 pass
370 # Overridable -- handle entity reference
371 def handle_entityref(self, name):
372 pass
374 # Overridable -- handle data
375 def handle_data(self, data):
376 pass
378 # Overridable -- handle comment
379 def handle_comment(self, data):
380 pass
382 # Overridable -- handle declaration
383 def handle_decl(self, decl):
384 pass
386 # Overridable -- handle processing instruction
387 def handle_pi(self, data):
388 pass
390 def unknown_decl(self, data):
391 self.error("unknown declaration: " + `data`)
393 # Internal -- helper to remove special character quoting
394 def unescape(self, s):
395 if '&' not in s:
396 return s
397 s = s.replace("<", "<")
398 s = s.replace(">", ">")
399 s = s.replace("'", "'")
400 s = s.replace(""", '"')
401 s = s.replace("&", "&") # Must be last
402 return s