1 ##############################################################################
2 #
3 # Copyright (c) 2001, 2002 Zope Corporation and Contributors.
4 # All Rights Reserved.
5 #
6 # This software is subject to the provisions of the Zope Public License,
7 # Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution.
8 # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED
9 # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
10 # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS
11 # FOR A PARTICULAR PURPOSE.
12 #
13 ##############################################################################
14 """
15 Parse HTML and compile to TALInterpreter intermediate code.
16 """
18 import sys
20 from TALGenerator import TALGenerator
21 from HTMLParser import HTMLParser, HTMLParseError
22 from TALDefs import \
23 ZOPE_METAL_NS, ZOPE_TAL_NS, ZOPE_I18N_NS, METALError, TALError, I18NError
25 BOOLEAN_HTML_ATTRS = [
26 # List of Boolean attributes in HTML that may be given in
27 # minimized form (e.g. <img ismap> rather than <img ismap="">)
28 # From http://www.w3.org/TR/xhtml1/#guidelines (C.10)
29 "compact", "nowrap", "ismap", "declare", "noshade", "checked",
30 "disabled", "readonly", "multiple", "selected", "noresize",
31 "defer"
32 ]
34 EMPTY_HTML_TAGS = [
35 # List of HTML tags with an empty content model; these are
36 # rendered in minimized form, e.g. <img />.
37 # From http://www.w3.org/TR/xhtml1/#dtds
38 "base", "meta", "link", "hr", "br", "param", "img", "area",
39 "input", "col", "basefont", "isindex", "frame",
40 ]
42 PARA_LEVEL_HTML_TAGS = [
43 # List of HTML elements that close open paragraph-level elements
44 # and are themselves paragraph-level.
45 "h1", "h2", "h3", "h4", "h5", "h6", "p",
46 ]
48 BLOCK_CLOSING_TAG_MAP = {
49 "tr": ("tr", "td", "th"),
50 "td": ("td", "th"),
51 "th": ("td", "th"),
52 "li": ("li",),
53 "dd": ("dd", "dt"),
54 "dt": ("dd", "dt"),
55 }
57 BLOCK_LEVEL_HTML_TAGS = [
58 # List of HTML tags that denote larger sections than paragraphs.
59 "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody",
60 "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div",
61 ]
63 TIGHTEN_IMPLICIT_CLOSE_TAGS = (PARA_LEVEL_HTML_TAGS
64 + BLOCK_CLOSING_TAG_MAP.keys())
67 class NestingError(HTMLParseError):
68 """Exception raised when elements aren't properly nested."""
70 def __init__(self, tagstack, endtag, position=(None, None)):
71 self.endtag = endtag
72 if tagstack:
73 if len(tagstack) == 1:
74 msg = ('Open tag <%s> does not match close tag </%s>'
75 % (tagstack[0], endtag))
76 else:
77 msg = ('Open tags <%s> do not match close tag </%s>'
78 % ('>, <'.join(tagstack), endtag))
79 else:
80 msg = 'No tags are open to match </%s>' % endtag
81 HTMLParseError.__init__(self, msg, position)
83 class EmptyTagError(NestingError):
84 """Exception raised when empty elements have an end tag."""
86 def __init__(self, tag, position=(None, None)):
87 self.tag = tag
88 msg = 'Close tag </%s> should be removed' % tag
89 HTMLParseError.__init__(self, msg, position)
91 class OpenTagError(NestingError):
92 """Exception raised when a tag is not allowed in another tag."""
94 def __init__(self, tagstack, tag, position=(None, None)):
95 self.tag = tag
96 msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1])
97 HTMLParseError.__init__(self, msg, position)
99 class HTMLTALParser(HTMLParser):
101 # External API
103 def __init__(self, gen=None):
104 HTMLParser.__init__(self)
105 if gen is None:
106 gen = TALGenerator(xml=0)
107 self.gen = gen
108 self.tagstack = []
109 self.nsstack = []
110 self.nsdict = {'tal': ZOPE_TAL_NS,
111 'metal': ZOPE_METAL_NS,
112 'i18n': ZOPE_I18N_NS,
113 }
115 def parseFile(self, file):
116 f = open(file)
117 data = f.read()
118 f.close()
119 try:
120 self.parseString(data)
121 except TALError, e:
122 e.setFile(file)
123 raise
125 def parseString(self, data):
126 self.feed(data)
127 self.close()
128 while self.tagstack:
129 self.implied_endtag(self.tagstack[-1], 2)
130 assert self.nsstack == [], self.nsstack
132 def getCode(self):
133 return self.gen.getCode()
135 def getWarnings(self):
136 return ()
138 # Overriding HTMLParser methods
140 def handle_starttag(self, tag, attrs):
141 self.close_para_tags(tag)
142 self.scan_xmlns(attrs)
143 tag, attrlist, taldict, metaldict, i18ndict \
144 = self.process_ns(tag, attrs)
145 if tag in EMPTY_HTML_TAGS and taldict.get("content"):
146 raise TALError(
147 "empty HTML tags cannot use tal:content: %s" % `tag`,
148 self.getpos())
149 self.tagstack.append(tag)
150 self.gen.emitStartElement(tag, attrlist, taldict, metaldict, i18ndict,
151 self.getpos())
152 if tag in EMPTY_HTML_TAGS:
153 self.implied_endtag(tag, -1)
155 def handle_startendtag(self, tag, attrs):
156 self.close_para_tags(tag)
157 self.scan_xmlns(attrs)
158 tag, attrlist, taldict, metaldict, i18ndict \
159 = self.process_ns(tag, attrs)
160 if taldict.get("content"):
161 if tag in EMPTY_HTML_TAGS:
162 raise TALError(
163 "empty HTML tags cannot use tal:content: %s" % `tag`,
164 self.getpos())
165 self.gen.emitStartElement(tag, attrlist, taldict, metaldict,
166 i18ndict, self.getpos())
167 self.gen.emitEndElement(tag, implied=-1)
168 else:
169 self.gen.emitStartElement(tag, attrlist, taldict, metaldict,
170 i18ndict, self.getpos(), isend=1)
171 self.pop_xmlns()
173 def handle_endtag(self, tag):
174 if tag in EMPTY_HTML_TAGS:
175 # </img> etc. in the source is an error
176 raise EmptyTagError(tag, self.getpos())
177 self.close_enclosed_tags(tag)
178 self.gen.emitEndElement(tag)
179 self.pop_xmlns()
180 self.tagstack.pop()
182 def close_para_tags(self, tag):
183 if tag in EMPTY_HTML_TAGS:
184 return
185 close_to = -1
186 if BLOCK_CLOSING_TAG_MAP.has_key(tag):
187 blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag]
188 for i in range(len(self.tagstack)):
189 t = self.tagstack[i]
190 if t in blocks_to_close:
191 if close_to == -1:
192 close_to = i
193 elif t in BLOCK_LEVEL_HTML_TAGS:
194 close_to = -1
195 elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS:
196 i = len(self.tagstack) - 1
197 while i >= 0:
198 closetag = self.tagstack[i]
199 if closetag in BLOCK_LEVEL_HTML_TAGS:
200 break
201 if closetag in PARA_LEVEL_HTML_TAGS:
202 if closetag != "p":
203 raise OpenTagError(self.tagstack, tag, self.getpos())
204 close_to = i
205 i = i - 1
206 if close_to >= 0:
207 while len(self.tagstack) > close_to:
208 self.implied_endtag(self.tagstack[-1], 1)
210 def close_enclosed_tags(self, tag):
211 if tag not in self.tagstack:
212 raise NestingError(self.tagstack, tag, self.getpos())
213 while tag != self.tagstack[-1]:
214 self.implied_endtag(self.tagstack[-1], 1)
215 assert self.tagstack[-1] == tag
217 def implied_endtag(self, tag, implied):
218 assert tag == self.tagstack[-1]
219 assert implied in (-1, 1, 2)
220 isend = (implied < 0)
221 if tag in TIGHTEN_IMPLICIT_CLOSE_TAGS:
222 # Pick out trailing whitespace from the program, and
223 # insert the close tag before the whitespace.
224 white = self.gen.unEmitWhitespace()
225 else:
226 white = None
227 self.gen.emitEndElement(tag, isend=isend, implied=implied)
228 if white:
229 self.gen.emitRawText(white)
230 self.tagstack.pop()
231 self.pop_xmlns()
233 def handle_charref(self, name):
234 self.gen.emitRawText("&#%s;" % name)
236 def handle_entityref(self, name):
237 self.gen.emitRawText("&%s;" % name)
239 def handle_data(self, data):
240 self.gen.emitRawText(data)
242 def handle_comment(self, data):
243 self.gen.emitRawText("<!--%s-->" % data)
245 def handle_decl(self, data):
246 self.gen.emitRawText("<!%s>" % data)
248 def handle_pi(self, data):
249 self.gen.emitRawText("<?%s>" % data)
251 # Internal thingies
253 def scan_xmlns(self, attrs):
254 nsnew = {}
255 for key, value in attrs:
256 if key.startswith("xmlns:"):
257 nsnew[key[6:]] = value
258 if nsnew:
259 self.nsstack.append(self.nsdict)
260 self.nsdict = self.nsdict.copy()
261 self.nsdict.update(nsnew)
262 else:
263 self.nsstack.append(self.nsdict)
265 def pop_xmlns(self):
266 self.nsdict = self.nsstack.pop()
268 def fixname(self, name):
269 if ':' in name:
270 prefix, suffix = name.split(':', 1)
271 if prefix == 'xmlns':
272 nsuri = self.nsdict.get(suffix)
273 if nsuri in (ZOPE_TAL_NS, ZOPE_METAL_NS, ZOPE_I18N_NS):
274 return name, name, prefix
275 else:
276 nsuri = self.nsdict.get(prefix)
277 if nsuri == ZOPE_TAL_NS:
278 return name, suffix, 'tal'
279 elif nsuri == ZOPE_METAL_NS:
280 return name, suffix, 'metal'
281 elif nsuri == ZOPE_I18N_NS:
282 return name, suffix, 'i18n'
283 return name, name, 0
285 def process_ns(self, name, attrs):
286 attrlist = []
287 taldict = {}
288 metaldict = {}
289 i18ndict = {}
290 name, namebase, namens = self.fixname(name)
291 for item in attrs:
292 key, value = item
293 key, keybase, keyns = self.fixname(key)
294 ns = keyns or namens # default to tag namespace
295 if ns and ns != 'unknown':
296 item = (key, value, ns)
297 if ns == 'tal':
298 if taldict.has_key(keybase):
299 raise TALError("duplicate TAL attribute " +
300 `keybase`, self.getpos())
301 taldict[keybase] = value
302 elif ns == 'metal':
303 if metaldict.has_key(keybase):
304 raise METALError("duplicate METAL attribute " +
305 `keybase`, self.getpos())
306 metaldict[keybase] = value
307 elif ns == 'i18n':
308 if i18ndict.has_key(keybase):
309 raise I18NError("duplicate i18n attribute " +
310 `keybase`, self.getpos())
311 i18ndict[keybase] = value
312 attrlist.append(item)
313 if namens in ('metal', 'tal'):
314 taldict['tal tag'] = namens
315 return name, attrlist, taldict, metaldict, i18ndict