1 """Shared support for scanning document type declarations in HTML and XHTML.
2 """
3 __docformat__ = 'restructuredtext'
5 import re
6 import string
8 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
9 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
11 del re
14 class ParserBase:
15 """Parser base class which provides some common support methods used
16 by the SGML/HTML and XHTML parsers."""
18 def reset(self):
19 self.lineno = 1
20 self.offset = 0
22 def getpos(self):
23 """Return current line number and offset."""
24 return self.lineno, self.offset
26 # Internal -- update line number and offset. This should be
27 # called for each piece of data exactly once, in order -- in other
28 # words the concatenation of all the input strings to this
29 # function should be exactly the entire input.
30 def updatepos(self, i, j):
31 if i >= j:
32 return j
33 rawdata = self.rawdata
34 nlines = string.count(rawdata, "\n", i, j)
35 if nlines:
36 self.lineno = self.lineno + nlines
37 pos = string.rindex(rawdata, "\n", i, j) # Should not fail
38 self.offset = j-(pos+1)
39 else:
40 self.offset = self.offset + j-i
41 return j
43 _decl_otherchars = ''
45 # Internal -- parse declaration (for use by subclasses).
46 def parse_declaration(self, i):
47 # This is some sort of declaration; in "HTML as
48 # deployed," this should only be the document type
49 # declaration ("<!DOCTYPE html...>").
50 rawdata = self.rawdata
51 import sys
52 j = i + 2
53 assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
54 if rawdata[j:j+1] in ("-", ""):
55 # Start of comment followed by buffer boundary,
56 # or just a buffer boundary.
57 return -1
58 # in practice, this should look like: ((name|stringlit) S*)+ '>'
59 n = len(rawdata)
60 decltype, j = self._scan_name(j, i)
61 if j < 0:
62 return j
63 if decltype == "doctype":
64 self._decl_otherchars = ''
65 while j < n:
66 c = rawdata[j]
67 if c == ">":
68 # end of declaration syntax
69 data = rawdata[i+2:j]
70 if decltype == "doctype":
71 self.handle_decl(data)
72 else:
73 self.unknown_decl(data)
74 return j + 1
75 if c in "\"'":
76 m = _declstringlit_match(rawdata, j)
77 if not m:
78 return -1 # incomplete
79 j = m.end()
80 elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
81 name, j = self._scan_name(j, i)
82 elif c in self._decl_otherchars:
83 j = j + 1
84 elif c == "[":
85 if decltype == "doctype":
86 j = self._parse_doctype_subset(j + 1, i)
87 else:
88 self.error("unexpected '[' char in declaration")
89 else:
90 self.error(
91 "unexpected %s char in declaration" % `rawdata[j]`)
92 if j < 0:
93 return j
94 return -1 # incomplete
96 # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
97 # returning the index just past any whitespace following the trailing ']'.
98 def _parse_doctype_subset(self, i, declstartpos):
99 rawdata = self.rawdata
100 n = len(rawdata)
101 j = i
102 while j < n:
103 c = rawdata[j]
104 if c == "<":
105 s = rawdata[j:j+2]
106 if s == "<":
107 # end of buffer; incomplete
108 return -1
109 if s != "<!":
110 self.updatepos(declstartpos, j + 1)
111 self.error("unexpected char in internal subset (in %s)"
112 % `s`)
113 if (j + 2) == n:
114 # end of buffer; incomplete
115 return -1
116 if (j + 4) > n:
117 # end of buffer; incomplete
118 return -1
119 if rawdata[j:j+4] == "<!--":
120 j = self.parse_comment(j, report=0)
121 if j < 0:
122 return j
123 continue
124 name, j = self._scan_name(j + 2, declstartpos)
125 if j == -1:
126 return -1
127 if name not in ("attlist", "element", "entity", "notation"):
128 self.updatepos(declstartpos, j + 2)
129 self.error(
130 "unknown declaration %s in internal subset" % `name`)
131 # handle the individual names
132 meth = getattr(self, "_parse_doctype_" + name)
133 j = meth(j, declstartpos)
134 if j < 0:
135 return j
136 elif c == "%":
137 # parameter entity reference
138 if (j + 1) == n:
139 # end of buffer; incomplete
140 return -1
141 s, j = self._scan_name(j + 1, declstartpos)
142 if j < 0:
143 return j
144 if rawdata[j] == ";":
145 j = j + 1
146 elif c == "]":
147 j = j + 1
148 while j < n and rawdata[j] in string.whitespace:
149 j = j + 1
150 if j < n:
151 if rawdata[j] == ">":
152 return j
153 self.updatepos(declstartpos, j)
154 self.error("unexpected char after internal subset")
155 else:
156 return -1
157 elif c in string.whitespace:
158 j = j + 1
159 else:
160 self.updatepos(declstartpos, j)
161 self.error("unexpected char %s in internal subset" % `c`)
162 # end of buffer reached
163 return -1
165 # Internal -- scan past <!ELEMENT declarations
166 def _parse_doctype_element(self, i, declstartpos):
167 rawdata = self.rawdata
168 n = len(rawdata)
169 name, j = self._scan_name(i, declstartpos)
170 if j == -1:
171 return -1
172 # style content model; just skip until '>'
173 if '>' in rawdata[j:]:
174 return string.find(rawdata, ">", j) + 1
175 return -1
177 # Internal -- scan past <!ATTLIST declarations
178 def _parse_doctype_attlist(self, i, declstartpos):
179 rawdata = self.rawdata
180 name, j = self._scan_name(i, declstartpos)
181 c = rawdata[j:j+1]
182 if c == "":
183 return -1
184 if c == ">":
185 return j + 1
186 while 1:
187 # scan a series of attribute descriptions; simplified:
188 # name type [value] [#constraint]
189 name, j = self._scan_name(j, declstartpos)
190 if j < 0:
191 return j
192 c = rawdata[j:j+1]
193 if c == "":
194 return -1
195 if c == "(":
196 # an enumerated type; look for ')'
197 if ")" in rawdata[j:]:
198 j = string.find(rawdata, ")", j) + 1
199 else:
200 return -1
201 while rawdata[j:j+1] in string.whitespace:
202 j = j + 1
203 if not rawdata[j:]:
204 # end of buffer, incomplete
205 return -1
206 else:
207 name, j = self._scan_name(j, declstartpos)
208 c = rawdata[j:j+1]
209 if not c:
210 return -1
211 if c in "'\"":
212 m = _declstringlit_match(rawdata, j)
213 if m:
214 j = m.end()
215 else:
216 return -1
217 c = rawdata[j:j+1]
218 if not c:
219 return -1
220 if c == "#":
221 if rawdata[j:] == "#":
222 # end of buffer
223 return -1
224 name, j = self._scan_name(j + 1, declstartpos)
225 if j < 0:
226 return j
227 c = rawdata[j:j+1]
228 if not c:
229 return -1
230 if c == '>':
231 # all done
232 return j + 1
234 # Internal -- scan past <!NOTATION declarations
235 def _parse_doctype_notation(self, i, declstartpos):
236 name, j = self._scan_name(i, declstartpos)
237 if j < 0:
238 return j
239 rawdata = self.rawdata
240 while 1:
241 c = rawdata[j:j+1]
242 if not c:
243 # end of buffer; incomplete
244 return -1
245 if c == '>':
246 return j + 1
247 if c in "'\"":
248 m = _declstringlit_match(rawdata, j)
249 if not m:
250 return -1
251 j = m.end()
252 else:
253 name, j = self._scan_name(j, declstartpos)
254 if j < 0:
255 return j
257 # Internal -- scan past <!ENTITY declarations
258 def _parse_doctype_entity(self, i, declstartpos):
259 rawdata = self.rawdata
260 if rawdata[i:i+1] == "%":
261 j = i + 1
262 while 1:
263 c = rawdata[j:j+1]
264 if not c:
265 return -1
266 if c in string.whitespace:
267 j = j + 1
268 else:
269 break
270 else:
271 j = i
272 name, j = self._scan_name(j, declstartpos)
273 if j < 0:
274 return j
275 while 1:
276 c = self.rawdata[j:j+1]
277 if not c:
278 return -1
279 if c in "'\"":
280 m = _declstringlit_match(rawdata, j)
281 if m:
282 j = m.end()
283 else:
284 return -1 # incomplete
285 elif c == ">":
286 return j + 1
287 else:
288 name, j = self._scan_name(j, declstartpos)
289 if j < 0:
290 return j
292 # Internal -- scan a name token and the new position and the token, or
293 # return -1 if we've reached the end of the buffer.
294 def _scan_name(self, i, declstartpos):
295 rawdata = self.rawdata
296 n = len(rawdata)
297 if i == n:
298 return None, -1
299 m = _declname_match(rawdata, i)
300 if m:
301 s = m.group()
302 name = string.strip(s)
303 if (i + len(s)) == n:
304 return None, -1 # end of buffer
305 return string.lower(name), m.end()
306 else:
307 self.updatepos(declstartpos, i)
308 self.error("expected name token", self.getpos())