Code

documentation cleanup
[roundup.git] / roundup / cgi / TAL / markupbase.py
1 """Shared support for scanning document type declarations in HTML and XHTML.
2 """
3 __docformat__ = 'restructuredtext'
5 import re
6 import string
8 _declname_match = re.compile(r'[a-zA-Z][-_.a-zA-Z0-9]*\s*').match
9 _declstringlit_match = re.compile(r'(\'[^\']*\'|"[^"]*")\s*').match
11 del re
14 class ParserBase:
15     """Parser base class which provides some common support methods used
16     by the SGML/HTML and XHTML parsers."""
18     def reset(self):
19         self.lineno = 1
20         self.offset = 0
22     def getpos(self):
23         """Return current line number and offset."""
24         return self.lineno, self.offset
26     # Internal -- update line number and offset.  This should be
27     # called for each piece of data exactly once, in order -- in other
28     # words the concatenation of all the input strings to this
29     # function should be exactly the entire input.
30     def updatepos(self, i, j):
31         if i >= j:
32             return j
33         rawdata = self.rawdata
34         nlines = string.count(rawdata, "\n", i, j)
35         if nlines:
36             self.lineno = self.lineno + nlines
37             pos = string.rindex(rawdata, "\n", i, j) # Should not fail
38             self.offset = j-(pos+1)
39         else:
40             self.offset = self.offset + j-i
41         return j
43     _decl_otherchars = ''
45     # Internal -- parse declaration (for use by subclasses).
46     def parse_declaration(self, i):
47         # This is some sort of declaration; in "HTML as
48         # deployed," this should only be the document type
49         # declaration ("<!DOCTYPE html...>").
50         rawdata = self.rawdata
51         import sys
52         j = i + 2
53         assert rawdata[i:j] == "<!", "unexpected call to parse_declaration"
54         if rawdata[j:j+1] in ("-", ""):
55             # Start of comment followed by buffer boundary,
56             # or just a buffer boundary.
57             return -1
58         # in practice, this should look like: ((name|stringlit) S*)+ '>'
59         n = len(rawdata)
60         decltype, j = self._scan_name(j, i)
61         if j < 0:
62             return j
63         if decltype == "doctype":
64             self._decl_otherchars = ''
65         while j < n:
66             c = rawdata[j]
67             if c == ">":
68                 # end of declaration syntax
69                 data = rawdata[i+2:j]
70                 if decltype == "doctype":
71                     self.handle_decl(data)
72                 else:
73                     self.unknown_decl(data)
74                 return j + 1
75             if c in "\"'":
76                 m = _declstringlit_match(rawdata, j)
77                 if not m:
78                     return -1 # incomplete
79                 j = m.end()
80             elif c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
81                 name, j = self._scan_name(j, i)
82             elif c in self._decl_otherchars:
83                 j = j + 1
84             elif c == "[":
85                 if decltype == "doctype":
86                     j = self._parse_doctype_subset(j + 1, i)
87                 else:
88                     self.error("unexpected '[' char in declaration")
89             else:
90                 self.error(
91                     "unexpected %s char in declaration" % `rawdata[j]`)
92             if j < 0:
93                 return j
94         return -1 # incomplete
96     # Internal -- scan past the internal subset in a <!DOCTYPE declaration,
97     # returning the index just past any whitespace following the trailing ']'.
98     def _parse_doctype_subset(self, i, declstartpos):
99         rawdata = self.rawdata
100         n = len(rawdata)
101         j = i
102         while j < n:
103             c = rawdata[j]
104             if c == "<":
105                 s = rawdata[j:j+2]
106                 if s == "<":
107                     # end of buffer; incomplete
108                     return -1
109                 if s != "<!":
110                     self.updatepos(declstartpos, j + 1)
111                     self.error("unexpected char in internal subset (in %s)"
112                                % `s`)
113                 if (j + 2) == n:
114                     # end of buffer; incomplete
115                     return -1
116                 if (j + 4) > n:
117                     # end of buffer; incomplete
118                     return -1
119                 if rawdata[j:j+4] == "<!--":
120                     j = self.parse_comment(j, report=0)
121                     if j < 0:
122                         return j
123                     continue
124                 name, j = self._scan_name(j + 2, declstartpos)
125                 if j == -1:
126                     return -1
127                 if name not in ("attlist", "element", "entity", "notation"):
128                     self.updatepos(declstartpos, j + 2)
129                     self.error(
130                         "unknown declaration %s in internal subset" % `name`)
131                 # handle the individual names
132                 meth = getattr(self, "_parse_doctype_" + name)
133                 j = meth(j, declstartpos)
134                 if j < 0:
135                     return j
136             elif c == "%":
137                 # parameter entity reference
138                 if (j + 1) == n:
139                     # end of buffer; incomplete
140                     return -1
141                 s, j = self._scan_name(j + 1, declstartpos)
142                 if j < 0:
143                     return j
144                 if rawdata[j] == ";":
145                     j = j + 1
146             elif c == "]":
147                 j = j + 1
148                 while j < n and rawdata[j] in string.whitespace:
149                     j = j + 1
150                 if j < n:
151                     if rawdata[j] == ">":
152                         return j
153                     self.updatepos(declstartpos, j)
154                     self.error("unexpected char after internal subset")
155                 else:
156                     return -1
157             elif c in string.whitespace:
158                 j = j + 1
159             else:
160                 self.updatepos(declstartpos, j)
161                 self.error("unexpected char %s in internal subset" % `c`)
162         # end of buffer reached
163         return -1
165     # Internal -- scan past <!ELEMENT declarations
166     def _parse_doctype_element(self, i, declstartpos):
167         rawdata = self.rawdata
168         n = len(rawdata)
169         name, j = self._scan_name(i, declstartpos)
170         if j == -1:
171             return -1
172         # style content model; just skip until '>'
173         if '>' in rawdata[j:]:
174             return string.find(rawdata, ">", j) + 1
175         return -1
177     # Internal -- scan past <!ATTLIST declarations
178     def _parse_doctype_attlist(self, i, declstartpos):
179         rawdata = self.rawdata
180         name, j = self._scan_name(i, declstartpos)
181         c = rawdata[j:j+1]
182         if c == "":
183             return -1
184         if c == ">":
185             return j + 1
186         while 1:
187             # scan a series of attribute descriptions; simplified:
188             #   name type [value] [#constraint]
189             name, j = self._scan_name(j, declstartpos)
190             if j < 0:
191                 return j
192             c = rawdata[j:j+1]
193             if c == "":
194                 return -1
195             if c == "(":
196                 # an enumerated type; look for ')'
197                 if ")" in rawdata[j:]:
198                     j = string.find(rawdata, ")", j) + 1
199                 else:
200                     return -1
201                 while rawdata[j:j+1] in string.whitespace:
202                     j = j + 1
203                 if not rawdata[j:]:
204                     # end of buffer, incomplete
205                     return -1
206             else:
207                 name, j = self._scan_name(j, declstartpos)
208             c = rawdata[j:j+1]
209             if not c:
210                 return -1
211             if c in "'\"":
212                 m = _declstringlit_match(rawdata, j)
213                 if m:
214                     j = m.end()
215                 else:
216                     return -1
217                 c = rawdata[j:j+1]
218                 if not c:
219                     return -1
220             if c == "#":
221                 if rawdata[j:] == "#":
222                     # end of buffer
223                     return -1
224                 name, j = self._scan_name(j + 1, declstartpos)
225                 if j < 0:
226                     return j
227                 c = rawdata[j:j+1]
228                 if not c:
229                     return -1
230             if c == '>':
231                 # all done
232                 return j + 1
234     # Internal -- scan past <!NOTATION declarations
235     def _parse_doctype_notation(self, i, declstartpos):
236         name, j = self._scan_name(i, declstartpos)
237         if j < 0:
238             return j
239         rawdata = self.rawdata
240         while 1:
241             c = rawdata[j:j+1]
242             if not c:
243                 # end of buffer; incomplete
244                 return -1
245             if c == '>':
246                 return j + 1
247             if c in "'\"":
248                 m = _declstringlit_match(rawdata, j)
249                 if not m:
250                     return -1
251                 j = m.end()
252             else:
253                 name, j = self._scan_name(j, declstartpos)
254                 if j < 0:
255                     return j
257     # Internal -- scan past <!ENTITY declarations
258     def _parse_doctype_entity(self, i, declstartpos):
259         rawdata = self.rawdata
260         if rawdata[i:i+1] == "%":
261             j = i + 1
262             while 1:
263                 c = rawdata[j:j+1]
264                 if not c:
265                     return -1
266                 if c in string.whitespace:
267                     j = j + 1
268                 else:
269                     break
270         else:
271             j = i
272         name, j = self._scan_name(j, declstartpos)
273         if j < 0:
274             return j
275         while 1:
276             c = self.rawdata[j:j+1]
277             if not c:
278                 return -1
279             if c in "'\"":
280                 m = _declstringlit_match(rawdata, j)
281                 if m:
282                     j = m.end()
283                 else:
284                     return -1    # incomplete
285             elif c == ">":
286                 return j + 1
287             else:
288                 name, j = self._scan_name(j, declstartpos)
289                 if j < 0:
290                     return j
292     # Internal -- scan a name token and the new position and the token, or
293     # return -1 if we've reached the end of the buffer.
294     def _scan_name(self, i, declstartpos):
295         rawdata = self.rawdata
296         n = len(rawdata)
297         if i == n:
298             return None, -1
299         m = _declname_match(rawdata, i)
300         if m:
301             s = m.group()
302             name = string.strip(s)
303             if (i + len(s)) == n:
304                 return None, -1  # end of buffer
305             return string.lower(name), m.end()
306         else:
307             self.updatepos(declstartpos, i)
308             self.error("expected name token", self.getpos())