Commit | Line | Data |
---|---|---|
c638d827 CR |
1 | ############################################################################## |
2 | # | |
3 | # Copyright (c) 2001, 2002 Zope Corporation and Contributors. | |
4 | # All Rights Reserved. | |
5 | # | |
6 | # This software is subject to the provisions of the Zope Public License, | |
7 | # Version 2.0 (ZPL). A copy of the ZPL should accompany this distribution. | |
8 | # THIS SOFTWARE IS PROVIDED "AS IS" AND ANY AND ALL EXPRESS OR IMPLIED | |
9 | # WARRANTIES ARE DISCLAIMED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
10 | # WARRANTIES OF TITLE, MERCHANTABILITY, AGAINST INFRINGEMENT, AND FITNESS | |
11 | # FOR A PARTICULAR PURPOSE. | |
12 | # | |
13 | ############################################################################## | |
14 | """ | |
15 | Parse HTML and compile to TALInterpreter intermediate code. | |
16 | """ | |
17 | ||
18 | import sys | |
19 | ||
20 | from TALGenerator import TALGenerator | |
21 | from HTMLParser import HTMLParser, HTMLParseError | |
22 | from TALDefs import \ | |
23 | ZOPE_METAL_NS, ZOPE_TAL_NS, ZOPE_I18N_NS, METALError, TALError, I18NError | |
24 | ||
25 | BOOLEAN_HTML_ATTRS = [ | |
26 | # List of Boolean attributes in HTML that may be given in | |
27 | # minimized form (e.g. <img ismap> rather than <img ismap="">) | |
28 | # From http://www.w3.org/TR/xhtml1/#guidelines (C.10) | |
29 | "compact", "nowrap", "ismap", "declare", "noshade", "checked", | |
30 | "disabled", "readonly", "multiple", "selected", "noresize", | |
31 | "defer" | |
32 | ] | |
33 | ||
34 | EMPTY_HTML_TAGS = [ | |
35 | # List of HTML tags with an empty content model; these are | |
36 | # rendered in minimized form, e.g. <img />. | |
37 | # From http://www.w3.org/TR/xhtml1/#dtds | |
38 | "base", "meta", "link", "hr", "br", "param", "img", "area", | |
39 | "input", "col", "basefont", "isindex", "frame", | |
40 | ] | |
41 | ||
42 | PARA_LEVEL_HTML_TAGS = [ | |
43 | # List of HTML elements that close open paragraph-level elements | |
44 | # and are themselves paragraph-level. | |
45 | "h1", "h2", "h3", "h4", "h5", "h6", "p", | |
46 | ] | |
47 | ||
48 | BLOCK_CLOSING_TAG_MAP = { | |
49 | "tr": ("tr", "td", "th"), | |
50 | "td": ("td", "th"), | |
51 | "th": ("td", "th"), | |
52 | "li": ("li",), | |
53 | "dd": ("dd", "dt"), | |
54 | "dt": ("dd", "dt"), | |
55 | } | |
56 | ||
57 | BLOCK_LEVEL_HTML_TAGS = [ | |
58 | # List of HTML tags that denote larger sections than paragraphs. | |
59 | "blockquote", "table", "tr", "th", "td", "thead", "tfoot", "tbody", | |
60 | "noframe", "ul", "ol", "li", "dl", "dt", "dd", "div", | |
61 | ] | |
62 | ||
63 | TIGHTEN_IMPLICIT_CLOSE_TAGS = (PARA_LEVEL_HTML_TAGS | |
64 | + BLOCK_CLOSING_TAG_MAP.keys()) | |
65 | ||
66 | ||
67 | class NestingError(HTMLParseError): | |
68 | """Exception raised when elements aren't properly nested.""" | |
69 | ||
70 | def __init__(self, tagstack, endtag, position=(None, None)): | |
71 | self.endtag = endtag | |
72 | if tagstack: | |
73 | if len(tagstack) == 1: | |
74 | msg = ('Open tag <%s> does not match close tag </%s>' | |
75 | % (tagstack[0], endtag)) | |
76 | else: | |
77 | msg = ('Open tags <%s> do not match close tag </%s>' | |
78 | % ('>, <'.join(tagstack), endtag)) | |
79 | else: | |
80 | msg = 'No tags are open to match </%s>' % endtag | |
81 | HTMLParseError.__init__(self, msg, position) | |
82 | ||
83 | class EmptyTagError(NestingError): | |
84 | """Exception raised when empty elements have an end tag.""" | |
85 | ||
86 | def __init__(self, tag, position=(None, None)): | |
87 | self.tag = tag | |
88 | msg = 'Close tag </%s> should be removed' % tag | |
89 | HTMLParseError.__init__(self, msg, position) | |
90 | ||
91 | class OpenTagError(NestingError): | |
92 | """Exception raised when a tag is not allowed in another tag.""" | |
93 | ||
94 | def __init__(self, tagstack, tag, position=(None, None)): | |
95 | self.tag = tag | |
96 | msg = 'Tag <%s> is not allowed in <%s>' % (tag, tagstack[-1]) | |
97 | HTMLParseError.__init__(self, msg, position) | |
98 | ||
99 | class HTMLTALParser(HTMLParser): | |
100 | ||
101 | # External API | |
102 | ||
103 | def __init__(self, gen=None): | |
104 | HTMLParser.__init__(self) | |
105 | if gen is None: | |
106 | gen = TALGenerator(xml=0) | |
107 | self.gen = gen | |
108 | self.tagstack = [] | |
109 | self.nsstack = [] | |
110 | self.nsdict = {'tal': ZOPE_TAL_NS, | |
111 | 'metal': ZOPE_METAL_NS, | |
112 | 'i18n': ZOPE_I18N_NS, | |
113 | } | |
114 | ||
115 | def parseFile(self, file): | |
116 | f = open(file) | |
117 | data = f.read() | |
118 | f.close() | |
119 | try: | |
120 | self.parseString(data) | |
121 | except TALError, e: | |
122 | e.setFile(file) | |
123 | raise | |
124 | ||
125 | def parseString(self, data): | |
126 | self.feed(data) | |
127 | self.close() | |
128 | while self.tagstack: | |
129 | self.implied_endtag(self.tagstack[-1], 2) | |
130 | assert self.nsstack == [], self.nsstack | |
131 | ||
132 | def getCode(self): | |
133 | return self.gen.getCode() | |
134 | ||
135 | def getWarnings(self): | |
136 | return () | |
137 | ||
138 | # Overriding HTMLParser methods | |
139 | ||
140 | def handle_starttag(self, tag, attrs): | |
141 | self.close_para_tags(tag) | |
142 | self.scan_xmlns(attrs) | |
143 | tag, attrlist, taldict, metaldict, i18ndict \ | |
144 | = self.process_ns(tag, attrs) | |
145 | if tag in EMPTY_HTML_TAGS and taldict.get("content"): | |
146 | raise TALError( | |
147 | "empty HTML tags cannot use tal:content: %s" % `tag`, | |
148 | self.getpos()) | |
149 | self.tagstack.append(tag) | |
150 | self.gen.emitStartElement(tag, attrlist, taldict, metaldict, i18ndict, | |
151 | self.getpos()) | |
152 | if tag in EMPTY_HTML_TAGS: | |
153 | self.implied_endtag(tag, -1) | |
154 | ||
155 | def handle_startendtag(self, tag, attrs): | |
156 | self.close_para_tags(tag) | |
157 | self.scan_xmlns(attrs) | |
158 | tag, attrlist, taldict, metaldict, i18ndict \ | |
159 | = self.process_ns(tag, attrs) | |
160 | if taldict.get("content"): | |
161 | if tag in EMPTY_HTML_TAGS: | |
162 | raise TALError( | |
163 | "empty HTML tags cannot use tal:content: %s" % `tag`, | |
164 | self.getpos()) | |
165 | self.gen.emitStartElement(tag, attrlist, taldict, metaldict, | |
166 | i18ndict, self.getpos()) | |
167 | self.gen.emitEndElement(tag, implied=-1) | |
168 | else: | |
169 | self.gen.emitStartElement(tag, attrlist, taldict, metaldict, | |
170 | i18ndict, self.getpos(), isend=1) | |
171 | self.pop_xmlns() | |
172 | ||
173 | def handle_endtag(self, tag): | |
174 | if tag in EMPTY_HTML_TAGS: | |
175 | # </img> etc. in the source is an error | |
176 | raise EmptyTagError(tag, self.getpos()) | |
177 | self.close_enclosed_tags(tag) | |
178 | self.gen.emitEndElement(tag) | |
179 | self.pop_xmlns() | |
180 | self.tagstack.pop() | |
181 | ||
182 | def close_para_tags(self, tag): | |
183 | if tag in EMPTY_HTML_TAGS: | |
184 | return | |
185 | close_to = -1 | |
186 | if BLOCK_CLOSING_TAG_MAP.has_key(tag): | |
187 | blocks_to_close = BLOCK_CLOSING_TAG_MAP[tag] | |
188 | for i in range(len(self.tagstack)): | |
189 | t = self.tagstack[i] | |
190 | if t in blocks_to_close: | |
191 | if close_to == -1: | |
192 | close_to = i | |
193 | elif t in BLOCK_LEVEL_HTML_TAGS: | |
194 | close_to = -1 | |
195 | elif tag in PARA_LEVEL_HTML_TAGS + BLOCK_LEVEL_HTML_TAGS: | |
196 | i = len(self.tagstack) - 1 | |
197 | while i >= 0: | |
198 | closetag = self.tagstack[i] | |
199 | if closetag in BLOCK_LEVEL_HTML_TAGS: | |
200 | break | |
201 | if closetag in PARA_LEVEL_HTML_TAGS: | |
202 | if closetag != "p": | |
203 | raise OpenTagError(self.tagstack, tag, self.getpos()) | |
204 | close_to = i | |
205 | i = i - 1 | |
206 | if close_to >= 0: | |
207 | while len(self.tagstack) > close_to: | |
208 | self.implied_endtag(self.tagstack[-1], 1) | |
209 | ||
210 | def close_enclosed_tags(self, tag): | |
211 | if tag not in self.tagstack: | |
212 | raise NestingError(self.tagstack, tag, self.getpos()) | |
213 | while tag != self.tagstack[-1]: | |
214 | self.implied_endtag(self.tagstack[-1], 1) | |
215 | assert self.tagstack[-1] == tag | |
216 | ||
217 | def implied_endtag(self, tag, implied): | |
218 | assert tag == self.tagstack[-1] | |
219 | assert implied in (-1, 1, 2) | |
220 | isend = (implied < 0) | |
221 | if tag in TIGHTEN_IMPLICIT_CLOSE_TAGS: | |
222 | # Pick out trailing whitespace from the program, and | |
223 | # insert the close tag before the whitespace. | |
224 | white = self.gen.unEmitWhitespace() | |
225 | else: | |
226 | white = None | |
227 | self.gen.emitEndElement(tag, isend=isend, implied=implied) | |
228 | if white: | |
229 | self.gen.emitRawText(white) | |
230 | self.tagstack.pop() | |
231 | self.pop_xmlns() | |
232 | ||
233 | def handle_charref(self, name): | |
234 | self.gen.emitRawText("&#%s;" % name) | |
235 | ||
236 | def handle_entityref(self, name): | |
237 | self.gen.emitRawText("&%s;" % name) | |
238 | ||
239 | def handle_data(self, data): | |
240 | self.gen.emitRawText(data) | |
241 | ||
242 | def handle_comment(self, data): | |
243 | self.gen.emitRawText("<!--%s-->" % data) | |
244 | ||
245 | def handle_decl(self, data): | |
246 | self.gen.emitRawText("<!%s>" % data) | |
247 | ||
248 | def handle_pi(self, data): | |
249 | self.gen.emitRawText("<?%s>" % data) | |
250 | ||
251 | # Internal thingies | |
252 | ||
253 | def scan_xmlns(self, attrs): | |
254 | nsnew = {} | |
255 | for key, value in attrs: | |
256 | if key.startswith("xmlns:"): | |
257 | nsnew[key[6:]] = value | |
258 | if nsnew: | |
259 | self.nsstack.append(self.nsdict) | |
260 | self.nsdict = self.nsdict.copy() | |
261 | self.nsdict.update(nsnew) | |
262 | else: | |
263 | self.nsstack.append(self.nsdict) | |
264 | ||
265 | def pop_xmlns(self): | |
266 | self.nsdict = self.nsstack.pop() | |
267 | ||
268 | def fixname(self, name): | |
269 | if ':' in name: | |
270 | prefix, suffix = name.split(':', 1) | |
271 | if prefix == 'xmlns': | |
272 | nsuri = self.nsdict.get(suffix) | |
273 | if nsuri in (ZOPE_TAL_NS, ZOPE_METAL_NS, ZOPE_I18N_NS): | |
274 | return name, name, prefix | |
275 | else: | |
276 | nsuri = self.nsdict.get(prefix) | |
277 | if nsuri == ZOPE_TAL_NS: | |
278 | return name, suffix, 'tal' | |
279 | elif nsuri == ZOPE_METAL_NS: | |
280 | return name, suffix, 'metal' | |
281 | elif nsuri == ZOPE_I18N_NS: | |
282 | return name, suffix, 'i18n' | |
283 | return name, name, 0 | |
284 | ||
285 | def process_ns(self, name, attrs): | |
286 | attrlist = [] | |
287 | taldict = {} | |
288 | metaldict = {} | |
289 | i18ndict = {} | |
290 | name, namebase, namens = self.fixname(name) | |
291 | for item in attrs: | |
292 | key, value = item | |
293 | key, keybase, keyns = self.fixname(key) | |
294 | ns = keyns or namens # default to tag namespace | |
295 | if ns and ns != 'unknown': | |
296 | item = (key, value, ns) | |
297 | if ns == 'tal': | |
298 | if taldict.has_key(keybase): | |
299 | raise TALError("duplicate TAL attribute " + | |
300 | `keybase`, self.getpos()) | |
301 | taldict[keybase] = value | |
302 | elif ns == 'metal': | |
303 | if metaldict.has_key(keybase): | |
304 | raise METALError("duplicate METAL attribute " + | |
305 | `keybase`, self.getpos()) | |
306 | metaldict[keybase] = value | |
307 | elif ns == 'i18n': | |
308 | if i18ndict.has_key(keybase): | |
309 | raise I18NError("duplicate i18n attribute " + | |
310 | `keybase`, self.getpos()) | |
311 | i18ndict[keybase] = value | |
312 | attrlist.append(item) | |
313 | if namens in ('metal', 'tal'): | |
314 | taldict['tal tag'] = namens | |
315 | return name, attrlist, taldict, metaldict, i18ndict |