1 from __future__ import absolute_import, division, unicode_literals
6 class Filter(_base.Filter):
8 previous1 = previous2 = None
9 for token in self.source:
10 if previous1 is not None:
11 yield previous2, previous1, token
14 yield previous2, previous1, None
17 for previous, token, next in self.slider():
19 if type == "StartTag":
21 not self.is_optional_start(token["name"], previous, next)):
23 elif type == "EndTag":
24 if not self.is_optional_end(token["name"], next):
29 def is_optional_start(self, tagname, previous, next):
30 type = next and next["type"] or None
32 # An html element's start tag may be omitted if the first thing
33 # inside the html element is not a space character or a comment.
34 return type not in ("Comment", "SpaceCharacters")
35 elif tagname == 'head':
36 # A head element's start tag may be omitted if the first thing
37 # inside the head element is an element.
38 # XXX: we also omit the start tag if the head element is empty
39 if type in ("StartTag", "EmptyTag"):
41 elif type == "EndTag":
42 return next["name"] == "head"
43 elif tagname == 'body':
44 # A body element's start tag may be omitted if the first thing
45 # inside the body element is not a space character or a comment,
46 # except if the first thing inside the body element is a script
47 # or style element and the node immediately preceding the body
48 # element is a head element whose end tag has been omitted.
49 if type in ("Comment", "SpaceCharacters"):
51 elif type == "StartTag":
52 # XXX: we do not look at the preceding event, so we never omit
53 # the body element's start tag if it's followed by a script or
55 return next["name"] not in ('script', 'style')
58 elif tagname == 'colgroup':
59 # A colgroup element's start tag may be omitted if the first thing
60 # inside the colgroup element is a col element, and if the element
61 # is not immediately preceeded by another colgroup element whose
62 # end tag has been omitted.
63 if type in ("StartTag", "EmptyTag"):
64 # XXX: we do not look at the preceding event, so instead we never
65 # omit the colgroup element's end tag when it is immediately
66 # followed by another colgroup element. See is_optional_end.
67 return next["name"] == "col"
70 elif tagname == 'tbody':
71 # A tbody element's start tag may be omitted if the first thing
72 # inside the tbody element is a tr element, and if the element is
73 # not immediately preceeded by a tbody, thead, or tfoot element
74 # whose end tag has been omitted.
75 if type == "StartTag":
76 # omit the thead and tfoot elements' end tag when they are
77 # immediately followed by a tbody element. See is_optional_end.
78 if previous and previous['type'] == 'EndTag' and \
79 previous['name'] in ('tbody', 'thead', 'tfoot'):
81 return next["name"] == 'tr'
86 def is_optional_end(self, tagname, next):
87 type = next and next["type"] or None
88 if tagname in ('html', 'head', 'body'):
89 # An html element's end tag may be omitted if the html element
90 # is not immediately followed by a space character or a comment.
91 return type not in ("Comment", "SpaceCharacters")
92 elif tagname in ('li', 'optgroup', 'tr'):
93 # A li element's end tag may be omitted if the li element is
94 # immediately followed by another li element or if there is
95 # no more content in the parent element.
96 # An optgroup element's end tag may be omitted if the optgroup
97 # element is immediately followed by another optgroup element,
98 # or if there is no more content in the parent element.
99 # A tr element's end tag may be omitted if the tr element is
100 # immediately followed by another tr element, or if there is
101 # no more content in the parent element.
102 if type == "StartTag":
103 return next["name"] == tagname
105 return type == "EndTag" or type is None
106 elif tagname in ('dt', 'dd'):
107 # A dt element's end tag may be omitted if the dt element is
108 # immediately followed by another dt element or a dd element.
109 # A dd element's end tag may be omitted if the dd element is
110 # immediately followed by another dd element or a dt element,
111 # or if there is no more content in the parent element.
112 if type == "StartTag":
113 return next["name"] in ('dt', 'dd')
114 elif tagname == 'dd':
115 return type == "EndTag" or type is None
119 # A p element's end tag may be omitted if the p element is
120 # immediately followed by an address, article, aside,
121 # blockquote, datagrid, dialog, dir, div, dl, fieldset,
122 # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
123 # nav, ol, p, pre, section, table, or ul, element, or if
124 # there is no more content in the parent element.
125 if type in ("StartTag", "EmptyTag"):
126 return next["name"] in ('address', 'article', 'aside',
127 'blockquote', 'datagrid', 'dialog',
128 'dir', 'div', 'dl', 'fieldset', 'footer',
129 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
130 'header', 'hr', 'menu', 'nav', 'ol',
131 'p', 'pre', 'section', 'table', 'ul')
133 return type == "EndTag" or type is None
134 elif tagname == 'option':
135 # An option element's end tag may be omitted if the option
136 # element is immediately followed by another option element,
137 # or if it is immediately followed by an <code>optgroup</code>
138 # element, or if there is no more content in the parent
140 if type == "StartTag":
141 return next["name"] in ('option', 'optgroup')
143 return type == "EndTag" or type is None
144 elif tagname in ('rt', 'rp'):
145 # An rt element's end tag may be omitted if the rt element is
146 # immediately followed by an rt or rp element, or if there is
147 # no more content in the parent element.
148 # An rp element's end tag may be omitted if the rp element is
149 # immediately followed by an rt or rp element, or if there is
150 # no more content in the parent element.
151 if type == "StartTag":
152 return next["name"] in ('rt', 'rp')
154 return type == "EndTag" or type is None
155 elif tagname == 'colgroup':
156 # A colgroup element's end tag may be omitted if the colgroup
157 # element is not immediately followed by a space character or
159 if type in ("Comment", "SpaceCharacters"):
161 elif type == "StartTag":
162 # XXX: we also look for an immediately following colgroup
163 # element. See is_optional_start.
164 return next["name"] != 'colgroup'
167 elif tagname in ('thead', 'tbody'):
168 # A thead element's end tag may be omitted if the thead element
169 # is immediately followed by a tbody or tfoot element.
170 # A tbody element's end tag may be omitted if the tbody element
171 # is immediately followed by a tbody or tfoot element, or if
172 # there is no more content in the parent element.
173 # A tfoot element's end tag may be omitted if the tfoot element
174 # is immediately followed by a tbody element, or if there is no
175 # more content in the parent element.
176 # XXX: we never omit the end tag when the following element is
177 # a tbody. See is_optional_start.
178 if type == "StartTag":
179 return next["name"] in ['tbody', 'tfoot']
180 elif tagname == 'tbody':
181 return type == "EndTag" or type is None
184 elif tagname == 'tfoot':
185 # A tfoot element's end tag may be omitted if the tfoot element
186 # is immediately followed by a tbody element, or if there is no
187 # more content in the parent element.
188 # XXX: we never omit the end tag when the following element is
189 # a tbody. See is_optional_start.
190 if type == "StartTag":
191 return next["name"] == 'tbody'
193 return type == "EndTag" or type is None
194 elif tagname in ('td', 'th'):
195 # A td element's end tag may be omitted if the td element is
196 # immediately followed by a td or th element, or if there is
197 # no more content in the parent element.
198 # A th element's end tag may be omitted if the th element is
199 # immediately followed by a td or th element, or if there is
200 # no more content in the parent element.
201 if type == "StartTag":
202 return next["name"] in ('td', 'th')
204 return type == "EndTag" or type is None