OILS / lazylex / html.py View on Github | oilshell.org

416 lines, 188 significant
1#!/usr/bin/env python2
2"""
3lazylex/html.py - Low-Level HTML Processing.
4
5See lazylex/README.md for details.
6
7TODO: This should be an Oil library eventually. It's a "lazily-parsed data
8structure" like TSV2.
9"""
10from __future__ import print_function
11
12import cStringIO
13import re
14import sys
15
16
17def log(msg, *args):
18 msg = msg % args
19 print(msg, file=sys.stderr)
20
21
22class LexError(Exception):
23 """For bad lexical elements like <> or && """
24
25 def __init__(self, s, pos):
26 self.s = s
27 self.pos = pos
28
29 def __str__(self):
30 return '(LexError %r)' % (self.s[self.pos : self.pos + 20])
31
32
33class ParseError(Exception):
34 """For errors in the tag structure."""
35
36 def __init__(self, msg, *args):
37 self.msg = msg
38 self.args = args
39
40 def __str__(self):
41 return '(ParseError %s)' % (self.msg % self.args)
42
43
44class Output(object):
45 """
46 Takes an underlying input buffer and an output file. Maintains a position in
47 the input buffer.
48
49 Print FROM the input or print new text to the output.
50 """
51
52 def __init__(self, s, f, left_pos=0, right_pos=0):
53 self.s = s
54 self.f = f
55 self.pos = left_pos
56 if right_pos == 0:
57 self.right_pos = len(s)
58 else:
59 self.right_pos = right_pos
60
61 def SkipTo(self, pos):
62 """Skip to a position."""
63 self.pos = pos
64
65 def PrintUntil(self, pos):
66 """Print until a position."""
67 piece = self.s[self.pos : pos]
68 self.f.write(piece)
69 self.pos = pos
70
71 def PrintTheRest(self):
72 """Print until the end of the string."""
73 self.PrintUntil(self.right_pos)
74
75 def Print(self, s):
76 """Print text to the underlying buffer."""
77 self.f.write(s)
78
79
80# HTML Tokens
81( Decl, Comment, Processing,
82 StartTag, StartEndTag, EndTag,
83 DecChar, HexChar, CharEntity,
84 RawData,
85 Invalid, EndOfStream ) = range(12)
86
87
88def MakeLexer(rules):
89 return [
90 # DOTALL is for the comment
91 (re.compile(pat, re.VERBOSE | re.DOTALL), i) for
92 (pat, i) in rules
93 ]
94
95#
96# Eggex
97#
98# Tag = / ~['>']+ /
99
100# Is this valid? A single character?
101# Tag = / ~'>'* /
102
103# Maybe better: / [NOT '>']+/
104# capital letters not allowed there?
105#
106# But then this is confusing:
107# / [NOT ~digit]+/
108#
109# / [NOT digit] / is [^\d]
110# / ~digit / is \D
111#
112# Or maybe:
113#
114# / [~ digit]+ /
115# / [~ '>']+ /
116# / [NOT '>']+ /
117
118# End = / '</' Tag '>' /
119# StartEnd = / '<' Tag '/>' /
120# Start = / '<' Tag '>' /
121#
122# EntityRef = / '&' dot{* N} ';' /
123
124
125LEXER = [
126 # TODO: instead of nongreedy matches, the loop can just do .find('-->') and
127 # .find('?>')
128
129 # Actually non-greedy matches are regular and can be matched in linear time
130 # with RE2.
131 #
132 # https://news.ycombinator.com/item?id=27099798
133 #
134 # Maybe try combining all of these for speed.
135
136 (r'<!-- .*? -->', Comment),
137 (r'<\? .*? \?>', Processing),
138
139 # NOTE: < is allowed in these.
140 (r'<! [^>]+ >', Decl), # <!DOCTYPE html>
141
142 (r'</ [^>]+ >', EndTag), # self-closing <br/> comes FIRST
143 (r'< [^>]+ />', StartEndTag), # end </a>
144 (r'< [^>]+ >', StartTag), # start <a>
145
146 (r'&\# [0-9]+ ;', DecChar),
147 (r'&\# x[0-9a-fA-F]+ ;', HexChar),
148 (r'& [a-zA-Z]+ ;', CharEntity),
149
150 # Note: > is allowed in raw data.
151 # https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
152 (r'[^&<]+', RawData),
153
154 (r'.', Invalid), # error!
155]
156
157LEXER = MakeLexer(LEXER)
158
159
160def _Tokens(s, left_pos, right_pos):
161 """
162 Args:
163 s: string to parse
164 left_pos, right_pos: Optional span boundaries.
165 """
166 pos = left_pos
167 if right_pos == 0:
168 n = len(s)
169 else:
170 n = right_pos
171
172 while pos < n:
173 # Find the FIRST pattern that matches.
174 for pat, tok_id in LEXER:
175 m = pat.match(s, pos)
176 if m:
177 end_pos = m.end()
178 yield tok_id, end_pos
179 pos = end_pos
180 break
181
182 # Zero length sentinel
183 yield EndOfStream, pos
184
185
186def ValidTokens(s, left_pos=0, right_pos=0):
187 """
188 Wrapper around _Tokens to prevent callers from having to handle Invalid.
189
190 I'm not combining the two functions because I might want to do a 'yield'
191 transformation on Tokens()? Exceptions might complicate the issue?
192 """
193 pos = left_pos
194 for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
195 if tok_id == Invalid:
196 raise LexError(s, pos)
197 yield tok_id, end_pos
198 pos = end_pos
199
200
201# To match <a or </a
202# <h2 but not <2h ?
203_TAG_RE = re.compile(r'/? \s* ([a-zA-Z][a-zA-Z0-9]*)', re.VERBOSE)
204
205# To match href="foo"
206
207_ATTR_RE = re.compile(r'''
208\s+ # Leading whitespace is required
209([a-z]+) # Attribute name
210(?: # Optional attribute value
211 \s* = \s*
212 (?:
213 " ([^>"]*) " # double quoted value
214 | ([a-zA-Z0-9_\-]+) # Just allow unquoted "identifiers"
215 # TODO: relax this? for href=$foo
216 )
217)?
218''', re.VERBOSE)
219
220
221TagName, AttrName, UnquotedValue, QuotedValue = range(4)
222
223class TagLexer(object):
224 """
225 Given a tag like <a href="..."> or <link type="..." />, the TagLexer
226 provides a few operations:
227
228 - What is the tag?
229 - Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
230 """
231 def __init__(self, s):
232 self.s = s
233 self.start_pos = -1 # Invalid
234 self.end_pos = -1
235
236 def Reset(self, start_pos, end_pos):
237 self.start_pos = start_pos
238 self.end_pos = end_pos
239
240 def TagString(self):
241 return self.s[self.start_pos : self.end_pos]
242
243 def TagName(self):
244 # First event
245 tok_id, start, end = next(self.Tokens())
246 return self.s[start : end]
247
248 def GetSpanForAttrValue(self, attr_name):
249 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
250 # TODO: Could also cache these
251
252 events = self.Tokens()
253 val = (-1, -1)
254 try:
255 while True:
256 tok_id, start, end = next(events)
257 if tok_id == AttrName:
258 name = self.s[start:end]
259 if name == attr_name:
260 # For HasAttr()
261 #val = True
262
263 # Now try to get a real value
264 tok_id, start, end = next(events)
265 if tok_id in (QuotedValue, UnquotedValue):
266
267 # TODO: Unescape this with htmlentitydefs
268 # I think we need another lexer!
269 #
270 # We could make a single pass?
271 # Shortcut: 'if '&' in substring'
272 # Then we need to unescape it
273
274 val = start, end
275 break
276
277 except StopIteration:
278 pass
279 return val
280
281 def GetAttr(self, attr_name):
282 # Algorithm: search for QuotedValue or UnquotedValue after AttrName
283 # TODO: Could also cache these
284 start, end = self.GetSpanForAttrValue(attr_name)
285 if start == -1:
286 return None
287 return self.s[start : end]
288
289 def Tokens(self):
290 """
291 Yields a sequence of tokens: Tag (AttrName AttrValue?)*
292
293 Where each Token is (Type, start_pos, end_pos)
294
295 Note that start and end are NOT redundant! We skip over some unwanted
296 characters.
297 """
298 m = _TAG_RE.match(self.s, self.start_pos+1)
299 if not m:
300 raise RuntimeError('Invalid HTML tag: %r' % self.TagString())
301 yield TagName, m.start(1), m.end(1)
302
303 pos = m.end(0)
304
305 while True:
306 # don't search past the end
307 m = _ATTR_RE.match(self.s, pos, self.end_pos)
308 if not m:
309 # A validating parser would check that > or /> is next -- there's no junk
310 break
311
312 yield AttrName, m.start(1), m.end(1)
313
314 # Quoted is group 2, unquoted is group 3.
315 if m.group(2) is not None:
316 yield QuotedValue, m.start(2), m.end(2)
317 elif m.group(3) is not None:
318 yield UnquotedValue, m.start(3), m.end(3)
319
320 # Skip past the "
321 pos = m.end(0)
322
323
324def ReadUntilStartTag(it, tag_lexer, tag_name):
325 """Find the next <foo>.
326
327 tag_lexer is RESET.
328 """
329 pos = 0
330 while True:
331 try:
332 tok_id, end_pos = next(it)
333 except StopIteration:
334 break
335 tag_lexer.Reset(pos, end_pos)
336 if tok_id == StartTag and tag_lexer.TagName() == tag_name:
337 return pos, end_pos
338
339 pos = end_pos
340
341 raise ParseError('No start tag %r', tag_name)
342
343
344def ReadUntilEndTag(it, tag_lexer, tag_name):
345 """Find the next </foo>.
346
347 tag_lexer is RESET.
348 """
349 pos = 0
350 while True:
351 try:
352 tok_id, end_pos = next(it)
353 except StopIteration:
354 break
355 tag_lexer.Reset(pos, end_pos)
356 if tok_id == EndTag and tag_lexer.TagName() == tag_name:
357 return pos, end_pos
358
359 pos = end_pos
360
361 raise ParseError('No end tag %r', tag_name)
362
363
364CHAR_ENTITY = {
365 'amp': '&',
366 'lt': '<',
367 'gt': '>',
368 'quot': '"',
369}
370
371
372def ToText(s, left_pos=0, right_pos=0):
373 """
374 Given HTML, return text by unquoting &gt; and &lt; etc.
375
376 Used by:
377 doctools/oils_doc.py: PygmentsPlugin
378 doctool/make_help.py: HelpIndexCards
379
380 In the latter case, we cold process some tags, like:
381
382 - Blue Link (not clickable, but still useful)
383 - Red X
384
385 That should be html.ToAnsi.
386 """
387 f = cStringIO.StringIO()
388 out = Output(s, f, left_pos, right_pos)
389
390 pos = left_pos
391 for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
392 if tok_id == RawData:
393 out.SkipTo(pos)
394 out.PrintUntil(end_pos)
395
396 elif tok_id == CharEntity: # &amp;
397
398 entity = s[pos+1 : end_pos-1]
399
400 out.SkipTo(pos)
401 out.Print(CHAR_ENTITY[entity])
402 out.SkipTo(end_pos)
403
404 # Not handling these yet
405 elif tok_id == HexChar:
406 raise AssertionError('Hex Char %r' % s[pos : pos + 20])
407
408 elif tok_id == DecChar:
409 raise AssertionError('Dec Char %r' % s[pos : pos + 20])
410
411 pos = end_pos
412
413 out.PrintTheRest()
414 return f.getvalue()
415
416