lazylex/html.py

OILS / lazylex / html.py View on Github | oilshell.org

416 lines, 188 significant

1	#!/usr/bin/env python2
2	"""
3	lazylex/html.py - Low-Level HTML Processing.
4
5	See lazylex/README.md for details.
6
7	TODO: This should be an Oil library eventually. It's a "lazily-parsed data
8	structure" like TSV2.
9	"""
10	from __future__ import print_function
11
12	import cStringIO
13	import re
14	import sys
15
16
17	def log(msg, *args):
18	msg = msg % args
19	print(msg, file=sys.stderr)
20
21
22	class LexError(Exception):
23	"""For bad lexical elements like <> or && """
24
25	def __init__(self, s, pos):
26	self.s = s
27	self.pos = pos
28
29	def __str__(self):
30	return '(LexError %r)' % (self.s[self.pos : self.pos + 20])
31
32
33	class ParseError(Exception):
34	"""For errors in the tag structure."""
35
36	def __init__(self, msg, *args):
37	self.msg = msg
38	self.args = args
39
40	def __str__(self):
41	return '(ParseError %s)' % (self.msg % self.args)
42
43
44	class Output(object):
45	"""
46	Takes an underlying input buffer and an output file. Maintains a position in
47	the input buffer.
48
49	Print FROM the input or print new text to the output.
50	"""
51
52	def __init__(self, s, f, left_pos=0, right_pos=0):
53	self.s = s
54	self.f = f
55	self.pos = left_pos
56	if right_pos == 0:
57	self.right_pos = len(s)
58	else:
59	self.right_pos = right_pos
60
61	def SkipTo(self, pos):
62	"""Skip to a position."""
63	self.pos = pos
64
65	def PrintUntil(self, pos):
66	"""Print until a position."""
67	piece = self.s[self.pos : pos]
68	self.f.write(piece)
69	self.pos = pos
70
71	def PrintTheRest(self):
72	"""Print until the end of the string."""
73	self.PrintUntil(self.right_pos)
74
75	def Print(self, s):
76	"""Print text to the underlying buffer."""
77	self.f.write(s)
78
79
80	# HTML Tokens
81	( Decl, Comment, Processing,
82	StartTag, StartEndTag, EndTag,
83	DecChar, HexChar, CharEntity,
84	RawData,
85	Invalid, EndOfStream ) = range(12)
86
87
88	def MakeLexer(rules):
89	return [
90	# DOTALL is for the comment
91	(re.compile(pat, re.VERBOSE \| re.DOTALL), i) for
92	(pat, i) in rules
93	]
94
95	#
96	# Eggex
97	#
98	# Tag = / ~['>']+ /
99
100	# Is this valid? A single character?
101	# Tag = / ~'>'* /
102
103	# Maybe better: / [NOT '>']+/
104	# capital letters not allowed there?
105	#
106	# But then this is confusing:
107	# / [NOT ~digit]+/
108	#
109	# / [NOT digit] / is [^\d]
110	# / ~digit / is \D
111	#
112	# Or maybe:
113	#
114	# / [~ digit]+ /
115	# / [~ '>']+ /
116	# / [NOT '>']+ /
117
118	# End = / '</' Tag '>' /
119	# StartEnd = / '<' Tag '/>' /
120	# Start = / '<' Tag '>' /
121	#
122	# EntityRef = / '&' dot{* N} ';' /
123
124
125	LEXER = [
126	# TODO: instead of nongreedy matches, the loop can just do .find('-->') and
127	# .find('?>')
128
129	# Actually non-greedy matches are regular and can be matched in linear time
130	# with RE2.
131	#
132	# https://news.ycombinator.com/item?id=27099798
133	#
134	# Maybe try combining all of these for speed.
135
136	(r'<!-- .*? -->', Comment),
137	(r'<\? .*? \?>', Processing),
138
139	# NOTE: < is allowed in these.
140	(r'<! [^>]+ >', Decl), # <!DOCTYPE html>
141
142	(r'</ [^>]+ >', EndTag), # self-closing <br/> comes FIRST
143	(r'< [^>]+ />', StartEndTag), # end </a>
144	(r'< [^>]+ >', StartTag), # start <a>
145
146	(r'&\# [0-9]+ ;', DecChar),
147	(r'&\# x[0-9a-fA-F]+ ;', HexChar),
148	(r'& [a-zA-Z]+ ;', CharEntity),
149
150	# Note: > is allowed in raw data.
151	# https://stackoverflow.com/questions/10462348/right-angle-bracket-in-html
152	(r'[^&<]+', RawData),
153
154	(r'.', Invalid), # error!
155	]
156
157	LEXER = MakeLexer(LEXER)
158
159
160	def _Tokens(s, left_pos, right_pos):
161	"""
162	Args:
163	s: string to parse
164	left_pos, right_pos: Optional span boundaries.
165	"""
166	pos = left_pos
167	if right_pos == 0:
168	n = len(s)
169	else:
170	n = right_pos
171
172	while pos < n:
173	# Find the FIRST pattern that matches.
174	for pat, tok_id in LEXER:
175	m = pat.match(s, pos)
176	if m:
177	end_pos = m.end()
178	yield tok_id, end_pos
179	pos = end_pos
180	break
181
182	# Zero length sentinel
183	yield EndOfStream, pos
184
185
186	def ValidTokens(s, left_pos=0, right_pos=0):
187	"""
188	Wrapper around _Tokens to prevent callers from having to handle Invalid.
189
190	I'm not combining the two functions because I might want to do a 'yield'
191	transformation on Tokens()? Exceptions might complicate the issue?
192	"""
193	pos = left_pos
194	for tok_id, end_pos in _Tokens(s, left_pos, right_pos):
195	if tok_id == Invalid:
196	raise LexError(s, pos)
197	yield tok_id, end_pos
198	pos = end_pos
199
200
201	# To match <a or </a
202	# <h2 but not <2h ?
203	_TAG_RE = re.compile(r'/? \s* ([a-zA-Z][a-zA-Z0-9]*)', re.VERBOSE)
204
205	# To match href="foo"
206
207	_ATTR_RE = re.compile(r'''
208	\s+ # Leading whitespace is required
209	([a-z]+) # Attribute name
210	(?: # Optional attribute value
211	\s* = \s*
212	(?:
213	" ([^>"]*) " # double quoted value
214	\| ([a-zA-Z0-9_\-]+) # Just allow unquoted "identifiers"
215	# TODO: relax this? for href=$foo
216	)
217	)?
218	''', re.VERBOSE)
219
220
221	TagName, AttrName, UnquotedValue, QuotedValue = range(4)
222
223	class TagLexer(object):
224	"""
225	Given a tag like <a href="..."> or <link type="..." />, the TagLexer
226	provides a few operations:
227
228	- What is the tag?
229	- Iterate through the attributes, giving (name, value_start_pos, value_end_pos)
230	"""
231	def __init__(self, s):
232	self.s = s
233	self.start_pos = -1 # Invalid
234	self.end_pos = -1
235
236	def Reset(self, start_pos, end_pos):
237	self.start_pos = start_pos
238	self.end_pos = end_pos
239
240	def TagString(self):
241	return self.s[self.start_pos : self.end_pos]
242
243	def TagName(self):
244	# First event
245	tok_id, start, end = next(self.Tokens())
246	return self.s[start : end]
247
248	def GetSpanForAttrValue(self, attr_name):
249	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
250	# TODO: Could also cache these
251
252	events = self.Tokens()
253	val = (-1, -1)
254	try:
255	while True:
256	tok_id, start, end = next(events)
257	if tok_id == AttrName:
258	name = self.s[start:end]
259	if name == attr_name:
260	# For HasAttr()
261	#val = True
262
263	# Now try to get a real value
264	tok_id, start, end = next(events)
265	if tok_id in (QuotedValue, UnquotedValue):
266
267	# TODO: Unescape this with htmlentitydefs
268	# I think we need another lexer!
269	#
270	# We could make a single pass?
271	# Shortcut: 'if '&' in substring'
272	# Then we need to unescape it
273
274	val = start, end
275	break
276
277	except StopIteration:
278	pass
279	return val
280
281	def GetAttr(self, attr_name):
282	# Algorithm: search for QuotedValue or UnquotedValue after AttrName
283	# TODO: Could also cache these
284	start, end = self.GetSpanForAttrValue(attr_name)
285	if start == -1:
286	return None
287	return self.s[start : end]
288
289	def Tokens(self):
290	"""
291	Yields a sequence of tokens: Tag (AttrName AttrValue?)*
292
293	Where each Token is (Type, start_pos, end_pos)
294
295	Note that start and end are NOT redundant! We skip over some unwanted
296	characters.
297	"""
298	m = _TAG_RE.match(self.s, self.start_pos+1)
299	if not m:
300	raise RuntimeError('Invalid HTML tag: %r' % self.TagString())
301	yield TagName, m.start(1), m.end(1)
302
303	pos = m.end(0)
304
305	while True:
306	# don't search past the end
307	m = _ATTR_RE.match(self.s, pos, self.end_pos)
308	if not m:
309	# A validating parser would check that > or /> is next -- there's no junk
310	break
311
312	yield AttrName, m.start(1), m.end(1)
313
314	# Quoted is group 2, unquoted is group 3.
315	if m.group(2) is not None:
316	yield QuotedValue, m.start(2), m.end(2)
317	elif m.group(3) is not None:
318	yield UnquotedValue, m.start(3), m.end(3)
319
320	# Skip past the "
321	pos = m.end(0)
322
323
324	def ReadUntilStartTag(it, tag_lexer, tag_name):
325	"""Find the next <foo>.
326
327	tag_lexer is RESET.
328	"""
329	pos = 0
330	while True:
331	try:
332	tok_id, end_pos = next(it)
333	except StopIteration:
334	break
335	tag_lexer.Reset(pos, end_pos)
336	if tok_id == StartTag and tag_lexer.TagName() == tag_name:
337	return pos, end_pos
338
339	pos = end_pos
340
341	raise ParseError('No start tag %r', tag_name)
342
343
344	def ReadUntilEndTag(it, tag_lexer, tag_name):
345	"""Find the next </foo>.
346
347	tag_lexer is RESET.
348	"""
349	pos = 0
350	while True:
351	try:
352	tok_id, end_pos = next(it)
353	except StopIteration:
354	break
355	tag_lexer.Reset(pos, end_pos)
356	if tok_id == EndTag and tag_lexer.TagName() == tag_name:
357	return pos, end_pos
358
359	pos = end_pos
360
361	raise ParseError('No end tag %r', tag_name)
362
363
364	CHAR_ENTITY = {
365	'amp': '&',
366	'lt': '<',
367	'gt': '>',
368	'quot': '"',
369	}
370
371
372	def ToText(s, left_pos=0, right_pos=0):
373	"""
374	Given HTML, return text by unquoting > and < etc.
375
376	Used by:
377	doctools/oils_doc.py: PygmentsPlugin
378	doctool/make_help.py: HelpIndexCards
379
380	In the latter case, we cold process some tags, like:
381
382	- Blue Link (not clickable, but still useful)
383	- Red X
384
385	That should be html.ToAnsi.
386	"""
387	f = cStringIO.StringIO()
388	out = Output(s, f, left_pos, right_pos)
389
390	pos = left_pos
391	for tok_id, end_pos in ValidTokens(s, left_pos, right_pos):
392	if tok_id == RawData:
393	out.SkipTo(pos)
394	out.PrintUntil(end_pos)
395
396	elif tok_id == CharEntity: # &
397
398	entity = s[pos+1 : end_pos-1]
399
400	out.SkipTo(pos)
401	out.Print(CHAR_ENTITY[entity])
402	out.SkipTo(end_pos)
403
404	# Not handling these yet
405	elif tok_id == HexChar:
406	raise AssertionError('Hex Char %r' % s[pos : pos + 20])
407
408	elif tok_id == DecChar:
409	raise AssertionError('Dec Char %r' % s[pos : pos + 20])
410
411	pos = end_pos
412
413	out.PrintTheRest()
414	return f.getvalue()
415
416