doctools/help

OILS / doctools / help_gen.py View on Github | oilshell.org

640 lines, 339 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""
4	help_gen.py
5
6	Ideas for HTML -> ANSI converter:
7
8	- `ls` -> <code>ls</code> -> is reverse video?
9	- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10	- could also be bright blue
11	- <pre> is also indented 4 spaces, like the markdown
12	- red X <span class="X">X</span>
13
14	- comments in code examples could be green?
15
16	What about:
17
18	- headings h1, h2, h3, h4
19	- Right now cards use reverse video. Centering didn't look great.
20
21	- <ul> - you could use a Unicode bullet here
22	- <ol>
23
24	Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25	thing.
26
27
28	- maybe you could have a prefix for a linked word?
29	- or use [] ?
30	- [SIGTERM]
31	- ^SIGTERM
32	.SIGTERM
33	X .SIGTERM
34	X @DIRSTACK
35	.for .while .if
36
37	Definition lists would be nice:
38	$? exit status
39	$0 first etc.
40	"""
41
42	import cStringIO
43	import HTMLParser
44	import os
45	import pprint
46	import re
47	import sys
48
49	from doctools import html_lib
50	from doctools.util import log
51	from lazylex import html
52
53
54	# Sections have alphabetical characters, spaces, and '/' for I/O. They are
55	# turned into anchors.
56	SECTION_RE = re.compile(r'''
57	\s*
58	\[
59	([a-zA-Z0-9 /:]+) # colon for ysh:upgrade
60	\]
61	''', re.VERBOSE)
62
63	TOPIC_RE = re.compile(r'''
64	(X[ ])? # optional deprecation symbol X, then a single space
65	@? # optional @array, e.g. @BASH_SOURCE
66	([a-zA-Z0-9_\-:]+) # e.g. osh-usage, all:oil, BASH_REMATCH
67	( [ ]\S+ # optional: single space then punctuation
68	\|
69	# or func()
70	)?
71	([ ][ ][ ])? # three spaces means we should keep highlighting
72	''', re.VERBOSE)
73
74
75	def _StringToHref(s):
76	# lower case to match what doctools/cmark.py does
77	return s.lower().replace(' ', '-')
78
79
80	# HACK HACK: These happen to have 3 spaces before them!
81	_NOT_A_TOPIC = ['compatible', 'egrep']
82
83	# BUGS:
84	# - Continuation lines: hacked with ...
85	# - Some X before punctuation aren't highlighted
86
87	X_LEFT_SPAN = '<span style="color: darkred">'
88
89	def IndexLineToHtml(chapter, line, debug_out):
90	"""Convert a line of text to HTML.
91
92	Topics are highlighted and X made red.
93
94	Args:
95	line: RAW SPAN of HTML that is already escaped.
96
97	Returns:
98	The HTML with some tags inserted.
99	"""
100	f = cStringIO.StringIO()
101	out = html.Output(line, f)
102
103	html_page = 'chap-%s.html' % chapter
104
105	pos = 0 # position within line
106
107	section_impl = True
108
109	if line.startswith('X '):
110	out.Print(X_LEFT_SPAN)
111	out.PrintUntil(2)
112	out.Print('</span>')
113	pos = 2
114	section_impl = False
115	elif line.startswith(' '):
116	pos = 2
117	else:
118	return line
119
120	# Highlight [Section] at the start of a line.
121	m = SECTION_RE.match(line, pos)
122	if m:
123	section_name = m.group(1)
124	#href = _StringToHref(section_name)
125	href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
126
127	out.PrintUntil(m.start(1))
128	out.Print('<a href="%s#%s" class="level2">' % (html_page, href))
129	out.PrintUntil(m.end(1)) # anchor
130	out.Print('</a>')
131
132	pos = m.end(0) # ADVANCE
133	else:
134	section_name = None
135
136	line_info = {'section': section_name, 'impl': section_impl, 'topics': []}
137	debug_out.append(line_info)
138
139	_WHITESPACE = re.compile(r'[ ]+')
140	m = _WHITESPACE.match(line, pos)
141	assert m, 'Expected whitespace %r' % line
142
143	pos = m.end(0)
144
145	done = False
146	while not done:
147	# Now just match one
148	m = TOPIC_RE.match(line, pos)
149	if not m or m.group(2) in _NOT_A_TOPIC:
150	break
151
152	topic_impl = True
153	if m.group(1):
154	out.PrintUntil(m.start(1))
155	out.Print(X_LEFT_SPAN)
156	out.PrintUntil(m.end(1))
157	out.Print('</span>')
158	topic_impl = False
159
160	# The linked topic
161	topic = m.group(2)
162	line_info['topics'].append((topic, topic_impl))
163
164	out.PrintUntil(m.start(2))
165	out.Print('<a href="%s#%s">' % (html_page, topic))
166	out.PrintUntil(m.end(2))
167	out.Print('</a>')
168
169	# Trailing 3 spaces required to continue.
170	if not m.group(4):
171	done = True
172
173	pos = m.end(0)
174
175	out.PrintTheRest()
176
177	return f.getvalue()
178
179
180	class Splitter(HTMLParser.HTMLParser):
181	"""Split an HTML stream starting at each of the heading tags.
182
183	For *-help.html.
184
185	TODO: Rewrite with this with lazylex!
186
187	Algorithm:
188	- ExtractBody() first, then match balanced tags
189	- SPLIT by h2, h3, h4
190	- Match <pre><code> blocks and re-indent
191	- Later:
192	- links <a href="">
193	- `` is turned into inline <code></code>
194	- for bold
195	- * * for emphasis
196	- <p> needs word wrapping! Oops.
197	- actually cmark seems to preserve this? OK maybe not.
198	- we just need space between <p>
199	"""
200	def __init__(self, heading_tags, out):
201	HTMLParser.HTMLParser.__init__(self)
202	self.heading_tags = heading_tags
203	self.out = out
204
205	self.cur_group = None # type: List[Tuple[str, str, List, List]]
206	self.in_heading = False
207
208	self.indent = 0
209
210	def log(self, msg, *args):
211	ind = self.indent * ' '
212	if 0:
213	log(ind + msg, *args)
214
215	def handle_starttag(self, tag, attrs):
216	if tag in self.heading_tags:
217	self.in_heading = True
218	if self.cur_group:
219	self.out.append(self.cur_group)
220
221	self.cur_group = (tag, attrs, [], [])
222
223	self.log('[%d] <> %s %s', self.indent, tag, attrs)
224	self.indent += 1
225
226	def handle_endtag(self, tag):
227	if tag in self.heading_tags:
228	self.in_heading = False
229
230	self.log('[%d] </> %s', self.indent, tag)
231	self.indent -= 1
232
233	def handle_entityref(self, name):
234	"""
235	From Python docs:
236	This method is called to process a named character reference of the form
237	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
238	"""
239	c = html.CHAR_ENTITY[name]
240	if self.in_heading:
241	self.cur_group[2].append(c)
242	else:
243	if self.cur_group:
244	self.cur_group[3].append(c)
245
246	def handle_data(self, data):
247	self.log('data %r', data)
248	if self.in_heading:
249	self.cur_group[2].append(data)
250	else:
251	if self.cur_group:
252	self.cur_group[3].append(data)
253
254	def end(self):
255	if self.cur_group:
256	self.out.append(self.cur_group)
257
258	# Maybe detect nesting?
259	if self.indent != 0:
260	raise RuntimeError(
261	'Unbalanced HTML tags: indent=%d, cur_group=%s' % (
262	self.indent, self.cur_group))
263
264
265	def ExtractBody(s):
266	"""Extract what's in between <body></body>
267
268	The splitter needs balanced tags, and what's in <head> isn't balanced.
269	"""
270	f = cStringIO.StringIO()
271	out = html.Output(s, f)
272	tag_lexer = html.TagLexer(s)
273
274	pos = 0
275	it = html.ValidTokens(s)
276	while True:
277	try:
278	tok_id, end_pos = next(it)
279	except StopIteration:
280	break
281
282	if tok_id == html.StartTag:
283	tag_lexer.Reset(pos, end_pos)
284	if tag_lexer.TagName() == 'body':
285	body_start_right = end_pos # right after <body>
286
287	out.SkipTo(body_start_right)
288	body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
289
290	out.PrintUntil(body_end_left)
291	break
292
293	pos = end_pos
294
295	return f.getvalue()
296
297
298	def SplitIntoCards(heading_tags, contents):
299	contents = ExtractBody(contents)
300
301	groups = []
302	sp = Splitter(heading_tags, groups)
303	sp.feed(contents)
304	sp.end()
305
306	for tag, attrs, heading_parts, parts in groups:
307	heading = ''.join(heading_parts).strip()
308
309	# Don't strip leading space?
310	text = ''.join(parts)
311	text = text.strip('\n') + '\n'
312
313	#log('text = %r', text[:10])
314
315	yield tag, attrs, heading, text
316
317	#log('make_help.py: Parsed %d parts', len(groups))
318
319
320	def HelpTopics(s):
321	"""
322	Given an HTML page like index-{osh,ysh}.html,
323
324	Yield groups (section_id, section_name, block of text)
325	"""
326	tag_lexer = html.TagLexer(s)
327
328	pos = 0
329	it = html.ValidTokens(s)
330	while True:
331	try:
332	tok_id, end_pos = next(it)
333	except StopIteration:
334	break
335
336	if tok_id == html.StartTag:
337	tag_lexer.Reset(pos, end_pos)
338	#log('%r', tag_lexer.TagString())
339	#log('%r', tag_lexer.TagName())
340
341	# Capture <h2 id="foo"> first
342	if tag_lexer.TagName() == 'h2':
343	h2_start_right = end_pos
344
345	open_tag_right = end_pos
346	section_id = tag_lexer.GetAttr('id')
347	assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
348
349	h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
350
351	anchor_html = s[h2_start_right : h2_end_left]
352	paren_pos = anchor_html.find('(')
353	if paren_pos == -1:
354	section_name = anchor_html
355	else:
356	section_name = anchor_html[: paren_pos].strip()
357
358	# Now find the <code></code> span
359	_, code_start_right = html.ReadUntilStartTag(it, tag_lexer, 'code')
360	css_class = tag_lexer.GetAttr('class')
361	assert css_class is not None
362	assert css_class.startswith('language-chapter-links-'), tag_lexer.TagString()
363
364	code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
365
366	text = html.ToText(s, code_start_right, code_end_left)
367	yield section_id, section_name, text
368
369	pos = end_pos
370
371
372	class DocNode(object):
373	"""To visualize doc structure."""
374
375	def __init__(self, name, attrs=None, text=None):
376	self.name = name
377	self.attrs = attrs # for h2 and h3 links
378	self.text = text
379	self.children = []
380
381
382	def CardsFromIndex(sh, out_prefix):
383	sections = []
384	for section_id, section_name, text in HelpTopics(sys.stdin.read()):
385	if 0:
386	log('section_id = %r', section_id)
387	log('section_name = %r', section_name)
388	log('')
389	#log('text = %r', text[:20])
390
391	topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
392
393	path = os.path.join(out_prefix, topic)
394	with open(path, 'w') as f:
395	f.write('%s\n\n' % section_name) # section_id is printed dynamically
396	f.write(text)
397	#f.write('\n') # extra
398	log(' Wrote %s', path)
399	sections.append(section_id)
400
401	log(' (doctools/make_help) -> %d sections -> %s', len(sections), out_prefix)
402
403
404	def CardsFromChapters(out_dir, tag_level, paths):
405	"""
406	Args:
407	paths: list of chap-*.html to read
408	"""
409	topics = {}
410
411	root_node = DocNode('/')
412	cur_h2_node = None
413
414	seen = set()
415	for path in paths:
416	with open(path) as f:
417	contents = f.read()
418
419	filename = os.path.basename(path)
420
421	tmp, _ = os.path.splitext(filename)
422	assert tmp.startswith('chap-')
423	chapter_name = tmp[len('chap-'):]
424
425	page_node = DocNode(filename)
426
427	cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
428
429	for tag, attrs, heading, text in cards:
430	values = [v for k, v in attrs if k == 'id']
431	id_value = values[0] if len(values) == 1 else None
432
433	topic_id = id_value if id_value else heading.replace(' ', '-')
434
435	if tag == 'h2':
436	name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
437	h2 = DocNode(name, attrs=attrs)
438	page_node.children.append(h2)
439	cur_h2_node = h2
440	elif tag == 'h3':
441	name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
442	# attach text so we can see which topics have empty bodies
443	h3 = DocNode(name, attrs=attrs, text=text)
444	cur_h2_node.children.append(h3)
445
446	if tag != tag_level:
447	continue # we only care about h3 now
448
449	if 0:
450	log('tag = %r', tag)
451	log('topic_id = %r', topic_id)
452	log('heading = %r', heading)
453	log('text = %r', text[:20])
454
455	embed = ('oils-embed', '1') in attrs
456
457	if out_dir is not None and embed:
458	# indices start with _
459	path = os.path.join(out_dir, topic_id)
460	with open(path, 'w') as f:
461	f.write(text)
462
463	# help builtin will show URL if there's a chapter name
464	topics[topic_id] = None if embed else chapter_name
465
466	if topic_id in seen:
467	log('Warning: %r is a duplicate topic', topic_id)
468	seen.add(topic_id)
469
470	root_node.children.append(page_node)
471
472	num_sections = sum(len(child.children) for child in root_node.children)
473
474	log('%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
475	len(paths), len(topics), num_sections, out_dir)
476
477	return topics, root_node
478
479
480	class StrPool(object):
481	def __init__(self):
482	self.var_names = {}
483	self.global_strs = []
484	self.unique_id = 1
485
486	def Add(self, s):
487	if s in self.var_names:
488	return
489
490	var_name = 'gStr%d' % self.unique_id
491	self.unique_id += 1
492
493	import json
494	# Use JSON as approximation for C++ string
495	self.global_strs.append('GLOBAL_STR(%s, %s)' % (var_name, json.dumps(s)))
496
497	self.var_names[s] = var_name
498
499
500	def WriteTopicDict(topic_dict, header_f, cc_f):
501	header_f.write('''
502	#include "mycpp/runtime.h"
503
504	namespace help_meta {
505	Dict<BigStr, BigStr>* TopicMetadata();
506	}
507	''')
508
509	pool = StrPool()
510
511	for k, v in topic_dict.iteritems():
512	pool.Add(k)
513	if v is not None:
514	pool.Add(v)
515	#log('%s %s', k, v)
516
517	num_items = len(topic_dict)
518	key_names = []
519	val_names = []
520
521	for k, v in topic_dict.iteritems():
522	key_names.append(pool.var_names[k])
523	if v is None:
524	v_str = 'nullptr'
525	else:
526	v_str = pool.var_names[v]
527	val_names.append(v_str)
528
529	cc_f.write('''
530	#include "mycpp/runtime.h"
531
532	namespace help_meta {
533
534	%s
535
536	GLOBAL_DICT(gTopics, BigStr, BigStr, %d, {%s}, {%s});
537
538	Dict<BigStr, BigStr>* TopicMetadata() {
539	return gTopics;
540	}
541	}
542	''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
543	' COMMA '.join(val_names)))
544
545
546	def main(argv):
547	action = argv[1]
548
549	if action == 'cards-from-index':
550	sh = argv[2] # osh or ysh
551	out_prefix = argv[3]
552
553	# Read HTML from stdin
554	# TODO: could pass a list of files to speed it up
555	CardsFromIndex(sh, out_prefix)
556
557	elif action == 'cards-from-chapters':
558
559	out_dir = argv[2]
560	py_out = argv[3]
561	cc_prefix = argv[4]
562	pages = argv[5:]
563
564	topic_dict, _ = CardsFromChapters(out_dir, 'h3', pages)
565
566	# Write topic dict as Python and C++
567
568	with open(py_out, 'w') as f:
569	f.write('TOPICS = %s\n' % pprint.pformat(topic_dict))
570
571	f.write('''
572
573	from typing import Dict
574
575	def TopicMetadata():
576	# type: () -> Dict[str, str]
577	return TOPICS
578	''')
579
580	h_path = cc_prefix + '.h'
581	cc_path = cc_prefix + '.cc'
582
583	with open(h_path, 'w') as header_f:
584	with open(cc_path, 'w') as cc_f:
585	WriteTopicDict(topic_dict, header_f, cc_f)
586
587	elif action == 'ref-check':
588	from doctools import cmark
589	from doctools import oils_doc
590	from doctools import ref_check
591
592	chapters = []
593	all_toc_nodes = []
594
595	for path in argv[2:]:
596	filename = os.path.basename(path)
597
598	if filename.endswith('.md'):
599	assert filename.startswith('toc-'), path
600
601	# First convert to HTML
602	with open(path) as in_file:
603	html = cmark.md2html(in_file.read())
604
605	# Now highlight code, which # which gives debug output for the
606	# language-chapter-links-*
607
608	box_nodes = []
609	html = oils_doc.HighlightCode(html, None,
610	debug_out=box_nodes)
611	all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
612
613	elif filename.endswith('.html'):
614	assert filename.startswith('chap-'), path
615	chapters.append(path)
616
617	else:
618	raise RuntimeError('Expected toc-* or chap-*, got %r' % filename)
619
620	topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
621
622	#log('%d chapters: %s', len(chapters), chapters[:5])
623	#log('%d topics: %s', len(topics), topics.keys()[:10])
624	log('')
625
626	ref_check.Check(all_toc_nodes, chap_tree)
627
628	else:
629	raise RuntimeError('Invalid action %r' % action)
630
631
632	if __name__ == '__main__':
633	try:
634	main(sys.argv)
635	except RuntimeError as e:
636	print('FATAL: %s' % e, file=sys.stderr)
637	sys.exit(1)
638
639
640	# vim: sw=2