OILS / doctools / help_gen.py View on Github | oilshell.org

640 lines, 339 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""
4help_gen.py
5
6Ideas for HTML -> ANSI converter:
7
8- `ls` -> <code>ls</code> -> is reverse video?
9- [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
10 - could also be bright blue
11- <pre> is also indented 4 spaces, like the markdown
12- red X <span class="X">X</span>
13
14- comments in code examples could be green?
15
16What about:
17
18- headings h1, h2, h3, h4
19 - Right now cards use reverse video. Centering didn't look great.
20
21- <ul> - you could use a Unicode bullet here
22- <ol>
23
24Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
25thing.
26
27
28- maybe you could have a prefix for a linked word?
29 - or use [] ?
30 - [SIGTERM]
31 - ^SIGTERM
32 .SIGTERM
33 X .SIGTERM
34 X @DIRSTACK
35 .for .while .if
36
37Definition lists would be nice:
38 $? exit status
39 $0 first etc.
40"""
41
42import cStringIO
43import HTMLParser
44import os
45import pprint
46import re
47import sys
48
49from doctools import html_lib
50from doctools.util import log
51from lazylex import html
52
53
54# Sections have alphabetical characters, spaces, and '/' for I/O. They are
55# turned into anchors.
56SECTION_RE = re.compile(r'''
57 \s*
58 \[
59 ([a-zA-Z0-9 /:]+) # colon for ysh:upgrade
60 \]
61''', re.VERBOSE)
62
63TOPIC_RE = re.compile(r'''
64 (X[ ])? # optional deprecation symbol X, then a single space
65 @? # optional @array, e.g. @BASH_SOURCE
66 ([a-zA-Z0-9_\-:]+) # e.g. osh-usage, all:oil, BASH_REMATCH
67 ( [ ]\S+ # optional: single space then punctuation
68 |
69 \(\) # or func()
70 )?
71 ([ ][ ][ ])? # three spaces means we should keep highlighting
72''', re.VERBOSE)
73
74
75def _StringToHref(s):
76 # lower case to match what doctools/cmark.py does
77 return s.lower().replace(' ', '-')
78
79
80# HACK HACK: These happen to have 3 spaces before them!
81_NOT_A_TOPIC = ['compatible', 'egrep']
82
83# BUGS:
84# - Continuation lines: hacked with ...
85# - Some X before punctuation aren't highlighted
86
87X_LEFT_SPAN = '<span style="color: darkred">'
88
89def IndexLineToHtml(chapter, line, debug_out):
90 """Convert a line of text to HTML.
91
92 Topics are highlighted and X made red.
93
94 Args:
95 line: RAW SPAN of HTML that is already escaped.
96
97 Returns:
98 The HTML with some tags inserted.
99 """
100 f = cStringIO.StringIO()
101 out = html.Output(line, f)
102
103 html_page = 'chap-%s.html' % chapter
104
105 pos = 0 # position within line
106
107 section_impl = True
108
109 if line.startswith('X '):
110 out.Print(X_LEFT_SPAN)
111 out.PrintUntil(2)
112 out.Print('</span>')
113 pos = 2
114 section_impl = False
115 elif line.startswith(' '):
116 pos = 2
117 else:
118 return line
119
120 # Highlight [Section] at the start of a line.
121 m = SECTION_RE.match(line, pos)
122 if m:
123 section_name = m.group(1)
124 #href = _StringToHref(section_name)
125 href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
126
127 out.PrintUntil(m.start(1))
128 out.Print('<a href="%s#%s" class="level2">' % (html_page, href))
129 out.PrintUntil(m.end(1)) # anchor
130 out.Print('</a>')
131
132 pos = m.end(0) # ADVANCE
133 else:
134 section_name = None
135
136 line_info = {'section': section_name, 'impl': section_impl, 'topics': []}
137 debug_out.append(line_info)
138
139 _WHITESPACE = re.compile(r'[ ]+')
140 m = _WHITESPACE.match(line, pos)
141 assert m, 'Expected whitespace %r' % line
142
143 pos = m.end(0)
144
145 done = False
146 while not done:
147 # Now just match one
148 m = TOPIC_RE.match(line, pos)
149 if not m or m.group(2) in _NOT_A_TOPIC:
150 break
151
152 topic_impl = True
153 if m.group(1):
154 out.PrintUntil(m.start(1))
155 out.Print(X_LEFT_SPAN)
156 out.PrintUntil(m.end(1))
157 out.Print('</span>')
158 topic_impl = False
159
160 # The linked topic
161 topic = m.group(2)
162 line_info['topics'].append((topic, topic_impl))
163
164 out.PrintUntil(m.start(2))
165 out.Print('<a href="%s#%s">' % (html_page, topic))
166 out.PrintUntil(m.end(2))
167 out.Print('</a>')
168
169 # Trailing 3 spaces required to continue.
170 if not m.group(4):
171 done = True
172
173 pos = m.end(0)
174
175 out.PrintTheRest()
176
177 return f.getvalue()
178
179
180class Splitter(HTMLParser.HTMLParser):
181 """Split an HTML stream starting at each of the heading tags.
182
183 For *-help.html.
184
185 TODO: Rewrite with this with lazylex!
186
187 Algorithm:
188 - ExtractBody() first, then match balanced tags
189 - SPLIT by h2, h3, h4
190 - Match <pre><code> blocks and re-indent
191 - Later:
192 - links <a href="">
193 - `` is turned into inline <code></code>
194 - ** ** for bold
195 - * * for emphasis
196 - <p> needs word wrapping! Oops.
197 - actually cmark seems to preserve this? OK maybe not.
198 - we just need space between <p>
199 """
200 def __init__(self, heading_tags, out):
201 HTMLParser.HTMLParser.__init__(self)
202 self.heading_tags = heading_tags
203 self.out = out
204
205 self.cur_group = None # type: List[Tuple[str, str, List, List]]
206 self.in_heading = False
207
208 self.indent = 0
209
210 def log(self, msg, *args):
211 ind = self.indent * ' '
212 if 0:
213 log(ind + msg, *args)
214
215 def handle_starttag(self, tag, attrs):
216 if tag in self.heading_tags:
217 self.in_heading = True
218 if self.cur_group:
219 self.out.append(self.cur_group)
220
221 self.cur_group = (tag, attrs, [], [])
222
223 self.log('[%d] <> %s %s', self.indent, tag, attrs)
224 self.indent += 1
225
226 def handle_endtag(self, tag):
227 if tag in self.heading_tags:
228 self.in_heading = False
229
230 self.log('[%d] </> %s', self.indent, tag)
231 self.indent -= 1
232
233 def handle_entityref(self, name):
234 """
235 From Python docs:
236 This method is called to process a named character reference of the form
237 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
238 """
239 c = html.CHAR_ENTITY[name]
240 if self.in_heading:
241 self.cur_group[2].append(c)
242 else:
243 if self.cur_group:
244 self.cur_group[3].append(c)
245
246 def handle_data(self, data):
247 self.log('data %r', data)
248 if self.in_heading:
249 self.cur_group[2].append(data)
250 else:
251 if self.cur_group:
252 self.cur_group[3].append(data)
253
254 def end(self):
255 if self.cur_group:
256 self.out.append(self.cur_group)
257
258 # Maybe detect nesting?
259 if self.indent != 0:
260 raise RuntimeError(
261 'Unbalanced HTML tags: indent=%d, cur_group=%s' % (
262 self.indent, self.cur_group))
263
264
265def ExtractBody(s):
266 """Extract what's in between <body></body>
267
268 The splitter needs balanced tags, and what's in <head> isn't balanced.
269 """
270 f = cStringIO.StringIO()
271 out = html.Output(s, f)
272 tag_lexer = html.TagLexer(s)
273
274 pos = 0
275 it = html.ValidTokens(s)
276 while True:
277 try:
278 tok_id, end_pos = next(it)
279 except StopIteration:
280 break
281
282 if tok_id == html.StartTag:
283 tag_lexer.Reset(pos, end_pos)
284 if tag_lexer.TagName() == 'body':
285 body_start_right = end_pos # right after <body>
286
287 out.SkipTo(body_start_right)
288 body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
289
290 out.PrintUntil(body_end_left)
291 break
292
293 pos = end_pos
294
295 return f.getvalue()
296
297
298def SplitIntoCards(heading_tags, contents):
299 contents = ExtractBody(contents)
300
301 groups = []
302 sp = Splitter(heading_tags, groups)
303 sp.feed(contents)
304 sp.end()
305
306 for tag, attrs, heading_parts, parts in groups:
307 heading = ''.join(heading_parts).strip()
308
309 # Don't strip leading space?
310 text = ''.join(parts)
311 text = text.strip('\n') + '\n'
312
313 #log('text = %r', text[:10])
314
315 yield tag, attrs, heading, text
316
317 #log('make_help.py: Parsed %d parts', len(groups))
318
319
320def HelpTopics(s):
321 """
322 Given an HTML page like index-{osh,ysh}.html,
323
324 Yield groups (section_id, section_name, block of text)
325 """
326 tag_lexer = html.TagLexer(s)
327
328 pos = 0
329 it = html.ValidTokens(s)
330 while True:
331 try:
332 tok_id, end_pos = next(it)
333 except StopIteration:
334 break
335
336 if tok_id == html.StartTag:
337 tag_lexer.Reset(pos, end_pos)
338 #log('%r', tag_lexer.TagString())
339 #log('%r', tag_lexer.TagName())
340
341 # Capture <h2 id="foo"> first
342 if tag_lexer.TagName() == 'h2':
343 h2_start_right = end_pos
344
345 open_tag_right = end_pos
346 section_id = tag_lexer.GetAttr('id')
347 assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
348
349 h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
350
351 anchor_html = s[h2_start_right : h2_end_left]
352 paren_pos = anchor_html.find('(')
353 if paren_pos == -1:
354 section_name = anchor_html
355 else:
356 section_name = anchor_html[: paren_pos].strip()
357
358 # Now find the <code></code> span
359 _, code_start_right = html.ReadUntilStartTag(it, tag_lexer, 'code')
360 css_class = tag_lexer.GetAttr('class')
361 assert css_class is not None
362 assert css_class.startswith('language-chapter-links-'), tag_lexer.TagString()
363
364 code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
365
366 text = html.ToText(s, code_start_right, code_end_left)
367 yield section_id, section_name, text
368
369 pos = end_pos
370
371
372class DocNode(object):
373 """To visualize doc structure."""
374
375 def __init__(self, name, attrs=None, text=None):
376 self.name = name
377 self.attrs = attrs # for h2 and h3 links
378 self.text = text
379 self.children = []
380
381
382def CardsFromIndex(sh, out_prefix):
383 sections = []
384 for section_id, section_name, text in HelpTopics(sys.stdin.read()):
385 if 0:
386 log('section_id = %r', section_id)
387 log('section_name = %r', section_name)
388 log('')
389 #log('text = %r', text[:20])
390
391 topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
392
393 path = os.path.join(out_prefix, topic)
394 with open(path, 'w') as f:
395 f.write('%s\n\n' % section_name) # section_id is printed dynamically
396 f.write(text)
397 #f.write('\n') # extra
398 log(' Wrote %s', path)
399 sections.append(section_id)
400
401 log(' (doctools/make_help) -> %d sections -> %s', len(sections), out_prefix)
402
403
404def CardsFromChapters(out_dir, tag_level, paths):
405 """
406 Args:
407 paths: list of chap-*.html to read
408 """
409 topics = {}
410
411 root_node = DocNode('/')
412 cur_h2_node = None
413
414 seen = set()
415 for path in paths:
416 with open(path) as f:
417 contents = f.read()
418
419 filename = os.path.basename(path)
420
421 tmp, _ = os.path.splitext(filename)
422 assert tmp.startswith('chap-')
423 chapter_name = tmp[len('chap-'):]
424
425 page_node = DocNode(filename)
426
427 cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
428
429 for tag, attrs, heading, text in cards:
430 values = [v for k, v in attrs if k == 'id']
431 id_value = values[0] if len(values) == 1 else None
432
433 topic_id = id_value if id_value else heading.replace(' ', '-')
434
435 if tag == 'h2':
436 name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
437 h2 = DocNode(name, attrs=attrs)
438 page_node.children.append(h2)
439 cur_h2_node = h2
440 elif tag == 'h3':
441 name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
442 # attach text so we can see which topics have empty bodies
443 h3 = DocNode(name, attrs=attrs, text=text)
444 cur_h2_node.children.append(h3)
445
446 if tag != tag_level:
447 continue # we only care about h3 now
448
449 if 0:
450 log('tag = %r', tag)
451 log('topic_id = %r', topic_id)
452 log('heading = %r', heading)
453 log('text = %r', text[:20])
454
455 embed = ('oils-embed', '1') in attrs
456
457 if out_dir is not None and embed:
458 # indices start with _
459 path = os.path.join(out_dir, topic_id)
460 with open(path, 'w') as f:
461 f.write(text)
462
463 # help builtin will show URL if there's a chapter name
464 topics[topic_id] = None if embed else chapter_name
465
466 if topic_id in seen:
467 log('Warning: %r is a duplicate topic', topic_id)
468 seen.add(topic_id)
469
470 root_node.children.append(page_node)
471
472 num_sections = sum(len(child.children) for child in root_node.children)
473
474 log('%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
475 len(paths), len(topics), num_sections, out_dir)
476
477 return topics, root_node
478
479
480class StrPool(object):
481 def __init__(self):
482 self.var_names = {}
483 self.global_strs = []
484 self.unique_id = 1
485
486 def Add(self, s):
487 if s in self.var_names:
488 return
489
490 var_name = 'gStr%d' % self.unique_id
491 self.unique_id += 1
492
493 import json
494 # Use JSON as approximation for C++ string
495 self.global_strs.append('GLOBAL_STR(%s, %s)' % (var_name, json.dumps(s)))
496
497 self.var_names[s] = var_name
498
499
500def WriteTopicDict(topic_dict, header_f, cc_f):
501 header_f.write('''
502#include "mycpp/runtime.h"
503
504namespace help_meta {
505Dict<BigStr*, BigStr*>* TopicMetadata();
506}
507''')
508
509 pool = StrPool()
510
511 for k, v in topic_dict.iteritems():
512 pool.Add(k)
513 if v is not None:
514 pool.Add(v)
515 #log('%s %s', k, v)
516
517 num_items = len(topic_dict)
518 key_names = []
519 val_names = []
520
521 for k, v in topic_dict.iteritems():
522 key_names.append(pool.var_names[k])
523 if v is None:
524 v_str = 'nullptr'
525 else:
526 v_str = pool.var_names[v]
527 val_names.append(v_str)
528
529 cc_f.write('''
530#include "mycpp/runtime.h"
531
532namespace help_meta {
533
534%s
535
536GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
537
538Dict<BigStr*, BigStr*>* TopicMetadata() {
539 return gTopics;
540}
541}
542''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
543 ' COMMA '.join(val_names)))
544
545
546def main(argv):
547 action = argv[1]
548
549 if action == 'cards-from-index':
550 sh = argv[2] # osh or ysh
551 out_prefix = argv[3]
552
553 # Read HTML from stdin
554 # TODO: could pass a list of files to speed it up
555 CardsFromIndex(sh, out_prefix)
556
557 elif action == 'cards-from-chapters':
558
559 out_dir = argv[2]
560 py_out = argv[3]
561 cc_prefix = argv[4]
562 pages = argv[5:]
563
564 topic_dict, _ = CardsFromChapters(out_dir, 'h3', pages)
565
566 # Write topic dict as Python and C++
567
568 with open(py_out, 'w') as f:
569 f.write('TOPICS = %s\n' % pprint.pformat(topic_dict))
570
571 f.write('''
572
573from typing import Dict
574
575def TopicMetadata():
576 # type: () -> Dict[str, str]
577 return TOPICS
578''')
579
580 h_path = cc_prefix + '.h'
581 cc_path = cc_prefix + '.cc'
582
583 with open(h_path, 'w') as header_f:
584 with open(cc_path, 'w') as cc_f:
585 WriteTopicDict(topic_dict, header_f, cc_f)
586
587 elif action == 'ref-check':
588 from doctools import cmark
589 from doctools import oils_doc
590 from doctools import ref_check
591
592 chapters = []
593 all_toc_nodes = []
594
595 for path in argv[2:]:
596 filename = os.path.basename(path)
597
598 if filename.endswith('.md'):
599 assert filename.startswith('toc-'), path
600
601 # First convert to HTML
602 with open(path) as in_file:
603 html = cmark.md2html(in_file.read())
604
605 # Now highlight code, which # which gives debug output for the
606 # language-chapter-links-*
607
608 box_nodes = []
609 html = oils_doc.HighlightCode(html, None,
610 debug_out=box_nodes)
611 all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
612
613 elif filename.endswith('.html'):
614 assert filename.startswith('chap-'), path
615 chapters.append(path)
616
617 else:
618 raise RuntimeError('Expected toc-* or chap-*, got %r' % filename)
619
620 topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
621
622 #log('%d chapters: %s', len(chapters), chapters[:5])
623 #log('%d topics: %s', len(topics), topics.keys()[:10])
624 log('')
625
626 ref_check.Check(all_toc_nodes, chap_tree)
627
628 else:
629 raise RuntimeError('Invalid action %r' % action)
630
631
632if __name__ == '__main__':
633 try:
634 main(sys.argv)
635 except RuntimeError as e:
636 print('FATAL: %s' % e, file=sys.stderr)
637 sys.exit(1)
638
639
640# vim: sw=2