1 | #!/usr/bin/env python2
|
2 | from __future__ import print_function
|
3 | """
|
4 | help_gen.py
|
5 |
|
6 | Ideas for HTML -> ANSI converter:
|
7 |
|
8 | - `ls` -> <code>ls</code> -> is reverse video?
|
9 | - [link]() -> <a href=""> -> underlined, and then add a number to the bottom?
|
10 | - could also be bright blue
|
11 | - <pre> is also indented 4 spaces, like the markdown
|
12 | - red X <span class="X">X</span>
|
13 |
|
14 | - comments in code examples could be green?
|
15 |
|
16 | What about:
|
17 |
|
18 | - headings h1, h2, h3, h4
|
19 | - Right now cards use reverse video. Centering didn't look great.
|
20 |
|
21 | - <ul> - you could use a Unicode bullet here
|
22 | - <ol>
|
23 |
|
24 | Word wrapping? troff/groff doesn't do it, but they do this weird right-justify
|
25 | thing.
|
26 |
|
27 |
|
28 | - maybe you could have a prefix for a linked word?
|
29 | - or use [] ?
|
30 | - [SIGTERM]
|
31 | - ^SIGTERM
|
32 | .SIGTERM
|
33 | X .SIGTERM
|
34 | X @DIRSTACK
|
35 | .for .while .if
|
36 |
|
37 | Definition lists would be nice:
|
38 | $? exit status
|
39 | $0 first etc.
|
40 | """
|
41 |
|
42 | import cStringIO
|
43 | import HTMLParser
|
44 | import os
|
45 | import pprint
|
46 | import re
|
47 | import sys
|
48 |
|
49 | from doctools import html_lib
|
50 | from doctools.util import log
|
51 | from lazylex import html
|
52 |
|
53 |
|
54 | # Sections have alphabetical characters, spaces, and '/' for I/O. They are
|
55 | # turned into anchors.
|
56 | SECTION_RE = re.compile(r'''
|
57 | \s*
|
58 | \[
|
59 | ([a-zA-Z0-9 /:]+) # colon for ysh:upgrade
|
60 | \]
|
61 | ''', re.VERBOSE)
|
62 |
|
63 | TOPIC_RE = re.compile(r'''
|
64 | (X[ ])? # optional deprecation symbol X, then a single space
|
65 | @? # optional @array, e.g. @BASH_SOURCE
|
66 | ([a-zA-Z0-9_\-:]+) # e.g. osh-usage, all:oil, BASH_REMATCH
|
67 | ( [ ]\S+ # optional: single space then punctuation
|
68 | |
|
69 | \(\) # or func()
|
70 | )?
|
71 | ([ ][ ][ ])? # three spaces means we should keep highlighting
|
72 | ''', re.VERBOSE)
|
73 |
|
74 |
|
75 | def _StringToHref(s):
|
76 | # lower case to match what doctools/cmark.py does
|
77 | return s.lower().replace(' ', '-')
|
78 |
|
79 |
|
80 | # HACK HACK: These happen to have 3 spaces before them!
|
81 | _NOT_A_TOPIC = ['compatible', 'egrep']
|
82 |
|
83 | # BUGS:
|
84 | # - Continuation lines: hacked with ...
|
85 | # - Some X before punctuation aren't highlighted
|
86 |
|
87 | X_LEFT_SPAN = '<span style="color: darkred">'
|
88 |
|
89 | def IndexLineToHtml(chapter, line, debug_out):
|
90 | """Convert a line of text to HTML.
|
91 |
|
92 | Topics are highlighted and X made red.
|
93 |
|
94 | Args:
|
95 | line: RAW SPAN of HTML that is already escaped.
|
96 |
|
97 | Returns:
|
98 | The HTML with some tags inserted.
|
99 | """
|
100 | f = cStringIO.StringIO()
|
101 | out = html.Output(line, f)
|
102 |
|
103 | html_page = 'chap-%s.html' % chapter
|
104 |
|
105 | pos = 0 # position within line
|
106 |
|
107 | section_impl = True
|
108 |
|
109 | if line.startswith('X '):
|
110 | out.Print(X_LEFT_SPAN)
|
111 | out.PrintUntil(2)
|
112 | out.Print('</span>')
|
113 | pos = 2
|
114 | section_impl = False
|
115 | elif line.startswith(' '):
|
116 | pos = 2
|
117 | else:
|
118 | return line
|
119 |
|
120 | # Highlight [Section] at the start of a line.
|
121 | m = SECTION_RE.match(line, pos)
|
122 | if m:
|
123 | section_name = m.group(1)
|
124 | #href = _StringToHref(section_name)
|
125 | href = html_lib.PrettyHref(section_name, preserve_anchor_case=True)
|
126 |
|
127 | out.PrintUntil(m.start(1))
|
128 | out.Print('<a href="%s#%s" class="level2">' % (html_page, href))
|
129 | out.PrintUntil(m.end(1)) # anchor
|
130 | out.Print('</a>')
|
131 |
|
132 | pos = m.end(0) # ADVANCE
|
133 | else:
|
134 | section_name = None
|
135 |
|
136 | line_info = {'section': section_name, 'impl': section_impl, 'topics': []}
|
137 | debug_out.append(line_info)
|
138 |
|
139 | _WHITESPACE = re.compile(r'[ ]+')
|
140 | m = _WHITESPACE.match(line, pos)
|
141 | assert m, 'Expected whitespace %r' % line
|
142 |
|
143 | pos = m.end(0)
|
144 |
|
145 | done = False
|
146 | while not done:
|
147 | # Now just match one
|
148 | m = TOPIC_RE.match(line, pos)
|
149 | if not m or m.group(2) in _NOT_A_TOPIC:
|
150 | break
|
151 |
|
152 | topic_impl = True
|
153 | if m.group(1):
|
154 | out.PrintUntil(m.start(1))
|
155 | out.Print(X_LEFT_SPAN)
|
156 | out.PrintUntil(m.end(1))
|
157 | out.Print('</span>')
|
158 | topic_impl = False
|
159 |
|
160 | # The linked topic
|
161 | topic = m.group(2)
|
162 | line_info['topics'].append((topic, topic_impl))
|
163 |
|
164 | out.PrintUntil(m.start(2))
|
165 | out.Print('<a href="%s#%s">' % (html_page, topic))
|
166 | out.PrintUntil(m.end(2))
|
167 | out.Print('</a>')
|
168 |
|
169 | # Trailing 3 spaces required to continue.
|
170 | if not m.group(4):
|
171 | done = True
|
172 |
|
173 | pos = m.end(0)
|
174 |
|
175 | out.PrintTheRest()
|
176 |
|
177 | return f.getvalue()
|
178 |
|
179 |
|
180 | class Splitter(HTMLParser.HTMLParser):
|
181 | """Split an HTML stream starting at each of the heading tags.
|
182 |
|
183 | For *-help.html.
|
184 |
|
185 | TODO: Rewrite with this with lazylex!
|
186 |
|
187 | Algorithm:
|
188 | - ExtractBody() first, then match balanced tags
|
189 | - SPLIT by h2, h3, h4
|
190 | - Match <pre><code> blocks and re-indent
|
191 | - Later:
|
192 | - links <a href="">
|
193 | - `` is turned into inline <code></code>
|
194 | - ** ** for bold
|
195 | - * * for emphasis
|
196 | - <p> needs word wrapping! Oops.
|
197 | - actually cmark seems to preserve this? OK maybe not.
|
198 | - we just need space between <p>
|
199 | """
|
200 | def __init__(self, heading_tags, out):
|
201 | HTMLParser.HTMLParser.__init__(self)
|
202 | self.heading_tags = heading_tags
|
203 | self.out = out
|
204 |
|
205 | self.cur_group = None # type: List[Tuple[str, str, List, List]]
|
206 | self.in_heading = False
|
207 |
|
208 | self.indent = 0
|
209 |
|
210 | def log(self, msg, *args):
|
211 | ind = self.indent * ' '
|
212 | if 0:
|
213 | log(ind + msg, *args)
|
214 |
|
215 | def handle_starttag(self, tag, attrs):
|
216 | if tag in self.heading_tags:
|
217 | self.in_heading = True
|
218 | if self.cur_group:
|
219 | self.out.append(self.cur_group)
|
220 |
|
221 | self.cur_group = (tag, attrs, [], [])
|
222 |
|
223 | self.log('[%d] <> %s %s', self.indent, tag, attrs)
|
224 | self.indent += 1
|
225 |
|
226 | def handle_endtag(self, tag):
|
227 | if tag in self.heading_tags:
|
228 | self.in_heading = False
|
229 |
|
230 | self.log('[%d] </> %s', self.indent, tag)
|
231 | self.indent -= 1
|
232 |
|
233 | def handle_entityref(self, name):
|
234 | """
|
235 | From Python docs:
|
236 | This method is called to process a named character reference of the form
|
237 | &name; (e.g. >), where name is a general entity reference (e.g. 'gt').
|
238 | """
|
239 | c = html.CHAR_ENTITY[name]
|
240 | if self.in_heading:
|
241 | self.cur_group[2].append(c)
|
242 | else:
|
243 | if self.cur_group:
|
244 | self.cur_group[3].append(c)
|
245 |
|
246 | def handle_data(self, data):
|
247 | self.log('data %r', data)
|
248 | if self.in_heading:
|
249 | self.cur_group[2].append(data)
|
250 | else:
|
251 | if self.cur_group:
|
252 | self.cur_group[3].append(data)
|
253 |
|
254 | def end(self):
|
255 | if self.cur_group:
|
256 | self.out.append(self.cur_group)
|
257 |
|
258 | # Maybe detect nesting?
|
259 | if self.indent != 0:
|
260 | raise RuntimeError(
|
261 | 'Unbalanced HTML tags: indent=%d, cur_group=%s' % (
|
262 | self.indent, self.cur_group))
|
263 |
|
264 |
|
265 | def ExtractBody(s):
|
266 | """Extract what's in between <body></body>
|
267 |
|
268 | The splitter needs balanced tags, and what's in <head> isn't balanced.
|
269 | """
|
270 | f = cStringIO.StringIO()
|
271 | out = html.Output(s, f)
|
272 | tag_lexer = html.TagLexer(s)
|
273 |
|
274 | pos = 0
|
275 | it = html.ValidTokens(s)
|
276 | while True:
|
277 | try:
|
278 | tok_id, end_pos = next(it)
|
279 | except StopIteration:
|
280 | break
|
281 |
|
282 | if tok_id == html.StartTag:
|
283 | tag_lexer.Reset(pos, end_pos)
|
284 | if tag_lexer.TagName() == 'body':
|
285 | body_start_right = end_pos # right after <body>
|
286 |
|
287 | out.SkipTo(body_start_right)
|
288 | body_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'body')
|
289 |
|
290 | out.PrintUntil(body_end_left)
|
291 | break
|
292 |
|
293 | pos = end_pos
|
294 |
|
295 | return f.getvalue()
|
296 |
|
297 |
|
298 | def SplitIntoCards(heading_tags, contents):
|
299 | contents = ExtractBody(contents)
|
300 |
|
301 | groups = []
|
302 | sp = Splitter(heading_tags, groups)
|
303 | sp.feed(contents)
|
304 | sp.end()
|
305 |
|
306 | for tag, attrs, heading_parts, parts in groups:
|
307 | heading = ''.join(heading_parts).strip()
|
308 |
|
309 | # Don't strip leading space?
|
310 | text = ''.join(parts)
|
311 | text = text.strip('\n') + '\n'
|
312 |
|
313 | #log('text = %r', text[:10])
|
314 |
|
315 | yield tag, attrs, heading, text
|
316 |
|
317 | #log('make_help.py: Parsed %d parts', len(groups))
|
318 |
|
319 |
|
320 | def HelpTopics(s):
|
321 | """
|
322 | Given an HTML page like index-{osh,ysh}.html,
|
323 |
|
324 | Yield groups (section_id, section_name, block of text)
|
325 | """
|
326 | tag_lexer = html.TagLexer(s)
|
327 |
|
328 | pos = 0
|
329 | it = html.ValidTokens(s)
|
330 | while True:
|
331 | try:
|
332 | tok_id, end_pos = next(it)
|
333 | except StopIteration:
|
334 | break
|
335 |
|
336 | if tok_id == html.StartTag:
|
337 | tag_lexer.Reset(pos, end_pos)
|
338 | #log('%r', tag_lexer.TagString())
|
339 | #log('%r', tag_lexer.TagName())
|
340 |
|
341 | # Capture <h2 id="foo"> first
|
342 | if tag_lexer.TagName() == 'h2':
|
343 | h2_start_right = end_pos
|
344 |
|
345 | open_tag_right = end_pos
|
346 | section_id = tag_lexer.GetAttr('id')
|
347 | assert section_id, 'Expected id= in %r' % tag_lexer.TagString()
|
348 |
|
349 | h2_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'h2')
|
350 |
|
351 | anchor_html = s[h2_start_right : h2_end_left]
|
352 | paren_pos = anchor_html.find('(')
|
353 | if paren_pos == -1:
|
354 | section_name = anchor_html
|
355 | else:
|
356 | section_name = anchor_html[: paren_pos].strip()
|
357 |
|
358 | # Now find the <code></code> span
|
359 | _, code_start_right = html.ReadUntilStartTag(it, tag_lexer, 'code')
|
360 | css_class = tag_lexer.GetAttr('class')
|
361 | assert css_class is not None
|
362 | assert css_class.startswith('language-chapter-links-'), tag_lexer.TagString()
|
363 |
|
364 | code_end_left, _ = html.ReadUntilEndTag(it, tag_lexer, 'code')
|
365 |
|
366 | text = html.ToText(s, code_start_right, code_end_left)
|
367 | yield section_id, section_name, text
|
368 |
|
369 | pos = end_pos
|
370 |
|
371 |
|
372 | class DocNode(object):
|
373 | """To visualize doc structure."""
|
374 |
|
375 | def __init__(self, name, attrs=None, text=None):
|
376 | self.name = name
|
377 | self.attrs = attrs # for h2 and h3 links
|
378 | self.text = text
|
379 | self.children = []
|
380 |
|
381 |
|
382 | def CardsFromIndex(sh, out_prefix):
|
383 | sections = []
|
384 | for section_id, section_name, text in HelpTopics(sys.stdin.read()):
|
385 | if 0:
|
386 | log('section_id = %r', section_id)
|
387 | log('section_name = %r', section_name)
|
388 | log('')
|
389 | #log('text = %r', text[:20])
|
390 |
|
391 | topic = '%s-%s' % (sh, section_id) # e.g. ysh-overview
|
392 |
|
393 | path = os.path.join(out_prefix, topic)
|
394 | with open(path, 'w') as f:
|
395 | f.write('%s\n\n' % section_name) # section_id is printed dynamically
|
396 | f.write(text)
|
397 | #f.write('\n') # extra
|
398 | log(' Wrote %s', path)
|
399 | sections.append(section_id)
|
400 |
|
401 | log(' (doctools/make_help) -> %d sections -> %s', len(sections), out_prefix)
|
402 |
|
403 |
|
404 | def CardsFromChapters(out_dir, tag_level, paths):
|
405 | """
|
406 | Args:
|
407 | paths: list of chap-*.html to read
|
408 | """
|
409 | topics = {}
|
410 |
|
411 | root_node = DocNode('/')
|
412 | cur_h2_node = None
|
413 |
|
414 | seen = set()
|
415 | for path in paths:
|
416 | with open(path) as f:
|
417 | contents = f.read()
|
418 |
|
419 | filename = os.path.basename(path)
|
420 |
|
421 | tmp, _ = os.path.splitext(filename)
|
422 | assert tmp.startswith('chap-')
|
423 | chapter_name = tmp[len('chap-'):]
|
424 |
|
425 | page_node = DocNode(filename)
|
426 |
|
427 | cards = SplitIntoCards(['h2', 'h3', 'h4'], contents)
|
428 |
|
429 | for tag, attrs, heading, text in cards:
|
430 | values = [v for k, v in attrs if k == 'id']
|
431 | id_value = values[0] if len(values) == 1 else None
|
432 |
|
433 | topic_id = id_value if id_value else heading.replace(' ', '-')
|
434 |
|
435 | if tag == 'h2':
|
436 | name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
|
437 | h2 = DocNode(name, attrs=attrs)
|
438 | page_node.children.append(h2)
|
439 | cur_h2_node = h2
|
440 | elif tag == 'h3':
|
441 | name = html_lib.PrettyHref(heading, preserve_anchor_case=True)
|
442 | # attach text so we can see which topics have empty bodies
|
443 | h3 = DocNode(name, attrs=attrs, text=text)
|
444 | cur_h2_node.children.append(h3)
|
445 |
|
446 | if tag != tag_level:
|
447 | continue # we only care about h3 now
|
448 |
|
449 | if 0:
|
450 | log('tag = %r', tag)
|
451 | log('topic_id = %r', topic_id)
|
452 | log('heading = %r', heading)
|
453 | log('text = %r', text[:20])
|
454 |
|
455 | embed = ('oils-embed', '1') in attrs
|
456 |
|
457 | if out_dir is not None and embed:
|
458 | # indices start with _
|
459 | path = os.path.join(out_dir, topic_id)
|
460 | with open(path, 'w') as f:
|
461 | f.write(text)
|
462 |
|
463 | # help builtin will show URL if there's a chapter name
|
464 | topics[topic_id] = None if embed else chapter_name
|
465 |
|
466 | if topic_id in seen:
|
467 | log('Warning: %r is a duplicate topic', topic_id)
|
468 | seen.add(topic_id)
|
469 |
|
470 | root_node.children.append(page_node)
|
471 |
|
472 | num_sections = sum(len(child.children) for child in root_node.children)
|
473 |
|
474 | log('%d chapters -> (doctools/make_help) -> %d <h3> cards from %d <h2> sections to %s',
|
475 | len(paths), len(topics), num_sections, out_dir)
|
476 |
|
477 | return topics, root_node
|
478 |
|
479 |
|
480 | class StrPool(object):
|
481 | def __init__(self):
|
482 | self.var_names = {}
|
483 | self.global_strs = []
|
484 | self.unique_id = 1
|
485 |
|
486 | def Add(self, s):
|
487 | if s in self.var_names:
|
488 | return
|
489 |
|
490 | var_name = 'gStr%d' % self.unique_id
|
491 | self.unique_id += 1
|
492 |
|
493 | import json
|
494 | # Use JSON as approximation for C++ string
|
495 | self.global_strs.append('GLOBAL_STR(%s, %s)' % (var_name, json.dumps(s)))
|
496 |
|
497 | self.var_names[s] = var_name
|
498 |
|
499 |
|
500 | def WriteTopicDict(topic_dict, header_f, cc_f):
|
501 | header_f.write('''
|
502 | #include "mycpp/runtime.h"
|
503 |
|
504 | namespace help_meta {
|
505 | Dict<BigStr*, BigStr*>* TopicMetadata();
|
506 | }
|
507 | ''')
|
508 |
|
509 | pool = StrPool()
|
510 |
|
511 | for k, v in topic_dict.iteritems():
|
512 | pool.Add(k)
|
513 | if v is not None:
|
514 | pool.Add(v)
|
515 | #log('%s %s', k, v)
|
516 |
|
517 | num_items = len(topic_dict)
|
518 | key_names = []
|
519 | val_names = []
|
520 |
|
521 | for k, v in topic_dict.iteritems():
|
522 | key_names.append(pool.var_names[k])
|
523 | if v is None:
|
524 | v_str = 'nullptr'
|
525 | else:
|
526 | v_str = pool.var_names[v]
|
527 | val_names.append(v_str)
|
528 |
|
529 | cc_f.write('''
|
530 | #include "mycpp/runtime.h"
|
531 |
|
532 | namespace help_meta {
|
533 |
|
534 | %s
|
535 |
|
536 | GLOBAL_DICT(gTopics, BigStr*, BigStr*, %d, {%s}, {%s});
|
537 |
|
538 | Dict<BigStr*, BigStr*>* TopicMetadata() {
|
539 | return gTopics;
|
540 | }
|
541 | }
|
542 | ''' % ('\n'.join(pool.global_strs), num_items, ' COMMA '.join(key_names),
|
543 | ' COMMA '.join(val_names)))
|
544 |
|
545 |
|
546 | def main(argv):
|
547 | action = argv[1]
|
548 |
|
549 | if action == 'cards-from-index':
|
550 | sh = argv[2] # osh or ysh
|
551 | out_prefix = argv[3]
|
552 |
|
553 | # Read HTML from stdin
|
554 | # TODO: could pass a list of files to speed it up
|
555 | CardsFromIndex(sh, out_prefix)
|
556 |
|
557 | elif action == 'cards-from-chapters':
|
558 |
|
559 | out_dir = argv[2]
|
560 | py_out = argv[3]
|
561 | cc_prefix = argv[4]
|
562 | pages = argv[5:]
|
563 |
|
564 | topic_dict, _ = CardsFromChapters(out_dir, 'h3', pages)
|
565 |
|
566 | # Write topic dict as Python and C++
|
567 |
|
568 | with open(py_out, 'w') as f:
|
569 | f.write('TOPICS = %s\n' % pprint.pformat(topic_dict))
|
570 |
|
571 | f.write('''
|
572 |
|
573 | from typing import Dict
|
574 |
|
575 | def TopicMetadata():
|
576 | # type: () -> Dict[str, str]
|
577 | return TOPICS
|
578 | ''')
|
579 |
|
580 | h_path = cc_prefix + '.h'
|
581 | cc_path = cc_prefix + '.cc'
|
582 |
|
583 | with open(h_path, 'w') as header_f:
|
584 | with open(cc_path, 'w') as cc_f:
|
585 | WriteTopicDict(topic_dict, header_f, cc_f)
|
586 |
|
587 | elif action == 'ref-check':
|
588 | from doctools import cmark
|
589 | from doctools import oils_doc
|
590 | from doctools import ref_check
|
591 |
|
592 | chapters = []
|
593 | all_toc_nodes = []
|
594 |
|
595 | for path in argv[2:]:
|
596 | filename = os.path.basename(path)
|
597 |
|
598 | if filename.endswith('.md'):
|
599 | assert filename.startswith('toc-'), path
|
600 |
|
601 | # First convert to HTML
|
602 | with open(path) as in_file:
|
603 | html = cmark.md2html(in_file.read())
|
604 |
|
605 | # Now highlight code, which # which gives debug output for the
|
606 | # language-chapter-links-*
|
607 |
|
608 | box_nodes = []
|
609 | html = oils_doc.HighlightCode(html, None,
|
610 | debug_out=box_nodes)
|
611 | all_toc_nodes.append({'toc': filename, 'boxes': box_nodes})
|
612 |
|
613 | elif filename.endswith('.html'):
|
614 | assert filename.startswith('chap-'), path
|
615 | chapters.append(path)
|
616 |
|
617 | else:
|
618 | raise RuntimeError('Expected toc-* or chap-*, got %r' % filename)
|
619 |
|
620 | topics, chap_tree = CardsFromChapters(None, 'h3', chapters)
|
621 |
|
622 | #log('%d chapters: %s', len(chapters), chapters[:5])
|
623 | #log('%d topics: %s', len(topics), topics.keys()[:10])
|
624 | log('')
|
625 |
|
626 | ref_check.Check(all_toc_nodes, chap_tree)
|
627 |
|
628 | else:
|
629 | raise RuntimeError('Invalid action %r' % action)
|
630 |
|
631 |
|
632 | if __name__ == '__main__':
|
633 | try:
|
634 | main(sys.argv)
|
635 | except RuntimeError as e:
|
636 | print('FATAL: %s' % e, file=sys.stderr)
|
637 | sys.exit(1)
|
638 |
|
639 |
|
640 | # vim: sw=2
|