OILS / doctools / cmark.py View on Github | oilshell.org

385 lines, 216 significant
1#!/usr/bin/env python2
2"""
3Convert markdown to HTML, then parse the HTML, generate and insert a TOC, and
4insert anchors.
5
6I started from cmark-0.28.3/wrappers/wrapper.py.
7"""
8from __future__ import print_function
9
10import ctypes
11import HTMLParser
12import json
13import optparse
14import os
15import re
16import sys
17
18from doctools import html_lib
19from doctools import doc_html # templates
20from doctools import oils_doc
21
22# Geez find_library returns the filename and not the path? Just hardcode it as
23# a workaround.
24# https://bugs.python.org/issue21042
25
26#from ctypes.util import find_library
27#libname = find_library("cmark")
28#assert libname, "cmark not found"
29
30# There's some ongoing discussion about how to deal with the same in Nix.
31# I think normally you'd just patch/substitute this path during the Nix build.
32# See note in shell.nix
33this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
34
35cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
36cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
37cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
38
39if cmark1 is not None and os.path.exists(cmark1):
40 libname = cmark1
41elif os.path.exists(cmark2):
42 libname = cmark2
43elif os.path.exists(cmark3):
44 libname = cmark3
45else:
46 raise AssertionError("Couldn't find libcmark.so")
47
48cmark = ctypes.CDLL(libname)
49
50markdown = cmark.cmark_markdown_to_html
51markdown.restype = ctypes.c_char_p
52markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
53
54
55def log(msg, *args):
56 if args:
57 msg = msg % args
58
59 # Uncomment to debug
60 #print(msg, file=sys.stderr)
61
62
63# Version 0.29.0 disallowed raw HTML by default!
64CMARK_OPT_UNSAFE = (1 << 17)
65
66def md2html(text):
67 textbytes = text
68 textlen = len(text)
69 return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
70
71
72def demo():
73 sys.stdout.write(md2html('*hi*'))
74
75
76class TocExtractor(HTMLParser.HTMLParser):
77 """
78 When he hit h_tags (h2, h3, h4, etc.), append to self.headings, recording the
79 line number.
80
81 Later, we insert two things:
82 - <a name=""> before each heading
83 - The TOC after <div id="toc">
84 """
85 def __init__(self):
86 HTMLParser.HTMLParser.__init__(self)
87
88 # make targets for these, regardless of whether the TOC links to them.
89 self.h_tags = ['h2', 'h3', 'h4']
90 self.indent = 0
91
92 # The TOC will be inserted after this.
93 self.toc_begin_line = -1
94 self.capturing = False
95
96 # Flat list of (line_num, tag, id, HTML)?
97 # HTML is like innerHTML. There can be <code> annotations and so forth.
98 # id is optional -- it can be used for generating headings.
99 self.headings = []
100
101 def handle_starttag(self, tag, attrs):
102 if tag == 'div' and attrs == [('id', 'toc')]:
103 log('%s> %s %s', self.indent * ' ', tag, attrs)
104 self.indent += 1
105 self.toc_begin_line, _ = self.getpos()
106
107 # Can't have nested <a> tags
108 if self.capturing and tag != 'a':
109 self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
110
111 if tag in self.h_tags:
112 log('%s> %s %s', self.indent * ' ', tag, attrs)
113 self.indent += 1
114 line_num, _ = self.getpos()
115
116 css_id = None
117 for k, v in attrs:
118 if k == 'id':
119 css_id = v
120 break
121 self.headings.append((line_num, tag, css_id, [], []))
122 self.capturing = True # record the text inside <h2></h2> etc.
123
124 def handle_endtag(self, tag):
125 # Debug print
126 if tag == 'div':
127 self.indent -= 1
128 log('%s< %s', self.indent * ' ', tag)
129
130 if tag in self.h_tags:
131 self.indent -= 1
132 log('%s< %s', self.indent * ' ', tag)
133 self.capturing = False
134
135 # Can't have nested <a> tags
136 if self.capturing and tag != 'a':
137 self._AppendHtml('</%s>' % tag)
138
139 def handle_entityref(self, data):
140 """
141 From Python docs:
142 This method is called to process a named character reference of the form
143 &name; (e.g. &gt;), where name is a general entity reference (e.g. 'gt').
144 """
145 # BUG FIX: For when we have say &quot; or &lt; in subheadings
146 if self.capturing:
147 self._AppendHtml('&%s;' % data)
148
149 def handle_data(self, data):
150 # Debug print
151 if self.indent > 0:
152 log('%s| %r', self.indent * ' ', data)
153
154 if self.capturing:
155 self._AppendHtml(data)
156 self._AppendText(data)
157
158 def _AppendText(self, text):
159 """Accumulate text of the last heading."""
160 _, _, _, _, text_parts = self.headings[-1]
161 text_parts.append(text)
162
163 def _AppendHtml(self, html):
164 """Accumulate HTML of the last heading."""
165 _, _, _, html_parts, _ = self.headings[-1]
166 html_parts.append(html)
167
168
169TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
170
171
172def _MakeTocAndAnchors(opts, toc_tags, headings, toc_pos,
173 preserve_anchor_case):
174 """
175 Given a list of extract headings and TOC position, render HTML to insert.
176
177 Args:
178 toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
179 all of them.
180 """
181 # Example:
182 # <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
183 #
184 # Yeah it's just a flat list, and then indentation is done with CSS. Hm
185 # that's easy.
186
187 toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
188 insertions = []
189
190 i = 0
191 for line_num, tag, css_id, html_parts, text_parts in headings:
192 css_class = TAG_TO_CSS[tag]
193
194 # Add BOTH href, for stability.
195 numeric_href = 'toc_%d' % i
196
197 # If there was an explicit CSS ID written by the user, use that as the href.
198 # I used this in the blog a few times.
199
200 pretty_href = html_lib.PrettyHref(''.join(text_parts),
201 preserve_anchor_case=preserve_anchor_case)
202
203 if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
204 toc_href = css_id
205 else:
206 # Always use the pretty version now. The old numeric version is still a
207 # target, but not in the TOC.
208 toc_href = pretty_href
209
210 line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
211 css_class, toc_href, ''.join(html_parts))
212 if tag in toc_tags:
213 toc_lines.append(line)
214
215 # TODO: We should just use the damn <h2 id="foo"> attribute! I didn't know
216 # those are valid anchors. We don't need to add <a name=""> ever.
217 FMT = '<a name="%s"></a>\n'
218
219 targets = []
220 if opts.toc_pretty_href: # NEW WAY
221 targets.append(FMT % pretty_href)
222 elif css_id: # Old blog explicit
223 targets.append(FMT % css_id)
224 targets.append(FMT % numeric_href)
225 else: # Old blog implicit
226 targets.append(FMT % pretty_href) # Include the NEW WAY too
227 targets.append(FMT % numeric_href)
228
229 insertions.append((line_num, ''.join(targets)))
230
231 i += 1
232
233 # +1 to insert AFTER the <div>
234 toc_insert = (toc_pos+1, ''.join(toc_lines))
235 insertions.insert(0, toc_insert) # The first insertion is TOC
236
237 return insertions
238
239
240def _ApplyInsertions(lines, insertions, out_file):
241 assert insertions, "Should be at least one insertion"
242 j = 0
243 n = len(insertions)
244
245 for i, line in enumerate(lines):
246 current_line = i + 1 # 1-based
247
248 if j < n:
249 line_num, s = insertions[j]
250 if current_line == line_num:
251 out_file.write(s)
252 j += 1
253
254 out_file.write(line)
255
256
257def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
258 if debug_out is None:
259 debug_out = []
260
261 # First convert to HTML
262 html = md2html(in_file.read())
263
264 # Now process HTML with oils_doc
265 if use_fastlex:
266 # Note: extract code BEFORE doing the HTML highlighting.
267 if opts.code_block_output:
268 with open(opts.code_block_output, 'w') as f:
269 f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
270 opts.code_block_output)
271 text = oils_doc.ExtractCode(html, f)
272
273 html = oils_doc.RemoveComments(html)
274
275 # Hack for allowing tables without <p> in cells, which CommonMark seems to
276 # require?
277 html = html.replace('<p><pstrip>', '')
278 html = html.replace('</pstrip></p>', '')
279
280 # Expand $xref, etc.
281 html = oils_doc.ExpandLinks(html)
282
283 # <code> blocks
284 # Including class=language-oil-help-topics
285 html = oils_doc.HighlightCode(html, meta.get('default_highlighter'),
286 debug_out=debug_out)
287
288 # h2 is the title. h1 is unused.
289 if opts.toc_tags:
290 toc_tags = opts.toc_tags
291 else:
292 toc_tags = ('h3', 'h4')
293
294 parser = TocExtractor()
295 parser.feed(html)
296
297 log('')
298 log('*** HTML headings:')
299 for heading in parser.headings:
300 log(heading)
301
302 if parser.toc_begin_line == -1: # Not found!
303 out_file.write(html) # Pass through
304 return
305
306 preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
307 insertions = _MakeTocAndAnchors(opts, toc_tags, parser.headings,
308 parser.toc_begin_line, preserve_anchor_case)
309
310 log('')
311 log('*** Text Insertions:')
312 for ins in insertions:
313 log(ins)
314
315 log('')
316 log('*** Output:')
317
318 lines = html.splitlines(True) # keep newlines
319 _ApplyInsertions(lines, insertions, out_file)
320
321
322def Options():
323 """Returns an option parser instance."""
324 p = optparse.OptionParser('cmark.py [options]')
325
326 p.add_option(
327 '--toc-pretty-href', action='store_true', default=False,
328 help='Generate textual hrefs #like-this rather than like #toc10')
329 p.add_option(
330 '--toc-tag', dest='toc_tags', action='append', default=[],
331 help='h tags to include in the TOC, e.g. h2 h3')
332 p.add_option(
333 '--disable-fastlex', dest='disable_fastlex', action='store_true',
334 default=False,
335 help='Hack for old blog posts')
336
337 p.add_option(
338 '--code-block-output', dest='code_block_output',
339 default=None,
340 help='Extract and print code blocks to this file')
341
342 return p
343
344
345# width 40 by default
346DEFAULT_META = {
347 'body_css_class': 'width40'
348}
349
350
351def main(argv):
352 o = Options()
353 opts, argv = o.parse_args(argv)
354 assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
355
356 meta = dict(DEFAULT_META)
357
358 if len(argv) == 3: # It's Oil documentation
359 with open(argv[1]) as f:
360 meta.update(json.load(f))
361
362 # Docs have a special header and footer.
363 with open(argv[2]) as content_f:
364 doc_html.Header(meta, sys.stdout, draft_warning=True)
365 Render(opts, meta, content_f, sys.stdout)
366 doc_html.Footer(meta, sys.stdout)
367 else:
368 # Filter for blog and for benchmarks.
369
370 # Metadata is optional here
371 try:
372 with open(argv[1]) as f:
373 meta.update(json.load(f))
374 except IndexError:
375 pass
376
377 # Old style for blog: it's a filter
378 Render(opts, meta, sys.stdin, sys.stdout, use_fastlex=not
379 opts.disable_fastlex)
380
381
382if __name__ == '__main__':
383 main(sys.argv)
384
385# vim: sw=2