doctools/cmark.py

OILS / doctools / cmark.py View on Github | oilshell.org

385 lines, 216 significant

1	#!/usr/bin/env python2
2	"""
3	Convert markdown to HTML, then parse the HTML, generate and insert a TOC, and
4	insert anchors.
5
6	I started from cmark-0.28.3/wrappers/wrapper.py.
7	"""
8	from __future__ import print_function
9
10	import ctypes
11	import HTMLParser
12	import json
13	import optparse
14	import os
15	import re
16	import sys
17
18	from doctools import html_lib
19	from doctools import doc_html # templates
20	from doctools import oils_doc
21
22	# Geez find_library returns the filename and not the path? Just hardcode it as
23	# a workaround.
24	# https://bugs.python.org/issue21042
25
26	#from ctypes.util import find_library
27	#libname = find_library("cmark")
28	#assert libname, "cmark not found"
29
30	# There's some ongoing discussion about how to deal with the same in Nix.
31	# I think normally you'd just patch/substitute this path during the Nix build.
32	# See note in shell.nix
33	this_dir = os.path.abspath(os.path.dirname(sys.argv[0]))
34
35	cmark1 = os.environ.get('_NIX_SHELL_LIBCMARK')
36	cmark2 = os.path.join(this_dir, '../../oil_DEPS/libcmark.so')
37	cmark3 = '/wedge/oils-for-unix.org/pkg/cmark/0.29.0/lib/libcmark.so' # a symlink
38
39	if cmark1 is not None and os.path.exists(cmark1):
40	libname = cmark1
41	elif os.path.exists(cmark2):
42	libname = cmark2
43	elif os.path.exists(cmark3):
44	libname = cmark3
45	else:
46	raise AssertionError("Couldn't find libcmark.so")
47
48	cmark = ctypes.CDLL(libname)
49
50	markdown = cmark.cmark_markdown_to_html
51	markdown.restype = ctypes.c_char_p
52	markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
53
54
55	def log(msg, *args):
56	if args:
57	msg = msg % args
58
59	# Uncomment to debug
60	#print(msg, file=sys.stderr)
61
62
63	# Version 0.29.0 disallowed raw HTML by default!
64	CMARK_OPT_UNSAFE = (1 << 17)
65
66	def md2html(text):
67	textbytes = text
68	textlen = len(text)
69	return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
70
71
72	def demo():
73	sys.stdout.write(md2html('hi'))
74
75
76	class TocExtractor(HTMLParser.HTMLParser):
77	"""
78	When he hit h_tags (h2, h3, h4, etc.), append to self.headings, recording the
79	line number.
80
81	Later, we insert two things:
82	- <a name=""> before each heading
83	- The TOC after <div id="toc">
84	"""
85	def __init__(self):
86	HTMLParser.HTMLParser.__init__(self)
87
88	# make targets for these, regardless of whether the TOC links to them.
89	self.h_tags = ['h2', 'h3', 'h4']
90	self.indent = 0
91
92	# The TOC will be inserted after this.
93	self.toc_begin_line = -1
94	self.capturing = False
95
96	# Flat list of (line_num, tag, id, HTML)?
97	# HTML is like innerHTML. There can be <code> annotations and so forth.
98	# id is optional -- it can be used for generating headings.
99	self.headings = []
100
101	def handle_starttag(self, tag, attrs):
102	if tag == 'div' and attrs == [('id', 'toc')]:
103	log('%s> %s %s', self.indent * ' ', tag, attrs)
104	self.indent += 1
105	self.toc_begin_line, _ = self.getpos()
106
107	# Can't have nested <a> tags
108	if self.capturing and tag != 'a':
109	self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
110
111	if tag in self.h_tags:
112	log('%s> %s %s', self.indent * ' ', tag, attrs)
113	self.indent += 1
114	line_num, _ = self.getpos()
115
116	css_id = None
117	for k, v in attrs:
118	if k == 'id':
119	css_id = v
120	break
121	self.headings.append((line_num, tag, css_id, [], []))
122	self.capturing = True # record the text inside <h2></h2> etc.
123
124	def handle_endtag(self, tag):
125	# Debug print
126	if tag == 'div':
127	self.indent -= 1
128	log('%s< %s', self.indent * ' ', tag)
129
130	if tag in self.h_tags:
131	self.indent -= 1
132	log('%s< %s', self.indent * ' ', tag)
133	self.capturing = False
134
135	# Can't have nested <a> tags
136	if self.capturing and tag != 'a':
137	self._AppendHtml('</%s>' % tag)
138
139	def handle_entityref(self, data):
140	"""
141	From Python docs:
142	This method is called to process a named character reference of the form
143	&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
144	"""
145	# BUG FIX: For when we have say " or < in subheadings
146	if self.capturing:
147	self._AppendHtml('&%s;' % data)
148
149	def handle_data(self, data):
150	# Debug print
151	if self.indent > 0:
152	log('%s\| %r', self.indent * ' ', data)
153
154	if self.capturing:
155	self._AppendHtml(data)
156	self._AppendText(data)
157
158	def _AppendText(self, text):
159	"""Accumulate text of the last heading."""
160	_, _, _, _, text_parts = self.headings[-1]
161	text_parts.append(text)
162
163	def _AppendHtml(self, html):
164	"""Accumulate HTML of the last heading."""
165	_, _, _, html_parts, _ = self.headings[-1]
166	html_parts.append(html)
167
168
169	TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
170
171
172	def _MakeTocAndAnchors(opts, toc_tags, headings, toc_pos,
173	preserve_anchor_case):
174	"""
175	Given a list of extract headings and TOC position, render HTML to insert.
176
177	Args:
178	toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
179	all of them.
180	"""
181	# Example:
182	# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
183	#
184	# Yeah it's just a flat list, and then indentation is done with CSS. Hm
185	# that's easy.
186
187	toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
188	insertions = []
189
190	i = 0
191	for line_num, tag, css_id, html_parts, text_parts in headings:
192	css_class = TAG_TO_CSS[tag]
193
194	# Add BOTH href, for stability.
195	numeric_href = 'toc_%d' % i
196
197	# If there was an explicit CSS ID written by the user, use that as the href.
198	# I used this in the blog a few times.
199
200	pretty_href = html_lib.PrettyHref(''.join(text_parts),
201	preserve_anchor_case=preserve_anchor_case)
202
203	if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
204	toc_href = css_id
205	else:
206	# Always use the pretty version now. The old numeric version is still a
207	# target, but not in the TOC.
208	toc_href = pretty_href
209
210	line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
211	css_class, toc_href, ''.join(html_parts))
212	if tag in toc_tags:
213	toc_lines.append(line)
214
215	# TODO: We should just use the damn <h2 id="foo"> attribute! I didn't know
216	# those are valid anchors. We don't need to add <a name=""> ever.
217	FMT = '<a name="%s"></a>\n'
218
219	targets = []
220	if opts.toc_pretty_href: # NEW WAY
221	targets.append(FMT % pretty_href)
222	elif css_id: # Old blog explicit
223	targets.append(FMT % css_id)
224	targets.append(FMT % numeric_href)
225	else: # Old blog implicit
226	targets.append(FMT % pretty_href) # Include the NEW WAY too
227	targets.append(FMT % numeric_href)
228
229	insertions.append((line_num, ''.join(targets)))
230
231	i += 1
232
233	# +1 to insert AFTER the <div>
234	toc_insert = (toc_pos+1, ''.join(toc_lines))
235	insertions.insert(0, toc_insert) # The first insertion is TOC
236
237	return insertions
238
239
240	def _ApplyInsertions(lines, insertions, out_file):
241	assert insertions, "Should be at least one insertion"
242	j = 0
243	n = len(insertions)
244
245	for i, line in enumerate(lines):
246	current_line = i + 1 # 1-based
247
248	if j < n:
249	line_num, s = insertions[j]
250	if current_line == line_num:
251	out_file.write(s)
252	j += 1
253
254	out_file.write(line)
255
256
257	def Render(opts, meta, in_file, out_file, use_fastlex=True, debug_out=None):
258	if debug_out is None:
259	debug_out = []
260
261	# First convert to HTML
262	html = md2html(in_file.read())
263
264	# Now process HTML with oils_doc
265	if use_fastlex:
266	# Note: extract code BEFORE doing the HTML highlighting.
267	if opts.code_block_output:
268	with open(opts.code_block_output, 'w') as f:
269	f.write('# %s: code blocks extracted from Markdown/HTML\n\n' %
270	opts.code_block_output)
271	text = oils_doc.ExtractCode(html, f)
272
273	html = oils_doc.RemoveComments(html)
274
275	# Hack for allowing tables without <p> in cells, which CommonMark seems to
276	# require?
277	html = html.replace('<p><pstrip>', '')
278	html = html.replace('</pstrip></p>', '')
279
280	# Expand $xref, etc.
281	html = oils_doc.ExpandLinks(html)
282
283	# <code> blocks
284	# Including class=language-oil-help-topics
285	html = oils_doc.HighlightCode(html, meta.get('default_highlighter'),
286	debug_out=debug_out)
287
288	# h2 is the title. h1 is unused.
289	if opts.toc_tags:
290	toc_tags = opts.toc_tags
291	else:
292	toc_tags = ('h3', 'h4')
293
294	parser = TocExtractor()
295	parser.feed(html)
296
297	log('')
298	log('*** HTML headings:')
299	for heading in parser.headings:
300	log(heading)
301
302	if parser.toc_begin_line == -1: # Not found!
303	out_file.write(html) # Pass through
304	return
305
306	preserve_anchor_case = bool(meta.get('preserve_anchor_case', ''))
307	insertions = _MakeTocAndAnchors(opts, toc_tags, parser.headings,
308	parser.toc_begin_line, preserve_anchor_case)
309
310	log('')
311	log('*** Text Insertions:')
312	for ins in insertions:
313	log(ins)
314
315	log('')
316	log('*** Output:')
317
318	lines = html.splitlines(True) # keep newlines
319	_ApplyInsertions(lines, insertions, out_file)
320
321
322	def Options():
323	"""Returns an option parser instance."""
324	p = optparse.OptionParser('cmark.py [options]')
325
326	p.add_option(
327	'--toc-pretty-href', action='store_true', default=False,
328	help='Generate textual hrefs #like-this rather than like #toc10')
329	p.add_option(
330	'--toc-tag', dest='toc_tags', action='append', default=[],
331	help='h tags to include in the TOC, e.g. h2 h3')
332	p.add_option(
333	'--disable-fastlex', dest='disable_fastlex', action='store_true',
334	default=False,
335	help='Hack for old blog posts')
336
337	p.add_option(
338	'--code-block-output', dest='code_block_output',
339	default=None,
340	help='Extract and print code blocks to this file')
341
342	return p
343
344
345	# width 40 by default
346	DEFAULT_META = {
347	'body_css_class': 'width40'
348	}
349
350
351	def main(argv):
352	o = Options()
353	opts, argv = o.parse_args(argv)
354	assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
355
356	meta = dict(DEFAULT_META)
357
358	if len(argv) == 3: # It's Oil documentation
359	with open(argv[1]) as f:
360	meta.update(json.load(f))
361
362	# Docs have a special header and footer.
363	with open(argv[2]) as content_f:
364	doc_html.Header(meta, sys.stdout, draft_warning=True)
365	Render(opts, meta, content_f, sys.stdout)
366	doc_html.Footer(meta, sys.stdout)
367	else:
368	# Filter for blog and for benchmarks.
369
370	# Metadata is optional here
371	try:
372	with open(argv[1]) as f:
373	meta.update(json.load(f))
374	except IndexError:
375	pass
376
377	# Old style for blog: it's a filter
378	Render(opts, meta, sys.stdin, sys.stdout, use_fastlex=not
379	opts.disable_fastlex)
380
381
382	if __name__ == '__main__':
383	main(sys.argv)
384
385	# vim: sw=2