forked from oils-for-unix/oils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcmark.py
executable file
·353 lines (265 loc) · 9.38 KB
/
cmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#!/usr/bin/env python2
"""
Convert markdown to HTML, then parse the HTML, generate and insert a TOC, and
insert anchors.
I started from cmark-0.28.3/wrappers/wrapper.py.
"""
from __future__ import print_function
import ctypes
import HTMLParser
import optparse
import os
import re
import sys
from doctools import html_lib
from doctools import doc_html # templates
from doctools import oil_doc
# Geez find_library returns the filename and not the path? Just hardcode it as
# a workaround.
# https://bugs.python.org/issue21042
#from ctypes.util import find_library
#libname = find_library("cmark")
#assert libname, "cmark not found"
# There's some ongoing discussion about how to deal with the same in Nix.
# I think normally you'd just patch/substitute this path during the Nix build.
# See note in shell.nix
libname = os.environ.get('_NIX_SHELL_LIBCMARK', '_deps/libcmark.so')
cmark = ctypes.CDLL(libname)
markdown = cmark.cmark_markdown_to_html
markdown.restype = ctypes.c_char_p
markdown.argtypes = [ctypes.c_char_p, ctypes.c_long, ctypes.c_long]
def log(msg, *args):
if args:
msg = msg % args
# Uncomment to debug
#print(msg, file=sys.stderr)
# Version 0.29.0 disallowed raw HTML by default!
CMARK_OPT_UNSAFE = (1 << 17)
def md2html(text):
textbytes = text
textlen = len(text)
return markdown(textbytes, textlen, CMARK_OPT_UNSAFE)
def demo():
sys.stdout.write(md2html('*hi*'))
def PrettyHref(s):
"""
Turn arbitrary heading text into a clickable href with no special characters.
This is modelled after what github does. It makes everything lower case.
"""
# Split by whitespace or hyphen
words = re.split(r'[\s\-]+', s)
# Keep only alphanumeric
keep = [''.join(re.findall(r'\w+', w)) for w in words]
# Join with - and lowercase. And then remove empty words, unlike Github.
# This is SIMILAR to what Github does, but there's no need to be 100%
# compatible.
return '-'.join(p.lower() for p in keep if p)
class TocExtractor(HTMLParser.HTMLParser):
"""
When he hit h_tags (h2, h3, h4, etc.), append to self.headings, recording the
line number.
Later, we insert two things:
- <a name=""> before each heading
- The TOC after <div id="toc">
"""
def __init__(self):
HTMLParser.HTMLParser.__init__(self)
# make targets for these, regardless of whether the TOC links to them.
self.h_tags = ['h2', 'h3', 'h4']
self.indent = 0
# The TOC will be inserted after this.
self.toc_begin_line = -1
self.capturing = False
# Flat list of (line_num, tag, id, HTML)?
# HTML is like innerHTML. There can be <code> annotations and so forth.
# id is optional -- it can be used for generating headings.
self.headings = []
def handle_starttag(self, tag, attrs):
if tag == 'div' and attrs == [('id', 'toc')]:
log('%s> %s %s', self.indent * ' ', tag, attrs)
self.indent += 1
self.toc_begin_line, _ = self.getpos()
# Can't have nested <a> tags
if self.capturing and tag != 'a':
self._AppendHtml('<%s%s>' % (tag, html_lib.AttrsToString(attrs)))
if tag in self.h_tags:
log('%s> %s %s', self.indent * ' ', tag, attrs)
self.indent += 1
line_num, _ = self.getpos()
css_id = None
for k, v in attrs:
if k == 'id':
css_id = v
break
self.headings.append((line_num, tag, css_id, [], []))
self.capturing = True # record the text inside <h2></h2> etc.
def handle_endtag(self, tag):
# Debug print
if tag == 'div':
self.indent -= 1
log('%s< %s', self.indent * ' ', tag)
if tag in self.h_tags:
self.indent -= 1
log('%s< %s', self.indent * ' ', tag)
self.capturing = False
# Can't have nested <a> tags
if self.capturing and tag != 'a':
self._AppendHtml('</%s>' % tag)
def handle_entityref(self, data):
"""
From Python docs:
This method is called to process a named character reference of the form
&name; (e.g. >), where name is a general entity reference (e.g. 'gt').
"""
# BUG FIX: For when we have say " or < in subheadings
if self.capturing:
self._AppendHtml('&%s;' % data)
def handle_data(self, data):
# Debug print
if self.indent > 0:
log('%s| %r', self.indent * ' ', data)
if self.capturing:
self._AppendHtml(data)
self._AppendText(data)
def _AppendText(self, text):
"""Accumlate text of the last heading."""
_, _, _, _, text_parts = self.headings[-1]
text_parts.append(text)
def _AppendHtml(self, html):
"""Accumulate HTML of the last heading."""
_, _, _, html_parts, _ = self.headings[-1]
html_parts.append(html)
TAG_TO_CSS = {'h2': 'toclevel1', 'h3': 'toclevel2', 'h4': 'toclevel3'}
def _MakeTocAndAnchors(opts, toc_tags, headings, toc_pos):
"""
Given a list of extract headings and TOC position, render HTML to insert.
Args:
toc_tags: List of HTML tags ['h2', 'h3'] to SHOW in TOC. But we LINK to
all of them.
"""
# Example:
# <div class="toclevel2"><a href="#_toc_0">Introduction</a></div>
#
# Yeah it's just a flat list, and then indentation is done with CSS. Hm
# that's easy.
toc_lines = ['<div id="toctitle">Table of Contents</div>\n']
insertions = []
i = 0
for line_num, tag, css_id, html_parts, text_parts in headings:
css_class = TAG_TO_CSS[tag]
# Add BOTH href, for stability.
numeric_href = 'toc_%d' % i
# If there was an explicit CSS ID written by the user, use that as the href.
# I used this in the blog a few times.
pretty_href = PrettyHref(''.join(text_parts))
if css_id: # A FEW OLD BLOG POSTS USE an explicit CSS ID
toc_href = css_id
else:
# Always use the pretty version now. The old numeric version is still a
# target, but not in the TOC.
toc_href = pretty_href
line = ' <div class="%s"><a href="#%s">%s</a></div>\n' % (
css_class, toc_href, ''.join(html_parts))
if tag in toc_tags:
toc_lines.append(line)
# TODO: We should just use the damn <h2 id="foo"> attribute! I didn't know
# those are valid anchors. We don't need to add <a name=""> ever.
FMT = '<a name="%s"></a>\n'
targets = []
if opts.toc_pretty_href: # NEW WAY
targets.append(FMT % pretty_href)
elif css_id: # Old blog explicit
targets.append(FMT % css_id)
targets.append(FMT % numeric_href)
else: # Old blog implicit
targets.append(FMT % pretty_href) # Include the NEW WAY too
targets.append(FMT % numeric_href)
insertions.append((line_num, ''.join(targets)))
i += 1
# +1 to insert AFTER the <div>
toc_insert = (toc_pos+1, ''.join(toc_lines))
insertions.insert(0, toc_insert) # The first insertion is TOC
return insertions
def _ApplyInsertions(lines, insertions, out_file):
assert insertions, "Should be at least one insertion"
j = 0
n = len(insertions)
for i, line in enumerate(lines):
current_line = i + 1 # 1-based
if j < n:
line_num, s = insertions[j]
if current_line == line_num:
out_file.write(s)
j += 1
out_file.write(line)
def Render(opts, in_file, out_file, use_fastlex=True):
html = md2html(in_file.read())
if use_fastlex:
html = oil_doc.RemoveComments(html)
# Hack for allowing tables without <p> in cells, which CommonMark seems to require?
html = html.replace('<p><pstrip>', '')
html = html.replace('</pstrip></p>', '')
# Stages of transformation.
html = oil_doc.ExpandLinks(html)
html = oil_doc.HighlightCode(html)
# h2 is the title. h1 is unused.
if opts.toc_tags:
toc_tags = opts.toc_tags
else:
toc_tags = ('h3', 'h4')
parser = TocExtractor()
parser.feed(html)
log('')
log('*** HTML headings:')
for heading in parser.headings:
log(heading)
if parser.toc_begin_line == -1: # Not found!
out_file.write(html) # Pass through
return
insertions = _MakeTocAndAnchors(opts, toc_tags, parser.headings, parser.toc_begin_line)
log('')
log('*** Text Insertions:')
for ins in insertions:
log(ins)
log('')
log('*** Output:')
lines = html.splitlines(True) # keep newlines
_ApplyInsertions(lines, insertions, out_file)
def Options():
"""Returns an option parser instance."""
p = optparse.OptionParser('cmark.py [options]')
p.add_option(
'--toc-pretty-href', action='store_true', default=False,
help='Generate textual hrefs #like-this rather than like #toc10')
p.add_option(
'--toc-tag', dest='toc_tags', action='append', default=[],
help='h tags to include in the TOC, e.g. h2 h3')
p.add_option(
'--disable-fastlex', dest='disable_fastlex', action='store_true',
default=False,
help='Hack for old blog posts')
return p
# width 40 by default
DEFAULT_META = {
'body_css_class': 'width40'
}
def main(argv):
o = Options()
opts, argv = o.parse_args(argv)
assert all(tag.startswith('h') for tag in opts.toc_tags), opts.toc_tags
if len(argv) == 1:
# Old style for blog: it's a filter
Render(opts, sys.stdin, sys.stdout, use_fastlex=not opts.disable_fastlex)
return
# Otherwise we expect metadata and content
meta = dict(DEFAULT_META)
import json
with open(argv[1]) as f:
doc_meta = json.load(f)
meta.update(doc_meta)
with open(argv[2]) as content_f:
doc_html.Header(meta, sys.stdout)
Render(opts, content_f, sys.stdout)
doc_html.Footer(meta, sys.stdout)
if __name__ == '__main__':
main(sys.argv)