forked from scylladb/seastar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
htmlsplit.py
executable file
·171 lines (146 loc) · 5.74 KB
/
htmlsplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#!/usr/bin/env python3
# This script takes the single-page HTML output from pandoc - tutorial.html -
# and splits it into many pages in split/: one page index.html for the table
# of contents, and an additional page for each chapter. We make sure that
# links from the TOC to each chapter, and also links across chapters,
# continue to work correctly, and also had links from each chapter back to
# the TOC, as well as to the next and previous chapters.
# Copyright (C) 2018 ScyllaDB.
#
# This file is open source software, licensed to you under the terms
# of the Apache License, Version 2.0 (the "License"). See the NOTICE file
# distributed with this work for additional information regarding copyright
# ownership. You may not use this file except in compliance with the License.
#
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from xml.etree import ElementTree
import argparse
import copy
import os
# chapter number to chapter title
titles = {}
# section id => chapter number
sections = {}
def add_elem_to_body(tree, e):
body = next(tree.iterfind('./body'))
body.append(e)
def add_nav_to_body(tree, chap_num):
body = next(tree.iterfind('./body'))
nav = ElementTree.SubElement(body, 'div')
e = ElementTree.SubElement(nav, 'a',
href='index.html')
e.text = 'Back to table of contents'
e.tail = '.'
prev_index = chap_num - 1
if prev_index in titles:
e.tail += " Previous: "
prev_title = titles[prev_index]
e = ElementTree.SubElement(nav, 'a',
href=f'{prev_index}.html')
e.text = f'{prev_index} {prev_title}'
e.tail = '.'
next_index = chap_num + 1
if next_index in titles:
e.tail += " Next: "
next_title = titles[next_index]
e = ElementTree.SubElement(nav, 'a',
href=f'{next_index}.html')
e.text = f'{next_index} {next_title}'
e.tail = '.'
def handle_toc(toc):
for chap in toc.iterfind('./ul/li'):
chap_href_elem = next(chap.iterfind('./a[@href]'))
chap_num_elem = next(chap_href_elem.iterfind(
'./span[@class="toc-section-number"]'))
# For chapters, remember the mapping from number to name in the
# map "titles", so we can use them later in links to next and
# previous chapter
chap_num = int(chap_num_elem.text)
titles[chap_num] = chap_num_elem.tail.strip()
# For all sections, remember the mapping from name-with-dashes
# to the chapter number they are in in "sections". We need this
# to support links to other sections.
href = chap_href_elem.get('href')
sections[href] = chap_num
for section in chap.iterfind('.//ul/li/a[@href]'):
href = section.get('href')
# replace the link to '#section' with number N.M to chapterN#section
if href.startswith('#'):
sections[href] = chap_num
def fix_links(e):
for link in e.findall('.//a[@href]'):
href = link.get('href')
if href.startswith('#') and href in sections:
# In a chapter we can have a link to a different subsection, which
# looks like <a href="#some-title">Some title</A>. We need to
# replace this to refer to the right file after the split.
chap_num = sections[href]
link.set('href', f'{chap_num}.html{href}')
def remove_ns_prefix(tree):
prefix = '{http://www.w3.org/1999/xhtml}'
for e in tree.iter():
if e.tag.startswith(prefix):
e.tag = e.tag[len(prefix):]
def get_chap_num(element):
data_num = e.get('data-number')
if data_num:
return int(data_num)
data_num = e.findtext('./span[@class="header-section-number"]')
if data_num:
return int(data_num)
assert data_num, "section number not found"
parser = argparse.ArgumentParser()
parser.add_argument('--input')
parser.add_argument('--output-dir')
args = parser.parse_args()
tree = ElementTree.parse(args.input)
for e in tree.iter():
remove_ns_prefix(e)
template = copy.deepcopy(tree.getroot())
template_body = next(template.iterfind('./body'))
template_body.clear()
# iterate through the children elements in body
# body element is composed of
# - header
# - toc
# - h1,h2,p,...
# h1 marks the beginning of a chapter
chap_num = 0
chap_tree = None
for e in next(tree.iterfind('./body')):
if e.tag == 'header':
template_body.append(e)
elif e.get('id') == 'TOC':
handle_toc(e)
fix_links(e)
toc_tree = ElementTree.ElementTree(copy.deepcopy(template))
add_elem_to_body(toc_tree, e)
toc_tree.write(os.path.join(args.output_dir, 'index.html'),
method='html')
elif e.tag == 'h1':
assert titles
assert sections
if chap_num > 0:
add_nav_to_body(chap_tree, chap_num)
chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
method='html')
chap_num = get_chap_num(e)
chap_tree = ElementTree.ElementTree(copy.deepcopy(template))
add_nav_to_body(chap_tree, chap_num)
add_elem_to_body(chap_tree, e)
else:
assert chap_tree is not None
fix_links(e)
add_elem_to_body(chap_tree, e)
add_nav_to_body(chap_tree, chap_num)
chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
method='html')