]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | #!/usr/bin/env python3 |
2 | ||
3 | # This script takes the single-page HTML output from pandoc - tutorial.html - | |
4 | # and splits it into many pages in split/: one page index.html for the table | |
5 | # of contents, and an additional page for each chapter. We make sure that | |
6 | # links from the TOC to each chapter, and also links across chapters, | |
7 | # continue to work correctly, and also had links from each chapter back to | |
8 | # the TOC, as well as to the next and previous chapters. | |
9 | ||
10 | ||
11 | # Copyright (C) 2018 ScyllaDB. | |
12 | # | |
13 | # This file is open source software, licensed to you under the terms | |
14 | # of the Apache License, Version 2.0 (the "License"). See the NOTICE file | |
15 | # distributed with this work for additional information regarding copyright | |
16 | # ownership. You may not use this file except in compliance with the License. | |
17 | # | |
18 | # You may obtain a copy of the License at | |
19 | # | |
20 | # http://www.apache.org/licenses/LICENSE-2.0 | |
21 | # | |
22 | # Unless required by applicable law or agreed to in writing, | |
23 | # software distributed under the License is distributed on an | |
24 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
25 | # KIND, either express or implied. See the License for the | |
26 | # specific language governing permissions and limitations | |
27 | # under the License. | |
28 | ||
f67539c2 TL |
29 | from xml.etree import ElementTree |
30 | import argparse | |
31 | import copy | |
32 | import os | |
33 | ||
34 | # chapter number to chapter title | |
11fdf7f2 | 35 | titles = {} |
f67539c2 | 36 | # section id => chapter number |
11fdf7f2 | 37 | sections = {} |
f67539c2 TL |
38 | |
39 | ||
40 | def add_elem_to_body(tree, e): | |
41 | body = next(tree.iterfind('./body')) | |
42 | body.append(e) | |
43 | ||
44 | ||
45 | def add_nav_to_body(tree, chap_num): | |
46 | body = next(tree.iterfind('./body')) | |
47 | ||
48 | nav = ElementTree.SubElement(body, 'div') | |
49 | e = ElementTree.SubElement(nav, 'a', | |
50 | href='index.html') | |
51 | e.text = 'Back to table of contents' | |
52 | e.tail = '.' | |
53 | prev_index = chap_num - 1 | |
54 | if prev_index in titles: | |
55 | e.tail += " Previous: " | |
56 | prev_title = titles[prev_index] | |
57 | e = ElementTree.SubElement(nav, 'a', | |
58 | href=f'{prev_index}.html') | |
59 | e.text = f'{prev_index} {prev_title}' | |
60 | e.tail = '.' | |
61 | next_index = chap_num + 1 | |
62 | if next_index in titles: | |
63 | e.tail += " Next: " | |
64 | next_title = titles[next_index] | |
65 | e = ElementTree.SubElement(nav, 'a', | |
66 | href=f'{next_index}.html') | |
67 | e.text = f'{next_index} {next_title}' | |
68 | e.tail = '.' | |
69 | ||
70 | ||
71 | def handle_toc(toc): | |
72 | for chap in toc.iterfind('./ul/li'): | |
73 | chap_href_elem = next(chap.iterfind('./a[@href]')) | |
74 | chap_num_elem = next(chap_href_elem.iterfind( | |
75 | './span[@class="toc-section-number"]')) | |
76 | # For chapters, remember the mapping from number to name in the | |
77 | # map "titles", so we can use them later in links to next and | |
78 | # previous chapter | |
79 | chap_num = int(chap_num_elem.text) | |
80 | titles[chap_num] = chap_num_elem.tail.strip() | |
81 | ||
82 | # For all sections, remember the mapping from name-with-dashes | |
83 | # to the chapter number they are in in "sections". We need this | |
84 | # to support links to other sections. | |
85 | href = chap_href_elem.get('href') | |
86 | sections[href] = chap_num | |
87 | for section in chap.iterfind('.//ul/li/a[@href]'): | |
88 | href = section.get('href') | |
11fdf7f2 | 89 | # replace the link to '#section' with number N.M to chapterN#section |
f67539c2 TL |
90 | if href.startswith('#'): |
91 | sections[href] = chap_num | |
92 | ||
93 | ||
94 | def fix_links(e): | |
95 | for link in e.findall('.//a[@href]'): | |
96 | href = link.get('href') | |
97 | if href.startswith('#') and href in sections: | |
11fdf7f2 TL |
98 | # In a chapter we can have a link to a different subsection, which |
99 | # looks like <a href="#some-title">Some title</A>. We need to | |
100 | # replace this to refer to the right file after the split. | |
f67539c2 TL |
101 | chap_num = sections[href] |
102 | link.set('href', f'{chap_num}.html{href}') | |
103 | ||
104 | ||
105 | def remove_ns_prefix(tree): | |
106 | prefix = '{http://www.w3.org/1999/xhtml}' | |
107 | for e in tree.iter(): | |
108 | if e.tag.startswith(prefix): | |
109 | e.tag = e.tag[len(prefix):] | |
110 | ||
111 | ||
112 | def get_chap_num(element): | |
113 | data_num = e.get('data-number') | |
114 | if data_num: | |
115 | return int(data_num) | |
116 | data_num = e.findtext('./span[@class="header-section-number"]') | |
117 | if data_num: | |
118 | return int(data_num) | |
119 | assert data_num, "section number not found" | |
120 | ||
121 | ||
122 | parser = argparse.ArgumentParser() | |
123 | parser.add_argument('--input') | |
124 | parser.add_argument('--output-dir') | |
125 | args = parser.parse_args() | |
126 | ||
127 | tree = ElementTree.parse(args.input) | |
128 | for e in tree.iter(): | |
129 | remove_ns_prefix(e) | |
130 | template = copy.deepcopy(tree.getroot()) | |
131 | template_body = next(template.iterfind('./body')) | |
132 | template_body.clear() | |
133 | ||
134 | # iterate through the children elements in body | |
135 | # body element is composed of | |
136 | # - header | |
137 | # - toc | |
138 | # - h1,h2,p,... | |
139 | # h1 marks the beginning of a chapter | |
140 | ||
141 | chap_num = 0 | |
142 | chap_tree = None | |
143 | for e in next(tree.iterfind('./body')): | |
144 | if e.tag == 'header': | |
145 | template_body.append(e) | |
146 | elif e.get('id') == 'TOC': | |
147 | handle_toc(e) | |
148 | fix_links(e) | |
149 | toc_tree = ElementTree.ElementTree(copy.deepcopy(template)) | |
150 | add_elem_to_body(toc_tree, e) | |
151 | toc_tree.write(os.path.join(args.output_dir, 'index.html'), | |
152 | method='html') | |
153 | elif e.tag == 'h1': | |
154 | assert titles | |
155 | assert sections | |
156 | if chap_num > 0: | |
157 | add_nav_to_body(chap_tree, chap_num) | |
158 | chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'), | |
159 | method='html') | |
160 | chap_num = get_chap_num(e) | |
161 | chap_tree = ElementTree.ElementTree(copy.deepcopy(template)) | |
162 | add_nav_to_body(chap_tree, chap_num) | |
163 | add_elem_to_body(chap_tree, e) | |
164 | else: | |
165 | assert chap_tree is not None | |
166 | fix_links(e) | |
167 | add_elem_to_body(chap_tree, e) | |
168 | ||
169 | add_nav_to_body(chap_tree, chap_num) | |
170 | chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'), | |
171 | method='html') |