ceph/src/seastar/doc/htmlsplit.py

   1 #!/usr/bin/env python3
   2
   3 # This script takes the single-page HTML output from pandoc - tutorial.html -
   4 # and splits it into many pages in split/: one page index.html for the table
   5 # of contents, and an additional page for each chapter. We make sure that
   6 # links from the TOC to each chapter, and also links across chapters,
   7 # continue to work correctly, and also had links from each chapter back to
   8 # the TOC, as well as to the next and previous chapters.
   9
  10
  11 # Copyright (C) 2018 ScyllaDB.
  12 #
  13 # This file is open source software, licensed to you under the terms
  14 # of the Apache License, Version 2.0 (the "License").  See the NOTICE file
  15 # distributed with this work for additional information regarding copyright
  16 # ownership.  You may not use this file except in compliance with the License.
  17 #
  18 # You may obtain a copy of the License at
  19 #
  20 #   http://www.apache.org/licenses/LICENSE-2.0
  21 #
  22 # Unless required by applicable law or agreed to in writing,
  23 # software distributed under the License is distributed on an
  24 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  25 # KIND, either express or implied.  See the License for the
  26 # specific language governing permissions and limitations
  27 # under the License.
  28
  29 from xml.etree import ElementTree
  30 import argparse
  31 import copy
  32 import os
  33
  34 # chapter number to chapter title
  35 titles = {}
  36 # section id => chapter number
  37 sections = {}
  38
  39
  40 def add_elem_to_body(tree, e):
  41     body = next(tree.iterfind('./body'))
  42     body.append(e)
  43
  44
  45 def add_nav_to_body(tree, chap_num):
  46     body = next(tree.iterfind('./body'))
  47
  48     nav = ElementTree.SubElement(body, 'div')
  49     e = ElementTree.SubElement(nav, 'a',
  50                                href='index.html')
  51     e.text = 'Back to table of contents'
  52     e.tail = '.'
  53     prev_index = chap_num - 1
  54     if prev_index in titles:
  55         e.tail += " Previous: "
  56         prev_title = titles[prev_index]
  57         e = ElementTree.SubElement(nav, 'a',
  58                                    href=f'{prev_index}.html')
  59         e.text = f'{prev_index} {prev_title}'
  60         e.tail = '.'
  61     next_index = chap_num + 1
  62     if next_index in titles:
  63         e.tail += " Next: "
  64         next_title = titles[next_index]
  65         e = ElementTree.SubElement(nav, 'a',
  66                                    href=f'{next_index}.html')
  67         e.text = f'{next_index} {next_title}'
  68         e.tail = '.'
  69
  70
  71 def handle_toc(toc):
  72     for chap in toc.iterfind('./ul/li'):
  73         chap_href_elem = next(chap.iterfind('./a[@href]'))
  74         chap_num_elem = next(chap_href_elem.iterfind(
  75             './span[@class="toc-section-number"]'))
  76         # For chapters, remember the mapping from number to name in the
  77         # map "titles", so we can use them later in links to next and
  78         # previous chapter
  79         chap_num = int(chap_num_elem.text)
  80         titles[chap_num] = chap_num_elem.tail.strip()
  81
  82         # For all sections, remember the mapping from name-with-dashes
  83         # to the chapter number they are in in "sections". We need this
  84         # to support links to other sections.
  85         href = chap_href_elem.get('href')
  86         sections[href] = chap_num
  87         for section in chap.iterfind('.//ul/li/a[@href]'):
  88             href = section.get('href')
  89             # replace the link to '#section' with number N.M to chapterN#section
  90             if href.startswith('#'):
  91                 sections[href] = chap_num
  92
  93
  94 def fix_links(e):
  95     for link in e.findall('.//a[@href]'):
  96         href = link.get('href')
  97         if href.startswith('#') and href in sections:
  98             # In a chapter we can have a link to a different subsection, which
  99             # looks like <a href="#some-title">Some title</A>. We need to
 100             # replace this to refer to the right file after the split.
 101             chap_num = sections[href]
 102             link.set('href', f'{chap_num}.html{href}')
 103
 104
 105 def remove_ns_prefix(tree):
 106     prefix = '{http://www.w3.org/1999/xhtml}'
 107     for e in tree.iter():
 108         if e.tag.startswith(prefix):
 109             e.tag = e.tag[len(prefix):]
 110
 111
 112 def get_chap_num(element):
 113     data_num = e.get('data-number')
 114     if data_num:
 115         return int(data_num)
 116     data_num = e.findtext('./span[@class="header-section-number"]')
 117     if data_num:
 118         return int(data_num)
 119     assert data_num, "section number not found"
 120
 121
 122 parser = argparse.ArgumentParser()
 123 parser.add_argument('--input')
 124 parser.add_argument('--output-dir')
 125 args = parser.parse_args()
 126
 127 tree = ElementTree.parse(args.input)
 128 for e in tree.iter():
 129     remove_ns_prefix(e)
 130 template = copy.deepcopy(tree.getroot())
 131 template_body = next(template.iterfind('./body'))
 132 template_body.clear()
 133
 134 # iterate through the children elements in body
 135 # body element is composed of
 136 #  - header
 137 #  - toc
 138 #  - h1,h2,p,...
 139 # h1 marks the beginning of a chapter
 140
 141 chap_num = 0
 142 chap_tree = None
 143 for e in next(tree.iterfind('./body')):
 144     if e.tag == 'header':
 145         template_body.append(e)
 146     elif e.get('id') == 'TOC':
 147         handle_toc(e)
 148         fix_links(e)
 149         toc_tree = ElementTree.ElementTree(copy.deepcopy(template))
 150         add_elem_to_body(toc_tree, e)
 151         toc_tree.write(os.path.join(args.output_dir, 'index.html'),
 152                        method='html')
 153     elif e.tag == 'h1':
 154         assert titles
 155         assert sections
 156         if chap_num > 0:
 157             add_nav_to_body(chap_tree, chap_num)
 158             chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
 159                             method='html')
 160         chap_num = get_chap_num(e)
 161         chap_tree = ElementTree.ElementTree(copy.deepcopy(template))
 162         add_nav_to_body(chap_tree, chap_num)
 163         add_elem_to_body(chap_tree, e)
 164     else:
 165         assert chap_tree is not None
 166         fix_links(e)
 167         add_elem_to_body(chap_tree, e)
 168
 169 add_nav_to_body(chap_tree, chap_num)
 170 chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
 171                 method='html')