[ceph.git] / ceph / src / seastar / doc / htmlsplit.py

#!/usr/bin/env python3

# This script takes the single-page HTML output from pandoc - tutorial.html -
# and splits it into many pages in split/: one page index.html for the table
# of contents, and an additional page for each chapter. We make sure that
# links from the TOC to each chapter, and also links across chapters,
# continue to work correctly, and also had links from each chapter back to
# the TOC, as well as to the next and previous chapters.


# Copyright (C) 2018 ScyllaDB.
#
# This file is open source software, licensed to you under the terms
# of the Apache License, Version 2.0 (the "License").  See the NOTICE file
# distributed with this work for additional information regarding copyright
# ownership.  You may not use this file except in compliance with the License.
#
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied.  See the License for the
# specific language governing permissions and limitations
# under the License.

from xml.etree import ElementTree
import argparse
import copy
import os

# chapter number to chapter title
titles = {}
# section id => chapter number
sections = {}


def add_elem_to_body(tree, e):
    body = next(tree.iterfind('./body'))
    body.append(e)


def add_nav_to_body(tree, chap_num):
    body = next(tree.iterfind('./body'))

    nav = ElementTree.SubElement(body, 'div')
    e = ElementTree.SubElement(nav, 'a',
                               href='index.html')
    e.text = 'Back to table of contents'
    e.tail = '.'
    prev_index = chap_num - 1
    if prev_index in titles:
        e.tail += " Previous: "
        prev_title = titles[prev_index]
        e = ElementTree.SubElement(nav, 'a',
                                   href=f'{prev_index}.html')
        e.text = f'{prev_index} {prev_title}'
        e.tail = '.'
    next_index = chap_num + 1
    if next_index in titles:
        e.tail += " Next: "
        next_title = titles[next_index]
        e = ElementTree.SubElement(nav, 'a',
                                   href=f'{next_index}.html')
        e.text = f'{next_index} {next_title}'
        e.tail = '.'


def handle_toc(toc):
    for chap in toc.iterfind('./ul/li'):
        chap_href_elem = next(chap.iterfind('./a[@href]'))
        chap_num_elem = next(chap_href_elem.iterfind(
            './span[@class="toc-section-number"]'))
        # For chapters, remember the mapping from number to name in the
        # map "titles", so we can use them later in links to next and
        # previous chapter
        chap_num = int(chap_num_elem.text)
        titles[chap_num] = chap_num_elem.tail.strip()

        # For all sections, remember the mapping from name-with-dashes
        # to the chapter number they are in in "sections". We need this
        # to support links to other sections.
        href = chap_href_elem.get('href')
        sections[href] = chap_num
        for section in chap.iterfind('.//ul/li/a[@href]'):
            href = section.get('href')
            # replace the link to '#section' with number N.M to chapterN#section
            if href.startswith('#'):
                sections[href] = chap_num


def fix_links(e):
    for link in e.findall('.//a[@href]'):
        href = link.get('href')
        if href.startswith('#') and href in sections:
            # In a chapter we can have a link to a different subsection, which
            # looks like <a href="#some-title">Some title</A>. We need to
            # replace this to refer to the right file after the split.
            chap_num = sections[href]
            link.set('href', f'{chap_num}.html{href}')


def remove_ns_prefix(tree):
    prefix = '{http://www.w3.org/1999/xhtml}'
    for e in tree.iter():
        if e.tag.startswith(prefix):
            e.tag = e.tag[len(prefix):]


def get_chap_num(element):
    data_num = e.get('data-number')
    if data_num:
        return int(data_num)
    data_num = e.findtext('./span[@class="header-section-number"]')
    if data_num:
        return int(data_num)
    assert data_num, "section number not found"


parser = argparse.ArgumentParser()
parser.add_argument('--input')
parser.add_argument('--output-dir')
args = parser.parse_args()

tree = ElementTree.parse(args.input)
for e in tree.iter():
    remove_ns_prefix(e)
template = copy.deepcopy(tree.getroot())
template_body = next(template.iterfind('./body'))
template_body.clear()

# iterate through the children elements in body
# body element is composed of
#  - header
#  - toc
#  - h1,h2,p,...
# h1 marks the beginning of a chapter

chap_num = 0
chap_tree = None
for e in next(tree.iterfind('./body')):
    if e.tag == 'header':
        template_body.append(e)
    elif e.get('id') == 'TOC':
        handle_toc(e)
        fix_links(e)
        toc_tree = ElementTree.ElementTree(copy.deepcopy(template))
        add_elem_to_body(toc_tree, e)
        toc_tree.write(os.path.join(args.output_dir, 'index.html'),
                       method='html')
    elif e.tag == 'h1':
        assert titles
        assert sections
        if chap_num > 0:
            add_nav_to_body(chap_tree, chap_num)
            chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
                            method='html')
        chap_num = get_chap_num(e)
        chap_tree = ElementTree.ElementTree(copy.deepcopy(template))
        add_nav_to_body(chap_tree, chap_num)
        add_elem_to_body(chap_tree, e)
    else:
        assert chap_tree is not None
        fix_links(e)
        add_elem_to_body(chap_tree, e)

add_nav_to_body(chap_tree, chap_num)
chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
                method='html')
Commit	Line	Data
11fdf7f2 TL	1	#!/usr/bin/env python3
	2
	3	# This script takes the single-page HTML output from pandoc - tutorial.html -
	4	# and splits it into many pages in split/: one page index.html for the table
	5	# of contents, and an additional page for each chapter. We make sure that
	6	# links from the TOC to each chapter, and also links across chapters,
	7	# continue to work correctly, and also had links from each chapter back to
	8	# the TOC, as well as to the next and previous chapters.
	9
	10
	11	# Copyright (C) 2018 ScyllaDB.
	12	#
	13	# This file is open source software, licensed to you under the terms
	14	# of the Apache License, Version 2.0 (the "License"). See the NOTICE file
	15	# distributed with this work for additional information regarding copyright
	16	# ownership. You may not use this file except in compliance with the License.
	17	#
	18	# You may obtain a copy of the License at
	19	#
	20	# http://www.apache.org/licenses/LICENSE-2.0
	21	#
	22	# Unless required by applicable law or agreed to in writing,
	23	# software distributed under the License is distributed on an
	24	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	25	# KIND, either express or implied. See the License for the
	26	# specific language governing permissions and limitations
	27	# under the License.
	28
f67539c2 TL	29	from xml.etree import ElementTree
	30	import argparse
	31	import copy
	32	import os
	33
	34	# chapter number to chapter title
11fdf7f2	35	titles = {}
f67539c2	36	# section id => chapter number
11fdf7f2	37	sections = {}
f67539c2 TL	38
	39
	40	def add_elem_to_body(tree, e):
	41	body = next(tree.iterfind('./body'))
	42	body.append(e)
	43
	44
	45	def add_nav_to_body(tree, chap_num):
	46	body = next(tree.iterfind('./body'))
	47
	48	nav = ElementTree.SubElement(body, 'div')
	49	e = ElementTree.SubElement(nav, 'a',
	50	href='index.html')
	51	e.text = 'Back to table of contents'
	52	e.tail = '.'
	53	prev_index = chap_num - 1
	54	if prev_index in titles:
	55	e.tail += " Previous: "
	56	prev_title = titles[prev_index]
	57	e = ElementTree.SubElement(nav, 'a',
	58	href=f'{prev_index}.html')
	59	e.text = f'{prev_index} {prev_title}'
	60	e.tail = '.'
	61	next_index = chap_num + 1
	62	if next_index in titles:
	63	e.tail += " Next: "
	64	next_title = titles[next_index]
	65	e = ElementTree.SubElement(nav, 'a',
	66	href=f'{next_index}.html')
	67	e.text = f'{next_index} {next_title}'
	68	e.tail = '.'
	69
	70
	71	def handle_toc(toc):
	72	for chap in toc.iterfind('./ul/li'):
	73	chap_href_elem = next(chap.iterfind('./a[@href]'))
	74	chap_num_elem = next(chap_href_elem.iterfind(
	75	'./span[@class="toc-section-number"]'))
	76	# For chapters, remember the mapping from number to name in the
	77	# map "titles", so we can use them later in links to next and
	78	# previous chapter
	79	chap_num = int(chap_num_elem.text)
	80	titles[chap_num] = chap_num_elem.tail.strip()
	81
	82	# For all sections, remember the mapping from name-with-dashes
	83	# to the chapter number they are in in "sections". We need this
	84	# to support links to other sections.
	85	href = chap_href_elem.get('href')
	86	sections[href] = chap_num
	87	for section in chap.iterfind('.//ul/li/a[@href]'):
	88	href = section.get('href')
11fdf7f2	89	# replace the link to '#section' with number N.M to chapterN#section
f67539c2 TL	90	if href.startswith('#'):
	91	sections[href] = chap_num
	92
	93
	94	def fix_links(e):
	95	for link in e.findall('.//a[@href]'):
	96	href = link.get('href')
	97	if href.startswith('#') and href in sections:
11fdf7f2 TL	98	# In a chapter we can have a link to a different subsection, which
	99	# looks like <a href="#some-title">Some title</A>. We need to
	100	# replace this to refer to the right file after the split.
f67539c2 TL	101	chap_num = sections[href]
	102	link.set('href', f'{chap_num}.html{href}')
	103
	104
	105	def remove_ns_prefix(tree):
	106	prefix = '{http://www.w3.org/1999/xhtml}'
	107	for e in tree.iter():
	108	if e.tag.startswith(prefix):
	109	e.tag = e.tag[len(prefix):]
	110
	111
	112	def get_chap_num(element):
	113	data_num = e.get('data-number')
	114	if data_num:
	115	return int(data_num)
	116	data_num = e.findtext('./span[@class="header-section-number"]')
	117	if data_num:
	118	return int(data_num)
	119	assert data_num, "section number not found"
	120
	121
	122	parser = argparse.ArgumentParser()
	123	parser.add_argument('--input')
	124	parser.add_argument('--output-dir')
	125	args = parser.parse_args()
	126
	127	tree = ElementTree.parse(args.input)
	128	for e in tree.iter():
	129	remove_ns_prefix(e)
	130	template = copy.deepcopy(tree.getroot())
	131	template_body = next(template.iterfind('./body'))
	132	template_body.clear()
	133
	134	# iterate through the children elements in body
	135	# body element is composed of
	136	# - header
	137	# - toc
	138	# - h1,h2,p,...
	139	# h1 marks the beginning of a chapter
	140
	141	chap_num = 0
	142	chap_tree = None
	143	for e in next(tree.iterfind('./body')):
	144	if e.tag == 'header':
	145	template_body.append(e)
	146	elif e.get('id') == 'TOC':
	147	handle_toc(e)
	148	fix_links(e)
	149	toc_tree = ElementTree.ElementTree(copy.deepcopy(template))
	150	add_elem_to_body(toc_tree, e)
	151	toc_tree.write(os.path.join(args.output_dir, 'index.html'),
	152	method='html')
	153	elif e.tag == 'h1':
	154	assert titles
	155	assert sections
	156	if chap_num > 0:
	157	add_nav_to_body(chap_tree, chap_num)
	158	chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
	159	method='html')
	160	chap_num = get_chap_num(e)
	161	chap_tree = ElementTree.ElementTree(copy.deepcopy(template))
	162	add_nav_to_body(chap_tree, chap_num)
	163	add_elem_to_body(chap_tree, e)
	164	else:
165	assert chap_tree is not None
166	fix_links(e)
167	add_elem_to_body(chap_tree, e)
168
169	add_nav_to_body(chap_tree, chap_num)
170	chap_tree.write(os.path.join(args.output_dir, f'{chap_num}.html'),
171	method='html')