Created
February 16, 2020 20:49
-
-
Save shimizukawa/474ecf0905e57820ebb8d2b3b65433f3 to your computer and use it in GitHub Desktop.
EPUBをSphinxソースに変換する
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pathlib | |
import dataclasses | |
import typing | |
import re | |
import ebooklib | |
from ebooklib import epub | |
import html2text | |
import m2r | |
OUT_DIR = pathlib.Path('./output/') | |
OUT_DIR.mkdir(exist_ok=True, parents=True) | |
@dataclasses.dataclass | |
class Heading: | |
sec: typing.Tuple[int] | |
node: typing.Union[epub.Link, epub.Section] | |
def process_toc(toc, sec=()) -> typing.List[Heading]: | |
r = [] | |
if isinstance(toc, tuple): | |
r += process_toc(toc[0], sec) | |
r += process_toc(toc[1], sec) | |
elif isinstance(toc, list): | |
for i, item in enumerate(toc): | |
s = sec + (i+1, ) | |
r += process_toc(item, sec=s) | |
else: | |
r += [Heading(sec, toc)] | |
return r | |
NORMALIZER = re.compile(r'[\n\s\t?:*]') | |
def normalize(name): | |
normalized = NORMALIZER.sub('-', name) | |
normalized = re.sub(r'-+', '-', normalized) | |
return normalized | |
def main(): | |
book = epub.read_epub('EXPERT_PYTHON_PROGRAMMING_THIRD_EDITION-9781789808896.epub') | |
heads = process_toc(book.toc) | |
names = [] | |
# writing chapters | |
for h in heads: | |
item = book.get_item_with_href(h.node.href) | |
title = h.node.title | |
basename = normalize(title) | |
depth = len(h.sec) | |
md = html2text.html2text(item.get_content().decode()) | |
# heading level | |
first_line, rest_lines = md.split('\n', 1) | |
if depth == 1 and first_line.startswith('# '): | |
rst_first_line = m2r.convert(first_line) | |
_t, _hr = rst_first_line.strip().split('\n') | |
rst_first_line = f'{_hr}\n{_t}\n{_hr}\n' | |
else: | |
first_line = '#' * (depth-2) + first_line | |
rst_first_line = m2r.convert(first_line) | |
rst = rst_first_line + m2r.convert(rest_lines) | |
if depth == 1: | |
file = OUT_DIR / f'{basename}.rst' | |
names.append(file.relative_to('.').stem) | |
print('Writing ...', h.sec[0], file.relative_to('.')) | |
with file.open('a', encoding='utf-8') as f: | |
f.write(rst) | |
# writing index | |
with (OUT_DIR / 'index.rst').open('w') as f: | |
f.write('Expert Python Programming 3rd\n') | |
f.write('=============================\n') | |
f.write('.. toctree::\n') | |
f.write('\n') | |
f.write(''.join([f' {name}\n' for name in names])) | |
# writing images | |
for item in book.get_items(): | |
file = (OUT_DIR / item.get_name()) | |
if item.get_type() == ebooklib.ITEM_DOCUMENT: | |
continue | |
elif item.get_type() == ebooklib.ITEM_NAVIGATION: | |
print('skip navigation file') | |
continue | |
elif item.get_type() == ebooklib.ITEM_STYLE: | |
print('skip style file') | |
continue | |
file.parent.mkdir(exist_ok=True, parents=True) | |
if item.get_type() == ebooklib.ITEM_IMAGE: | |
print('Writing ...', file.relative_to('.')) | |
file.write_bytes(item.get_content()) | |
elif item.get_type() == ebooklib.ITEM_COVER: | |
file = file.with_name('cover.png') | |
print('Writing ...', file.relative_to('.')) | |
file.write_bytes(item.get_content()) | |
else: | |
raise NotImplementedError('Unknown Type: %s' % item) | |
if __name__ == '__main__': | |
main() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
docutils==0.16 | |
EbookLib==0.17.1 | |
html2text==2020.1.16 | |
lxml==4.5.0 | |
m2r==0.2.1 | |
mistune==0.8.4 | |
six==1.14.0 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
EbookLib | |
html2text | |
m2r |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment