Created
April 12, 2021 00:22
-
-
Save karlrwjohnson/36db59ab8f4506d770b521eb8f71b233 to your computer and use it in GitHub Desktop.
Script to edit a PDF file's bookmarks using a text editor (uses pdftk-java and Python3)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
## | |
## PDF Bookmark Patcher script | |
## | |
## This script allows you to use a regular text editor to edit the bookmarks | |
## in a PDF file. It is a wrapper around another tool called PDFtk-java. | |
## | |
## Usage: | |
## 1. < replace somefile.pdf with the name of your file > | |
## 2. python3 ../extract_bookmarks.py somefile.pdf --export-text bookmarks.txt | |
## 3. < edit bookmarks.txt > | |
## 4. python3 ../extract_bookmarks.py somefile.pdf --import-text bookmarks.txt | |
## 5. < somefile.bookmarked.pdf is a copy of the file with updated bookmarks > | |
## | |
## Requires: | |
## - PDFtk-java (https://gitlab.com/pdftk-java/pdftk, https://linuxhint.com/install_pdftk_ubuntu/) | |
## - Make sure that `pdftk` is present in your PATH | |
## - Python 3 | |
## | |
import html | |
import re | |
import sys | |
from argparse import ArgumentParser | |
from bs4 import BeautifulSoup | |
from subprocess import Popen, PIPE | |
from logging import getLogger, DEBUG, basicConfig | |
from textwrap import dedent | |
from typing import List, NamedTuple, Union | |
logger = getLogger() | |
basicConfig(format='%(levelname)s - %(message)s') | |
class PdfBookmark(NamedTuple): | |
title: str | |
level: int | |
page_number: int | |
class BookmarkTreeNode(NamedTuple): | |
title: str | |
level: int | |
page_number: int | |
children: List['BookmarkTreeNode'] | |
class TextTreeNode(NamedTuple): | |
content: str | |
indent: str | |
line_number: int | |
children: List['TextTreeNode'] | |
def main(): | |
parser = ArgumentParser(description='Use pdftk to edit bookmarks on a PDF') | |
parser.add_argument('pdf_file', help='PDF file to process') | |
parser.add_argument('--export-text', help='Export ') | |
parser.add_argument('--import-text', help='HTML file of bookmarks') | |
parser.add_argument('--output-pdf', help='PDF file to output to') | |
parser.add_argument('--verbose', help='Enable verbose logging', action='store_true') | |
args = parser.parse_args() | |
# if args.verbose: | |
# logger.setLevel(DEBUG) | |
logger.setLevel(DEBUG) | |
if args.export_text: | |
export_text(args.pdf_file, args.export_text) | |
elif args.import_text: | |
import_text(args.pdf_file, args.import_text, args.output_pdf) | |
def export_text(pdf_filename:str, bookmark_filename: str): | |
with open(bookmark_filename, 'w') as outfile: | |
data_lines: List[str] = dump_pdf_data(pdf_filename) | |
bookmarks: List[PdfBookmark] = parse_bookmarks_from_pdf_data(data_lines) | |
tree: List[BookmarkTreeNode] = build_bookmark_tree(bookmarks) | |
html: str = export_bookmarks_text(tree, pdf_filename, bookmark_filename) | |
outfile.write(html) | |
def import_text(pdf_filename:str, bookmark_filename: str, output_filename: Union[str, None]): | |
if not output_filename: | |
output_filename = re.sub(r'\.pdf', '.bookmarked.pdf', pdf_filename) | |
logger.info('Automatically generating output filename %s', output_filename) | |
with open(bookmark_filename) as infile: | |
file_lines: List[str] = infile.read().split('\n') | |
text_tree: List[TextTreeNode] = import_indented_text(file_lines) | |
bookmarks: List[PdfBookmark] = build_pdf_bookmark_list(text_tree) | |
logger.info(f'Loaded {len(bookmarks)} bookmarks with {sum(1 for x in bookmarks if x.page_number != 0)} page numbers') | |
original_data_lines: List[str] = dump_pdf_data(pdf_filename) | |
patched_data_lines: List[str] = patch_bookmarks_into_pdf_data(original_data_lines, bookmarks) | |
update_pdf_data(pdf_filename, output_filename, patched_data_lines) | |
def dump_pdf_data(filename: str) -> List[str]: | |
command = ['pdftk', filename, 'dump_data_utf8'] | |
logger.info(f'Running command: {command!r}') | |
popen = Popen(command, stdout=PIPE) | |
stdout_data, stderr_data = popen.communicate() | |
assert popen.returncode == 0, f'{command} failed (exit code {popen.returncode})' | |
ret = stdout_data.decode().split('\n') | |
logger.debug(f'Command returned {len(ret)} lines') | |
return ret | |
def update_pdf_data(input_filename: str, output_filename: str, contents: List[str]) -> None: | |
command = ['pdftk', input_filename, 'update_info_utf8', '-', 'output', output_filename, 'verbose'] | |
content_bytes = '\n'.join(contents).encode('utf-8') | |
logger.info(f'Running command with {len(contents)} lines of input: {command!r}') | |
popen = Popen(command, stdin=PIPE, stdout=PIPE) | |
stdout_data, stderr_data = popen.communicate(content_bytes) | |
print(stdout_data.decode()) | |
assert popen.returncode == 0, f'{command} failed (exit code {popen.returncode})' | |
def parse_bookmarks_from_pdf_data(lines: List[str]) -> List[PdfBookmark]: | |
ret: List[PdfBookmark] = [] | |
line_number = 0 | |
current_bookmark_fields: Dict = {} | |
def flush(): | |
# Extract current state and perform reset immediately | |
# so we can return early on error | |
flushed_state = {**current_bookmark_fields} | |
current_bookmark_fields.clear() | |
# Initial record will cause a flush. This should not be an error. | |
if not flushed_state.keys(): | |
return | |
# Do not create bookmark if necessary keys are missing, but keep parsing bookmarks | |
expected_keys = PdfBookmark._fields | |
missing_keys = expected_keys - flushed_state.keys() | |
if missing_keys: | |
logger.warn('Missing keys %s when flushing bookmark on line %s', missing_keys, line_number) | |
return | |
# Flag when extra keys exist | |
extra_keys = flushed_state.keys() - expected_keys | |
if extra_keys: | |
logger.warn('Extra keys %s when flushing bookmark on line %s', extra_keys, line_number) | |
# Do not abort | |
bookmark = PdfBookmark(**{key: flushed_state[key] for key in expected_keys}) | |
ret.append(bookmark) | |
for line in lines: | |
line_number += 1 | |
prop, value, *_ = line.split(': ', maxsplit=1) + [None] | |
if prop == 'BookmarkBegin': | |
flush() | |
elif prop == 'BookmarkTitle': | |
if value is None: | |
logger.warn('Missing value on line %s', line_number) | |
continue | |
current_bookmark_fields['title'] = html.unescape(value) | |
elif prop == 'BookmarkLevel': | |
if value is None: | |
logger.warn('Missing value on line %s', line_number) | |
continue | |
try: | |
current_bookmark_fields['level'] = int(value) | |
except ValueError: | |
logger.warn('Cannot parse %s as an integer on line %s', value, line_number) | |
elif prop == 'BookmarkPageNumber': | |
if value is None: | |
logger.warn('Missing value on line %s', line_number) | |
continue | |
try: | |
current_bookmark_fields['page_number'] = int(value) | |
except ValueError: | |
logger.warn('Cannot parse %s as an integer on line %s', value, line_number) | |
flush() | |
return ret | |
def patch_bookmarks_into_pdf_data(pdftk_data: List[str], bookmarks: List[PdfBookmark]) -> List[str]: | |
ret: List[str] = [] | |
data_written = False | |
bookmark_lines = [ | |
line | |
for bookmark in bookmarks | |
for line in [ | |
'BookmarkBegin', | |
f'BookmarkTitle: {html.escape(bookmark.title)}', | |
f'BookmarkLevel: {bookmark.level}', | |
f'BookmarkPageNumber: {bookmark.page_number}', | |
] | |
] | |
for line in pdftk_data: | |
if line.startswith('BookmarkBegin'): | |
if not data_written: | |
data_written = True | |
ret += bookmark_lines | |
continue | |
elif line.startswith('BookmarkTitle') or line.startswith('BookmarkLevel') or line.startswith('BookmarkPageNumber'): | |
continue | |
ret.append(line) | |
if not data_written: | |
data_written = True | |
ret += bookmark_lines | |
return ret | |
def build_bookmark_tree(bookmarks: List[PdfBookmark]) -> List[BookmarkTreeNode]: | |
logger.debug('converting list of %s bookmarks into a tree based on their "level" properties', len(bookmarks)) | |
# The value that we'll return -- it's a list of bookmark nodes with level=1 | |
# Elements will be added to this via ancestor_child_lists | |
root_node_list: List[BookmarkTreeNode] = []; | |
# As we build the tree, use a stack to track the current node's ancestors | |
# -- Or rather, the ancestors' lists of child nodes. | |
# The first list is the list of "root nodes", who have level=1 | |
# The second list is the list of children of the current level=1 node, | |
# for whome level=2 and so forth. | |
ancestor_child_lists: List[List[BookmarkTreeNode]] = [root_node_list] | |
def get_current_level(): | |
return len(ancestor_child_lists) | |
def push_node(node: BookmarkTreeNode): | |
# Add the node to its direct ancestor's list of children | |
ancestor_child_lists[-1].append(node) | |
# Push its child list onto the stack of ancestors, so further calls to | |
# push_node will add children to this node (see above) | |
ancestor_child_lists.append(node.children) | |
def pop_node(): | |
if get_current_level() <= 1: | |
raise RuntimeError("Bookmark level should not go below 1") | |
ancestor_child_lists.pop() | |
for bookmark in bookmarks: | |
# The algorithm is centered around "add a node to its parent" | |
# For that to work, the parent must be exactly one level higher | |
# than the child. | |
# If the current node is higher than the stack indicates, | |
# peel back nodes until the stack is pointing to the appropriate parent | |
while get_current_level() > bookmark.level: | |
pop_node() | |
# If the next bookmark jumped down a level too far (e.g. level 3 to level 5), | |
# add intermediate nodes to fill the gap | |
while get_current_level() < bookmark.level: | |
placeholder_node = BookmarkTreeNode( | |
title="<placeholder>", | |
level=bookmark.level, | |
page_number=0, | |
children=[] | |
) | |
push_node(placeholder_node) | |
# Add the current node | |
node = BookmarkTreeNode( | |
title=bookmark.title, | |
level=bookmark.level, | |
page_number=bookmark.page_number, | |
children=[] | |
) | |
push_node(node) | |
logger.debug('finished generating bookmark tree') | |
return root_node_list | |
def export_bookmarks_text(bookmark_tree: List[BookmarkTreeNode], pdf_filename: str, bookmark_filename: str) -> str: | |
logger.debug('generating YML file from tree') | |
instructions = dedent(f"""\ | |
# Bookmarks extracted from {pdf_filename} | |
# | |
# Instructions: | |
# | |
# Using a text editor, update the following list of bookmarks. | |
# Each bookmark should appear on its own line, and it should follow the format | |
# <page number>. <title> | |
# To make some bookmarks appear as children of others, use spaces or tabs to indent. | |
# | |
# Blank lines and lines starting with a "#" (like this one) are comments and will not be interpreted as a bookmark | |
# | |
# Then, import them back into the PDF file using the command: | |
# python {sys.argv[0]} {pdf_filename} --import-from {bookmark_filename} | |
# | |
# E.g.: | |
# | |
# 1. Introduction | |
# 3. Episode IV: A New Hope | |
# 4. Scene 1: Vader captures Leia | |
# 10. Scene 2: Luke on Tattoine | |
# 105. Episode V: The Empire Strikes Back | |
# 192. Episode VI: Return of the Jedi | |
""") | |
lines = [instructions, ''] | |
def print_tree_node(bookmark_tree: List[BookmarkTreeNode], indent: str = ''): | |
next_indent = indent + '\t' | |
for current in bookmark_tree: | |
lines.append(f'{indent}{current.page_number}. {current.title}') | |
print_tree_node(current.children, next_indent) | |
print_tree_node(bookmark_tree) | |
ret = '\n'.join(lines) | |
logger.debug('finished generating text file') | |
return ret | |
def import_indented_text(file_lines: str) -> List[TextTreeNode]: | |
# The value that we'll return -- it's a list of unindented file lines | |
# Elements will be added to this via ancestor_child_lists | |
root_node_list: List[TextTreeNode] = [] | |
# As we build the tree, use a stack to track the current node's ancestors | |
# -- Or rather, the ancestors' lists of child nodes. | |
# The first list is the list of "root nodes", who have level=1 | |
# The second list is the list of children of the current level=1 node, | |
# for whome level=2 and so forth. | |
ancestor_child_lists: List[List[TextTreeNode]] = [root_node_list] | |
def get_current_level(): | |
return len(ancestor_child_lists) | |
def push_node(node: TextTreeNode): | |
# Add the node to its direct ancestor's list of children | |
ancestor_child_lists[-1].append(node) | |
# Push its child list onto the stack of ancestors, so further calls to | |
# push_node will add children to this node (see above) | |
ancestor_child_lists.append(node.children) | |
def pop_node(): | |
if get_current_level() <= 1: | |
raise RuntimeError("Bookmark level should not go below 1") | |
ancestor_child_lists.pop() | |
for line_number, line in enumerate(file_lines): | |
# Skip blank lines and comments | |
if re.match(r'^\s*#', line) or re.match(r'^\s*$', line): | |
continue | |
indent, content = re.match(r'^(\s*)(.+)$', line).groups() | |
# Find the most-direct ancestor. | |
# If the node at the top of the ancestor stack has an indent that includes | |
# the current line's indent, then the current line must be either a peer | |
# or an aunt/uncle of that ancestor. So pop nodes off the stack until you | |
# find one with a smaller indent. | |
while len(ancestor_child_lists) > 1 and ancestor_child_lists[-2][-1].indent.startswith(indent): | |
pop_node() | |
# The child's indent must be a superset of the ancestor's indent | |
# If it's not, then someone mixed tabs and spaces and it's impossible | |
# to parse the file. | |
if len(ancestor_child_lists) > 1 and not indent.startswith(ancestor_child_lists[-2][-1].indent): | |
raise RuntimeError(f"Inconsistent indentation whitespace at line {line_number}. Make sure you're using either tabs or spaces, not both!") | |
node = TextTreeNode( | |
content=content, | |
indent=indent, | |
line_number=line_number, | |
children=[] | |
) | |
push_node(node) | |
return root_node_list | |
def build_pdf_bookmark_list(text_tree: List[TextTreeNode], level=1) -> List[PdfBookmark]: | |
ret: List[PdfBookmark] = [] | |
for line in text_tree: | |
page_number, title = re.match(r'^\s*(\d+)?(?:\.?)\s*(.+)$', line.content).groups() | |
if page_number is None: | |
logger.warn('No page number for bookmark named %s on line %s', title, line.line_number, level) | |
page_number = 0 | |
ret.append(PdfBookmark(title=title, level=level, page_number=int(page_number))) | |
ret += build_pdf_bookmark_list(line.children, level=(level + 1)) | |
return ret | |
if __name__ == '__main__': | |
exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment