This was originally taken from https://superuser.com/a/1012669/894282 and then updated to be compatible with python 3.X
Requires pypdf2
to be installed.
This was originally taken from https://superuser.com/a/1012669/894282 and then updated to be compatible with python 3.X
Requires pypdf2
to be installed.
#! /usr/bin/env python | |
# Original author Nicholas Kim, modified by Yan Pashkovsky | |
# New license - GPL v3 | |
import sys | |
import time | |
from pathlib import Path | |
# from PyPDF2 import PdfReader, PdfWriter | |
# try: | |
# from PyPDF2.utils import PdfReadError | |
# except ImportError: | |
# from PyPDF2._reader import PdfReadError | |
from pypdf import PdfReader, PdfWriter | |
try: | |
from pypdf.utils import PdfReadError | |
except ImportError: | |
from pypdf._reader import PdfReadError | |
def eprint(*args, **kwargs): | |
"""Print to stderr | |
Taken from https://stackoverflow.com/a/14981125/7564988 | |
""" | |
print(*args, file=sys.stderr, **kwargs) | |
def get_cmdline_arguments(): | |
"""Retrieve command line arguments.""" | |
from optparse import OptionParser | |
usage_string = "%prog [-v] [-o output_name] [-b bookmarks_file] file1, file2 [, ...]" | |
parser = OptionParser(usage_string) | |
parser.add_option( | |
"-o", "--output", | |
dest="output_filename", | |
default=time.strftime("output_%Y%m%d_%H%M%S"), | |
help="Specify output filename (exclude .pdf extension); default is current date/time stamp" | |
) | |
parser.add_option( | |
"-b", "--bookmarks", | |
dest="bookmark_file", | |
default=None, | |
help="Specify the bookmark names for each file. The file should be new-line delimited and the number of lies must match the number of input files. If not given, the name of each file will be used as the bookmark name." | |
) | |
parser.add_option("-v", "--verbose", | |
action="store_true", dest="verbose", default=True, | |
help="Print detailed output (undoes quiet)") | |
parser.add_option("-q", "--quiet", | |
action="store_false", dest="verbose", default=True, | |
help="Do not print detailed output (undoes verbose)") | |
options, args = parser.parse_args() | |
if len(args) < 2: | |
parser.print_help() | |
sys.exit(1) | |
return options, args | |
def main(): | |
options, filenames = get_cmdline_arguments() | |
verboseprint = print if options.verbose else lambda *a, **k: None | |
output_pdf_name = options.output_filename + ".pdf" | |
files_to_merge = [] | |
bookmarks = [] | |
verboseprint(f"Output filename: {output_pdf_name}") | |
verboseprint(f"Input filenames:") | |
for f in filenames: | |
verboseprint(f"\t{f}") | |
# gather bookmark names | |
if options.bookmark_file: | |
bookmark_path = Path(options.bookmark_file) | |
assert bookmark_path.exists(), f"Bookmark file '{options.bookmark_file}' does not exist." | |
with bookmark_path.open() as file: | |
bookmarks = file.read().splitlines() | |
assert len(bookmarks) == len(filenames), f"Number of bookmarks in '{bookmark_path}' ({len(bookmarks)}) does not match the number of files ({len(filenames)})" | |
verboseprint(f"Bookmark Names:") | |
for b in bookmarks: | |
verboseprint(f"\t{b}") | |
else: | |
bookmarks = [filename for filename in filenames] | |
verboseprint(f"Bookmark Names: Same as filenames") | |
# get PDF files | |
for f in filenames: | |
try: | |
next_pdf_file = PdfReader(open(f, "rb")) | |
except(PdfReadError): | |
eprint("%s is not a valid PDF file." % f) | |
sys.exit(1) | |
except(IOError): | |
eprint("%s could not be found." % f) | |
sys.exit(1) | |
else: | |
files_to_merge.append(next_pdf_file) | |
# merge page by page | |
output_pdf_stream = PdfWriter() | |
j=0 | |
k=0 | |
for f in files_to_merge: | |
verboseprint(f"Adding {filenames[k]} to output") | |
for i in range(len(f.pages)): | |
output_pdf_stream.add_page(f.pages[i]) | |
if i==0: | |
output_pdf_stream.add_outline_item(str(bookmarks[k]),j) | |
j = j + 1 | |
k += 1 | |
# create output pdf file | |
verboseprint(f"Writing output file...") | |
try: | |
output_pdf_file = open(output_pdf_name, "wb") | |
output_pdf_stream.write(output_pdf_file) | |
finally: | |
output_pdf_file.close() | |
print("%s successfully created." % output_pdf_name) | |
if __name__ == "__main__": | |
main() |
Compatibility with mainline pypdf
is as simple as replacing PyPDF2
with pypdf
in the import statements; everything else should work.
Latest update adds the ability to customize what the bookmark labels to be added are and also adds more verbose output (with the option of disabling).
Latest revision works with
PyPDF2
version 3.0.1.NOTE PyPDF2 is deprecated in favor of pypdf, which actually gets regular updates. This still works for now, so I'm not going to fix what isn't broken.