I think I figured it out, it's kind of a Frankenstein of different modified procedures but it works so far. It finds all the bookmark titles and their page numbers then searches for a the desired bookmark and uses the page numbers to slice out that portion and put them all in one pdf. I used the word 'equipment' as my keyword but could be easily modified to find something else. I'm new to the Python thing, I used work from several other people online, I did my best to give credit where it is due, if there's a more formal way to do it please let me know. I'm sure this looks clunky to those with experience but it works if anybody is doing something similar. Cheers...
import os
# The credit for the following code that finds the titles of PDF bookmarks and their corresponding pages, goes to
#Darrel at https://stackoverflow.com/a/1924950. I merely updated it to use PyPDF2 and added a personalized page number
#finder and appending mechanism. Also credit for the generator expression to find the index of the desired tuple goes
# to Jon Surell https://stackoverflow.com/a/10865345.
import PyPDF2
class Table_Contents (PyPDF2.PdfFileReader):
def get_the_page_numbers(self):
def _setup_outline_page_ids(outline, _result=None):
if _result is None:
_result = {}
for obj in outline:
if isinstance(obj, PyPDF2.pdf.Destination):
_result[(id(obj), obj.title)] = obj.page.idnum
elif isinstance(obj, list):
_setup_outline_page_ids(obj, _result)
return _result
def _setup_page_id_to_num(pages=None, _result=None, _num_pages=None):
if _result is None:
_result = {}
if pages is None:
_num_pages = []
pages = self.trailer["/Root"].getObject()["/Pages"].getObject()
t = pages["/Type"]
if t == "/Pages":
for page in pages["/Kids"]:
_result[page.idnum] = len(_num_pages)
_setup_page_id_to_num(page.getObject(), _result, _num_pages)
elif t == "/Page":
_num_pages.append(1)
return _result
outline_page_ids = _setup_outline_page_ids(self.getOutlines())
page_id_to_page_numbers = _setup_page_id_to_num()
result = {}
for (_, title), page_idnum in outline_page_ids.items():
result[title] = page_id_to_page_numbers.get(page_idnum, '???')
return result
path='C:\\Users\\********\\Desktop\\PDF Documents'
merger=PyPDF2.PdfFileMerger()
for(root,dirs,files) in os.walk(path):
for name in files:
input1 = (open(os.path.join(root, name), "rb"))
pdf = Table_Contents(input1 , "rb")
Dic = sorted([(v, k) for k, v in pdf.get_the_page_numbers().items()])
for p, t in sorted([(v, k) for k, v in pdf.get_the_page_numbers().items()]):
if 'Equipment' in t:
Page_Start = p
check = next((i for i, v in enumerate(Dic) if v[0] == p), None)
Page_Stop = ((Dic[check + 1][0]))
merger.append(input1, bookmark=None, pages=(Page_Start,Page_Stop,1), import_bookmarks=False)
merger.write("All Equipment Lists.pdf")