I have the following code to recast a set of medical records so that the new PDF has bookmarks that reflect the date of medical treatment. The code creates a new PDF (porejemplo.pdf) but when I open it I there are no bookmarks. what am I missing?
import re
import pypdf
# Open the PDF file for reading
pdf_file = open("C:/Users/stand/Downloads/MyClientVisitDates.pdf", 'rb')
pdf_reader = pypdf.PdfReader(pdf_file)
pdf_writer = pypdf.PdfWriter()
# Define the regular expression for finding the bookmark locations
regex = re.compile(r'Visit date: \b\d{2}/\d{2}/\d{4}\b')
# Iterate through the pages of the PDF
for i in range(len(pdf_reader.pages)):
page = pdf_reader.pages[i]
text = page.extract_text()
matches = re.finditer(regex, text)
pdf_writer.add_page(page)
for match in matches:
pdf_writer.add_outline_item( title=matches, page_number=page)
# Write the new PDF file with bookmarks
output_file = open('porejemplo.pdf', 'wb')
pdf_writer.write(output_file)
output_file.close()
pdf_file.close()
(Jan-21-2023, 04:53 PM)standenman Wrote: [ -> ]I have the following code to recast a set of medical records so that the new PDF has bookmarks that reflect the date of medical treatment. The code creates a new PDF (porejemplo.pdf) but when I open it I there are no bookmarks. what am I missing?
import re
import pypdf
# Open the PDF file for reading
pdf_file = open("C:/Users/stand/Downloads/MyClientVisitDates.pdf", 'rb')
pdf_reader = pypdf.PdfReader(pdf_file)
pdf_writer = pypdf.PdfWriter()
# Define the regular expression for finding the bookmark locations
regex = re.compile(r'Visit date: \b\d{2}/\d{2}/\d{4}\b')
# Iterate through the pages of the PDF
for i in range(len(pdf_reader.pages)):
page = pdf_reader.pages[i]
text = page.extract_text()
matches = re.finditer(regex, text)
pdf_writer.add_page(page)
for match in matches:
pdf_writer.add_outline_item( title=matches, page_number=page)
# Write the new PDF file with bookmarks
output_file = open('porejemplo.pdf', 'wb')
pdf_writer.write(output_file)
output_file.close()
pdf_file.close()
The function
re.finditer()
returns a iterator of
Match
objects. You need to pass a
string
to the
pdf_writer.add_outline_item()
function as
title
parameter.
Also, consider using
enumerate
to iterate over the PDF pages:
for page_number, page in enumerate(pdf_reader.pages):
text = page.extract_text()
matches = re.finditer(regex, text)
# You still need to deal with matches here, as you need to pass a string
# to pdf_writer.add_outline_item() as a title.
pdf_writer.add_page(page)
for match in matches:
pdf_writer.add_outline_item(title=matches, page_number=page_number)
Thanks for your response. Still getting a pdf without bookmarks, plus errors in my terminal of the form:
[0, IndirectObject(652, 0, 1775950030224)]
unknown widths :
With this code:
import re
import pypdf
# Open the PDF file for reading
pdf_file = open("C:/Users/stand/Downloads/SusanJonesVisitDate.pdf", 'rb')
pdf_reader = pypdf.PdfReader(pdf_file)
pdf_writer = pypdf.PdfWriter()
# Define the regular expression for finding the bookmark locations
regex = re.compile(r'Visit date: \b\d{2}/\d{2}/\d{4}\b')
for page_number, page in enumerate(pdf_reader.pages):
text = page.extract_text()
matches = re.finditer(regex, text)
# You still need to deal with matches here, as you need to pass a string
# to pdf_writer.add_outline_item() as a title.
pdf_writer.add_page(page)
for match in matches:
pdf_writer.add_outline_item(title=matches, page_number=page_number)
# Write the new PDF file with bookmarks
output_file = open('porejemplo22.pdf', 'wb')
pdf_writer.write(output_file)
output_file.close()
pdf_file.close()