Python Forum

Full Version: how to split pdf under 10mb using python
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
If I have 70 mb pdf, Can I split all pdf under 10mb in a folder.the suffix under part 1, part 2.....
I think you need to specify the page numbers you require by finding the chapter page numbers and using those page numbers to extract pages from the pdf

First read the whole PDF, then extract the pages you want like this:

from PyPDF2 import PdfReader, PdfWriter

# adjust to suit
mypdf = '/home/pedro/pdfs/pdfs/user_manual_ce208.pdf'
savename = '/home/pedro/extracted_pdfs/part_PDF.pdf'

pdf = PdfReader(mypdf)
pages = len(pdf.pages) # in my case 36 pages

startnum = input('what is the starting page number?  ')
endnum = input('what is the last page number?  ')
# pdf page numbers start at zero so subtract 1 from the given page number
start = int(startnum) - 1
end = int(endnum)

# open a new pdf
pdfWriter = PdfWriter()

# add the required pages to the new pdf
for page_num in range(start, end):
    pdfWriter.add_page(pdf.pages[page_num])

with open(savename, 'wb') as f:
    pdfWriter.write(f)
You can save this as a function, which takes 2 arguments: startpage, endpage, and loop through your whole pdf by using a number in the savename. All you need is a list of tuples (startpage, endpage)!
Can use PyPDF2 as posted over.
Like this should work,remember to make some effort next time Wink
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter

def split_pdf_by_size(input_pdf, output_folder, max_size_mb):
    """
    Splits a PDF file into smaller parts, each under a specified size limit.
    Args:
        input_pdf (str or Path): Path to the input PDF file.
        output_folder (str or Path): Folder where the split files will be saved.
        max_size_mb (float): Maximum size of each split PDF in MB.
    """
    input_pdf = Path(input_pdf)
    output_folder = Path(output_folder)
    # Convert size to bytes
    max_size_bytes = max_size_mb * 1024 * 1024
    # Create output folder if it doesn't exist
    output_folder.mkdir(parents=True, exist_ok=True)
    # Read the input PDF
    reader = PdfReader(input_pdf)
    total_pages = len(reader.pages)
    part_number = 1
    writer = PdfWriter()
    for page_number in range(total_pages):
        writer.add_page(reader.pages[page_number])
        # Save the file if it exceeds the size limit
        temp_path = output_folder / f"part_{part_number}.pdf"
        with temp_path.open(mode="wb") as temp_file:
            writer.write(temp_file)
            temp_file_size = temp_path.stat().st_size
        if temp_file_size >= max_size_bytes:
            part_number += 1
            writer = PdfWriter()
    # Save any remaining pages
    if writer.pages:
        final_path = output_folder / f"part_{part_number}.pdf"
        with final_path.open(mode="wb") as final_file:
            writer.write(final_file)
    print(f"PDF has been split into parts under {max_size_mb} MB each in {output_folder}.")

if __name__ == '__main__':
    # Example Usage
    input_pdf = "The Art of Computer Programming 3.pdf"
    output_folder = "output_pdfs"
    # Maximum size of each split PDF in MB
    max_size_mb = 10
    split_pdf_by_size(input_pdf, output_folder, max_size_mb)
(Jan-17-2025, 04:27 PM)snippsat Wrote: [ -> ]Can use PyPDF2 as posted over.
Like this should work,remember to make some effort next time Wink
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter

def split_pdf_by_size(input_pdf, output_folder, max_size_mb):
    """
    Splits a PDF file into smaller parts, each under a specified size limit.
    Args:
        input_pdf (str or Path): Path to the input PDF file.
        output_folder (str or Path): Folder where the split files will be saved.
        max_size_mb (float): Maximum size of each split PDF in MB.
    """
    input_pdf = Path(input_pdf)
    output_folder = Path(output_folder)
    # Convert size to bytes
    max_size_bytes = max_size_mb * 1024 * 1024
    # Create output folder if it doesn't exist
    output_folder.mkdir(parents=True, exist_ok=True)
    # Read the input PDF
    reader = PdfReader(input_pdf)
    total_pages = len(reader.pages)
    part_number = 1
    writer = PdfWriter()
    for page_number in range(total_pages):
        writer.add_page(reader.pages[page_number])
        # Save the file if it exceeds the size limit
        temp_path = output_folder / f"part_{part_number}.pdf"
        with temp_path.open(mode="wb") as temp_file:
            writer.write(temp_file)
            temp_file_size = temp_path.stat().st_size
        if temp_file_size >= max_size_bytes:
            part_number += 1
            writer = PdfWriter()
    # Save any remaining pages
    if writer.pages:
        final_path = output_folder / f"part_{part_number}.pdf"
        with final_path.open(mode="wb") as final_file:
            writer.write(final_file)
    print(f"PDF has been split into parts under {max_size_mb} MB each in {output_folder}.")

if __name__ == '__main__':
    # Example Usage
    input_pdf = "The Art of Computer Programming 3.pdf"
    output_folder = "output_pdfs"
    # Maximum size of each split PDF in MB
    max_size_mb = 10
    split_pdf_by_size(input_pdf, output_folder, max_size_mb)
It works great, thank you very much!
(Jan-17-2025, 04:20 PM)Pedroski55 Wrote: [ -> ]I think you need to specify the page numbers you require by finding the chapter page numbers and using those page numbers to extract pages from the pdf

First read the whole PDF, then extract the pages you want like this:

from PyPDF2 import PdfReader, PdfWriter

# adjust to suit
mypdf = '/home/pedro/pdfs/pdfs/user_manual_ce208.pdf'
savename = '/home/pedro/extracted_pdfs/part_PDF.pdf'

pdf = PdfReader(mypdf)
pages = len(pdf.pages) # in my case 36 pages

startnum = input('what is the starting page number?  ')
endnum = input('what is the last page number?  ')
# pdf page numbers start at zero so subtract 1 from the given page number
start = int(startnum) - 1
end = int(endnum)

# open a new pdf
pdfWriter = PdfWriter()

# add the required pages to the new pdf
for page_num in range(start, end):
    pdfWriter.add_page(pdf.pages[page_num])

with open(savename, 'wb') as f:
    pdfWriter.write(f)
You can save this as a function, which takes 2 arguments: startpage, endpage, and loop through your whole pdf by using a number in the savename. All you need is a list of tuples (startpage, endpage)!

Thank you