Here's something that I wrote a while back (forgot about is, I'm in my mid 70's and easy for me to do).
the pdf file used in the exmple is downloaded if not available
this expects a starting directory structure of:
PdfSplitter/
make sure requests and pdfrw are installed:
from there:
Edit Jul13, 11:13 PM (UTF)
removed redundant import for pathlib
the pdf file used in the exmple is downloaded if not available
this expects a starting directory structure of:
PdfSplitter/
Output:├── __init__.py
├── src
│ ├── __init__.py
it was run from a virtual environment, but that's not necessarymake sure requests and pdfrw are installed:
pip install requests
pip install pdfrw
from there:
- cd to .../PdfSplitter/
- add __init__.py to PdfSplitter directory:
src/ __init__.py PdfSplitter.py
- Add an empty __init__.py script to src directory
- add the following module to the src directory name it pypdfsplit.py:
from pathlib import Path from pdfrw import PdfReader, PdfWriter import requests import os import sys class Ppaths: def __init__(self, depth=0): os.chdir(os.path.abspath(os.path.dirname(__file__))) dir_depth = abs(depth) HomePath = Path(".") while dir_depth: HomePath = HomePath / ".." dir_depth -= 1 rootpath = HomePath / ".." self.datapath = rootpath / "data" self.datapath.mkdir(exist_ok=True) self.csvpath = self.datapath / 'csv' self.csvpath.mkdir(exist_ok=True) self.pdfpath = self.datapath / 'pdf' self.pdfpath.mkdir(exist_ok=True) self.pdfsplitspath = self.pdfpath / 'splilts' self.pdfsplitspath.mkdir(exist_ok=True) class pypdfsplit: def __init__(self): self.ppath = Ppaths() self.pdf_reader = None self.pdf_writer = PdfWriter() def dispatch(self, pdffile, page_range=[1]): self.pdf_reader = PdfReader(pdffile) self.split_pdf(pdffile, page_range) def split_pdf(self, pdffile, page_range): outbase = pdffile.stem for pagenum in page_range: page = self.pdf_reader.getPage(pagenum) self.pdf_writer.addpage(page) outfile = self.ppath.pdfsplitspath / f"{outbase}{pagenum}.pdf" self.pdf_writer.write(outfile) def get_page(self, url, bin=True): page = None response = requests.get(url) if response.status_code == 200: if bin: page = response.content else: page = response.text return page def main(): psp = pypdfsplit() mypdffile = psp.ppath.pdfpath / 'l78.pdf' if not mypdffile.exists(): page_url = 'https://www.st.com/resource/en/datasheet/l78.pdf' page = psp.get_page(url=page_url, bin=True) if page: with mypdffile.open('wb') as fp: fp.write(page) else: print(f"Can't load {url}") sys.exit(-1) myrange = [1,3,5] psp.dispatch(pdffile=mypdffile, page_range=myrange) if __name__ == '__main__': main()
- run from PdfSplitter directory:
python src/pypdfsplit.py
- when done, directory structure will look like:
pages 1, 3 and 5 were split from the main pdf and stored in PdfSplitter/data/pdf/splitsOutput:PdfSplitter/ ├── data │ ├── csv │ └── pdf │ ├── l78.pdf │ └── splilts │ ├── l781.pdf │ ├── l783.pdf │ └── l785.pdf ├── __init__.py └── src └── pypdfsplit.py
Edit Jul13, 11:13 PM (UTF)
removed redundant import for pathlib