Python Forum

Full Version: Convert From PDf into JPEG Problem
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I'd recently found one issuse from converting PDF into JPG file and It will be stopped at File Number 52 without continuous on.

I'd tried neither issues on PdfReader from causing this problem. Hope anyone could help. Thanks

#!/usr/bin/python
import os
import base64
import warnings
import pandas as pd
import pypdfium2 as pdfium
from pypdf import PdfReader
from PIL import Image

def into_jpg(file, file_loc):
    try:
        
        write_loc = '/home/pi/code/images/train'
        new_file_name = str(base64.b64encode(str.encode(file.split('.')[0]))[:15], 'utf-8')
        new_file_loc = os.path.join(write_loc, new_file_name + '.jpg')
        new_count = 1
        
        pdfs = pdfium.PdfDocument(file_loc)
        n_pages = len(pdfs)
        
        for page_num in range(len(pdfs)):
            page = pdfs.get_page(page_num)
            pil_image = page.render(
                    scale = 1,
                    rotation = 0,
                    crop = (0, 0, 0, 0)
                    #optimise_mode = pdfium.OptimiseMode.NONE
                ).to_pil()
            
            pil_image.save(os.path.join(write_loc, f'{new_file_name}_{page_num+1}.jpg'))
            new_file_loc = os.path.join(write_loc, new_file_name + '_' + str(page_num) + '.jpg')
              
    except Exception as exp:
        print(exp)
        
    
    return

file_dir = '/home/pi/code/ma_1'
file_loc = None
count = 1
for file in os.listdir(file_dir):
    if os.path.isdir(os.path.join(file_dir, file)):
        for file_ in os.listdir(os.path.join(file_dir, file)):
            file_loc = os.path.join(os.path.join(file_dir, file), file_)
            IsProcess = True
            while IsProcess: 
                if os.path.isfile(file_loc):
                    try:
                        PdfReader(file_loc)
                        print('File Location_{0}:'.format(count), file_loc)
                        into_jpg(file_, file_loc)
                        
                        IsProcess = False
                    except Exception as exp:
                        print(exp)
                        
    count = count + 1
           
Problem solved, while loop caused in hang issues.
I often split PDFs to jpgs for OMR. This works well for me.

import pdf2image

# can use glob to get a list of pdfs and loop through the list
pdf = "2_cv外贸助理.pdf"
# for saving various pdfs
path2pdf = '/home/pedro/babystuff/'
# for saving split pdfs as jpg
path2jpg = '/home/pedro/babystuff/pdf2jpg/'

def splitPDF(aPDF, source, destination):
    print(f'Splitting {source + aPDF} to individual jpgs ... ')
    outputName = aPDF.split('.')
    savename = outputName[0]    
    # images is a list
    images = pdf2image.convert_from_path(source + aPDF)
    i=1
    for image in images:
        image.save(destination + savename + str(i) + '.jpg', 'JPEG')
        i+=1
(Sep-04-2023, 08:42 AM)Pedroski55 Wrote: [ -> ]I often split PDFs to jpgs for OMR. This works well for me.

import pdf2image

# can use glob to get a list of pdfs and loop through the list
pdf = "2_cv外贸助理.pdf"
# for saving various pdfs
path2pdf = '/home/pedro/babystuff/'
# for saving split pdfs as jpg
path2jpg = '/home/pedro/babystuff/pdf2jpg/'

def splitPDF(aPDF, source, destination):
    print(f'Splitting {source + aPDF} to individual jpgs ... ')
    outputName = aPDF.split('.')
    savename = outputName[0]    
    # images is a list
    images = pdf2image.convert_from_path(source + aPDF)
    i=1
    for image in images:
        image.save(destination + savename + str(i) + '.jpg', 'JPEG')
        i+=1

hm. Tq.
(Sep-05-2023, 06:19 AM)koklimabc Wrote: [ -> ]hm. Tq.
This may be a cultural problem, but please think that some forum members like me don't understand your abbreviations. Looking for HMTQ online, I find only "Her Majesty The Queen". This is probably not what you meant. Confused