Jul-07-2021, 11:39 PM
Oh, that's a shame, must be a windows thing.
I tried it by putting 3 old exam pdfs in the source directory and letting it run.
I get good results.
Here is the same as a program, runs perfectly for me in a bash terminal.
I tried it by putting 3 old exam pdfs in the source directory and letting it run.
I get good results.
Here is the same as a program, runs perfectly for me in a bash terminal.
#! /usr/bin/python3 import os import pdf2image from PIL import Image import pytesseract # set your paths source = '/home/pedro/babystuff/pdf2text/' destination_jpg = '/home/pedro/babystuff/pdf2jpg/' save_text_path = '/home/pedro/babystuff/ocr_textfiles/' # get rid of the jpg files after reading them def junkjpgs(path): print('Clearing out the folders we use, in case there is anything in there ... ') pics = os.listdir(path) if len(pics) == 0: print('Nothing in ' + path + '\n\n') return for file in pics: os.remove(path + file) print('ALL files removed from: ' + path + '\n\n') # crack the PDF open def splitPDF(aPDF, source, destination): print('Splitting the PDF to individual jpgs ... ') outputName = aPDF.split('.') savename = outputName[0] # images is a list images = pdf2image.convert_from_path(source + aPDF) i=1 for image in images: image.save(destination + savename + str(i) + '.jpg', 'JPEG') i+=1 print('PDF split to .jpgs and all saved in: ' + destination + '\n\n') savetextname = savename + '.txt' return savetextname def convert2text(name): # get the jpgs jpgFiles = os.listdir(destination_jpg) jpgFiles.sort() this_text = open(save_text_path + name, 'a') # this works fine for i in range(len(jpgFiles)): chiText1 = pytesseract.image_to_string(Image.open(destination_jpg + jpgFiles[i]), lang='chi_sim') print('Page ' + str(i + 1) + ' done') this_text.write(chiText1) print('Next loop coming up') this_text.close() print('removing the jpgs ... ') junkjpgs(destination_jpg) print('finished this PDF ... ') if __name__ == '__main__': # in case there are any old jpgs in the jpg folder junkjpgs(destination_jpg) # get the pdf files files = os.listdir(source) mypdfs = [] # maybe there are some other files in there for f in files: if f.endswith('.pdf'): mypdfs.append(f) # ocr the jpgs for f in mypdfs: text_name = splitPDF(f, source, destination_jpg) convert2text(text_name)