Jul-06-2021, 11:42 PM
(This post was last modified: Jul-07-2021, 04:15 AM by Pedroski55.)
HI again, just had some free time, so I tidied up my pdf to text program.
You just need to change the paths, I don't use Windows, so I am not too sure about the correct format.
Then you can paste this in your Idle shell and enter myApp()
Works well for me! The girlfriend might need it again someday, have to keep her happy
You just need to change the paths, I don't use Windows, so I am not too sure about the correct format.
Then you can paste this in your Idle shell and enter myApp()
Works well for me! The girlfriend might need it again someday, have to keep her happy
def myApp(): import os import pdf2image from PIL import Image import pytesseract # set your paths source = '/home/pedro/babystuff/pdf2text/' destination_jpg = '/home/pedro/babystuff/pdf2jpg/' save_text_path = '/home/pedro/babystuff/ocr_textfiles/' # get the pdf files files = os.listdir(source) mypdfs = [] # maybe there are some other files in there, so only get .pdf files for f in files: if f.endswith('.pdf'): mypdfs.append(f) # get rid of the jpg files after reading them def junkjpgs(path): print('Clearing out the folders we use, in case there is anything in there ... ') pics = os.listdir(path) if len(pics) == 0: print('Nothing in ' + path + '\n\n') return for file in pics: os.remove(path + file) print('ALL files removed from: ' + path + '\n\n') # in case there are any old jpg files in the jpg folder junkjpgs(destination_jpg) # crack the PDF open def splitPDF(aPDF, source, destination): print('Splitting the PDF to individual jpgs ... ') outputName = aPDF.split('.') savename = outputName[0] # images is a list images = pdf2image.convert_from_path(source + aPDF) i=1 for image in images: image.save(destination + savename + str(i) + '.jpg', 'JPEG') i+=1 print('PDF split to .jpgs and all saved in: ' + destination + '\n\n') savetextname = savename + '.txt' return savetextname def convert2text(name): # get the jpgs jpgFiles = os.listdir(destination_jpg) jpgFiles.sort() this_text = open(save_text_path + name, 'a') # this works fine for i in range(len(jpgFiles)): chiText1 = pytesseract.image_to_string(Image.open(destination_jpg + jpgFiles[i]), lang='chi_sim') print('Page ' + str(i + 1) + ' done') this_text.write(chiText1) print('Next loop coming up') this_text.close() print('removing the jpgs ... ') junkjpgs(destination_jpg) print('finished this PDF ... ') for f in mypdfs: text_name = splitPDF(f, source, destination_jpg) convert2text(text_name)