Python Forum

Full Version: OCR-Python from Multi TIFF to HOCR getting only Data from 1st Page of multiple TIFF
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi,

I need your help with the code which can be replaced in the below one. Aim able to run the file with the output to hocr output.
But, my requirement is to parse the data from multiple page TIFF image, where I am getting data only from 1st page



# Python program to extract text from all the images in a folder
# storing the text in corresponding files in a different folder
# This is for hocr output, but there is error of getting only 1 page
from PIL import Image
import pytesseract as pt
import os
pt.pytesseract.tesseract_cmd = r'C:\Users\admin\AppData\Local\Programs\Tesseract-OCR\tesseract.exe'
	
def main():
	# path for the folder for getting the raw images
	path ="D:\\input"
	# path for the folder for getting the output
	tempPath ="D:\\output"

	# iterating the images inside the folder
	for imageName in os.listdir(path):
			
		inputPath = os.path.join(path, imageName)
		img = Image.open(inputPath)

		# applying ocr using pytesseract for python
          
		text = pt.image_to_pdf_or_hocr(img, extension = 'hocr', config = (r'--oem 3 --psm 6'), lang ="eng")
		
		fullTempPath = os.path.join(tempPath, 'time_'+imageName+".hocr")
		print(text)
 
		# saving the text for every image in a separate .hocr file
		file1 = open(fullTempPath, "wb")
		file1.write(text)
		file1.close()
 

if __name__ == '__main__':
	main()
Thank you
Joe