Jun-18-2023, 02:06 PM
hello, I try to convert with OCR some pdf that contains images. I got this error:
Traceback (most recent call last): File "E:\Carte\BB\17 - Site Leadership\alte\Ionel Balauta\Aryeht\Task 1 - Traduce tot site-ul\Doar Google Web\Andreea\Meditatii\2023\OCR.py", line 31, in <module> image = page.extract_images()[0]["obj"] AttributeError: 'PageObject' object has no attribute 'extract_images'this is the code:
import os import PyPDF2 import pytesseract from PIL import Image from pdf2image import convert_from_path # Path to the folder containing PDF files input_folder = "d:/doc/doc" # Path to the folder where text files will be saved output_folder = "d:/doc/doc" # Path to the Tesseract OCR executable (change if necessary) pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" # Get a list of all PDF files in the input folder files = [f for f in os.listdir(input_folder) if f.endswith(".pdf")] # Loop through each PDF file and convert it to text using OCR for file in files: pdf_path = os.path.join(input_folder, file) txt_path = os.path.join(output_folder, os.path.splitext(file)[0] + ".txt") # Extract images from PDF and perform OCR on each image images = [] with open(pdf_path, "rb") as file: pdf_reader = PyPDF2.PdfFileReader(file) for page_num in range(pdf_reader.numPages): page = pdf_reader.getPage(page_num) image = page.extract_images()[0]["obj"] images.append(Image.frombytes("RGB", image.size, image.data)) # Perform OCR on images and extract text text = "" for image in images: text += pytesseract.image_to_string(image) # Save the extracted text to a text file with open(txt_path, "w", encoding="utf-8") as txt_file: txt_file.write(text) print("Conversion complete!")can someone fix my code so it works?