Python Forum

I'm coding a Python script that navigates to a website, screenshots it, and annotates boxes and numbers over every clickable element (buttons, text input, and anything the user can press on or interact with). If working properly, the Python function would return a screenshot that resembles the following:

[attachment=2989]

I tried the code below. At first, the boxes were completely misaligned, but I found that when I multiplied their scale by a factor of two, they more or less fit the screenshot. However, there is still misalignment. I'm trying to fix my code to output a correct image.

The problem is with how I'm drawing squares and numbers over clickable elements:

for i, element in enumerate(clickable_elements, start=1):
        location = element.location
        size = element.size
        left = location['x'] * 2
        top = location['y'] * 2
        right = left + (size['width'] * 2)
        bottom = top + (size['height'] * 2)
        
        # Draw rectangle
        draw.rectangle([left, top, right, bottom], outline="red", width=2)
        
        # Draw number
        draw.text((left, top - 25), str(i), font=font, fill="red")

In particular, the x and y location do not seem to be correct and do not align with the screenshot, although it is improved when I multiply by two -- and the same applies for the width and height. How can I change the positioning and width/height of each box to perfectly align with the screenshot?

Here are the screenshots that my code is currently outputting for Google and Amazon, respectively:

[attachment=2990]

[attachment=2991]

It looks like there's a recurring issue with the text inputs not being overlaid properly.

Here is my code:

import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from PIL import Image, ImageDraw, ImageFont

def setup_headless_browser():
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=chrome_options)
    return driver

def take_screenshot_and_markup(driver, url):
    driver.get(url)
    
    # Take screenshot
    screenshot_path = "screenshot.png"
    driver.save_screenshot(screenshot_path)
    
    # Find clickable elements
    clickable_elements = driver.find_elements(By.XPATH, "//*[self::a or self::button or self::input[@type='submit' or @type='button' or @type='reset']]")
    
    # Open the screenshot image
    img = Image.open(screenshot_path)
    draw = ImageDraw.Draw(img)
    
    # Load a font
    try:
        font = ImageFont.truetype("arial.ttf", 50)    
    except IOError:
        font = ImageFont.load_default()
    
    # Draw squares and numbers around clickable elements
    for i, element in enumerate(clickable_elements, start=1):
        location = element.location
        size = element.size
        left = location['x'] * 2
        top = location['y'] * 2
        right = left + (size['width'] * 2)
        bottom = top + (size['height'] * 2)
        
        # Draw rectangle
        draw.rectangle([left, top, right, bottom], outline="red", width=2)
        
        # Draw number
        draw.text((left, top - 25), str(i), font=font, fill="red")
    
    # Save the marked-up image
    marked_screenshot_path = "marked_screenshot.png"
    img.save(marked_screenshot_path)
    
    return marked_screenshot_path

def main():
    driver = setup_headless_browser()
    url = "https://www.amazon.com"  # Replace with the desired URL
    
    try:
        marked_screenshot = take_screenshot_and_markup(driver, url)
        print(f"Marked-up screenshot saved as: {marked_screenshot}")
    finally:
        driver.quit()

if __name__ == "__main__":
    main()

natew