Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Help Scraping web site
#1
i'm tryng to scrape a website..

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import time
import json

# Configura le opzioni del browser
chrome_options = Options()
chrome_user_data_dir = "C:\\Users\\carto\\AppData\\Local\\Google\\Chrome\\User Data\\"
chrome_options.add_argument(f"user-data-dir={chrome_user_data_dir}")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Specifica il percorso del driver
service = Service('C:\\chromedriver\\chromedriver.exe')

# Avvia il browser
driver = webdriver.Chrome(service=service, options=chrome_options)

# Imposta gli headers
custom_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
    "Accept-Language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7"
}

# Esegui il comando per impostare gli headers
driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": custom_headers["User-Agent"]})
driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": custom_headers})

# Vai alla pagina iniziale
driver.get('https://www.idealbet.it/betting/index.html#/overview')

# Attendi il caricamento della pagina
time.sleep(9)

# Naviga alla pagina dei campionati
driver.get('https://www.idealbet.it/betting/index.html#/sport/66/category/502/championship/2942')

# Attendi il caricamento della pagina
time.sleep(9)

# Prova a cliccare sul pulsante "Tiri🎯"
try:
    tiri_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Tiri🎯')]"))
    )
    tiri_button.click()
except Exception as e:
    print("Errore nel cliccare il pulsante:", e)

# Stampa l'HTML della pagina per il debug
print(driver.page_source)

# Attendi il caricamento dei dati
time.sleep(3)

# Raccogli i dati
event_boxes = driver.find_elements(By.CLASS_NAME, "EventBoxstyled__EventBoxWrapper-sc-ksk2ut-1")
matches_data = []

for event in event_boxes:
    try:
        match_date = event.find_element(By.CLASS_NAME, "EventBoxstyled__DateTime-sc-ksk2ut-8").text
        team1 = event.find_elements(By.CLASS_NAME, "EventBoxCompetitorsstyled__CompetitorNameBase-sc-wpbfyn-0")[0].text
        team2 = event.find_elements(By.CLASS_NAME, "EventBoxCompetitorsstyled__CompetitorNameBase-sc-wpbfyn-0")[1].text
        odds = event.find_elements(By.CLASS_NAME, "OddBoxVariant0styled__OddValue-sc-1ypym0p-6")

        odds_data = {
            "1": odds[0].text,
            "X": odds[1].text,
            "2": odds[2].text
        }

        matches_data.append({
            "date": match_date,
            "team1": team1,
            "team2": team2,
            "odds": odds_data
        })
    except Exception as e:
        print(f"Errore durante l'estrazione dei dati per un evento: {e}")

# Scrivi i dati in un file JSON
with open('matches_data.json', 'w') as json_file:
    json.dump(matches_data, json_file, indent=4)

# Chiudi il driver
driver.quit()
i can see inspecting website from chrome this divs etc while from python script no!
are them on iframes ?
Reply
#2
Finally inspecting i found the api request i can download the json but i cannot understand how it's linked the match to the odd:

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import json
import re

# Configura le opzioni del browser
chrome_options = Options()
chrome_user_data_dir = "C:\\Users\\carto\\AppData\\Local\\Google\\Chrome\\User Data\\"
chrome_options.add_argument(f"user-data-dir={chrome_user_data_dir}")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Specifica il percorso del driver
service = Service('C:\\chromedriver\\chromedriver.exe')

# Avvia il browser
driver = webdriver.Chrome(service=service, options=chrome_options)

try:
    # Apre la pagina API
    driver.get('https://sb2frontend-1-altenar2.biahosted.com/api/widget/GetOverviewByMarketTypes?culture=it-IT&timezoneOffset=-120&integration=idealbet&deviceType=1&numFormat=en-GB&countryCode=IT&eventCount=0&sportId=0&champIds=2941&marketTypeIds=17821%2C17822%2C17735%2C17736&marketGroupId=1614')
    
    # Attendi che la pagina carichi i dati
    time.sleep(10)

    # Estrai il contenuto della pagina
    page_source = driver.page_source

    # Usa una regex per trovare il JSON
    json_match = re.search(r'\{.*\}', page_source, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        data = json.loads(json_str)

        # Stampa i dati in formato leggibile
        print(json.dumps(data, indent=2))

        # Salva i dati in un file JSON
        with open('data.json', 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=2)
        print("Dati salvati in 'data.json'.")
    else:
        print("Nessun JSON trovato nel contenuto della pagina.")

finally:
    # Chiudi il browser
    driver.quit()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Easiest way to log into a site for web scraping? ejected 2 3,173 Mar-26-2019, 01:05 AM
Last Post: metulburr

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020