Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Help Scraping web site
#1
i'm tryng to scrape a website..

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
import time
import json
 
# Configura le opzioni del browser
chrome_options = Options()
chrome_user_data_dir = "C:\\Users\\carto\\AppData\\Local\\Google\\Chrome\\User Data\\"
chrome_options.add_argument(f"user-data-dir={chrome_user_data_dir}")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
 
# Specifica il percorso del driver
service = Service('C:\\chromedriver\\chromedriver.exe')
 
# Avvia il browser
driver = webdriver.Chrome(service=service, options=chrome_options)
 
# Imposta gli headers
custom_headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36",
    "Accept-Language": "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7"
}
 
# Esegui il comando per impostare gli headers
driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": custom_headers["User-Agent"]})
driver.execute_cdp_cmd("Network.setExtraHTTPHeaders", {"headers": custom_headers})
 
# Vai alla pagina iniziale
 
# Attendi il caricamento della pagina
time.sleep(9)
 
# Naviga alla pagina dei campionati
 
# Attendi il caricamento della pagina
time.sleep(9)
 
# Prova a cliccare sul pulsante "Tiri🎯"
try:
    tiri_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Tiri🎯')]"))
    )
    tiri_button.click()
except Exception as e:
    print("Errore nel cliccare il pulsante:", e)
 
# Stampa l'HTML della pagina per il debug
print(driver.page_source)
 
# Attendi il caricamento dei dati
time.sleep(3)
 
# Raccogli i dati
event_boxes = driver.find_elements(By.CLASS_NAME, "EventBoxstyled__EventBoxWrapper-sc-ksk2ut-1")
matches_data = []
 
for event in event_boxes:
    try:
        match_date = event.find_element(By.CLASS_NAME, "EventBoxstyled__DateTime-sc-ksk2ut-8").text
        team1 = event.find_elements(By.CLASS_NAME, "EventBoxCompetitorsstyled__CompetitorNameBase-sc-wpbfyn-0")[0].text
        team2 = event.find_elements(By.CLASS_NAME, "EventBoxCompetitorsstyled__CompetitorNameBase-sc-wpbfyn-0")[1].text
        odds = event.find_elements(By.CLASS_NAME, "OddBoxVariant0styled__OddValue-sc-1ypym0p-6")
 
        odds_data = {
            "1": odds[0].text,
            "X": odds[1].text,
            "2": odds[2].text
        }
 
        matches_data.append({
            "date": match_date,
            "team1": team1,
            "team2": team2,
            "odds": odds_data
        })
    except Exception as e:
        print(f"Errore durante l'estrazione dei dati per un evento: {e}")
 
# Scrivi i dati in un file JSON
with open('matches_data.json', 'w') as json_file:
    json.dump(matches_data, json_file, indent=4)
 
# Chiudi il driver
driver.quit()
i can see inspecting website from chrome this divs etc while from python script no!
are them on iframes ?
Reply
#2
Finally inspecting i found the api request i can download the json but i cannot understand how it's linked the match to the odd:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
import time
import json
import re
 
# Configura le opzioni del browser
chrome_options = Options()
chrome_user_data_dir = "C:\\Users\\carto\\AppData\\Local\\Google\\Chrome\\User Data\\"
chrome_options.add_argument(f"user-data-dir={chrome_user_data_dir}")
chrome_options.add_argument("--remote-debugging-port=9222")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
 
# Specifica il percorso del driver
service = Service('C:\\chromedriver\\chromedriver.exe')
 
# Avvia il browser
driver = webdriver.Chrome(service=service, options=chrome_options)
 
try:
    # Apre la pagina API
     
    # Attendi che la pagina carichi i dati
    time.sleep(10)
 
    # Estrai il contenuto della pagina
    page_source = driver.page_source
 
    # Usa una regex per trovare il JSON
    json_match = re.search(r'\{.*\}', page_source, re.DOTALL)
    if json_match:
        json_str = json_match.group(0)
        data = json.loads(json_str)
 
        # Stampa i dati in formato leggibile
        print(json.dumps(data, indent=2))
 
        # Salva i dati in un file JSON
        with open('data.json', 'w', encoding='utf-8') as json_file:
            json.dump(data, json_file, ensure_ascii=False, indent=2)
        print("Dati salvati in 'data.json'.")
    else:
        print("Nessun JSON trovato nel contenuto della pagina.")
 
finally:
    # Chiudi il browser
    driver.quit()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Easiest way to log into a site for web scraping? ejected 2 3,177 Mar-26-2019, 01:05 AM
Last Post: metulburr

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020