from bs4 import BeautifulSoup from pprint import pprint import requests import csv import re import time import random from selenium import webdriver from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.firefox.firefox_binary import FirefoxBinary from selenium.common.exceptions import NoSuchElementException from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException # URL Load url = "URL_List.txt" with open(url, "r") as urllist: url_pages = urllist.read() urls = url_pages.split("\n") # Variables matchArr = [] matchArrFinal = [] scrapeDate = time.strftime("%d-%m-%Y") # Driver Load driver = webdriver.Chrome(executable_path='chromedriver', service_args=['--ssl-protocol=any', '--ignore-ssl-errors=true']) driver.maximize_window() # URL Scrapping for single_url in urls: randomLoadTime = random.randint(400, 600)/100 time.sleep(randomLoadTime) driver1 = driver driver1.get(single_url) htmlSourceRedirectCheck = driver1.page_source # Redirect Check redirectCheck = re.findall('404 - Page not found', htmlSourceRedirectCheck) if '404 - Page not found' in redirectCheck: leaguer1 = single_url leagueFinal = re.findall('fotbal/(.*?)/', leaguer1) print(str(leagueFinal) + ' ' + '404 - Page not found') pass else: try: loadedOddsCheck = WebDriverWait(driver1, 25) loadedOddsCheck.until(EC.element_to_be_clickable \ ((By.XPATH, ".//h3[contains(@data-params, 'hideShowEvents')]"))) except TimeoutException: pass unloadedOdds = driver1.find_elements_by_xpath \ (".//h3[contains(@data-params, 'loadExpandEvents')]") for clicking in unloadedOdds: clicking.click() randomLoadTime2 = random.randint(50, 100)/100 time.sleep(randomLoadTime2) matchArr = [] leaguer = single_url htmlSourceOrig = driver1.page_source htmlSource = htmlSourceOrig.replace('Dagenham & Redbridge', 'Dagenham') # REGEX try: leagueFinal = re.findall('fotbal/(.*?)/', leaguer) print(leagueFinal) except IndexError: leagueFinal = 'null' try: home = re.findall('"event-details-team-a-name">(.*?)</span>', htmlSource) except IndexError: home = 'null' try: away = re.findall('"event-details-team-b-name">(.*?)</span>', htmlSource) except IndexError: away = 'null' try: date = re.findall('"event-details-date">(.*?)</span>', htmlSource) except IndexError: date = 'null' try: odds = re.findall('bet-odds-value">([0-9]+,[0-9][0-9])</span>', htmlSource) except IndexError: odds = 'null' oddsFinal = [o.replace(',', '.') for o in odds] # Live date fix matchNumber = len(home) dateNumber = len(date) dateFix = matchNumber - dateNumber if matchNumber > dateNumber: for fixing in range (dateFix): date.insert(0, 'LIVE') # Matches matchNum = len(home) for matches in range (matchNum): matchArr.append(leagueFinal[0]) matchArr.append(home[0]) matchArr.append(away[0]) try: matchArr.append(date[0]) except IndexError: matchArr.append(None) try: matchArr.append(oddsFinal[0]) except IndexError: matchArr.append(None) try: matchArr.append(oddsFinal[1]) except IndexError: matchArr.append(None) try: matchArr.append(oddsFinal[2]) except IndexError: matchArr.append(None) del home[0] del away[0] try: del date[0] except IndexError: pass del oddsFinal[0:3] for matchesFinal in range (matchNum): matchArrFinal.append(matchArr[0:7]) del matchArr[0:7] driver.close() # CSV with open('D:\Betting\BET Fotbal\Scrapped Odds\Sazkabet' + ' ' + scrapeDate + '.csv', 'w', newline='') as csvFile: writer = csv.writer(csvFile, delimiter=',') writer.writerow(["league", "home", "away", "date", "1", "0", "2"]) writer.writerows(matchArrFinal) csvFile.close()here is content of URL_List.txt file:
Scrapping javascript website with Selenium where pages randomly fail to load
Users browsing this thread: 1 Guest(s)