Python Forum
Scrapping javascript website with Selenium where pages randomly fail to load
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Scrapping javascript website with Selenium where pages randomly fail to load
#3
from bs4 import BeautifulSoup
from pprint import pprint
import requests
import csv
import re
import time
import random
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

# URL Load

url = "URL_List.txt"
with open(url, "r") as urllist:
  url_pages = urllist.read()
urls = url_pages.split("\n")

# Variables

matchArr = []
matchArrFinal = []
scrapeDate = time.strftime("%d-%m-%Y")

# Driver Load

driver = webdriver.Chrome(executable_path='chromedriver', 
                               service_args=['--ssl-protocol=any', 
                               '--ignore-ssl-errors=true'])
driver.maximize_window()

# URL Scrapping

for single_url in urls:
    
    randomLoadTime = random.randint(400, 600)/100
    time.sleep(randomLoadTime)
    driver1 = driver
    driver1.get(single_url)  
    htmlSourceRedirectCheck = driver1.page_source

    # Redirect Check
    redirectCheck = re.findall('404 - Page not found', htmlSourceRedirectCheck)

    if '404 - Page not found' in redirectCheck:
        leaguer1 = single_url
        leagueFinal = re.findall('fotbal/(.*?)/', leaguer1)
        print(str(leagueFinal) + ' ' + '404 - Page not found')
        pass

    else:
        try:
            loadedOddsCheck = WebDriverWait(driver1, 25)
            loadedOddsCheck.until(EC.element_to_be_clickable \
            ((By.XPATH, ".//h3[contains(@data-params, 'hideShowEvents')]")))
        except TimeoutException:
                pass

        unloadedOdds = driver1.find_elements_by_xpath \
        (".//h3[contains(@data-params, 'loadExpandEvents')]")
        for clicking in unloadedOdds:
            clicking.click()
            randomLoadTime2 = random.randint(50, 100)/100
            time.sleep(randomLoadTime2)
    
        matchArr = []
        leaguer = single_url

        htmlSourceOrig = driver1.page_source
        htmlSource = htmlSourceOrig.replace('Dagenham & Redbridge', 'Dagenham')

        # REGEX

        try:
            leagueFinal = re.findall('fotbal/(.*?)/', leaguer)
            print(leagueFinal)
        except IndexError:
            leagueFinal = 'null'
        try:
            home = re.findall('"event-details-team-a-name">(.*?)</span>', htmlSource)
        except IndexError:
            home = 'null'
        try:
            away = re.findall('"event-details-team-b-name">(.*?)</span>', htmlSource)
        except IndexError:
            away = 'null'
        try:
            date = re.findall('"event-details-date">(.*?)</span>', htmlSource)
        except IndexError:
            date = 'null'
        try:
            odds = re.findall('bet-odds-value">([0-9]+,[0-9][0-9])</span>', htmlSource)
        except IndexError:
            odds = 'null'

        oddsFinal = [o.replace(',', '.') for o in odds]

        # Live date fix
        matchNumber = len(home)
        dateNumber = len(date)
        dateFix = matchNumber - dateNumber
        if matchNumber > dateNumber:
            for fixing in range (dateFix):
                date.insert(0, 'LIVE')
                
        # Matches

        matchNum = len(home)

        for matches in range (matchNum):
            matchArr.append(leagueFinal[0])
            matchArr.append(home[0])
            matchArr.append(away[0])
            try:        
                matchArr.append(date[0])
            except IndexError:
                matchArr.append(None)
            try:
                matchArr.append(oddsFinal[0])
            except IndexError:
                matchArr.append(None)
            try:
                matchArr.append(oddsFinal[1])
            except IndexError:
                matchArr.append(None)
            try:
                matchArr.append(oddsFinal[2])
            except IndexError:
                matchArr.append(None) 


            del home[0]
            del away[0]

            try:
                del date[0]
            except IndexError:         
                pass

            del oddsFinal[0:3]

        for matchesFinal in range (matchNum):
            matchArrFinal.append(matchArr[0:7])
            
            del matchArr[0:7]

driver.close() 

# CSV

with open('D:\Betting\BET Fotbal\Scrapped Odds\Sazkabet' + ' ' + scrapeDate + '.csv', 'w', newline='') as csvFile:
    writer = csv.writer(csvFile, delimiter=',')
    writer.writerow(["league", "home", "away", "date", "1", "0", "2"])
    writer.writerows(matchArrFinal)

csvFile.close()
here is content of URL_List.txt file:
Reply


Messages In This Thread
RE: Scrapping javascript website with Selenium where pages randomly fail to load - by JuanJuan - Dec-25-2019, 07:21 PM

Possibly Related Threads…
Thread Author Replies Views Last Post
  Problem with scrapping Website giddyhead 1 1,753 Mar-08-2024, 08:20 AM
Last Post: AhanaSharma
  python web scrapping mg24 1 479 Mar-01-2024, 09:48 PM
Last Post: snippsat
  Scaping pages created by javascript mbizzl 1 1,583 Jul-17-2022, 10:01 PM
Last Post: Larz60+
  How can I ignore empty fields when scrapping never5000 0 1,460 Feb-11-2022, 09:19 AM
Last Post: never5000
  Suggestion request for scrapping html table Vkkindia 3 2,142 Dec-06-2021, 06:09 PM
Last Post: Larz60+
  web scrapping through Python Naheed 2 2,704 May-17-2021, 12:02 PM
Last Post: Naheed
  Website scrapping and download santoshrane 3 4,506 Apr-14-2021, 07:22 AM
Last Post: kashcode
  Using Python request without selenium on html form with javascript onclick submit but eraosa 0 3,265 Jan-09-2021, 06:08 PM
Last Post: eraosa
  Newbie help with lxml scrapping chelsealoa 1 1,937 Jan-08-2021, 09:14 AM
Last Post: Larz60+
  Scrapping Sport score laplacea 1 2,341 Dec-13-2020, 04:09 PM
Last Post: Larz60+

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020