Selenium XPATH - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Selenium XPATH (/thread-33189.html) |
Selenium XPATH - jimsxxl - Apr-05-2021 Hi guys, Im a beginner when it comes to Python, and wanted to try code a scraper. Im trying to scrape betfair.com for Team-names and Odds. I successfully extracted the Team-names, but having abit of trouble getting the odds. Here is my code: from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import time # Site URL web = 'https://www.betfair.se/exchange/plus/sv/fotboll-betting-1/today' # chrome driver path path = '/usr/bin/chromedriver' # Load site with URL driver = webdriver.Chrome(path) driver.get(web) # Sleep/wait for site to load time.sleep(15) # Find the 'allow-cookies'-button, click it accept = driver.find_element_by_xpath('//*[@id="onetrust-accept-btn-handler"]') accept.click() #sport_title = driver.find_element_by_xpath('//*[@id="main-wrapper"]/div/div[2]/div/ui-view/ui-view/div/div/div/div/div[1]/div/div[1]/bf-super-coupon/main/ng-include[1]/header/h2').text # Get match-table match = WebDriverWait(driver, 600).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="main-wrapper"]//ul[@class="runners"]/li'))) # Get odds-table one = WebDriverWait(driver, 600).until(EC.presence_of_all_elements_located((By.XPATH, '//*[@id="main-wrapper"]//div[@class="coupon-runner ng-scope"]/button'))) # Loop them and print output for teams in match: print(teams.text) for odds in one: print(odds.text) # Quit chromedriver driver.quit()The output i get is: $ python /home/jims/projects/jims-betfair_scraper/jims-betfair_scraper.py Sheffield Wednesday Cardiff Everton Crystal Palace Wolves ... ... ... ... 1.04 303606 kr 1.05 298037 kr 26 3950 kr 28 425 kr ... ... ...As you can see i also getting the "volume"/money extracted along with the odds. How can i only get the Odds outputted ? As i said in the beginning, im totally new to Python, so any comments on how i could improve this code is much appreciated as im still learning ! Thank you in advance ! RE: Selenium XPATH - Larz60+ - Apr-06-2021 I took a different approach, and captured the match names, but couldn't find the odds data, so didn't include that part. In addition, I don't use chrome, but firefox, so used that browser. At any rate, I used cacheing so that I didn't have to keep reloading the page (this to be removed when satisfied with the code. I also wrote a pretty page to help look for data, and added a dictionary to ho;d all data. I see 'Odds' on a separate page, should I look there? Here was my 'stab' at this: from selenium import webdriver from selenium.webdriver.common.by import By from bs4 import BeautifulSoup import time from PrettifyPage import PrettifyPage from CreateDict import CreateDict from pathlib import Path import os class ScrapeFootballScores: def __init__(self): os.chdir(os.path.abspath(os.path.dirname(__file__))) homepath = Path('.') footballpath = homepath / 'football' footballpath.mkdir(exist_ok=True) self.pp = PrettifyPage().prettify self.cd = CreateDict() self.url = 'https://www.betfair.se/exchange/plus/sv/fotboll-betting-1/today' self.cachefile = footballpath / 'mainpage.html' self.prettyfile = footballpath / 'FootballPretty.html' def start_browser(self): caps = webdriver.DesiredCapabilities().FIREFOX caps["marionette"] = True self.browser = webdriver.Firefox(capabilities=caps) def stop_browser(self): self.browser.close() def get_scores(self): pp = self.pp cd = self.cd matchlist = [] if self.cachefile.exists(): with self.cachefile.open() as fp: page = fp.read() soup = BeautifulSoup(page, "lxml") else: self.start_browser() self.browser.get(self.url) time.sleep(2) page = self.browser.page_source with self.cachefile.open('w') as fp: fp.write(page) soup = BeautifulSoup(page, "lxml") with self.prettyfile.open('w') as fp: fp.write(pp(soup, 2)) self.stop_browser() cd.new_dict('Matches') target = soup.find('ul', {'class': "runners"}) for li in target.find_all('li'): MatchName = li.text.strip() # print(MatchName) minfo = cd.add_node(cd.Matches, MatchName) matchlist.append(MatchName) # Display Dictionary print(cd.Matches) if __name__ == '__main__': sfs = ScrapeFootballScores() sfs.get_scores()and the started dictionary: I'll add odds if you can tell me where on the page (not XPath) they are located.Additional code needed: CreateDict.py # Author: Larz60+ Nov 22, 2018 import os class CreateDict: def __init__(self): os.chdir(os.path.abspath(os.path.dirname(__file__))) def new_dict(self, dictname): setattr(self, dictname, {}) def add_node(self, parent, nodename): node = parent[nodename] = {} return node def add_cell(self, nodename, cellname, value): cell = nodename[cellname] = value return cell def display_dict(self, dictname, level=0): indent = " " * (4 * level) for key, value in dictname.items(): if isinstance(value, dict): print(f'\n{indent}{key}') level += 1 self.display_dict(value, level) else: print(f'{indent}{key}: {value}') if level > 0: level -= 1{'Western Sydney Wanderers': {}, 'Central Coast Mariners': {}} def testit(): cd = CreateDict() cd.new_dict('CityList') boston = cd.add_node(cd.CityList, 'Boston') bos_resturants = cd.add_node(boston, 'Resturants') spoke = cd.add_node(bos_resturants, 'Spoke Wine Bar') cd.add_cell(spoke, 'Addr1', '89 Holland St') cd.add_cell(spoke, 'City', 'Sommerville') cd.add_cell(spoke, 'Addr1', '02144') cd.add_cell(spoke, 'Phone', '617-718-9463') highland = cd.add_node(bos_resturants, 'Highland Kitchen') cd.add_cell(highland, 'Addr1', '150 Highland Ave') cd.add_cell(highland, 'City', 'Sommerville') cd.add_cell(highland, 'ZipCode', '02144') cd.add_cell(highland, 'Phone', '617-625-1131') print(f'\nCityList Dictionary') cd.display_dict(cd.CityList) print(f'\nraw data: {cd.CityList}') if __name__ == '__main__': testit()PrettifyPage.py from bs4 import BeautifulSoup import requests import pathlib class PrettifyPage: def __init__(self): pass def prettify(self, soup, indent): pretty_soup = str() previous_indent = 0 for line in soup.prettify().split("\n"): current_indent = str(line).find("<") if current_indent == -1 or current_indent > previous_indent + 2: current_indent = previous_indent + 1 previous_indent = current_indent pretty_soup += self.write_new_line(line, current_indent, indent) return pretty_soup def write_new_line(self, line, current_indent, desired_indent): new_line = "" spaces_to_add = (current_indent * desired_indent) - current_indent if spaces_to_add > 0: for i in range(spaces_to_add): new_line += " " new_line += str(line) + "\n" return new_line if __name__ == '__main__': pp = PrettifyPage() pfilename = pp.bpath.htmlpath / 'BusinessEntityRecordsAA.html' with pfilename.open('rb') as fp: page = fp.read() soup = BeautifulSoup(page, 'lxml') pretty = pp.prettify(soup, indent=2) print(pretty) RE: Selenium XPATH - jimsxxl - Apr-06-2021 Hello Larz60 ! Thank you so much for your reply. I re-wrote almost everything. Its working like i want it now! Very fun to learn new things, even though it might be an easy task for some.. |