Python Forum
Scrap text out of td table from URLS - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Scrap text out of td table from URLS (/thread-20235.html)



Scrap text out of td table from URLS - Gochix2020 - Aug-01-2019

Hi guys. Can anyone can help with this simple thing?
I would like to scrape out some text from Urls in python script.
There is tables in urls which can be find by this line
markets = driver.find_elements_by_class_name('match-on')
because it could use Webdriver (chromedriver)
from selenium import webdriver
driver = webdriver.Chrome('')
infos = m.find_elements_by_tag_name('td')
So maybe there is smart people here who could help me just to drag out info from table from specific urls and table where class name is 'match-on' and tag name is td then i would be very thankfull. I know i giving not a lot of info but i don't know much about python. I just know that it will help me to fix my problem..
Thank you guys


RE: Scrap text out of td table from URLS - Larz60+ - Aug-02-2019

could you please tell us the URL scraping may be different from site to site.


RE: Scrap text out of td table from URLS - Malt - Aug-02-2019

You can use the BeautifulSoup package which will do the scraping part and if needed you can use the regex to get only the URL you want


RE: Scrap text out of td table from URLS - Gochix2020 - Aug-02-2019

(Aug-02-2019, 01:30 AM)Larz60+ Wrote: could you please tell us the URL scraping may be different from site to site.

It is https://www.oddschecker.com/tennis
If i could just get something to start with then would be amazing. Ive seen guys was using BeautifulSoup but with what to start i don't know.
And in the url there is simple text for example
Name 1 5.7 1.4
Name 2


So i would like to extract everything in format
Name 1 name 2 5.7 1.4
And every line will be new scrape


RE: Scrap text out of td table from URLS - Larz60+ - Aug-03-2019

you can use beautiful soup in combination with selenium, but you need selenium to run the javascript first
This code will get you started:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import time
import os
import PrettifyPage
import sys


class MatchData:
    def __init__(self):
        # anchor directory same as script
        os.chdir(os.path.abspath(os.path.dirname(__file__)))
        self.pp = PrettifyPage.PrettifyPage()
        self.analyze_page()

    def start_browser(self):
        caps = webdriver.DesiredCapabilities().FIREFOX
        caps["marionette"] = True
        self.browser = webdriver.Firefox(capabilities=caps)

    def stop_browser(self):
        self.browser.close()

    def analyze_page(self):
        self.start_browser()
        self.browser.get('https://www.oddschecker.com/tennis')
        time.sleep(2)
        self.browser.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div/div/div/div/div[1]/section[2]/div/div/table/tbody/tr[1]/td[1]/a/div/p[1]')
        page = self.browser.page_source
        soup = BeautifulSoup(page, 'lxml')
        table = soup.find('table', {'class': "at-12 standard-list", 'data-ng-init': "MainController.mainNav.hideCouponRows = {};"})
        trs = table.find_all('tr', {'class': "match-on"})
        for n, tr in enumerate(trs):
            tds = tr.find_all('td')
            for n1, td in enumerate(tds):
                print(f"\n================================= tr_{tr}, td_{n1} ================================= ")
                print(f"{self.pp.prettify(td, 2)}")
        self.stop_browser()


if __name__ == '__main__':
    MatchData()
You'll also need this module in same directory (use PrettifyPage.py as script name or it won't work properly)

PrettifyPage.py
# PrettifyPage.py

from bs4 import BeautifulSoup
import requests
import pathlib


class PrettifyPage:
    def __init__(self):
        pass

    def prettify(self, soup, indent):
        pretty_soup = str()
        previous_indent = 0
        for line in soup.prettify().split("\n"):
            current_indent = str(line).find("<")
            if current_indent == -1 or current_indent > previous_indent + 2:
                current_indent = previous_indent + 1
            previous_indent = current_indent
            pretty_soup += self.write_new_line(line, current_indent, indent)
        return pretty_soup

    def write_new_line(self, line, current_indent, desired_indent):
        new_line = ""
        spaces_to_add = (current_indent * desired_indent) - current_indent
        if spaces_to_add > 0:
            for i in range(spaces_to_add):
                new_line += " "		
        new_line += str(line) + "\n"
        return new_line

if __name__ == '__main__':
    pp = PrettifyPage()
    pfilename = pp.bpath.htmlpath / 'BusinessEntityRecordsAA.html'
    with pfilename.open('rb') as fp:
        page = fp.read()
    soup = BeautifulSoup(page, 'lxml')
    pretty = pp.prettify(soup, indent=2)
    print(pretty)
here's a sample of the output, this is just printing the contents of the found data,
you'll have to extract what you need from that

partial results:
Output:
================================= tr_<tr class="match-on no-top-border" data-day="Today" data-mid="3464826451" data-ng-class="{ 'hide-row' : MainController.mainNav.hideCouponRows['2019-08-0338825577'] }"><td class="time all-odds-click"><div class="time-div beta-caption2 beta-mcaption4"><span class="time-digits beta-caption1 bold betam-caption2">11:30</span></div></td><td class="all-odds-click" colspan="2"><p class="fixtures-bet-name beta-footnote">Gille/Vliegen</p><p class="fixtures-bet-name beta-footnote">Oswald/Polasek</p></td><td class="basket-add" data-best-dig="1.88" data-bid="26247959786" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Gille/Vliegen to betslip"><p class="participant-name"><span class="odds beta-footnote bold add-to-bet-basket">7/8</span></p></td><td class="basket-add" data-best-dig="2.08" data-bid="26247959787" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Oswald/Polasek to betslip"><p class="participant-name participant-name-draw"><span class="odds beta-footnote bold add-to-bet-basket">13/12</span></p></td><td class="betting link-right"><a class="beta-callout full-height-link whole-row-link" data-event-name="Gille/Vliegen v Oswald/Polasek" href="/tennis/atp-kitzbuhel/gille-vliegen-v-oswald-polasek/winner" title="View all Gille/Vliegen v Oswald/Polasek odds"><span class="beta-footnote betam-caption2 comp-odds-text">All Odds</span><span class="beta-sprite big-arr right"></span></a></td></tr>, td_0 ================================= <td class="time all-odds-click"> <div class="time-div beta-caption2 beta-mcaption4"> <span class="time-digits beta-caption1 bold betam-caption2"> 11:30 </span> </div> </td> ================================= tr_<tr class="match-on no-top-border" data-day="Today" data-mid="3464826451" data-ng-class="{ 'hide-row' : MainController.mainNav.hideCouponRows['2019-08-0338825577'] }"><td class="time all-odds-click"><div class="time-div beta-caption2 beta-mcaption4"><span class="time-digits beta-caption1 bold betam-caption2">11:30</span></div></td><td class="all-odds-click" colspan="2"><p class="fixtures-bet-name beta-footnote">Gille/Vliegen</p><p class="fixtures-bet-name beta-footnote">Oswald/Polasek</p></td><td class="basket-add" data-best-dig="1.88" data-bid="26247959786" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Gille/Vliegen to betslip"><p class="participant-name"><span class="odds beta-footnote bold add-to-bet-basket">7/8</span></p></td><td class="basket-add" data-best-dig="2.08" data-bid="26247959787" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Oswald/Polasek to betslip"><p class="participant-name participant-name-draw"><span class="odds beta-footnote bold add-to-bet-basket">13/12</span></p></td><td class="betting link-right"><a class="beta-callout full-height-link whole-row-link" data-event-name="Gille/Vliegen v Oswald/Polasek" href="/tennis/atp-kitzbuhel/gille-vliegen-v-oswald-polasek/winner" title="View all Gille/Vliegen v Oswald/Polasek odds"><span class="beta-footnote betam-caption2 comp-odds-text">All Odds</span><span class="beta-sprite big-arr right"></span></a></td></tr>, td_1 ================================= <td class="all-odds-click" colspan="2"> <p class="fixtures-bet-name beta-footnote"> Gille/Vliegen </p> <p class="fixtures-bet-name beta-footnote"> Oswald/Polasek </p> </td> ================================= tr_<tr class="match-on no-top-border" data-day="Today" data-mid="3464826451" data-ng-class="{ 'hide-row' : MainController.mainNav.hideCouponRows['2019-08-0338825577'] }"><td class="time all-odds-click"><div class="time-div beta-caption2 beta-mcaption4"><span class="time-digits beta-caption1 bold betam-caption2">11:30</span></div></td><td class="all-odds-click" colspan="2"><p class="fixtures-bet-name beta-footnote">Gille/Vliegen</p><p class="fixtures-bet-name beta-footnote">Oswald/Polasek</p></td><td class="basket-add" data-best-dig="1.88" data-bid="26247959786" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Gille/Vliegen to betslip"><p class="participant-name"><span class="odds beta-footnote bold add-to-bet-basket">7/8</span></p></td><td class="basket-add" data-best-dig="2.08" data-bid="26247959787" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Oswald/Polasek to betslip"><p class="participant-name participant-name-draw"><span class="odds beta-footnote bold add-to-bet-basket">13/12</span></p></td><td class="betting link-right"><a class="beta-callout full-height-link whole-row-link" data-event-name="Gille/Vliegen v Oswald/Polasek" href="/tennis/atp-kitzbuhel/gille-vliegen-v-oswald-polasek/winner" title="View all Gille/Vliegen v Oswald/Polasek odds"><span class="beta-footnote betam-caption2 comp-odds-text">All Odds</span><span class="beta-sprite big-arr right"></span></a></td></tr>, td_2 ================================= <td class="basket-add" data-best-dig="1.88" data-bid="26247959786" data-track="&amp;lid=card&amp;lpos=basket-add" title="Add Gille/Vliegen to betslip"> <p class="participant-name"> <span class="odds beta-footnote bold add-to-bet-basket"> 7/8 </span> </p> </td>