Oct-16-2019, 09:38 AM
Hello everyone, I am an infant at web scraping I am trying to get some info from a website and I am at my wits end. Some assistance would really help me out big time.
import requests from lxml import html import csv import pandas as pd #visit the webpage and access the web contents r = requests.get('https://www.basketball-reference.com/boxscores/201810160BOS.html') data = html.fromstring(r.text) #xpath: collect the specific data you're looking for player = data.xpath("//tbody/tr/th[@data-stat='player']/a[@href]/text()") mp = data.xpath("//tbody/tr/td[@data-stat='mp']/a[@href]/text()") fg = data.xpath("//tbody/tr/td[@data-stat='fg']/a[@href]/text()") fga = data.xpath("//tbody/tr/td[@data-stat='fga']/a[@href]/text()") fg_pct = data.xpath("//tbody/tr/td[@data-stat='fg_pct']") fg_pct = [i.text_content() for i in fg_pct] fg3 = data.xpath("//tbody/tr/td[@data-stat='fg3']/a[@href]/text()") fg3a = data.xpath("//tbody/tr/td[@data-stat='fg3a']/a[@href]/text()") fg3_pct = data.xpath("//tbody/tr/td[@data-stat='fg3_pct']") fg3_pct = [i.text_content() for i in fg3_pct] ft = data.xpath("//tbody/tr/td[@data-stat='ft']/a[@href]/text()") fta = data.xpath("//tbody/tr/td[@data-stat='fta']/a[@href]/text()") ft_pct = data.xpath("//tbody/tr/td[@data-stat='ft_pct']") ft_pct = [i.text_content() for i in ft_pct] orb = data.xpath("//tbody/tr/td[@data-stat='orb']/a[@href]/text()") drb = data.xpath("//tbody/tr/td[@data-stat='drb']/a[@href]/text()") trb = data.xpath("//tbody/tr/td[@data-stat='trb']/a[@href]/text()") ast = data.xpath("//tbody/tr/td[@data-stat='ast']/a[@href]/text()") stl = data.xpath("//tbody/tr/td[@data-stat='stl']/a[@href]/text()") blk = data.xpath("//tbody/tr/td[@data-stat='blk']/a[@href]/text()") tov = data.xpath("//tbody/tr/td[@data-stat='tov']/a[@href]/text()") pf = data.xpath("//tbody/tr/td[@data-stat='pf']/a[@href]/text()") pts = data.xpath("//tbody/tr/td[@data-stat='pts']/a[@href]/text()") plus_minus = data.xpath("//tbody/tr/td[@data-stat='plus_minus']/a[@href]/text()") nbaboxstats = zip(player, mp, fg, fga, fg_pct, fg3, fg3a, fg3_pct, ft, fta, ft_pct, orb, drb, trb, ast, stl, blk, tov, pf, pts, plus_minus) #organizing our data structure to a pandas dataframe df = pd.DataFrame(nbaboxstats) df.rename(columns = {0:'starter', 1:'mp', 2:'fg', 3:'fga', 4:'fg_pct', 5:'fg3', 6:'fg3a', 7:'fg3_pct', 8:'ft', 9:'fta', 10:'ft_pct', 11:'orb', 12:'drb', 13:'trb', 14:'ast', 15:'stl', 16:'blk', 17:'tov', 18:'pf', 19:'pts', 20:'plus_minus'}) df.head()