May-03-2018, 05:42 PM
Hi guys-I'm still in the process of learning Python but have made some progress using pandas/soup to scrape information from the web. Let's say I have scripts that get data from the web and create an Excel file using the data. Normally the only variations are the URL being scraped and the name of the Excel file that is created.
Basically I'm just looking for a way to condense the scripts so they are easier to edit if I need to make changes. Below is a sample and I've marked what changes from script to script with "#". If anyone has advice on how to condense/make them cleaner I would really appreciate it.
Basically I'm just looking for a way to condense the scripts so they are easier to edit if I need to make changes. Below is a sample and I've marked what changes from script to script with "#". If anyone has advice on how to condense/make them cleaner I would really appreciate it.
import requests from random import choice from bs4 import BeautifulSoup import pandas as pd from urllib.parse import urlparse, parse_qs from functools import reduce from pathlib import Path import os.path import time import os desktop_agents = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'] #fg_url = "https://www.fangraphs.com/leaders.aspx?pos=all&stats=pit&lg=all&qual=0&type=c,14,62,120,121&season=2018&month=0&season1=2016&ind=0&team=&rost=0&age=&filter=&players=0&page=1_100000" def random_headers(): return {'User-Agent': choice(desktop_agents),'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} df3 = pd.DataFrame() page_request = requests.get(fg_url,headers=random_headers()) soup = BeautifulSoup(page_request.text,"lxml") table = soup.find_all('table')[11] data = [] column_headers = [] headingrows = table.find_all('th') for row in headingrows[0:]: column_headers.append(row.text.strip()) data.append(column_headers) table_body = table.find('tbody') rows = table_body.find_all('tr') for row in rows: cols = row.find_all('td') cols = [ele.text.strip() for ele in cols] data.append([ele for ele in cols[0:]]) ID = [] for tag in soup.select('a[href^=statss.aspx?playerid=]'): link = tag['href'] query = parse_qs(link) ID.append(query) df1 = pd.DataFrame(data) df1 = df1.rename(columns=df1.iloc[0]) df1 = df1.loc[1:].reset_index(drop=True) df2 = pd.DataFrame(ID) df2.drop(['position'], axis = 1, inplace = True, errors = 'ignore') df2['statss.aspx?playerid'] = df2['statss.aspx?playerid'].str[0] df3 = pd.concat([df1, df2], axis=1) df3.rename(columns = {'statss.aspx?playerid':'fg_id'}, inplace=True) df3 = df3.drop('#', axis=1) df3['fg_id'] = df3['fg_id'].astype(int) df4 = pd.read_csv('http://crunchtimebaseball.com/master.csv', usecols=[17,0], encoding ='ISO-8859-1') df4['fg_id'] = df4['fg_id'].str.replace('sa', '1000') df4['fg_id'] = df4['fg_id'].fillna(0) df4['fg_id'] = df4['fg_id'].astype(int) merged = pd.merge(left=df3,right=df4, left_on='fg_id', right_on='fg_id') #merged.to_csv("PitchingGA3Year.csv")