Sep-16-2021, 05:53 PM
Actually, I refactored the code and it works better and is more readable. Beautiful Soup doesn't really give you a way to create an empty object then set the parameters via the object instance. So, I just did it this way instead. It's similar to your code.
Forget about the timing. Just testing the time between using a session and not.
Yes, I could have made an object alias 'as bs'
. The entire point of this was to split everything up into different functions.
I am pretty good with comprehension but I just want to know what the
Forget about the timing. Just testing the time between using a session and not.
Yes, I could have made an object alias 'as bs'

I am pretty good with comprehension but I just want to know what the
author for authordoes exactly. Is the first occurrence holding a value?
import requests import bs4 import datetime HEADERS = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Accept-Language': 'en-US,en;q=0.9', 'Referer': 'https://google.com', 'DNT': '1', } BASE_URL = 'https://quotes.toscrape.com/page/{}/' def get_html(BASE_URL, current_page, ses): #Get request res = ses.get(BASE_URL.format(current_page), headers=HEADERS) return res def get_soup(soup, set_authors): # Search for all of the authors for name in soup.select('.author'): # add each author's link text to a set to remove duplicates. set_authors.add(name.text) #return and append to global set_authors object return set_authors def save_csv(set_authors): #Sort list alphabetically list_sort = list(set_authors) list_sort.sort() #save to CSV Code for author in list_sort: print(author) # no session 0:00:02.776860 # with session 0:00:00.997129 def parse(): #Global session object ses = requests.Session() set_authors = set() current_page = 1 start = datetime.datetime.now() while True: res = get_html(BASE_URL, current_page, ses) if res.status_code == 200: soup = bs4.BeautifulSoup(res.text, 'lxml') set_authors = get_soup(soup, set_authors) if not(soup.select_one('li.next')): break else: current_page += 1 else: print('error') break finish = start = datetime.datetime.now() - start print(finish) save_csv(set_authors) parse() if __name__ == '__main__': parse()