Python Forum

Pages: 1 2

Greetings,

I am writing a small program that will save an author to a CSV file. However, in the function

get_soup

I had to return a boolean because I had no way to break out of the while loop. If I could have, I would have just returned the set_authors set.

Is what I did optimal or is there a better way?

Keep in mind that there could have been x number of pages. So, I checked for the "next" button to make sure it was not there, thus the last page.

I suppose I could have searched the html from the response for the last page with regex. I tried making a variable for:

bool_break = True
while bool_break:

Then tried to change it from the get_soup function but that didn't work.

import requests
import bs4


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://google.com',
    'DNT': '1',

}


BASE_URL = 'https://quotes.toscrape.com/page/{}/'


def get_html(BASE_URL, current_page, ses):

    res = ses.get(BASE_URL.format(current_page), headers=HEADERS)
    return res


def get_soup(res_text, set_authors):

    soup = bs4.BeautifulSoup(res_text, 'lxml')

    # Search for all of the authors
    for name in soup.select('.author'):
        # add each author's link text to a set to remove duplicates.
        set_authors.add(name.text)

        if not(soup.select('li.next')):  # Found last page
            return True  # Need to break out of outter while loop or I would have just returned the set_authors


def parse():

    ses = requests.Session()
    set_authors = set()

    current_page = 1

    while True:

        res = get_html(BASE_URL, current_page, ses)
        if res.status_code == 200:

            if get_soup(res.text, set_authors):
                break
            current_page += 1

        else:
            print('error')
            break

    for author in set_authors:
        print(author)


parse()


# list_sort = list(set_authors)
# list_sort.sort()

# for author in list_sort:
#     print(author)

I think this needs to be refactored. That's what I am going to do.

I would refactor like this:

import requests
from bs4 import BeautifulSoup  # Or could import bs4 as bs

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://google.com',
    'DNT': '1',
}
BASE_URL = 'https://quotes.toscrape.com/page/{}/'
 
def find_tag(tag, url, headers):
    '''Iterator to get text for matching tags from url'''
    base_session = requests.Session()
    page = 1
    while True:
        page_session = base_session.get(url.format(page), headers=headers)  # Does not need to have own function
        if page_session.status_code != 200:
            continue  # avoid walking code off the page with multiple levels of indentation

        soup = BeautifulSoup(page_session.text, 'lxml')  # Avoid embedding package version info in code.
        for name in soup.select(tag):
            yield name.text 
 
        if not(soup.select('li.next')):
            break   # Found last page.  All done

        page += 1
 
authors = [author for author in find_tag('.author', BASE_URL, HEADERS)]
for author in sorted(set(authors)):
    print(author)

I don't have beautiful soup or requests installed, so this is completely untested. It may also miss important steps and this may not be the right way to extract tags. I just refactored your code.

Actually, I refactored the code and it works better and is more readable. Beautiful Soup doesn't really give you a way to create an empty object then set the parameters via the object instance. So, I just did it this way instead. It's similar to your code.

Forget about the timing. Just testing the time between using a session and not.
Yes, I could have made an object alias 'as bs' Big Grin

. The entire point of this was to split everything up into different functions.

I am pretty good with comprehension but I just want to know what the

author for author

does exactly. Is the first occurrence holding a value?

import requests
import bs4
import datetime


HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://google.com',
    'DNT': '1',

}

BASE_URL = 'https://quotes.toscrape.com/page/{}/'

def get_html(BASE_URL, current_page, ses):
    #Get request
    res = ses.get(BASE_URL.format(current_page), headers=HEADERS)
    return res

def get_soup(soup, set_authors):
    
    # Search for all of the authors
    for name in soup.select('.author'):
        # add each author's link text to a set to remove duplicates.
        set_authors.add(name.text)
    #return and append to global set_authors object    
    return set_authors  

def save_csv(set_authors):
    #Sort list alphabetically
    list_sort = list(set_authors)
    list_sort.sort()

    #save to CSV Code
    for author in list_sort:
        print(author)
      
 # no session 0:00:02.776860
 # with session 0:00:00.997129
 
    
def parse():
    
    #Global session object
    ses = requests.Session()
    set_authors = set()

    current_page = 1
    start = datetime.datetime.now()
    while True:

        res = get_html(BASE_URL, current_page, ses)
       
        if res.status_code == 200:
            
            soup = bs4.BeautifulSoup(res.text, 'lxml')
            set_authors = get_soup(soup, set_authors)
           
            if not(soup.select_one('li.next')):
                break
            else:
                current_page += 1

        else:
            print('error')
            break
    finish = start = datetime.datetime.now() - start
    print(finish)
    save_csv(set_authors)


parse()

if __name__ == '__main__':
    parse()

def find_tag(tag, url, headers):
    '''Iterator to get text for matching tags from url'''
    base_session = requests.Session()
    page = 1
    while True:

How do I break out of the while loop if the pages continue to return a different status code?

        page_session = base_session.get(url.format(page), headers=headers)  # Does not need to have own function
        if page_session.status_code != 200:
            continue  # avoid walking code off the page with multiple levels of indentation
           
        soup = BeautifulSoup(page_session.text, 'lxml')  # Avoid embedding package version info in code.
        for name in soup.select(tag):
            yield name.text 
 
        if not(soup.select('li.next')):
            break   # Found last page.  All done

        page += 1
 
authors = [author for author in find_tag('.author', BASE_URL, HEADERS)]
for author in sorted(set(authors)):
    print(author)

I omitted the error handling code by accident.

import requests
from bs4 import BeautifulSoup  # Or could import bs4 as bs
 
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://google.com',
    'DNT': '1',
}
BASE_URL = 'https://quotes.toscrape.com/page/{}/'
  
def find_tag(tag, url, headers):
    '''Iterator to get text for matching tags from url'''
    base_session = requests.Session()
    page = 1
    while True:
        page_session = base_session.get(url.format(page), headers=headers)  # Does not need to have own function
        if page_session.status_code != 200:
            print('error')
            break
 
        soup = BeautifulSoup(page_session.text, 'lxml')  # Avoid embedding package version info in code.
        for name in soup.select(tag):
            yield name.text 
  
        if not(soup.select('li.next')):
            break   # Found last page.  All done
 
        page += 1
  
authors = [author for author in find_tag('.author', BASE_URL, HEADERS)]
for author in sorted(set(authors)):
    print(author)

This is a good example of how important it is to put your error handling code right next to the error generating code. Otherwise it might get lost.

As for the comprehension, this is a pretty standard way to write a comprehension.

authors = [author   for   author            in find_tag('.author', BASE_URL, HEADERS)]
^result    ^Add this      ^iterator value      ^iterator

Since I am only printing the list, I really don't need an authors variable or a list comprehension.

print(sorted(set(find_tag('.author', BASE_URL, HEADERS))))

You should always write code thinking you might use it again. Hiding a '.author' tag inside a function makes it a one use function. Returning a set limits how the function can be used. My example could be used to get a count for each author.

import collections
authors = collections.Counter(find_tag('.author', BASE_URL, HEADERS))
print(authors.most_common())

This would print a list of authors along with how many times their name appeared, and the list would be sorted in decreasing order. Lots of bang for very little buck. If you want to convert a list to a set, Python provides a way to do that. Your code doesn't have to.

Yes, you are correct. I need to pass a selector to the function. I just didn't know how far I wanted to go with this example. I always try to return a data type that gives the user the option to do what they want with it. Like returning formatted strings is a terrible idea.

I should have returned a list. Then filtered the duplicates by converting it to a set, outside a function.

I will make some adjustments to my code based on your suggestions.

I need to focus on reusable code and return basic data types.

I use comprehension when I need to filter and map at the same time. Even at the cost of a little overhead.

list_multiply = [1, 2, 3, 4, 5, 6, 7, 8]
list_inter = [ x * x for x in list_multiply if x > 2]

instead of using the map and filter functions

That type of thing.

What did you mean by this:

soup = BeautifulSoup(page_session.text, 'lxml')  # Avoid embedding package version info in code.

You had bs4.BeautifulSoup(.... If bs5 came out and you wanted to change to that, you would need to modify to bs5.BeautifulSoup(.... Here it is only one line that has a dependency, but I recently changed from using PySide2 to PySide6 and it would have been hundreds of affected lines of code if I had not used "from xxx import yyy" or "import xxx as zzz". Even if you use a tool to automatically apply the changes they will still appear in the change log, potentially hiding other important changes.

About that comprehension. Looking back, this would have been a better way to do it:

authors = list(find_tag('.author', BASE_URL, HEADERS))

I am not used to making iterators, at least not making iterators that I don't use in a for loop. There is no reason to make a comprehension that just adds the unaltered iterator value to the list. Python already provides a way to do that.

I cleaned it up a little based on some of your recommendations and my own:

This was not necessary because I had a break before it:

if not(soup.select_one('li.next')):
            break
else:
      current_page += 1

I also returned a list instead of a set from get_content function

The

get_content

function returns a list. Then I decided to return it as a sorted set. I was not aware of the '

sorted

' keyword.

import requests
import bs4 as bs
import csv



HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://google.com',
    'DNT': '1',

}

BASE_URL = 'https://quotes.toscrape.com/page/{}/'

def get_html(BASE_URL, current_page, base_session):
    #Get request
    response = base_session.get(BASE_URL.format(current_page), headers=HEADERS)
    return response

def get_soup(soup, list_authors, selector):
    
    # Search for all of the authors
    for name in soup.select(selector):
        
        list_authors.append(name.text)
        
    return list_authors 

def save_csv(list_authors, filename):
    #Sort list alphabetically
    list_sorted = sorted(set(list_authors))
    

    #save to CSV Code
    # with open(filename, 'w', encoding='utf-8', newline='') as csvfile:
    #     writer = csv.writer(csvfile, delimiter=',')
    #     writer.writerow(['Author'])
    for author in list_sorted:
            # writer.writerow([author])
        print(author)
   
def parse():
    
    #Global session object
    base_session = requests.Session()
    list_authors = []

    current_page = 1
  
    while True:

        page_session = get_html(BASE_URL, current_page, base_session)
       
        if page_session.status_code != 200:
            print('error')
            break
            
        
        soup = bs.BeautifulSoup(page_session.text, 'lxml')
        list_result = get_soup(soup, list_authors, '.author')
           
        if not(soup.select_one('li.next')): # I had a correspond else statement else: current_page += 1 that was redundant. If we don't break
                                                        # it's safe to iterate current page     
            break
                           
        current_page += 1
  
    save_csv(list_result, 'example.csv')


if __name__ == '__main__':
    parse()

Your way works great but I wanted to break it up into functions even if it didn't warrant it. Thanks for all of your help. You made me think of a more abstract approach.

Pages: 1 2

muzikman

muzikman

deanhystad

muzikman

muzikman

deanhystad

muzikman

muzikman

deanhystad

muzikman