Python Forum

Hi Expert,

I have fetched data from html using below code-

def get_soup(url):
    response = requests.get(url)
    html = response.content
    return BeautifulSoup(html, "html.parser")

And I have fecthed catagory url with-

def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    try:
        categories = soup.find('div', attrs={'id': 'menu_oc'})
        if categories is not None:
            for c in categories.findAll('a'):
                if c['href'] is not None:
                    cat_urls.append(c['href'])
    except Exception as exc:
        print("error::" + url + str(exc))
    finally:
        return cat_urls

Now I am trying to fetch product urls with below code-

def get_product_urls(url):
    soup = get_soup(url)
    prod_urls = []
    try:
        if soup.find('div', attrs={'class': 'pagination'}):
            pages = soup.find('div', attrs={'class': 'page'}).text.split("of ", 1)[1].replace(' (1 Pages)','')
            if pages is not None:
                for page in range(1, int(pages) + 1):
                    soup_with_page = get_soup(url + "&page={}".format(page))
                    product_urls_soup = soup_with_page.find('div', attrs={'id': 'carousel-featured-0'})
                    if product_urls_soup is not None:
                        for row in product_urls_soup.findAll('a'):
                            if row['href'] is not None:
                                prod_urls.append(row['href'])
    except Exception as exc:
        print("error:: " + prod_urls + ": " + str(exc))
    finally:
        return prod_urls

if __name__ == '__main__':
    with Pool(2) as p:
        product_urls = p.map(get_product_urls, category_urls)
    product_urls = list(filter(None, product_urls))
    product_urls_flat = list(set([y for x in product_urls for y in x]))

I am getting product_urls_soup as None here, what I am doing wrong here? PFB sample html data-

html data

How to handle pagination here since some categoroies have pagination and some have not?

Finally I got the issue.
I was not checking pagination for all categories and that's why getting problem.
Now I am able to solve the issue by putting a check for pagination.

what is the original url?

Hi All,

I am trying to scrape data from a site and able to fetch category urls with below loc-

def get_soup(url):
    soup = None
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content
            soup = BeautifulSoup(html, "html.parser")
    except Exception as exc:
        print("error::", str(exc))
    finally:
        return soup

def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    try:
        categories = soup.find('div', attrs={'id': 'menu_oc'})
        if categories is not None:
            for c in categories.findAll('a'):
                if c['href'] is not None:
                    cat_urls.append(c['href'])
    except Exception as exc:
        print("error..", str(exc))
    finally:
        print("category urls::", cat_urls)
        return cat_urls

Now issue is with fetching the product urls because I have to fetch all product urls from each category (pagination+without pagination) and thus I am not able to proceed.

Can anyone please help me to write a function to get the product urls?

please post enough code to run without work, it forces us to improvise, perhaps causing differnet results:

My attempt:

from bs4 import BeautifulSoup
import requests


def get_soup(url):
    soup = None
    try:
        response = requests.get(url, headers=headers, timeout=timeout)
        if response.status_code == 200:
            html = response.content
            soup = BeautifulSoup(html, "html.parser")
    except Exception as exc:
        print("error::", str(exc))
    finally:
        return soup
 
def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    try:
        categories = soup.find('div', attrs={'id': 'menu_oc'})
        if categories is not None:
            for c in categories.findAll('a'):
                if c['href'] is not None:
                    cat_urls.append(c['href'])
    except Exception as exc:
        print("error..", str(exc))
    finally:
        print("category urls::", cat_urls)
        return cat_urls

def main():
    url = 'http://www.infantree.net/shop/'
    soup = get_soup(url)

if __name__ == '__main__':
    main()

Error:
error:: name 'headers' is not defined

I have edited the code, please recheck!

Turn of JavaScript in browser and see how many product url's you see Think

The same as i just posted in this thread

issue is resolved now.

PrateekG

Larz60+

PrateekG

Larz60+

PrateekG

snippsat

PrateekG