Python Forum
Getting 'list index out of range' while fetching product details using BeautifulSoup?
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Getting 'list index out of range' while fetching product details using BeautifulSoup?
#1
Hi All,

I have written following functions in Python3.6 for fetching the product related details from the site-

def get_soup(url):
    soup = None
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.content
            soup = BeautifulSoup(html, "html.parser")
    except Exception as exc:
        print("Unable to fecth data due to..", str(exc))
    finally:
        return soup
def get_product_details(url):
    soup = get_soup(url)
    sleep(1)
    try:
        product_shop = soup.find('div', attrs={"class": "buy"})
        if product_shop is not None:
            available_product_shop = soup.findAll('div')[2].find('span').text == "In Stock"
            if available_product_shop is not None:
                prod_details = dict()
                merchant_product_id = soup.find('div', attrs={'class': 'description'}).findAll('span')[3].text
                if merchant_product_id is not None:
                    prod_details['merchant_product_id'] = merchant_product_id
                    check_title = soup.find('header', attrs={'class': 'product-name'}).find('h1')
                    if check_title is not None:
                        prod_details['title'] = check_title.text
                    check_description = soup.find('div', attrs={'id': 'tab-description'})
                    if check_description is not None:
                        prod_details['description'] = clean_description(check_description)
                    check_brand = soup.find('div', attrs={'class': 'description'}).findAll('span')[2].find('a')
                    if check_brand is not None:
                        prod_details['brand'] = check_brand.text
                    prod_details['google_product_category'] = CATEGORY_ID
                    prod_details['web_url'] = url
                    prod_details['merchant_image_urls'] = ",".join(list(filter(None, map(lambda x: x['href'].replace(",", "%2C"),
                                                                                         soup.find('div', attrs={
                                                                                             'class': 'left'}).findAll(
                                                                                             'a')))))
                    check_price = soup.find('span', attrs={"class": "price-old"})
                    if check_price is not None:
                        prod_details['price'] = check_price.text.replace("SGD $", "")
                    check_sale_price = soup.find('span', attrs={"class": "price-new"})
                    if check_sale_price is not None:
                        prod_details['sale_price'] = check_sale_price.text.replace("SGD $", "")
                    return prod_details
    except Exception as exc:
        print("Error..", str(exc))
def get_all_products(url):
    prod_urls = []
    soup = get_soup(url)
    prod_urls.append(get_product_urls(soup))

    links = get_pagination(soup)
    if not links:
        return prod_urls

    for link in links:
        soup = get_soup(link)
        prod_urls.append(get_product_urls(soup))

    print("Found following product urls:", prod_urls)
    return prod_urls

def get_product_urls(soup):
    links = soup.select('div.product-list .span .name a')
    if links is not None:
        return [link['href'] for link in links]

def get_pagination(soup):
    pages = soup.select('div.pagination div.links a')
    if pages is not None:
        return [link['href'] for link in pages if link.string.isdecimal()]

def get_category_urls(url):
    soup = get_soup(url)
    cat_urls = []
    try:
        categories = soup.find('div', attrs={'id': 'menu_oc'})
        if categories is not None:
            for c in categories.findAll('a'):
                if c['href'] is not None:
                    cat_urls.append(c['href'])
    except Exception as exc:
        print("Unable to fetch category urls due to..", str(exc))
    finally:
        print("Found following category urls::", cat_urls)
        return cat_urls
def flatten(items):
    for x in items:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

if __name__ == '__main__':
    category_urls = get_category_urls(URL)

    with Pool(20) as p:
        product_urls = p.map(get_all_products, category_urls)
    #product_urls = list(filter(None, product_urls))
    product_urls_flat =list(flatten(product_urls))

    with Pool(20) as p:
         products = p.map(get_product_details, product_urls_flat)
    products = list(filter(None, products))
    products_df = pd.DataFrame(products)
    print(products_df.head())
When I run the above code I got following issues-
1. 'list index out of range' in get_product_details() function.
2. Not getting correct value of Brand, Image urls, product id

Can anyone please run my code and share me the correct version of it?
Reply


Messages In This Thread
Getting 'list index out of range' while fetching product details using BeautifulSoup? - by PrateekG - Jun-06-2018, 04:53 AM

Possibly Related Threads…
Thread Author Replies Views Last Post
  Fetching Images from DB in Django Dexty 2 1,705 Mar-15-2024, 08:43 AM
Last Post: firn100
  All product links to products on a website MarionStorm 0 1,085 Jun-02-2022, 11:17 PM
Last Post: MarionStorm
  IndexError: list index out of range" & "TypeError: The view function f: Flask Web App joelbeater992 5 3,507 Aug-31-2021, 08:08 PM
Last Post: joelbeater992
  Python BeautifulSoup IndexError: list index out of range rhat398 1 6,230 May-28-2021, 09:09 PM
Last Post: Daring_T
  fetching, parsing data from Wikipedia apollo 2 3,538 May-06-2021, 08:08 PM
Last Post: snippsat
  How to make data coming from a database clickable giving more details newbie1 8 3,739 May-29-2020, 11:19 PM
Last Post: newbie1
  IndexError: tuple index out of range ? JohnnyCoffee 4 3,391 Jan-22-2020, 06:54 AM
Last Post: JohnnyCoffee
  Fetching and Parsing XML Data FalseFact 3 3,250 Apr-01-2019, 10:21 AM
Last Post: Larz60+
  My Django 2.0.6 logging is not working while product merging PrateekG 0 2,154 Jul-26-2018, 02:24 PM
Last Post: PrateekG
  from List to BeautifulSoup , Homework RPC 6 7,026 Jul-03-2018, 12:17 AM
Last Post: snippsat

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020