Jun-06-2018, 04:53 AM
Hi All,
I have written following functions in Python3.6 for fetching the product related details from the site-
1. 'list index out of range' in get_product_details() function.
2. Not getting correct value of Brand, Image urls, product id
Can anyone please run my code and share me the correct version of it?
I have written following functions in Python3.6 for fetching the product related details from the site-
def get_soup(url): soup = None try: response = requests.get(url) if response.status_code == 200: html = response.content soup = BeautifulSoup(html, "html.parser") except Exception as exc: print("Unable to fecth data due to..", str(exc)) finally: return soup def get_product_details(url): soup = get_soup(url) sleep(1) try: product_shop = soup.find('div', attrs={"class": "buy"}) if product_shop is not None: available_product_shop = soup.findAll('div')[2].find('span').text == "In Stock" if available_product_shop is not None: prod_details = dict() merchant_product_id = soup.find('div', attrs={'class': 'description'}).findAll('span')[3].text if merchant_product_id is not None: prod_details['merchant_product_id'] = merchant_product_id check_title = soup.find('header', attrs={'class': 'product-name'}).find('h1') if check_title is not None: prod_details['title'] = check_title.text check_description = soup.find('div', attrs={'id': 'tab-description'}) if check_description is not None: prod_details['description'] = clean_description(check_description) check_brand = soup.find('div', attrs={'class': 'description'}).findAll('span')[2].find('a') if check_brand is not None: prod_details['brand'] = check_brand.text prod_details['google_product_category'] = CATEGORY_ID prod_details['web_url'] = url prod_details['merchant_image_urls'] = ",".join(list(filter(None, map(lambda x: x['href'].replace(",", "%2C"), soup.find('div', attrs={ 'class': 'left'}).findAll( 'a'))))) check_price = soup.find('span', attrs={"class": "price-old"}) if check_price is not None: prod_details['price'] = check_price.text.replace("SGD $", "") check_sale_price = soup.find('span', attrs={"class": "price-new"}) if check_sale_price is not None: prod_details['sale_price'] = check_sale_price.text.replace("SGD $", "") return prod_details except Exception as exc: print("Error..", str(exc))
def get_all_products(url): prod_urls = [] soup = get_soup(url) prod_urls.append(get_product_urls(soup)) links = get_pagination(soup) if not links: return prod_urls for link in links: soup = get_soup(link) prod_urls.append(get_product_urls(soup)) print("Found following product urls:", prod_urls) return prod_urls def get_product_urls(soup): links = soup.select('div.product-list .span .name a') if links is not None: return [link['href'] for link in links] def get_pagination(soup): pages = soup.select('div.pagination div.links a') if pages is not None: return [link['href'] for link in pages if link.string.isdecimal()] def get_category_urls(url): soup = get_soup(url) cat_urls = [] try: categories = soup.find('div', attrs={'id': 'menu_oc'}) if categories is not None: for c in categories.findAll('a'): if c['href'] is not None: cat_urls.append(c['href']) except Exception as exc: print("Unable to fetch category urls due to..", str(exc)) finally: print("Found following category urls::", cat_urls) return cat_urls
def flatten(items): for x in items: if isinstance(x, Iterable) and not isinstance(x, (str, bytes)): yield from flatten(x) else: yield x if __name__ == '__main__': category_urls = get_category_urls(URL) with Pool(20) as p: product_urls = p.map(get_all_products, category_urls) #product_urls = list(filter(None, product_urls)) product_urls_flat =list(flatten(product_urls)) with Pool(20) as p: products = p.map(get_product_details, product_urls_flat) products = list(filter(None, products)) products_df = pd.DataFrame(products) print(products_df.head())When I run the above code I got following issues-
1. 'list index out of range' in get_product_details() function.
2. Not getting correct value of Brand, Image urls, product id
Can anyone please run my code and share me the correct version of it?