May-28-2021, 11:30 AM
import requests from bs4 import BeautifulSoup import csv import time class LightupScraper: results = [] def fetch(self, url): print(f'HTTP GET request to URL: {url}', end='') res = requests.get(url) print(f' | Status Code: {res.status_code}') return res def save_response(self, res): with open('res.html', 'w') as html_file: html_file.write(res) def load_response(self): html = '' with open('res.html', 'r') as html_file: for line in html_file: html += line return html def parse(self, html): content = BeautifulSoup(html, 'lxml') titles = [title.text.strip() for title in content.find_all('h4', {'class': 'card-title ols-card-title'})] links = [link.find('a')['href'] for link in content.find_all('h4', {'class': 'card-title ols-card-title'})] skus = [sku.text for sku in content.find_all('span', {'class': 'productView-info-value ols-card-text--sku'})] mpn = [mpn.text.split(':')[-1].strip() for mpn in content.find_all('span', {'class': 'productView-info-name mpn-label ols-card-text--mpn'})] details = [ul.find_all('li') for ul in content.find_all('ul', {'class': 'ols-card-text__list'})] brand = [''.join([brand.text for brand in detail if 'Brand:' in brand.text]).split(':')[-1].strip() for detail in details] base = [''.join([base.text for base in detail if 'Base Type:' in base.text]).split(':')[-1].strip() for detail in details] life_hours = [''.join([life_hour.text for life_hour in detail if 'Life Hours:' in life_hour.text]).split(':')[-1].strip() for detail in details] lumens = [''.join([lumen.text for lumen in detail if 'Lumens:' in lumen.text]).split(':')[-1].strip() for detail in details] warrantys = [''.join([warranty.text for warranty in detail if 'Warranty:' in warranty.text]).split(':')[-1].strip() for detail in details] wattages = [''.join([wattage.text for wattage in detail if 'Wattage:' in wattage.text]).split(':')[-1].strip() for detail in details] features = [feature.text.split() for feature in content.find_all('span', {'class': 'ols-card-text__list--features'})] prices = [price.text for price in content.find_all('span', {'class': 'price price--withoutTax'})] #print(prices) for feature in features: feat = feature for item in range(0, len(titles)): self.results.append({ 'titles': titles[item], 'skus': skus[item], 'mpn': mpn[item], 'brand': brand[item], 'base': base[item], 'life_hours': life_hours[item], 'lumens': lumens[item], 'warrantys': warrantys[item], 'wattages': wattages[item], 'feature': feat[item], 'links': links[item], 'price': prices[item] }) def to_csv(self): with open('lightup.csv', 'w', newline='') as csv_file: writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys()) writer.writeheader() for row in self.results: writer.writerow(row) print('Exported results to lightup.csv') def run(self): page_num = 3 for page in range(1, page_num + 1): base_url = 'https://www.lightup.com/standard-household-lighting.html?p=' base_url += str(page) res = self.fetch(base_url) self.parse(res.text) #time.sleep(30) self.to_csv() # html = self.load_response() # self.parse(html) #self.save_response(html.text) if __name__ == '__main__': scraper = LightupScraper() scraper.run()Error:
Error:File "lightup_scraper.py", line 66, in parse
'price': prices[item]
IndexError: list index out of range
I tried to scrape the prices but i am getting list index out of range error because the tag which is responsible for price is returning 14 elements and the other tags returning 16 this is because some price tags are different e.g for price per case tag is price price--withoutTax price-per--case and for single product price price--withoutTax.I tried try except block but no luck it gives me whole another list not individual prices i can't get my head around this problem may be someone can give me some pointers to actually make this work.