Sep-06-2022, 05:24 AM
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 |
from scrapy import Spider from scrapy.http import Request class TesterSpider(Spider): name = 'tester' allowed_domains = [ 'books.toscrape.com' ] def parse( self , response): books = response.xpath( "//h3/a/@href" ).extract() for book in books: absolute_url = response.urljoin(book) yield Request(absolute_url, callback = self .parse_book) # process next page next_page_url = response.xpath( "//a[text()='next']/@href" ).extract_first() absolute_next_page_url = response.urljoin(next_page_url) yield Request(absolute_next_page_url) def parse_book( self , response): title = response.xpath( "//h1/text()" ).extract_first() price = response.xpath( "//*[@class='price_color']/text()" ).extract_first() img_url = response.xpath( "//img/@src" ).extract_first() rating = response.xpath( "//p[starts-with(@class,'star-rating')]/@class" ).extract_first() rating = rating.replace( 'star-rating ' , '') desc = response.xpath( "//div[(@id='product_description')]/following-sibling::p/text()" ).extract_first() # Product Description upc = product_desc(response, 'UPC' ) product_type = product_desc(response, 'Product Type' ) availability = product_desc(response, 'Availability' ) number_of_reviews = product_desc(response, 'Number of reviews' ) yield { 'Title' : title, 'Price' : price, 'Location' : img_url, 'Rating' : rating, 'Description' : desc, 'UPC' : upc, 'Product Type' : product_type, 'Availability' : availability, 'Reviews' : number_of_reviews } def product_desc(response, lookup): return response.xpath( "//th[text()='" + lookup + "']/following-sibling::td/text()" ).extract_first() |
Thank you