Nov-12-2018, 12:39 PM
(This post was last modified: Nov-12-2018, 02:33 PM by Baggelhsk95.)
i was testing the following code to see the results and on debugging i saw the scraped_items and it was like 4.777,....that wasnt the results i wanted to get....second i wanted to scrape each def function to different file and finnaly to scrape all the functions...and not only the first and second functions..... :(
Thank you very much!!! :D
here's is my actual code:
Thank you very much!!! :D
here's is my actual code:
# -*- coding: utf-8 -*- import scrapy class SccbotSpider(scrapy.Spider): name = 'SccBot' start_urls = ['https://spurverbreiterung.de/index.php?cat=c182_Radbefestigungsteile.html'] def parse(self, response): tab1 = response.css('#tab1') for container in tab1.css('tr > td[align="center"]'): scraped_info = { 'TextBox' : container.css('a::text').extract(), 'LinkBox' : container.css('a::attr(href)').extract(), 'CurrentUrl' : response.url } yield scraped_info urls = tab1.css('tr > td[align="center"] > a::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_details) def parse_details(self, response): for containerx in response.css('tr > td[align="center"]'): scraped_items = { 'TextBox' : containerx.css('a::text').extract(), 'LinkBox' : containerx.css('a::attr(href)').extract(), 'CurrentUrl' : response.url } yield scraped_items urls = response.css('tr > td[align="center"] > a::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_items) def parse_items(self, response): for products in response.css('div.inhalt > a.product_link'): scraped_products = { 'Category' : response.css('#main_content > h1::text').extract(), 'CategoryType' : response.css('div.content_boxes > div.rad_header::text').extract(), 'ProductName' : products.css('div.prod-name::text').extract(), 'ProductNumber' : products.css('div.art-nr > span::text').extract(), 'Price' : products.css('div.preis').extract(), 'AvaibilityIcon' : products.css('div.ampel > img::attr(src)').extract(), 'ProductLink' : products.css('a.product_link::attr(href)').extract(), 'CurrentURL' : response.url } yield scraped_products urls = response.css('div.inhalt > a.product_link::attr(href)').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url=url, callback=self.parse_ims) def parse_ims(self, response): for productss in response.css('div.wrapper'): scraped_rads = { 'Title' : productss.css('#product_info > h1::text').extract(), 'Price' : productss.css('div.productsinfo_price > span::text').extract(), 'ProductDetails' : productss.css('div.product_details.clear > table').extract(), 'ProductInfo' : productss.css('div.productsinfo_right').extract(), 'ProductImg' : productss.css('div.productsinfo_img > ul > img::attr(src)').extract(), 'MoreDetails' : productss.css('div.textf_rechts').extract(), 'CurrentURL' : response.url, } yield scraped_rads