Aug-16-2020, 06:54 AM
actually I need do that with scrapy. In my program when scrapy spider crawl webpage and store some data like title,description,url... of webpage in mongodb. So I need to when spider crawl a web page, take screenshot of webpage and store it in mongodb. And after that I need retrieve it from mongodb. ? This is my spider ?
import scrapy from scrapy.selector import Selector from search.models import * import lxml from lxml.html.clean import Cleaner import re from urllib.parse import urlparse import json class DSSpider(scrapy.Spider): name = "ds_spider" def __init__(self, recrawl='no'): if app.config['SPIDER_ALLOWED_DOMAINS'] != None: self.allowed_domains = app.config['SPIDER_ALLOWED_DOMAINS'] self.start_urls = ['https://www.imdb.com'] is_crawled = recrawl.lower() in ['y', 'yes', 't', 'true', '1'] crawl_list = Crawllist \ .objects(is_crawled=is_crawled) \ .limit(app.config['CLOSESPIDER_PAGECOUNT']) \ .order_by('updated_at') for link in crawl_list: self.start_urls.append(link.url) def parse(self, response): schemas = response.xpath('//script[@type="application/ld+json"]//text()').extract() for schema in schemas: data = json.loads(schema, cls=json.JSONDecoder) page_markup = data.get('@type') selector = Selector(response) # get page title page_title = selector.xpath('//title/text()').extract()[0] # get page content cleaner = Cleaner() cleaner.javascript = True cleaner.style = True page_html = selector.xpath('//body').extract()[0] # remove js and css code page_html = cleaner.clean_html(page_html) # extract text html_doc = lxml.html.document_fromstring(page_html) page_content = ' '.join(lxml.etree.XPath("//text()")(html_doc)) page_content += ' ' + page_title # remove line breaks tabs and extra spaces page_content = re.sub('\n', ' ', page_content) page_content = re.sub('\r', ' ', page_content) page_content = re.sub('\t', ' ', page_content) page_content = re.sub(' +', ' ', page_content) page_content = page_content.strip() # get page links page_hrefs = response.xpath('//a/@href').extract() page_urls = [] # filter links with unallowed domains for link in page_hrefs: # convert links to absolute urls url = response.urljoin(link) # extract domain from url parsed_url = urlparse(url) url_domain = parsed_url.netloc if url_domain in self.allowed_domains: page_urls.append(url) # log out some info self.log('Page: %s (%s)' % (response.url, page_title)) # save the page if Page.objects(url=response.url).count() == 0: page = Page(url=response.url, title=page_title, content=page_content, markup=page_markup).save() for url in page_urls: page.update(add_to_set__links=Pagelink(url=url).save()) # add url to crawl list if Crawllist.objects(url=url).count() == 0: Crawllist(url=url).save() # update crawl list Crawllist.objects(url=response.url).update(is_crawled=True) else: page = Page.objects.get(url=response.url) page.update(title=page_title, content=page_content, markup=page_markup) for next_pages in response.css('a::attr(href)'): next_page = next_pages.extract() print(next_page) if next_page is not None: yield response.follow(next_page, callback= self.parse)