Python Forum
Store Screenshot Selenium + MongoDB
Thread Rating:
  • 3 Vote(s) - 4.67 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Store Screenshot Selenium + MongoDB
#3
actually I need do that with scrapy. In my program when scrapy spider crawl webpage and store some data like title,description,url... of webpage in mongodb. So I need to when spider crawl a web page, take screenshot of webpage and store it in mongodb. And after that I need retrieve it from mongodb. ? This is my spider ?
import scrapy
from scrapy.selector import Selector
from search.models import *
import lxml
from lxml.html.clean import Cleaner
import re
from urllib.parse import urlparse
import json


class DSSpider(scrapy.Spider):
    name = "ds_spider"

    def __init__(self, recrawl='no'):
        if app.config['SPIDER_ALLOWED_DOMAINS'] != None:
            self.allowed_domains = app.config['SPIDER_ALLOWED_DOMAINS']

        self.start_urls = ['https://www.imdb.com']
        is_crawled = recrawl.lower() in ['y', 'yes', 't', 'true', '1']
        crawl_list = Crawllist \
            .objects(is_crawled=is_crawled) \
            .limit(app.config['CLOSESPIDER_PAGECOUNT']) \
            .order_by('updated_at')
        for link in crawl_list:
            self.start_urls.append(link.url)

    def parse(self, response):
        schemas = response.xpath('//script[@type="application/ld+json"]//text()').extract()

        for schema in schemas:
            data = json.loads(schema, cls=json.JSONDecoder)
            page_markup = data.get('@type')

        selector = Selector(response)
        # get page title
        page_title = selector.xpath('//title/text()').extract()[0]
        # get page content 
        cleaner = Cleaner()
        cleaner.javascript = True
        cleaner.style = True
        page_html = selector.xpath('//body').extract()[0]
        # remove js and css code
        page_html = cleaner.clean_html(page_html)
        # extract text
        html_doc = lxml.html.document_fromstring(page_html)
        page_content = ' '.join(lxml.etree.XPath("//text()")(html_doc))
        page_content += ' ' + page_title
        # remove line breaks tabs and extra spaces
        page_content = re.sub('\n', ' ', page_content)
        page_content = re.sub('\r', ' ', page_content)
        page_content = re.sub('\t', ' ', page_content)
        page_content = re.sub(' +', ' ', page_content)
        page_content = page_content.strip()
        # get page links
        page_hrefs = response.xpath('//a/@href').extract()
        page_urls = []

        # filter links with unallowed domains
        for link in page_hrefs:
            # convert links to absolute urls
            url = response.urljoin(link)
            # extract domain from url
            parsed_url = urlparse(url)
            url_domain = parsed_url.netloc
            if url_domain in self.allowed_domains:
                page_urls.append(url)
        # log out some info
        self.log('Page: %s (%s)' % (response.url, page_title))


        # save the page
        if Page.objects(url=response.url).count() == 0:
            page = Page(url=response.url, title=page_title, content=page_content, markup=page_markup).save()
            for url in page_urls:
                page.update(add_to_set__links=Pagelink(url=url).save())
                # add url to crawl list
                if Crawllist.objects(url=url).count() == 0:
                    Crawllist(url=url).save()
            # update crawl list
            Crawllist.objects(url=response.url).update(is_crawled=True)
        else:
            page = Page.objects.get(url=response.url)
            page.update(title=page_title, content=page_content, markup=page_markup)

        for next_pages in  response.css('a::attr(href)'):
            next_page = next_pages.extract()
            print(next_page)

            if next_page is not None:
                yield response.follow(next_page, callback= self.parse)
Reply


Messages In This Thread
Store Screenshot Selenium + MongoDB - by Nuwan16 - Aug-15-2020, 09:44 PM
RE: Store Screenshot Selenium + MongoDB - by Nuwan16 - Aug-16-2020, 06:54 AM

Possibly Related Threads…
Thread Author Replies Views Last Post
  Taking screenshot ConsistentlyInconsistent 1 1,848 Sep-10-2023, 11:20 PM
Last Post: Larz60+
  Retrieve images base64 encoded MongoDB and Flask Nuwan16 2 4,390 Oct-13-2020, 06:25 PM
Last Post: Nuwan16
Question Selenium Screenshots store in Database Nuwan16 3 2,830 Oct-05-2020, 02:55 PM
Last Post: Nuwan16
  Flask Can't Save Screenshot to Postgres Db firebird 3 3,454 Sep-21-2020, 09:22 PM
Last Post: firebird
  filtering by category flask+mongodb Leon79 3 13,745 Jul-19-2020, 04:25 AM
Last Post: ndc85430
  Screenshot web page ! ABVSVL 3 4,174 Jul-11-2020, 01:39 PM
Last Post: snippsat
  error when trying to update mongodb damian0612 6 4,782 Jul-04-2020, 07:25 PM
Last Post: damian0612
  screenshot arezoo 3 3,084 Apr-11-2020, 10:22 AM
Last Post: buran
  Extract json-ld schema markup data and store in MongoDB Nuwan16 0 3,068 Apr-05-2020, 04:06 PM
Last Post: Nuwan16
  storing images in mongodb using python richa828 2 10,343 Jun-06-2018, 08:08 AM
Last Post: richa828

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020