Apr-05-2020, 04:06 PM
I'am creating a spider to crawl webpage' json-ld schema markup and store data in mongodb. actually I want to scrape json-ld schema markup and extract the data type("@type" : "_____") from schema markup and store this @type in mongodb. My spiders crawl well whole schema markup code. But I want to know that How to extract @type from that json-ld schema markup and store it in mongodb.
This is my spider files
apple_spider.py
items.py
pipelines.py
This is my spider files
apple_spider.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 |
import scrapy from pprint import pprint from extruct.jsonld import JsonLdExtractor from ..items import ApplespiderItem class AppleSpider(scrapy.Spider): name = 'apple' allowed_domains = [ 'apple.com' ] start_urls = ( ) def parse( self , response): extractor = JsonLdExtractor() items = extractor.extract(response.body_as_unicode(), response.url) pprint(items) for item in items: if item.get( 'properties' , {}).get( 'name' ): properties = item[ 'properties' ] yield { 'name' : properties[ 'name' ], 'price' : properties[ 'offers' ][ 'properties' ][ 'price' ], 'url' : properties[ 'url' ] } |
1 2 3 4 5 6 7 8 |
import scrapy class ApplespiderItem(scrapy.Item): # define the fields for your item here like: name = scrapy.Field() price = scrapy.Field() url = scrapy.Field() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 |
import pymongo class ApplespiderPipeline( object ): def __init__( self ): self .conn = pymongo.MongoClient( 'localhost' , 27017 ) db = self .conn[ 'newdb' ] self .collection = db[ 'app_tb' ] def process_item( self , item, spider): self .collection.insert( dict (item)) return item |