Python Forum

Full Version: Extract json-ld schema markup data and store in MongoDB
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
I'am creating a spider to crawl webpage' json-ld schema markup and store data in mongodb. actually I want to scrape json-ld schema markup and extract the data type("@type" : "_____") from schema markup and store this @type in mongodb. My spiders crawl well whole schema markup code. But I want to know that How to extract @type from that json-ld schema markup and store it in mongodb.
This is my spider files

import scrapy
from pprint import pprint
from extruct.jsonld import JsonLdExtractor
from ..items import ApplespiderItem

class AppleSpider(scrapy.Spider):
    name = 'apple'
    allowed_domains = ['']
    start_urls = (

    def parse(self, response):

        extractor = JsonLdExtractor()

        items = extractor.extract(response.body_as_unicode(), response.url)

        for item in items:
            if item.get('properties', {}).get('name'):
                properties = item['properties']

                yield {
                    'name': properties['name'],
                    'price': properties['offers']['properties']['price'],
                    'url': properties['url']

import scrapy

class ApplespiderItem(scrapy.Item):
    # define the fields for your item here like:
    name = scrapy.Field()
    price = scrapy.Field()
    url = scrapy.Field()

import pymongo

class ApplespiderPipeline(object):

	def __init__(self):
		self.conn = pymongo.MongoClient(
		db = self.conn['newdb']
		self.collection = db['app_tb']

	def process_item(self, item, spider):
		return item