Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Python Scrapy
#1
For some reason I am getting the output for "date" returns one character at a time instead of the full date for each row. I have been committed to learning Python for some time now but I am far from being an expert. Any help is appreciated!

Here is my code:

import scrapy
import datetime
import re
from datetime import timedelta



class Tennis_ExplorerSpider(scrapy.Spider):
    name = 'tennis_explorer'
    allowed_domains = ['tennisexplorer.com']

#    start_urls = 'https://www.tennisexplorer.com/results/?type=atp-single&year=2020&month=07&day=27'

    def daterange(start_date, end_date):
        for n in range(int((end_date - start_date).days)):
            yield start_date + timedelta(n)
    
    start_date = datetime.datetime.today() #- datetime.timedelta(days=1)
    end_date = datetime.datetime.today() + datetime.timedelta(days=1)    
    start_urls = []
    start_url='https://www.tennisexplorer.com/matches/?type=all&year='
    for single_date in daterange(start_date, end_date):
        start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6"))

    
    def parse(self, response):  
            #Extracting the content using xpath            
            self.logger.debug('callback "parse": got response %r' % response)
            data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract()
            match_id =[re.sub('^.+=','',el) for el in data]


            data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-")
            date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime('%Y-%m-%d')
            
            time = response.xpath('//tr/td[1][@class="first time"]/text()').extract()
            #event_name = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract()              
            #event_id = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract()              
            #player1_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            player1 = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            #player1_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            #player2_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            player2 = response.xpath('//tr[not(contains(@class, "head"))]/td[1][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            #player2_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract()
            player1_sets = response.xpath('//tr/td[3][@class="result"]/text()').get()
            player1_set1 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[4][@class="score"]/text()').get()
            player1_set2 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[5][@class="score"]/text()').get()
            player1_set3 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[6][@class="score"]/text()').get()
            player1_set4 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[7][@class="score"]/text()').get()
            player1_set5 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[8][@class="score"]/text()').get()
            player2_sets = response.xpath('//tr/td[2][@class="result"]/text()').get()
            player2_set1 = response.xpath('//tr[contains(@id, "b")]/td[3][@class="score"]/text()').get()
            player2_set2 = response.xpath('//tr[contains(@id, "b")]/td[4][@class="score"]/text()').get()
            player2_set3 = response.xpath('//tr[contains(@id, "b")]/td[5][@class="score"]/text()').get()
            player2_set4 = response.xpath('//tr[contains(@id, "b")]/td[6][@class="score"]/text()').get()
            player2_set5 = response.xpath('//tr[contains(@id, "b")]/td[7][@class="score"]/text()').get()

            
            #Give the extracted content row wise
            for item in zip(match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5):
                #create a dictionary to store the scraped info
                scraped_info = {
                    'match_id' : item[0],
                    'date' : item[1],
                    'time' : item[2],                    
                    #'event_name' : item[0],                    
                    'player1' : item[3],
                    'player2' : item[4],                    
                    'player1_sets' : item[5],
                    'player1_set1' : item[6],
                    'player1_set2' : item[7],
                    'player1_set3' : item[8],
                    'player1_set4' : item[9],
                    'player1_set5' : item[10],
                    'player2_sets' : item[11],
                    'player2_set1' : item[12],
                    'player2_set2' : item[13],
                    'player2_set3' : item[14],
                    'player2_set4' : item[15],
                    'player2_set5' : item[16],
                }

    
                #yield or give the scraped info to scrapy
                yield scraped_info
Reply
#2
How is this called? I don't really see what part of this is supposed to print the date. When I run it, there is no output. Could you trim it to a small script that just shows the problem you're having?
Reply
#3
(Aug-04-2020, 03:34 AM)bowlofred Wrote: How is this called? I don't really see what part of this is supposed to print the date. When I run it, there is no output. Could you trim it to a small script that just shows the problem you're having?

The two lines below are what I am stuck on. It is important to note everything works as expected when "date" is removed.

            data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-")
            date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime('%Y-%m-%d')
Here is the pipeline:
import psycopg2


class TennisexplorerPipeline(object):

    def open_spider(self, spider):
        hostname = '35.225.206.101'
        username = 'tcb'
        password = '7UmgPbQKdZeTgvPS' # your password
        database = 'tcb'
        self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database)
        self.cur = self.connection.cursor()

    def close_spider(self, spider):
        self.cur.close()
        self.connection.close()

    def process_item(self, item, spider):
        self.cur.execute("insert into tennis_explorer(match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5, update_timestamp) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,current_timestamp)",
                             (item['match_id'],item['date'],item['time'],item['player1'],item['player2'],item['player1_sets'],item['player1_set1'],item['player1_set2'],item['player1_set3'],item['player1_set4'],item['player1_set5'],item['player2_sets'],item['player2_set1'],item['player2_set2'],item['player2_set3'],item['player2_set4'],item['player2_set5']))     
                
        self.connection.commit()
        return item
Here is Items.py
import scrapy


class TennisexplorerItem(scrapy.Item):
    # define the fields for your item here like:
    match_id = scrapy.Field()
    date = scrapy.Field()
    time = scrapy.Field()             
    player1 = scrapy.Field()
    player2 = scrapy.Field()
    player1_sets = scrapy.Field()
    player1_set1 = scrapy.Field()
    player1_set2 = scrapy.Field()
    player1_set3 = scrapy.Field()
    player1_set4 = scrapy.Field()
    player1_set5 = scrapy.Field()
    player2_sets = scrapy.Field()
    player2_set1 = scrapy.Field()
    player2_set2 = scrapy.Field()
    player2_set3 = scrapy.Field()
    player2_set4 = scrapy.Field()
    player2_set5 = scrapy.Field()   
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Python Scrapy Date Extraction Issue tr8585 1 3,236 Aug-05-2020, 04:32 AM
Last Post: tr8585
  Python - Scrapy Baggelhsk95 0 2,262 Apr-24-2019, 01:07 PM
Last Post: Baggelhsk95
  Python Scrapy ebay API Baggelhsk95 0 3,173 Nov-21-2018, 11:22 AM
Last Post: Baggelhsk95
  Python scrapy scraped_items Baggelhsk95 2 2,822 Nov-13-2018, 08:30 AM
Last Post: Baggelhsk95
  Python - Scrapy - CSS selector Baggelhsk95 1 5,512 Nov-07-2018, 04:45 PM
Last Post: stranac
  Python - Scrapy - Contains Baggelhsk95 3 4,478 Oct-27-2018, 03:42 PM
Last Post: stranac
  Python - Scrapy Login in Baggelhsk95 3 4,761 Oct-23-2018, 04:24 PM
Last Post: stranac
  Python - Scrapy Ebay Test Baggelhsk95 4 4,279 Oct-16-2018, 12:37 PM
Last Post: snippsat
  Python - Scrapy Login form Baggelhsk95 4 10,697 Oct-16-2018, 08:01 AM
Last Post: Baggelhsk95
  Python - Scrapy Javascript Pagination (next_page) Baggelhsk95 3 9,919 Oct-08-2018, 01:20 PM
Last Post: stranac

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020