Aug-04-2020, 02:27 AM
For some reason I am getting the output for "date" returns one character at a time instead of the full date for each row. I have been committed to learning Python for some time now but I am far from being an expert. Any help is appreciated!
Here is my code:
Here is my code:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
import scrapy import datetime import re from datetime import timedelta class Tennis_ExplorerSpider(scrapy.Spider): name = 'tennis_explorer' allowed_domains = [ 'tennisexplorer.com' ] def daterange(start_date, end_date): for n in range ( int ((end_date - start_date).days)): yield start_date + timedelta(n) start_date = datetime.datetime.today() #- datetime.timedelta(days=1) end_date = datetime.datetime.today() + datetime.timedelta(days = 1 ) start_urls = [] for single_date in daterange(start_date, end_date): start_urls.append(single_date.strftime(start_url + "%Y&month=%m&day=%d&timezone=-6" )) def parse( self , response): #Extracting the content using xpath self .logger.debug( 'callback "parse": got response %r' % response) data = response.xpath( '//table[@class="result"]//a[contains(@href,"match-detail")]/@href' ).extract() match_id = [re.sub( '^.+=' ,'',el) for el in data] data2 = response.xpath( '//span[@class="tab"]/text()' ).get().replace( ". " , "-" ) date = datetime.datetime.strptime(data2, "%d-%m-%Y" ).strftime( '%Y-%m-%d' ) time = response.xpath( '//tr/td[1][@class="first time"]/text()' ).extract() #event_name = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract() #event_id = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract() #player1_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player1 = response.xpath( '//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()' ).extract() #player1_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() #player2_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player2 = response.xpath( '//tr[not(contains(@class, "head"))]/td[1][@class="t-name"]/a[contains(@href, "/player/")]/text()' ).extract() #player2_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player1_sets = response.xpath( '//tr/td[3][@class="result"]/text()' ).get() player1_set1 = response.xpath( '//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[4][@class="score"]/text()' ).get() player1_set2 = response.xpath( '//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[5][@class="score"]/text()' ).get() player1_set3 = response.xpath( '//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[6][@class="score"]/text()' ).get() player1_set4 = response.xpath( '//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[7][@class="score"]/text()' ).get() player1_set5 = response.xpath( '//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[8][@class="score"]/text()' ).get() player2_sets = response.xpath( '//tr/td[2][@class="result"]/text()' ).get() player2_set1 = response.xpath( '//tr[contains(@id, "b")]/td[3][@class="score"]/text()' ).get() player2_set2 = response.xpath( '//tr[contains(@id, "b")]/td[4][@class="score"]/text()' ).get() player2_set3 = response.xpath( '//tr[contains(@id, "b")]/td[5][@class="score"]/text()' ).get() player2_set4 = response.xpath( '//tr[contains(@id, "b")]/td[6][@class="score"]/text()' ).get() player2_set5 = response.xpath( '//tr[contains(@id, "b")]/td[7][@class="score"]/text()' ).get() #Give the extracted content row wise for item in zip (match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5): #create a dictionary to store the scraped info scraped_info = { 'match_id' : item[ 0 ], 'date' : item[ 1 ], 'time' : item[ 2 ], #'event_name' : item[0], 'player1' : item[ 3 ], 'player2' : item[ 4 ], 'player1_sets' : item[ 5 ], 'player1_set1' : item[ 6 ], 'player1_set2' : item[ 7 ], 'player1_set3' : item[ 8 ], 'player1_set4' : item[ 9 ], 'player1_set5' : item[ 10 ], 'player2_sets' : item[ 11 ], 'player2_set1' : item[ 12 ], 'player2_set2' : item[ 13 ], 'player2_set3' : item[ 14 ], 'player2_set4' : item[ 15 ], 'player2_set5' : item[ 16 ], } #yield or give the scraped info to scrapy yield scraped_info |