Python Scrapy - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Python Scrapy (/thread-28801.html) |
Python Scrapy - tr8585 - Aug-04-2020 For some reason I am getting the output for "date" returns one character at a time instead of the full date for each row. I have been committed to learning Python for some time now but I am far from being an expert. Any help is appreciated! Here is my code: import scrapy import datetime import re from datetime import timedelta class Tennis_ExplorerSpider(scrapy.Spider): name = 'tennis_explorer' allowed_domains = ['tennisexplorer.com'] # start_urls = 'https://www.tennisexplorer.com/results/?type=atp-single&year=2020&month=07&day=27' def daterange(start_date, end_date): for n in range(int((end_date - start_date).days)): yield start_date + timedelta(n) start_date = datetime.datetime.today() #- datetime.timedelta(days=1) end_date = datetime.datetime.today() + datetime.timedelta(days=1) start_urls = [] start_url='https://www.tennisexplorer.com/matches/?type=all&year=' for single_date in daterange(start_date, end_date): start_urls.append(single_date.strftime(start_url+"%Y&month=%m&day=%d&timezone=-6")) def parse(self, response): #Extracting the content using xpath self.logger.debug('callback "parse": got response %r' % response) data = response.xpath('//table[@class="result"]//a[contains(@href,"match-detail")]/@href').extract() match_id =[re.sub('^.+=','',el) for el in data] data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-") date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime('%Y-%m-%d') time = response.xpath('//tr/td[1][@class="first time"]/text()').extract() #event_name = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract() #event_id = response.selector.xpath('//*[@id="center"]/ul/div[2]/div/div/ul/li[1]/span').extract() #player1_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player1 = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() #player1_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() #player2_id = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player2 = response.xpath('//tr[not(contains(@class, "head"))]/td[1][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() #player2_odds = response.xpath('//tr[not(contains(@class, "head"))]/td[2][@class="t-name"]/a[contains(@href, "/player/")]/text()').extract() player1_sets = response.xpath('//tr/td[3][@class="result"]/text()').get() player1_set1 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[4][@class="score"]/text()').get() player1_set2 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[5][@class="score"]/text()').get() player1_set3 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[6][@class="score"]/text()').get() player1_set4 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[7][@class="score"]/text()').get() player1_set5 = response.xpath('//tr[not(contains(@id, "b"))][contains(@id, "r")]/td[8][@class="score"]/text()').get() player2_sets = response.xpath('//tr/td[2][@class="result"]/text()').get() player2_set1 = response.xpath('//tr[contains(@id, "b")]/td[3][@class="score"]/text()').get() player2_set2 = response.xpath('//tr[contains(@id, "b")]/td[4][@class="score"]/text()').get() player2_set3 = response.xpath('//tr[contains(@id, "b")]/td[5][@class="score"]/text()').get() player2_set4 = response.xpath('//tr[contains(@id, "b")]/td[6][@class="score"]/text()').get() player2_set5 = response.xpath('//tr[contains(@id, "b")]/td[7][@class="score"]/text()').get() #Give the extracted content row wise for item in zip(match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5): #create a dictionary to store the scraped info scraped_info = { 'match_id' : item[0], 'date' : item[1], 'time' : item[2], #'event_name' : item[0], 'player1' : item[3], 'player2' : item[4], 'player1_sets' : item[5], 'player1_set1' : item[6], 'player1_set2' : item[7], 'player1_set3' : item[8], 'player1_set4' : item[9], 'player1_set5' : item[10], 'player2_sets' : item[11], 'player2_set1' : item[12], 'player2_set2' : item[13], 'player2_set3' : item[14], 'player2_set4' : item[15], 'player2_set5' : item[16], } #yield or give the scraped info to scrapy yield scraped_info RE: Python Scrapy - bowlofred - Aug-04-2020 How is this called? I don't really see what part of this is supposed to print the date. When I run it, there is no output. Could you trim it to a small script that just shows the problem you're having? RE: Python Scrapy - tr8585 - Aug-04-2020 (Aug-04-2020, 03:34 AM)bowlofred Wrote: How is this called? I don't really see what part of this is supposed to print the date. When I run it, there is no output. Could you trim it to a small script that just shows the problem you're having? The two lines below are what I am stuck on. It is important to note everything works as expected when "date" is removed. data2 = response.xpath('//span[@class="tab"]/text()').get().replace(". ", "-") date = datetime.datetime.strptime(data2, "%d-%m-%Y").strftime('%Y-%m-%d')Here is the pipeline: import psycopg2 class TennisexplorerPipeline(object): def open_spider(self, spider): hostname = '35.225.206.101' username = 'tcb' password = '7UmgPbQKdZeTgvPS' # your password database = 'tcb' self.connection = psycopg2.connect(host=hostname, user=username, password=password, dbname=database) self.cur = self.connection.cursor() def close_spider(self, spider): self.cur.close() self.connection.close() def process_item(self, item, spider): self.cur.execute("insert into tennis_explorer(match_id, date, time, player1, player2, player1_sets, player1_set1, player1_set2, player1_set3, player1_set4, player1_set5, player2_sets, player2_set1, player2_set2, player2_set3, player2_set4, player2_set5, update_timestamp) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,current_timestamp)", (item['match_id'],item['date'],item['time'],item['player1'],item['player2'],item['player1_sets'],item['player1_set1'],item['player1_set2'],item['player1_set3'],item['player1_set4'],item['player1_set5'],item['player2_sets'],item['player2_set1'],item['player2_set2'],item['player2_set3'],item['player2_set4'],item['player2_set5'])) self.connection.commit() return itemHere is Items.py import scrapy class TennisexplorerItem(scrapy.Item): # define the fields for your item here like: match_id = scrapy.Field() date = scrapy.Field() time = scrapy.Field() player1 = scrapy.Field() player2 = scrapy.Field() player1_sets = scrapy.Field() player1_set1 = scrapy.Field() player1_set2 = scrapy.Field() player1_set3 = scrapy.Field() player1_set4 = scrapy.Field() player1_set5 = scrapy.Field() player2_sets = scrapy.Field() player2_set1 = scrapy.Field() player2_set2 = scrapy.Field() player2_set3 = scrapy.Field() player2_set4 = scrapy.Field() player2_set5 = scrapy.Field() |