Scrape script when deployed on heroku not giving expected outcomes - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Scrape script when deployed on heroku not giving expected outcomes (/thread-21891.html) |
Scrape script when deployed on heroku not giving expected outcomes - alok001 - Oct-19-2019 Hello All I built a python selenium script to scrape flights from a website. The code is working fine when executed from local machine. I deployed the code Heroku ensuring chrome driver is installed and path set properly. The code is deployed but when i am executing the code i am not getting the same output as i am getting from local machine. I did my best to investigate but couldn't find the possible reason. Below are the logs and code for same. from urllib.request import Request,urlopen from bs4 import BeautifulSoup as soup import smtplib from sendgrid import SendGridAPIClient from sendgrid.helpers.mail import * from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import time import pandas as pd import numpy as np from datetime import date import os def scrollDown(driver, value): #print('loading.......value') driver.execute_script("window.scrollBy(0,"+str(value)+")") # Scroll down the page def scrollDownAllTheWay(driver): old_page = driver.page_source while True: print("Scrolling loop") for i in range(3): scrollDown(driver, 600) time.sleep(5) new_page = driver.page_source if new_page != old_page: old_page = new_page else: break return True def scrollFiveTime(driver): print('scroller is to run') for i in range(5): scrollDown(driver, 900) time.sleep(5) def sendEmail(details,subjectData): print('inside sendgrid') message = { 'personalizations': [ { 'to': [ { 'email': '[email protected]' }, { 'email': '[email protected]' } ], 'subject': subjectData } ], 'from': { 'email': '[email protected]' }, 'content': [ { 'type': 'text/html', 'value': details } ] } try: sg = SendGridAPIClient(api_key='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') response = sg.send(message) print(response.status_code) print(response.body) print(response.headers) except Exception as e: print(str(e)) def scapeFlights(): goibibo = 'https://www.goibibo.com/flights/air-RPR-BOM-20200113--1-0-0-E-D/' today = date.today() #CHROMEDRIVER_PATH = / app /.chromedriver / bin / chromedriver #GOOGLE_CHROME_BIN = / app /.apt / usr / bin / google - chrome CHROMEDRIVER_PATH = '/app/.chromedriver/bin/chromedriver' chrome_bin = os.environ.get("GOOGLE_CHROME_BIN") options = webdriver.ChromeOptions() options.add_argument('--disable-gpu') options.add_argument('--no-sandbox') options.add_argument('--headless') options.binary_location = chrome_bin #options.add_argument("start-maximized") #options.add_argument("disable-infobars") #options.add_argument("--disable-extensions") driver = webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH) #driver = webdriver.Chrome(chrome_options=options,executable_path = r"D:\NewDriver\chromedriver_win32\chromedriver.exe") #driver.implicitly_wait(30) driver.get(goibibo) print('.......') scrollDownAllTheWay(driver) time.sleep(5) # //*[@id="content"]/div/div[2]/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[1] airlines = [] # # try: # elem = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'clr'))) # print ('Page is ready!') # except: # print('loading timeout!') #airlines = driver.find_elements_by_class_name("clr") #print(len(airlines)) airline_names=driver.find_elements_by_xpath("//span[@class='greyLt ico13 padR10 padL5']") #print(airline_names) airline_names_list=[value.text for value in airline_names] dept_from=driver.find_elements_by_xpath("//span[@class='ico11 greyLt padL5']") dept_from_list=[value.text.split(',')[0] for value in dept_from] dept_time = driver.find_elements_by_xpath("//span[@class='fb ico18 padT5 quicks']") dept_time_list = [value.text for value in dept_time] #ico15 fb txtCenter quicks padT5 duration = driver.find_elements_by_xpath("//div[@class='ico15 fb txtCenter quicks padT5']") duration_list = [value.text for value in duration] #greyLt ico11 padL5 arr_to = driver.find_elements_by_xpath("//span[@class='greyLt ico11 padL5']") arr_to_list = [value.text.split(',')[0] for value in arr_to] #fb dF alignItemsCenter ico18 padT5 quicks arr_time = driver.find_elements_by_xpath("//span[@class='fb dF alignItemsCenter ico18 padT5 quicks']/span") arr_time_list = [value.text for value in arr_time] #ico20 fb quicks price=driver.find_elements_by_xpath("//span[@class='ico20 fb quicks']") price_list=[value.text for value in price] print(len(dept_from_list),'-',len(dept_time_list),'-',len(duration_list),'-',len(arr_to_list),'-',len(price_list)) #,'-',arr_to_list[i],'-',arr_to_list[i] dataSet=pd.DataFrame() for i in range(len(dept_from_list)): #print(airline_names_list[i],'-',dept_from_list[i],'-',dept_time_list[i],'-',duration_list[i],'-',arr_to_list[i],'-',arr_time_list[i],'-',price_list[i]) dataSet.loc[i+1, 'Airline'] = airline_names_list[i] dataSet.loc[i+1, 'Price'] = price_list[i] dataSet.loc[i+1, 'From'] = dept_from_list[i] dataSet.loc[i+1, 'To'] = arr_to_list[i] dataSet.loc[i+1, 'Duration'] = duration_list[i] dataSet.loc[i+1, 'Departure Time'] = dept_time_list[i] dataSet.loc[i+1, 'Arrival Time'] = arr_time_list[i] dataSet.loc[i + 1, 'Current Date'] = today.strftime("%d/%m/%Y") dataSet.loc[i + 1, 'Travel Date']='13/01/2020' print('...............best flight...............') html = dataSet.to_html() print(dataSet) #print(html) subject = "Latest flight Heroku results from Raipur to Mumbai" body="<html><head>Super Flights</head><body>"+html+"</body></html>" print("called made ") #sendEmail(body, subject) def startScapper(): scapeFlights() startScapper()local machine when executed from heroku
RE: Scrape script when deployed on heroku not giving expected outcomes - alok001 - Oct-19-2019 Well i did some debugging and found when executing from heroku , the page is not getting scrolled. Added new method to scroll. Can somebody please suggest. def scrollDownUp(driver,elm): last_height = driver.execute_script("return document.body.scrollHeight") while True: # Scroll down to almost the bottom of the page driver.execute_script("window.scrollTo(0, (document.body.scrollHeight-600));") # Time Taken to Load the page time.sleep(7) # Scrolling Up & Down to load more Data elm.send_keys(Keys.HOME) time.sleep(4) elm.send_keys(Keys.END) time.sleep(3) print('last height ', last_height) # Calculate the new scrolling height and then compare it to old height new_height = driver.execute_script("return document.body.scrollHeight") print('new height',new_height) if new_height == last_height: break last_height = new_heightlocal machine from heroku console
|