Python Forum
Scrape script when deployed on heroku not giving expected outcomes
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Scrape script when deployed on heroku not giving expected outcomes
#1
Hello All

I built a python selenium script to scrape flights from a website. The code is working fine when executed from local machine. I deployed the code Heroku ensuring chrome driver is installed and path set properly. The code is deployed but when i am executing the code i am not getting the same output as i am getting from local machine. I did my best to investigate but couldn't find the possible reason. Below are the logs and code for same.

from urllib.request import  Request,urlopen
from bs4 import  BeautifulSoup as soup
import smtplib
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import *
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np
from datetime import date
import os

def scrollDown(driver, value):
    #print('loading.......value')
    driver.execute_script("window.scrollBy(0,"+str(value)+")")

# Scroll down the page
def scrollDownAllTheWay(driver):
    old_page = driver.page_source
    while True:
        print("Scrolling loop")
        for i in range(3):
            scrollDown(driver, 600)
            time.sleep(5)
        new_page = driver.page_source
        if new_page != old_page:
            old_page = new_page
        else:
            break
    return True


def scrollFiveTime(driver):
    print('scroller is to run')
    for i in range(5):
        scrollDown(driver, 900)
        time.sleep(5)

def sendEmail(details,subjectData):
    print('inside sendgrid')
    message = {
        'personalizations': [
            {
                'to': [
                    {
                        'email': '[email protected]'
                    },
                    {
                        'email': '[email protected]'
                    }
                ],
                'subject': subjectData
            }
        ],
        'from': {
            'email': '[email protected]'
        },
        'content': [
            {
                'type': 'text/html',
                'value': details
            }
        ]
    }
    try:
        sg = SendGridAPIClient(api_key='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')
        response = sg.send(message)
        print(response.status_code)
        print(response.body)
        print(response.headers)
    except Exception as e:
        print(str(e))

def scapeFlights():
    goibibo = 'https://www.goibibo.com/flights/air-RPR-BOM-20200113--1-0-0-E-D/'
    today = date.today()

    #CHROMEDRIVER_PATH = / app /.chromedriver / bin / chromedriver
    #GOOGLE_CHROME_BIN = / app /.apt / usr / bin / google - chrome

    CHROMEDRIVER_PATH = '/app/.chromedriver/bin/chromedriver'

    chrome_bin = os.environ.get("GOOGLE_CHROME_BIN")

    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.binary_location = chrome_bin

    #options.add_argument("start-maximized")
    #options.add_argument("disable-infobars")
    #options.add_argument("--disable-extensions")

    driver = webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH)

    #driver = webdriver.Chrome(chrome_options=options,executable_path = r"D:\NewDriver\chromedriver_win32\chromedriver.exe")
    #driver.implicitly_wait(30)
    driver.get(goibibo)
    print('.......')
    scrollDownAllTheWay(driver)
    time.sleep(5)
    # //*[@id="content"]/div/div[2]/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[1]
    airlines = []
    #

    # try:
    # elem = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'clr')))
    # print ('Page is ready!')
    # except:
    # print('loading timeout!')

    #airlines = driver.find_elements_by_class_name("clr")
    #print(len(airlines))

    airline_names=driver.find_elements_by_xpath("//span[@class='greyLt ico13 padR10 padL5']")
    #print(airline_names)
    airline_names_list=[value.text for value in airline_names]

    dept_from=driver.find_elements_by_xpath("//span[@class='ico11 greyLt padL5']")
    dept_from_list=[value.text.split(',')[0] for value in dept_from]

    dept_time = driver.find_elements_by_xpath("//span[@class='fb ico18 padT5 quicks']")
    dept_time_list = [value.text for value in dept_time]

    #ico15 fb txtCenter quicks padT5
    duration = driver.find_elements_by_xpath("//div[@class='ico15 fb txtCenter quicks padT5']")
    duration_list = [value.text for value in duration]

    #greyLt ico11 padL5
    arr_to = driver.find_elements_by_xpath("//span[@class='greyLt ico11 padL5']")
    arr_to_list = [value.text.split(',')[0] for value in arr_to]

    #fb dF alignItemsCenter ico18 padT5 quicks
    arr_time = driver.find_elements_by_xpath("//span[@class='fb dF alignItemsCenter ico18 padT5 quicks']/span")
    arr_time_list = [value.text for value in arr_time]

    #ico20 fb quicks
    price=driver.find_elements_by_xpath("//span[@class='ico20 fb quicks']")
    price_list=[value.text for value in price]

    print(len(dept_from_list),'-',len(dept_time_list),'-',len(duration_list),'-',len(arr_to_list),'-',len(price_list))
    #,'-',arr_to_list[i],'-',arr_to_list[i]

    dataSet=pd.DataFrame()

    for i in range(len(dept_from_list)):
        #print(airline_names_list[i],'-',dept_from_list[i],'-',dept_time_list[i],'-',duration_list[i],'-',arr_to_list[i],'-',arr_time_list[i],'-',price_list[i])
        dataSet.loc[i+1, 'Airline'] = airline_names_list[i]
        dataSet.loc[i+1, 'Price'] = price_list[i]
        dataSet.loc[i+1, 'From'] = dept_from_list[i]
        dataSet.loc[i+1, 'To'] = arr_to_list[i]
        dataSet.loc[i+1, 'Duration'] = duration_list[i]
        dataSet.loc[i+1, 'Departure Time'] = dept_time_list[i]
        dataSet.loc[i+1, 'Arrival Time'] = arr_time_list[i]
        dataSet.loc[i + 1, 'Current Date'] = today.strftime("%d/%m/%Y")
        dataSet.loc[i + 1, 'Travel Date']='13/01/2020'

    print('...............best flight...............')
    html = dataSet.to_html()
    print(dataSet)
    #print(html)
    subject = "Latest flight Heroku results from Raipur to Mumbai"
    body="<html><head>Super Flights</head><body>"+html+"</body></html>"
    print("called made ")
    #sendEmail(body, subject)



def startScapper():
    scapeFlights()

startScapper()
local machine
Output:
C:\Python36\python.exe D:/Scapper/webscapper/FlightScapper.py ....... Scrolling loop Scrolling loop Scrolling loop Scrolling loop 38 - 38 - 38 - 38 - 38 ...............best flight............... Airline Price From ... Arrival Time Current Date Travel Date 1 IndiGo 3,500 Raipur ... 18:45 19/10/2019 13/01/2020 2 IndiGo 3,599 Raipur ... 20:45 19/10/2019 13/01/2020 3 Air India 3,650 Raipur ... 16:25 19/10/2019 13/01/2020 4 IndiGo 3,760 Raipur ... 16:20 19/10/2019 13/01/2020 5 IndiGo 3,762 Raipur ... 16:30 19/10/2019 13/01/2020 6 IndiGo 3,762 Raipur ... 21:25 19/10/2019 13/01/2020 7 IndiGo 3,762 Raipur ... 00:05 19/10/2019 13/01/2020 8 IndiGo 3,969 Raipur ... +1D 19/10/2019 13/01/2020 9 IndiGo 4,231 Raipur ... 13:45 19/10/2019 13/01/2020 10 Air India 5,824 Raipur ... 13:45 19/10/2019 13/01/2020 11 Air India 6,296 Raipur ... 09:20 19/10/2019 13/01/2020 12 Air India 6,559 Raipur ... +1D 19/10/2019 13/01/2020 13 Air India 6,559 Raipur ... 23:05 19/10/2019 13/01/2020 14 Air India 6,559 Raipur ... 01:10 19/10/2019 13/01/2020 15 Air India 6,559 Raipur ... +1D 19/10/2019 13/01/2020 16 Air India 6,559 Raipur ... 10:15 19/10/2019 13/01/2020 17 Air India 6,559 Raipur ... +1D 19/10/2019 13/01/2020 18 Air India 6,559 Raipur ... 15:10 19/10/2019 13/01/2020 19 Air India 6,559 Raipur ... +1D 19/10/2019 13/01/2020 20 Air India 6,769 Raipur ... 19:10 19/10/2019 13/01/2020 21 Air India 6,769 Raipur ... +1D 19/10/2019 13/01/2020 22 Air India 6,769 Raipur ... 20:15 19/10/2019 13/01/2020 23 Air India 6,769 Raipur ... +1D 19/10/2019 13/01/2020 24 Air India 6,769 Raipur ... 21:15 19/10/2019 13/01/2020 25 Air India 6,769 Raipur ... +1D 19/10/2019 13/01/2020 26 Air India 6,769 Raipur ... 22:10 19/10/2019 13/01/2020 27 Air India 6,769 Raipur ... +1D 19/10/2019 13/01/2020 28 Air India 6,769 Raipur ... 21:35 19/10/2019 13/01/2020 29 Air India 7,294 Raipur ... +1D 19/10/2019 13/01/2020 30 Air India 7,294 Raipur ... 15:10 19/10/2019 13/01/2020 31 Air India 7,399 Raipur ... 19:10 19/10/2019 13/01/2020 32 Air India 7,733 Raipur ... 20:15 19/10/2019 13/01/2020 33 Air India 8,007 Raipur ... 21:15 19/10/2019 13/01/2020 34 Air India 8,136 Raipur ... 23:10 19/10/2019 13/01/2020 35 Air India 8,344 Raipur ... 01:10 19/10/2019 13/01/2020 36 Air India 8,427 Raipur ... +1D 19/10/2019 13/01/2020 37 Air India 11,210 Raipur ... 21:35 19/10/2019 13/01/2020 38 Air India 11,210 Raipur ... 11:10 19/10/2019 13/01/2020 [38 rows x 9 columns] called made Process finished with exit code 0
when executed from heroku

Output:
D:\Scapper\webscapper>heroku run python FlightScapper.py Running python FlightScapper.py on ⬢ webscapper... up, run.5768 (Free) ....... Scrolling loop Scrolling loop 0 - 0 - 0 - 0 - 0 ...............best flight............... Empty DataFrame Columns: [] Index: [] called made D:\Scapper\webscapper>
Reply
#2
Well i did some debugging and found when executing from heroku , the page is not getting scrolled. Added new method to scroll. Can somebody please suggest.

def scrollDownUp(driver,elm):
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to almost the bottom of the page
        driver.execute_script("window.scrollTo(0, (document.body.scrollHeight-600));")

        # Time Taken to Load the page
        time.sleep(7)

        # Scrolling Up & Down to load more Data
        elm.send_keys(Keys.HOME)
        time.sleep(4)
        elm.send_keys(Keys.END)
        time.sleep(3)
        print('last height ', last_height)
        # Calculate the new scrolling height and then compare it to old height
        new_height = driver.execute_script("return document.body.scrollHeight")
        print('new height',new_height)
        if new_height == last_height:
            break
        last_height = new_height
local machine
Output:
last height 3531 new height 6601 last height 6601 new height 7825 last height 7825 new height 7825 38 - 38 - 38 - 38 - 38
from heroku console
Output:
D:\Scapper\webscapper>heroku run python FlightScapper.py Running python FlightScapper.py on ⬢ webscapper... up, run.3818 (Free) ....... last height 2606 new height 2606 0 - 0 - 0 - 0 - 0 ...............best flight............... Empty DataFrame Columns: [] Index: [] called made
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
Star Leapcell: The Python-Friendly Alternative to Heroku + Airtable Hybrid IssacChan 1 392 Feb-01-2024, 06:00 AM
Last Post: Athi
  Deployed Spider on Heroku: How do I email downloaded files? JaneTan 2 1,518 Mar-24-2022, 08:31 AM
Last Post: JaneTan
  Heroku Error H10 jamesaarr 1 1,976 Oct-21-2021, 03:43 PM
Last Post: jamesaarr
  Importing Postgres Heroku from AWS S3 Drone4four 0 1,774 May-27-2021, 01:09 PM
Last Post: Drone4four
  Django project deployed to Heroku: Postgres security Drone4four 0 1,908 Mar-26-2021, 10:17 AM
Last Post: Drone4four
  scrape data 1 go to next page scrape data 2 and so on alkaline3 6 5,087 Mar-13-2020, 07:59 PM
Last Post: alkaline3
  flask app to save images locally when deployed on heroku not working Prince_Bhatia 1 5,229 Feb-20-2019, 11:59 PM
Last Post: snippsat
  Deploy flask app on Heroku Prince_Bhatia 0 4,009 Feb-20-2019, 09:05 AM
Last Post: Prince_Bhatia
  Hosting statistic tool on heroku with flask secure? Zoja 1 2,836 Oct-29-2018, 10:07 AM
Last Post: Zoja
  Scrape java script web site PythonHunger 6 4,039 Oct-25-2018, 05:59 AM
Last Post: PythonHunger

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020