Scrape script when deployed on heroku not giving expected outcomes

alok001 · Oct-19-2019, 10:17 AM

Hello All

I built a python selenium script to scrape flights from a website. The code is working fine when executed from local machine. I deployed the code Heroku ensuring chrome driver is installed and path set properly. The code is deployed but when i am executing the code i am not getting the same output as i am getting from local machine. I did my best to investigate but couldn't find the possible reason. Below are the logs and code for same.

from urllib.request import  Request,urlopen
from bs4 import  BeautifulSoup as soup
import smtplib
from sendgrid import SendGridAPIClient
from sendgrid.helpers.mail import *
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np
from datetime import date
import os

def scrollDown(driver, value):
    #print('loading.......value')
    driver.execute_script("window.scrollBy(0,"+str(value)+")")

# Scroll down the page
def scrollDownAllTheWay(driver):
    old_page = driver.page_source
    while True:
        print("Scrolling loop")
        for i in range(3):
            scrollDown(driver, 600)
            time.sleep(5)
        new_page = driver.page_source
        if new_page != old_page:
            old_page = new_page
        else:
            break
    return True


def scrollFiveTime(driver):
    print('scroller is to run')
    for i in range(5):
        scrollDown(driver, 900)
        time.sleep(5)

def sendEmail(details,subjectData):
    print('inside sendgrid')
    message = {
        'personalizations': [
            {
                'to': [
                    {
                        'email': '[email protected]'
                    },
                    {
                        'email': '[email protected]'
                    }
                ],
                'subject': subjectData
            }
        ],
        'from': {
            'email': '[email protected]'
        },
        'content': [
            {
                'type': 'text/html',
                'value': details
            }
        ]
    }
    try:
        sg = SendGridAPIClient(api_key='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')
        response = sg.send(message)
        print(response.status_code)
        print(response.body)
        print(response.headers)
    except Exception as e:
        print(str(e))

def scapeFlights():
    goibibo = 'https://www.goibibo.com/flights/air-RPR-BOM-20200113--1-0-0-E-D/'
    today = date.today()

    #CHROMEDRIVER_PATH = / app /.chromedriver / bin / chromedriver
    #GOOGLE_CHROME_BIN = / app /.apt / usr / bin / google - chrome

    CHROMEDRIVER_PATH = '/app/.chromedriver/bin/chromedriver'

    chrome_bin = os.environ.get("GOOGLE_CHROME_BIN")

    options = webdriver.ChromeOptions()
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.binary_location = chrome_bin

    #options.add_argument("start-maximized")
    #options.add_argument("disable-infobars")
    #options.add_argument("--disable-extensions")

    driver = webdriver.Chrome(chrome_options=options,executable_path=CHROMEDRIVER_PATH)

    #driver = webdriver.Chrome(chrome_options=options,executable_path = r"D:\NewDriver\chromedriver_win32\chromedriver.exe")
    #driver.implicitly_wait(30)
    driver.get(goibibo)
    print('.......')
    scrollDownAllTheWay(driver)
    time.sleep(5)
    # //*[@id="content"]/div/div[2]/div/div[2]/div/div[2]/div[2]/div[2]/div[2]/div/div/div[1]
    airlines = []
    #

    # try:
    # elem = WebDriverWait(driver, 5).until(EC.presence_of_element_located((By.CLASS_NAME, 'clr')))
    # print ('Page is ready!')
    # except:
    # print('loading timeout!')

    #airlines = driver.find_elements_by_class_name("clr")
    #print(len(airlines))

    airline_names=driver.find_elements_by_xpath("//span[@class='greyLt ico13 padR10 padL5']")
    #print(airline_names)
    airline_names_list=[value.text for value in airline_names]

    dept_from=driver.find_elements_by_xpath("//span[@class='ico11 greyLt padL5']")
    dept_from_list=[value.text.split(',')[0] for value in dept_from]

    dept_time = driver.find_elements_by_xpath("//span[@class='fb ico18 padT5 quicks']")
    dept_time_list = [value.text for value in dept_time]

    #ico15 fb txtCenter quicks padT5
    duration = driver.find_elements_by_xpath("//div[@class='ico15 fb txtCenter quicks padT5']")
    duration_list = [value.text for value in duration]

    #greyLt ico11 padL5
    arr_to = driver.find_elements_by_xpath("//span[@class='greyLt ico11 padL5']")
    arr_to_list = [value.text.split(',')[0] for value in arr_to]

    #fb dF alignItemsCenter ico18 padT5 quicks
    arr_time = driver.find_elements_by_xpath("//span[@class='fb dF alignItemsCenter ico18 padT5 quicks']/span")
    arr_time_list = [value.text for value in arr_time]

    #ico20 fb quicks
    price=driver.find_elements_by_xpath("//span[@class='ico20 fb quicks']")
    price_list=[value.text for value in price]

    print(len(dept_from_list),'-',len(dept_time_list),'-',len(duration_list),'-',len(arr_to_list),'-',len(price_list))
    #,'-',arr_to_list[i],'-',arr_to_list[i]

    dataSet=pd.DataFrame()

    for i in range(len(dept_from_list)):
        #print(airline_names_list[i],'-',dept_from_list[i],'-',dept_time_list[i],'-',duration_list[i],'-',arr_to_list[i],'-',arr_time_list[i],'-',price_list[i])
        dataSet.loc[i+1, 'Airline'] = airline_names_list[i]
        dataSet.loc[i+1, 'Price'] = price_list[i]
        dataSet.loc[i+1, 'From'] = dept_from_list[i]
        dataSet.loc[i+1, 'To'] = arr_to_list[i]
        dataSet.loc[i+1, 'Duration'] = duration_list[i]
        dataSet.loc[i+1, 'Departure Time'] = dept_time_list[i]
        dataSet.loc[i+1, 'Arrival Time'] = arr_time_list[i]
        dataSet.loc[i + 1, 'Current Date'] = today.strftime("%d/%m/%Y")
        dataSet.loc[i + 1, 'Travel Date']='13/01/2020'

    print('...............best flight...............')
    html = dataSet.to_html()
    print(dataSet)
    #print(html)
    subject = "Latest flight Heroku results from Raipur to Mumbai"
    body="<html><head>Super Flights</head><body>"+html+"</body></html>"
    print("called made ")
    #sendEmail(body, subject)



def startScapper():
    scapeFlights()

startScapper()

local machine

Output:C:\Python36\python.exe D:/Scapper/webscapper/FlightScapper.py
.......
Scrolling loop
Scrolling loop
Scrolling loop
Scrolling loop
38 - 38 - 38 - 38 - 38
...............best flight...............
      Airline   Price    From  ... Arrival Time Current Date Travel Date
1      IndiGo   3,500  Raipur  ...        18:45   19/10/2019  13/01/2020
2      IndiGo   3,599  Raipur  ...        20:45   19/10/2019  13/01/2020
3   Air India   3,650  Raipur  ...        16:25   19/10/2019  13/01/2020
4      IndiGo   3,760  Raipur  ...        16:20   19/10/2019  13/01/2020
5      IndiGo   3,762  Raipur  ...        16:30   19/10/2019  13/01/2020
6      IndiGo   3,762  Raipur  ...        21:25   19/10/2019  13/01/2020
7      IndiGo   3,762  Raipur  ...        00:05   19/10/2019  13/01/2020
8      IndiGo   3,969  Raipur  ...          +1D   19/10/2019  13/01/2020
9      IndiGo   4,231  Raipur  ...        13:45   19/10/2019  13/01/2020
10  Air India   5,824  Raipur  ...        13:45   19/10/2019  13/01/2020
11  Air India   6,296  Raipur  ...        09:20   19/10/2019  13/01/2020
12  Air India   6,559  Raipur  ...          +1D   19/10/2019  13/01/2020
13  Air India   6,559  Raipur  ...        23:05   19/10/2019  13/01/2020
14  Air India   6,559  Raipur  ...        01:10   19/10/2019  13/01/2020
15  Air India   6,559  Raipur  ...          +1D   19/10/2019  13/01/2020
16  Air India   6,559  Raipur  ...        10:15   19/10/2019  13/01/2020
17  Air India   6,559  Raipur  ...          +1D   19/10/2019  13/01/2020
18  Air India   6,559  Raipur  ...        15:10   19/10/2019  13/01/2020
19  Air India   6,559  Raipur  ...          +1D   19/10/2019  13/01/2020
20  Air India   6,769  Raipur  ...        19:10   19/10/2019  13/01/2020
21  Air India   6,769  Raipur  ...          +1D   19/10/2019  13/01/2020
22  Air India   6,769  Raipur  ...        20:15   19/10/2019  13/01/2020
23  Air India   6,769  Raipur  ...          +1D   19/10/2019  13/01/2020
24  Air India   6,769  Raipur  ...        21:15   19/10/2019  13/01/2020
25  Air India   6,769  Raipur  ...          +1D   19/10/2019  13/01/2020
26  Air India   6,769  Raipur  ...        22:10   19/10/2019  13/01/2020
27  Air India   6,769  Raipur  ...          +1D   19/10/2019  13/01/2020
28  Air India   6,769  Raipur  ...        21:35   19/10/2019  13/01/2020
29  Air India   7,294  Raipur  ...          +1D   19/10/2019  13/01/2020
30  Air India   7,294  Raipur  ...        15:10   19/10/2019  13/01/2020
31  Air India   7,399  Raipur  ...        19:10   19/10/2019  13/01/2020
32  Air India   7,733  Raipur  ...        20:15   19/10/2019  13/01/2020
33  Air India   8,007  Raipur  ...        21:15   19/10/2019  13/01/2020
34  Air India   8,136  Raipur  ...        23:10   19/10/2019  13/01/2020
35  Air India   8,344  Raipur  ...        01:10   19/10/2019  13/01/2020
36  Air India   8,427  Raipur  ...          +1D   19/10/2019  13/01/2020
37  Air India  11,210  Raipur  ...        21:35   19/10/2019  13/01/2020
38  Air India  11,210  Raipur  ...        11:10   19/10/2019  13/01/2020

[38 rows x 9 columns]
called made 

Process finished with exit code 0

when executed from heroku

Output:D:\Scapper\webscapper>heroku run python FlightScapper.py
Running python FlightScapper.py on ⬢ webscapper... up, run.5768 (Free)
.......
Scrolling loop
Scrolling loop
0 - 0 - 0 - 0 - 0
...............best flight...............
Empty DataFrame
Columns: []
Index: []
called made

D:\Scapper\webscapper>

alok001 · Oct-19-2019, 12:39 PM

Well i did some debugging and found when executing from heroku , the page is not getting scrolled. Added new method to scroll. Can somebody please suggest.

def scrollDownUp(driver,elm):
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to almost the bottom of the page
        driver.execute_script("window.scrollTo(0, (document.body.scrollHeight-600));")

        # Time Taken to Load the page
        time.sleep(7)

        # Scrolling Up & Down to load more Data
        elm.send_keys(Keys.HOME)
        time.sleep(4)
        elm.send_keys(Keys.END)
        time.sleep(3)
        print('last height ', last_height)
        # Calculate the new scrolling height and then compare it to old height
        new_height = driver.execute_script("return document.body.scrollHeight")
        print('new height',new_height)
        if new_height == last_height:
            break
        last_height = new_height

local machine

Output:last height  3531
new height 6601
last height  6601
new height 7825
last height  7825
new height 7825
38 - 38 - 38 - 38 - 38

from heroku console

Output:D:\Scapper\webscapper>heroku run python FlightScapper.py
Running python FlightScapper.py on ⬢ webscapper... up, run.3818 (Free)
.......
last height  2606
new height 2606
0 - 0 - 0 - 0 - 0
...............best flight...............
Empty DataFrame
Columns: []
Index: []
called made

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Leapcell: The Python-Friendly Alternative to Heroku + Airtable Hybrid	IssacChan	1	1,624	Feb-01-2024, 06:00 AM Last Post: Athi
	Deployed Spider on Heroku: How do I email downloaded files?	JaneTan	2	2,446	Mar-24-2022, 08:31 AM Last Post: JaneTan
	Heroku Error H10	jamesaarr	1	2,865	Oct-21-2021, 03:43 PM Last Post: jamesaarr
	Importing Postgres Heroku from AWS S3	Drone4four	0	2,279	May-27-2021, 01:09 PM Last Post: Drone4four
	Django project deployed to Heroku: Postgres security	Drone4four	0	2,477	Mar-26-2021, 10:17 AM Last Post: Drone4four
	scrape data 1 go to next page scrape data 2 and so on	alkaline3	6	8,719	Mar-13-2020, 07:59 PM Last Post: alkaline3
	flask app to save images locally when deployed on heroku not working	Prince_Bhatia	1	6,151	Feb-20-2019, 11:59 PM Last Post: snippsat
	Deploy flask app on Heroku	Prince_Bhatia	0	4,621	Feb-20-2019, 09:05 AM Last Post: Prince_Bhatia
	Hosting statistic tool on heroku with flask secure?	Zoja	1	3,633	Oct-29-2018, 10:07 AM Last Post: Zoja
	Scrape java script web site	PythonHunger	6	5,501	Oct-25-2018, 05:59 AM Last Post: PythonHunger

Scrape script when deployed on heroku not giving expected outcomes

User Panel Messages

Announcements