Webscraping news articles by using selenium

cate16 · Aug-23-2023, 09:21 AM

Hello, I have to do webscraping of some articles from a website (pressreader).

My code is the following:

from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service



import pyautogui
import os.path

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



def starttoend(start, end, year, month, day):
    s_year = start[0:4]
    s_mon = start[4:6]
    s_day = start[6:8]
    e_year = end[0:4]
    e_mon = end[4:6]
    e_day = end[6:8]
    ret = []
    for i in range(year.index(s_year), year.index(e_year) + 1):
        for j in range(month.index(s_mon), month.index(e_mon) + 1):
            if i == year.index(s_year) and j == month.index(s_mon):
                for k in range(day.index(s_day), 31):
                    ret.append(year[i] + month[j] + day[k])
            elif i == year.index(e_year) and j == month.index(e_mon):
                for k in range(0, day.index(e_day) + 1):
                    ret.append(year[i] + month[j] + day[k])
            else:
                for k in range(31):
                    ret.append(year[i] + month[j] + day[k])
    return ret


# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
        "2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
        "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

# set up to save print as PDF file
settings = {
    "appState": {
        "recentDestinations": [{
            "id": "Save as PDF",
            "origin": "local"
        }],
        "selectedDestinationId": "Save as PDF",
        "version": 2
    }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

service = Service(executable_path=r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#driver = webdriver.Chrome(service = service)


# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(service = service,        
                          options=chrome_options)

# traverse through all papers
for i in range(len(papernames)):
    # traverse through dates
    for j in dates[i]:
        count = 1
        dobreak = False
        for k in index:
            if (dobreak):
                break

            try:

                driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
                actions1 = webdriver.common.action_chains.ActionChains(driver)
                actions2 = webdriver.common.action_chains.ActionChains(driver)

                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

                bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

                bottom_button.click()

                time.sleep(2)

                all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
                all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

                news = all_news[k]
                first = True

                article_id = news.get_attribute("article-id")
                print(article_id)
                actions1.move_to_element(news).perform()
                news.click()

                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
                time.sleep(2)
                arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
                head = arti.find_element_by_tag_name("hgroup")
                time.sleep(1)
                actions2.move_to_element(head).perform()
                time.sleep(1)
                actions2.context_click(head).perform()

                time.sleep(2)
                printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
                printbutton.click()

                time.sleep(1)

                printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
                printtext.click()

                time.sleep(4)
                name = ""
                if (count < 10):
                    name = papernames[i] + "_" + j + "_" + "0" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
                else:
                    name = papernames[i] + "_" + j + "_" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))

                time.sleep(1)
                pyautogui.press('enter')
                print("saved" + name)

                time.sleep(10)

                count += 1
                cont_fail = 0
                if k == len(all_news) - 1:
                    driver.quit()
                    dobreak = True
                    break
                driver.quit()
                time.sleep(1)


            except:

                cont_fail += 1

                print("failed on" + papernames[i] + j + str(k))

                driver.quit()

            if cont_fail > 5:
                break

            continue

I keep getting this error:

C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 96, in <module>
    driver = webdriver.Chrome(service = service,        
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1

I have installed the chrome driver path in the system, already tried the old version of the selenium package (since I think that the original code (link: https://github.com/asui1/Webautomation/b...%20test.py) might be using an older version of selenium) but it still doesn't work.
I am new to python and I still have a lot to learn, can someone help me? Thank you in advance

***snippsat*** · (This post was last modified: Aug-23-2023, 12:57 PM by snippsat.)

The code your to try run is really messy written,and will not work now without a rewrite.
For the error you get first so most chromedriver.exe be in OS environment path.
In cmd echo %PATH:;=&echo.% so in one those folders that this command list.
Also now in Downloads most find link to chromedriver in JSON endpoints.
So for Chrome now is version 116:

https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip

cate16 · Aug-24-2023, 08:53 AM

(Aug-23-2023, 12:56 PM)snippsat Wrote: The code your to try run is really messy written,and will not work now without a rewrite.
For the error you get first so most chromedriver.exe be in OS environment path.
In cmd echo %PATH:;=&echo.% so in one those folders that this command list.
Also now in Downloads most find link to chromedriver in JSON endpoints.
So for Chrome now is version 116:
https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip

Thank so much for your suggestion.
I have put the folder of the final version of chrome driver in the OS path as you said, but I keep getting the same error...

Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 95, in <module>
    driver = webdriver.Chrome(service = service,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1

Regarding this:

(Aug-23-2023, 12:56 PM)snippsat Wrote: In cmd echo %PATH:;=&echo.% so in one those folders that this command list.

Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code? And how am I supposed to edit it for my needs?

Thank you again for your help.

***snippsat*** · (This post was last modified: Aug-24-2023, 01:04 PM by snippsat.)

(Aug-24-2023, 08:53 AM)cate16 Wrote: Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code?

Now you shall open cmd and type echo %PATH:;=&echo.% with <enter> then will get a list of folders.
So in one those folder you most place chromedriver.exe,these are Environment Variables Path

Here a working example.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)

Output:
0106091523

So i have C:\cmder\bin in my OS Path.
It i use cmd,see that folder is there eg also folder C:\WINDOWS would work to place chromedriver.exe in.

Microsoft Windows [Version 10.0.19045.3324]
(c) Microsoft Corporation. Med enerett.

C:\>%PATH:;=&echo.%
.....
C:\cmder\bin
C:\WINDOWS\system32
C:\WINDOWS
....

C:\>

cate16 · Aug-25-2023, 08:04 AM

(Aug-24-2023, 01:04 PM)snippsat Wrote:
(Aug-24-2023, 08:53 AM)cate16 Wrote: Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code?
Now you shall open cmd and type echo %PATH:;=&echo.% with <enter> then will get a list of folders.
So in one those folder you most place chromedriver.exe,these are Environment Variables Path

Here a working example.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0106091523
So i have C:\cmder\bin in my OS Path.
It i use cmd,see that folder is there eg also folder C:\WINDOWS would work to place chromedriver.exe in.
Microsoft Windows [Version 10.0.19045.3324]
(c) Microsoft Corporation. Med enerett.

C:\>%PATH:;=&echo.%
.....
C:\cmder\bin
C:\WINDOWS\system32
C:\WINDOWS
....

C:\>

Thank you for the explanation.
I did as you told me to do, by putting the folder in which chromedriver.exe is situated among the paths:
[Image: Capture-d-cran-2023-08-25-101601.png]

[Image: Capture-d-cran-2023-08-25-101601.png]

However, if I run the code I keep receiving the same kind of error:

C:\Users\cmosca\AppData\Local\Programs\Python\Python311\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 99, in <module>
    driver = webdriver.Chrome(service = service,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = get_path(self.service, options)
                        ^^^^^^^^
NameError: name 'get_path' is not defined

Process finished with exit code 1

Do you have an idea of what could be another solution? Thank you in advance.

***snippsat*** · Aug-25-2023, 05:48 PM

(Aug-25-2023, 08:04 AM)cate16 Wrote: However, if I run the code I keep receiving the same kind of error:

Don't run the first messy code,it's not updates and will not work at all.
Run my test code,so for you it will be this this(you could have chooen a shother path).
First in cmd do:

pip install selenium --upgrade

Code to test.

# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)

Output:
0508142227

cate16 · Aug-28-2023, 07:19 AM

(Aug-25-2023, 05:48 PM)snippsat Wrote:

(Aug-25-2023, 08:04 AM)cate16 Wrote: However, if I run the code I keep receiving the same kind of error:

Don't run the first messy code,it's not updates and will not work at all.
Run my test code,so for you it will be this this(you could have chooen a shother path).
First in cmd do:

pip install selenium --upgrade

Code to test.

# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)

Output:
0508142227

Thank you for your reply. I have done as you said.
This part worked:

pip install selenium --upgrade

This didn't:

# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)

Output:
0508142227

I got the following error:

C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\cmosca\PycharmProjects\pythonProject1\main.py 
Traceback (most recent call last):
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\main.py", line 17, in <module>
    lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 739, in find_element
    return self.execute(Command.FIND_ELEMENT, {"using": by, "value": value})["value"]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 345, in execute
    self.error_handler.check_response(response)
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 229, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.details"}
  (Session info: chrome=116.0.5845.111); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7024A5282+57250]
	(No symbol) [0x00007FF70241CB92]
	(No symbol) [0x00007FF7022EDEAB]
	(No symbol) [0x00007FF70232739E]
	(No symbol) [0x00007FF70232748C]
	(No symbol) [0x00007FF7023600C7]
	(No symbol) [0x00007FF70234665F]
	(No symbol) [0x00007FF70235E172]
	(No symbol) [0x00007FF7023463F3]
	(No symbol) [0x00007FF70231C991]
	(No symbol) [0x00007FF70231DB74]
	GetHandleVerifier [0x00007FF7027550A2+2874818]
	GetHandleVerifier [0x00007FF7027A6C74+3209620]
	GetHandleVerifier [0x00007FF70279FAAF+3180495]
	GetHandleVerifier [0x00007FF7025378E6+656902]
	(No symbol) [0x00007FF702428228]
	(No symbol) [0x00007FF702424374]
	(No symbol) [0x00007FF7024244A6]
	(No symbol) [0x00007FF702414873]
	BaseThreadInitThunk [0x00007FFDF06426AD+29]
	RtlUserThreadStart [0x00007FFDF130AA68+40]


Process finished with exit code 1

***snippsat*** · (This post was last modified: Aug-28-2023, 09:58 AM by snippsat.)

(Aug-28-2023, 07:19 AM)cate16 Wrote: I got the following error:

Now it try to parse.can be a problem with site in your country or speed of search.
Try set time.sleep(3) on line 17,to make it wait a bit before looking for tag.
Sometime the network search can be to fast,time sleep is just a quick first test,Selenium has own Waits .

Here site that works for all,to make sure your setup works.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.python.org/'
browser.get(url)
python_about = browser.find_element(By.CSS_SELECTOR, '#touchnav-wrapper > header > div > div.introduction > p')
print(python_about.text)

Output:Python is a programming language that lets you work quickly
and integrate systems more effectively. Learn More

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Webscraping with beautifulsoup	cormanstan	3	1,981	Aug-24-2023, 11:57 AM Last Post: snippsat
	Webscraping returning empty table	Buuuwq	0	1,402	Dec-09-2022, 10:41 AM Last Post: Buuuwq
	WebScraping using Selenium library	Korgik	0	1,046	Dec-09-2022, 09:51 AM Last Post: Korgik
	How to get rid of numerical tokens in output (webscraping issue)?	jps2020	0	1,952	Oct-26-2020, 05:37 PM Last Post: jps2020
	Python Webscraping with a Login Website	warriordazza	0	2,609	Jun-07-2020, 07:04 AM Last Post: warriordazza
	Help with basic webscraping	Captain_Snuggle	2	3,938	Nov-07-2019, 08:07 PM Last Post: kozaizsvemira
	Can't Resolve Webscraping AttributeError	Hass	1	2,315	Jan-15-2019, 09:36 PM Last Post: nilamo
	How to exclude certain links while webscraping basis on keywords	Prince_Bhatia	0	3,247	Oct-31-2018, 07:00 AM Last Post: Prince_Bhatia
	Webscraping homework	Ghigo1995	1	2,651	Sep-23-2018, 07:36 PM Last Post: nilamo
	Intro to WebScraping	d1rjr03	2	3,455	Aug-15-2018, 12:05 AM Last Post: metulburr

Webscraping news articles by using selenium

User Panel Messages

Announcements