Python Forum
Webscraping news articles by using selenium
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Webscraping news articles by using selenium
#1
Hello, I have to do webscraping of some articles from a website (pressreader).

My code is the following:

from selenium import webdriver
import pandas as pd
import time
import json
from selenium.webdriver import ActionChains
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
import clipboard
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.chrome.service import Service



import pyautogui
import os.path

import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC



def starttoend(start, end, year, month, day):
    s_year = start[0:4]
    s_mon = start[4:6]
    s_day = start[6:8]
    e_year = end[0:4]
    e_mon = end[4:6]
    e_day = end[6:8]
    ret = []
    for i in range(year.index(s_year), year.index(e_year) + 1):
        for j in range(month.index(s_mon), month.index(e_mon) + 1):
            if i == year.index(s_year) and j == month.index(s_mon):
                for k in range(day.index(s_day), 31):
                    ret.append(year[i] + month[j] + day[k])
            elif i == year.index(e_year) and j == month.index(e_mon):
                for k in range(0, day.index(e_day) + 1):
                    ret.append(year[i] + month[j] + day[k])
            else:
                for k in range(31):
                    ret.append(year[i] + month[j] + day[k])
    return ret


# name of papers to find
papernames = ["libero"]
start = "20080101"
end = "20230821"
cont_fail: int = 0
dates = []

year = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020", "2021",
        "2022", "2023"]
months = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"]
days = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12", "13", "14", "15", "16", "17", "18",
        "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31"]

date_tul = starttoend(start, end, year, months, days)

dates.append(date_tul)

index = list(range(25))

# set up to save print as PDF file
settings = {
    "appState": {
        "recentDestinations": [{
            "id": "Save as PDF",
            "origin": "local"
        }],
        "selectedDestinationId": "Save as PDF",
        "version": 2
    }
}
prefs = {'printing.print_preview_sticky_settings': json.dumps(settings)}

service = Service(executable_path=r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service_obj = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#service = Service(r'C:\Users\cmosca\Desktop\python\packages\chromedriver_32\chromedriver.exe')

#driver = webdriver.Chrome(service = service)


# change chrome printing option to minimize work.
chrome_options: Options = webdriver.ChromeOptions()
#chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('prefs', prefs)
chrome_options.add_argument('--kiosk-printing')
driver = webdriver.Chrome(service = service,        
                          options=chrome_options)

# traverse through all papers
for i in range(len(papernames)):
    # traverse through dates
    for j in dates[i]:
        count = 1
        dobreak = False
        for k in index:
            if (dobreak):
                break

            try:

                driver.get("https://www.pressreader.com/ita/" + papernames[i] + "/" + j + "/page/1/textview")
                actions1 = webdriver.common.action_chains.ActionChains(driver)
                actions2 = webdriver.common.action_chains.ActionChains(driver)

                WebDriverWait(driver, 60).until(
                    EC.presence_of_element_located((By.XPATH, '//*[@id="thumbsToolbarBottom_0"]/a')))

                bottom_button = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottom_0"]/a')

                bottom_button.click()

                time.sleep(2)

                all_bottom = driver.find_element_by_xpath('//*[@id="thumbsToolbarBottomPreview_0"]')
                all_news = all_bottom.find_elements_by_xpath('//a[@page-number="1"]')

                news = all_news[k]
                first = True

                article_id = news.get_attribute("article-id")
                print(article_id)
                actions1.move_to_element(news).perform()
                news.click()

                WebDriverWait(driver, 20).until(
                    EC.presence_of_element_located((By.XPATH, '//article[@aid="' + str(article_id) + '"]')))
                time.sleep(2)
                arti = driver.find_element_by_xpath('//article[@aid="' + str(article_id) + '"]')
                head = arti.find_element_by_tag_name("hgroup")
                time.sleep(1)
                actions2.move_to_element(head).perform()
                time.sleep(1)
                actions2.context_click(head).perform()

                time.sleep(2)
                printbutton = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[7]/a')
                printbutton.click()

                time.sleep(1)

                printtext = driver.find_element_by_xpath('/html/body/div[12]/div/section/div/div/ul/li[1]/a')
                printtext.click()

                time.sleep(4)
                name = ""
                if (count < 10):
                    name = papernames[i] + "_" + j + "_" + "0" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + "0" + str(count))
                else:
                    name = papernames[i] + "_" + j + "_" + str(count)
                    pyautogui.typewrite(papernames[i] + "_" + j + "_" + str(count))

                time.sleep(1)
                pyautogui.press('enter')
                print("saved" + name)

                time.sleep(10)

                count += 1
                cont_fail = 0
                if k == len(all_news) - 1:
                    driver.quit()
                    dobreak = True
                    break
                driver.quit()
                time.sleep(1)


            except:

                cont_fail += 1

                print("failed on" + papernames[i] + j + str(k))

                driver.quit()

            if cont_fail > 5:
                break

            continue
I keep getting this error:
C:\Users\cmosca\PycharmProjects\pythonProject\venv\Scripts\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 96, in <module>
    driver = webdriver.Chrome(service = service,        
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1
I have installed the chrome driver path in the system, already tried the old version of the selenium package (since I think that the original code (link: https://github.com/asui1/Webautomation/b...%20test.py) might be using an older version of selenium) but it still doesn't work.
I am new to python and I still have a lot to learn, can someone help me? Thank you in advance
Reply
#2
The code your to try run is really messy written,and will not work now without a rewrite.
For the error you get first so most chromedriver.exe be in OS environment path.
In cmd echo %PATH:;=&echo.% so in one those folders that this command list.
Also now in Downloads most find link to chromedriver in JSON endpoints.
So for Chrome now is version 116:
https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
Reply
#3
(Aug-23-2023, 12:56 PM)snippsat Wrote: The code your to try run is really messy written,and will not work now without a rewrite.
For the error you get first so most chromedriver.exe be in OS environment path.
In cmd echo %PATH:;=&echo.% so in one those folders that this command list.
Also now in Downloads most find link to chromedriver in JSON endpoints.
So for Chrome now is version 116:
https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip

Thank so much for your suggestion.
I have put the folder of the final version of chrome driver in the OS path as you said, but I keep getting the same error...

Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 95, in <module>
    driver = webdriver.Chrome(service = service,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = DriverFinder.get_path(self.service, options)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject\venv\Lib\site-packages\selenium\webdriver\common\driver_finder.py", line 44, in get_path
    raise NoSuchDriverException(f"Unable to locate or obtain driver for {options.capabilities['browserName']}")
selenium.common.exceptions.NoSuchDriverException: Message: Unable to locate or obtain driver for chrome; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors/driver_location


Process finished with exit code 1
Regarding this:
(Aug-23-2023, 12:56 PM)snippsat Wrote: In cmd echo %PATH:;=&echo.% so in one those folders that this command list.

Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code? And how am I supposed to edit it for my needs?

Thank you again for your help.
Reply
#4
(Aug-24-2023, 08:53 AM)cate16 Wrote: Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code?
Now you shall open cmd and type echo %PATH:;=&echo.% with <enter> then will get a list of folders.
So in one those folder you most place chromedriver.exe,these are Environment Variables Path

Here a working example.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0106091523
So i have C:\cmder\bin in my OS Path.
It i use cmd,see that folder is there eg also folder C:\WINDOWS would work to place chromedriver.exe in.
Microsoft Windows [Version 10.0.19045.3324]
(c) Microsoft Corporation. Med enerett.

C:\>%PATH:;=&echo.%
.....
C:\cmder\bin
C:\WINDOWS\system32
C:\WINDOWS
....

C:\>
Reply
#5
(Aug-24-2023, 01:04 PM)snippsat Wrote:
(Aug-24-2023, 08:53 AM)cate16 Wrote: Since I am very new to Python, I haven't understand what you mean. Do I have to put this in what part of the code?
Now you shall open cmd and type echo %PATH:;=&echo.% with <enter> then will get a list of folders.
So in one those folder you most place chromedriver.exe,these are Environment Variables Path

Here a working example.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0106091523
So i have C:\cmder\bin in my OS Path.
It i use cmd,see that folder is there eg also folder C:\WINDOWS would work to place chromedriver.exe in.
Microsoft Windows [Version 10.0.19045.3324]
(c) Microsoft Corporation. Med enerett.

C:\>%PATH:;=&echo.%
.....
C:\cmder\bin
C:\WINDOWS\system32
C:\WINDOWS
....

C:\>

Thank you for the explanation.
I did as you told me to do, by putting the folder in which chromedriver.exe is situated among the paths:
[Image: Capture-d-cran-2023-08-25-101601.png]

However, if I run the code I keep receiving the same kind of error:
C:\Users\cmosca\AppData\Local\Programs\Python\Python311\python.exe "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py" 
Traceback (most recent call last):
  File "C:\Users\cmosca\Desktop\python\Webautomation-master\crawling test.py", line 99, in <module>
    driver = webdriver.Chrome(service = service,
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chrome\webdriver.py", line 45, in __init__
    super().__init__(
  File "C:\Users\cmosca\AppData\Local\Programs\Python\Python311\Lib\site-packages\selenium\webdriver\chromium\webdriver.py", line 51, in __init__
    self.service.path = get_path(self.service, options)
                        ^^^^^^^^
NameError: name 'get_path' is not defined

Process finished with exit code 1
Do you have an idea of what could be another solution? Thank you in advance.
Reply
#6
(Aug-25-2023, 08:04 AM)cate16 Wrote: However, if I run the code I keep receiving the same kind of error:
Don't run the first messy code,it's not updates and will not work at all.
Run my test code,so for you it will be this this(you could have chooen a shother path).
First in cmd do:
pip install selenium --upgrade
Code to test.
# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0508142227
Reply
#7
(Aug-25-2023, 05:48 PM)snippsat Wrote:
(Aug-25-2023, 08:04 AM)cate16 Wrote: However, if I run the code I keep receiving the same kind of error:
Don't run the first messy code,it's not updates and will not work at all.
Run my test code,so for you it will be this this(you could have chooen a shother path).
First in cmd do:
pip install selenium --upgrade
Code to test.
# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0508142227


Thank you for your reply. I have done as you said.
This part worked:
pip install selenium --upgrade
This didn't:
# sel_test.py
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
 
# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\Users\cmosca\AppData\Local\Programs\Python\Python311\chromedriver-win64\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.palottery.state.pa.us/Draw-Games/Treasure-Hunt.aspx'
browser.get(url)
lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
print(lotto_number.text)
Output:
0508142227
I got the following error:

C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Scripts\python.exe C:\Users\cmosca\PycharmProjects\pythonProject1\main.py 
Traceback (most recent call last):
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\main.py", line 17, in <module>
    lotto_number = browser.find_element(By.CSS_SELECTOR, 'div.details')
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 739, in find_element
    return self.execute(Command.FIND_ELEMENT, {"using": by, "value": value})["value"]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\webdriver.py", line 345, in execute
    self.error_handler.check_response(response)
  File "C:\Users\cmosca\PycharmProjects\pythonProject1\venv\Lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 229, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.NoSuchElementException: Message: no such element: Unable to locate element: {"method":"css selector","selector":"div.details"}
  (Session info: chrome=116.0.5845.111); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00007FF7024A5282+57250]
	(No symbol) [0x00007FF70241CB92]
	(No symbol) [0x00007FF7022EDEAB]
	(No symbol) [0x00007FF70232739E]
	(No symbol) [0x00007FF70232748C]
	(No symbol) [0x00007FF7023600C7]
	(No symbol) [0x00007FF70234665F]
	(No symbol) [0x00007FF70235E172]
	(No symbol) [0x00007FF7023463F3]
	(No symbol) [0x00007FF70231C991]
	(No symbol) [0x00007FF70231DB74]
	GetHandleVerifier [0x00007FF7027550A2+2874818]
	GetHandleVerifier [0x00007FF7027A6C74+3209620]
	GetHandleVerifier [0x00007FF70279FAAF+3180495]
	GetHandleVerifier [0x00007FF7025378E6+656902]
	(No symbol) [0x00007FF702428228]
	(No symbol) [0x00007FF702424374]
	(No symbol) [0x00007FF7024244A6]
	(No symbol) [0x00007FF702414873]
	BaseThreadInitThunk [0x00007FFDF06426AD+29]
	RtlUserThreadStart [0x00007FFDF130AA68+40]


Process finished with exit code 1
Reply
#8
(Aug-28-2023, 07:19 AM)cate16 Wrote: I got the following error:
Now it try to parse.can be a problem with site in your country or speed of search.
Try set time.sleep(3) on line 17,to make it wait a bit before looking for tag.
Sometime the network search can be to fast,time sleep is just a quick first test,Selenium has own Waits .

Here site that works for all,to make sure your setup works.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time

# Setup
#https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5795.0/win64/chromedriver-win64.zip
options = Options()
options.add_argument("--headless=new")
ser = Service(r"C:\cmder\bin\chromedriver.exe")
browser = webdriver.Chrome(service=ser, options=options)
# Parse or automation
url = 'https://www.python.org/'
browser.get(url)
python_about = browser.find_element(By.CSS_SELECTOR, '#touchnav-wrapper > header > div > div.introduction > p')
print(python_about.text) 
Output:
Python is a programming language that lets you work quickly and integrate systems more effectively. Learn More
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Webscraping with beautifulsoup cormanstan 3 1,981 Aug-24-2023, 11:57 AM
Last Post: snippsat
  Webscraping returning empty table Buuuwq 0 1,402 Dec-09-2022, 10:41 AM
Last Post: Buuuwq
  WebScraping using Selenium library Korgik 0 1,046 Dec-09-2022, 09:51 AM
Last Post: Korgik
  How to get rid of numerical tokens in output (webscraping issue)? jps2020 0 1,952 Oct-26-2020, 05:37 PM
Last Post: jps2020
  Python Webscraping with a Login Website warriordazza 0 2,609 Jun-07-2020, 07:04 AM
Last Post: warriordazza
  Help with basic webscraping Captain_Snuggle 2 3,938 Nov-07-2019, 08:07 PM
Last Post: kozaizsvemira
  Can't Resolve Webscraping AttributeError Hass 1 2,315 Jan-15-2019, 09:36 PM
Last Post: nilamo
  How to exclude certain links while webscraping basis on keywords Prince_Bhatia 0 3,247 Oct-31-2018, 07:00 AM
Last Post: Prince_Bhatia
  Webscraping homework Ghigo1995 1 2,651 Sep-23-2018, 07:36 PM
Last Post: nilamo
  Intro to WebScraping d1rjr03 2 3,455 Aug-15-2018, 12:05 AM
Last Post: metulburr

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020