I attempted to load this with selenium, because the next page is created dynamically with javascript, and if not done that way, redirects to the same page.
this will work with login and password, and click on button (with selenium)
That's more work that I'm willing to do, but you can use code below as a starting point
There's also an Event listener involved which I haven't worked with before, but I found this link:
https://stackoverflow.com/questions/3588...he-webpage
Code so far:
You will need this one (it's used to create an easy on the eye copy of page fetched by selenium named LinkedinPage1.html (in script directory) useful for looking at javascript rendered page: PrettifyPage.py
# PrettifyPage.py
from bs4 import BeautifulSoup
import requests
import pathlib
class PrettifyPage:
def __init__(self):
pass
def prettify(self, soup, indent):
pretty_soup = str()
previous_indent = 0
for line in soup.prettify().split("\n"):
current_indent = str(line).find("<")
if current_indent == -1 or current_indent > previous_indent + 2:
current_indent = previous_indent + 1
previous_indent = current_indent
pretty_soup += self.write_new_line(line, current_indent, indent)
return pretty_soup
def write_new_line(self, line, current_indent, desired_indent):
new_line = ""
spaces_to_add = (current_indent * desired_indent) - current_indent
if spaces_to_add > 0:
for i in range(spaces_to_add):
new_line += " "
new_line += str(line) + "\n"
return new_line
start of selenium scraper:
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from pathlib import Path
import os
import time
import PrettifyPage
class GetLinkedinJobs:
def __init__(self):
self.pp = PrettifyPage.PrettifyPage()
# assert starting directory
os.chdir(os.path.abspath(os.path.dirname(__file__)))
self.homepath = Path('.')
def start_browser(self):
caps = webdriver.DesiredCapabilities().FIREFOX
caps["marionette"] = True
self.browser = None
self.browser = webdriver.Firefox(capabilities=caps)
def stop_browser(self):
self.browser.close()
def save_pretty_page(self, soup):
save_pretty_filename = self.homepath / 'LinkedinPage1.html'
print(f'self.save_pretty_filename: {save_pretty_filename.resolve()}')
with save_pretty_filename.open('w') as fp:
fp.write(self.pp.prettify(soup, 2))
def get_page_info(self):
self.start_browser()
url = 'https://www.linkedin.com/jobs/search?keywords=Data%20Science&location=United%20Kingdom&redirect=false&position=1&pageNum=0'
self.browser.get(url)
time.sleep(2)
src = self.browser.page_source
soup = BeautifulSoup(src,"lxml")
self.save_pretty_page(soup)
self.stop_browser()
if __name__ == '__main__':
glj = GetLinkedinJobs()
glj.get_page_info()
Good luck