Sep-19-2022, 01:48 PM
Output:DevTools listening on ws://127.0.0.1:56769/devtools/browser/34ec545d-5d6c-434d-86c6-fc77922dbfe6
DevTools listening on ws://127.0.0.1:56770/devtools/browser/a0b62188-71dd-4d82-865d-cbec7cf42dc5
Error loading jobs from LinkedIn: 'WebDriver' object has no attribute 'find_element_by_css_selector'
Error loading jobs from LinkedIn: 'WebDriver' object has no attribute 'find_element_by_css_selector'
Total duplicates dropped: 0
Total duplicates dropped: 0
-- main.py -- from selenium.webdriver.support.ui import WebDriverWait import json import os import sys import threading import pandas as pd from IndeedScraper import IndeedScraper from LinkedInScraper import LinkedInScraper from ScraperUtil import ScraperUtil def run_search(json_file_name): """Collects job listings that match the parameters stored in the provided json file.""" # Attempt to load the json file. If it isn't successful, close the program. try: search_keywords, location, ignore_keywords, experience = load_json("./Search Configs/"+json_file_name) except: return # Store just the file name, not the file extension. This is useful for creating a new Excel file. file_name = json_file_name.split('.')[0] all_dataFrames = [scrape_indeed(search_keywords, location, ignore_keywords, experience), scrape_linkedin(search_keywords, location, ignore_keywords, experience)] # Stores the search results within an Excel file. store_in_excel_file(file_name, all_dataFrames) def load_json(json_file_name): # Load search variables from config file. try: with open(json_file_name, "r") as jsonfile: config = json.load(jsonfile) # Save all search parameters as variables. search_keywords = config["search_keywords"] location = config["location"] ignore_keywords = config['ignore_keywords'] experience = str(config['experience'].lower()) # Warn the user if they haven't provided a valid experience parameter. if experience not in ["junior", "mid", "senior"]: print( "Warning: Experience value in", json_file_name, " is invalid. please choose either 'Junior', 'Mid', " "or 'Senior'. Jobs of all experience levels will be included in this search.") # Print a summary of the search parameters. print("Read config successfully.") print("search_keywords=", search_keywords) print("location=", location) print("ignore_keywords=", ignore_keywords) print("experience=", experience) return search_keywords, location, ignore_keywords, experience except Exception as e: raise ValueError("Error, could not load ", json_file_name, str(e)) def scrape_indeed(search_keywords, location, ignore_keywords, experience): """Instantiates and calls scrape() method on a LinkedInScraper object. returns the dataFrame stored in the object once the search is complete.""" indeed = IndeedScraper() try: indeed.scrape(search_keywords, location, ignore_keywords, experience) print(indeed.data.shape[0], "jobs loaded from Indeed.") return indeed.data except Exception as e: print("Error loading jobs from Indeed: " + str(e)) return ScraperUtil.construct_dataframe([]) # Return an empty dataFrame. def scrape_linkedin(search_keywords, location, ignore_keywords, experience): """Instantiates and calls scrape() method on an IndeedScraper object. returns the dataFrame stored in the object once the search is complete.""" linkedin = LinkedInScraper() try: linkedin.scrape(search_keywords, location, ignore_keywords, experience) print(linkedin.data.shape[0], "jobs loaded from LinkedIn.") return linkedin.data except Exception as e: print("Error loading jobs from LinkedIn: " + str(e)) return ScraperUtil.construct_dataframe([]) # Return an empty dataFrame. def store_in_excel_file(file_name, all_dataFrames): """Stores all job listings in an Excel file. If the file exists, new listings are added to the existing file. Otherwise, a new Excel file is created.""" master_dataFrame = ScraperUtil.construct_dataframe([]) try: master_dataFrame = pd.read_excel(file_name + '.xlsx') except: print(file_name + ".xlsx doesn't exist yet. Creating new file.") all_dataFrames.append(master_dataFrame) new_dataFrame = pd.concat(all_dataFrames) length_before = new_dataFrame.shape[0] new_dataFrame.drop_duplicates(keep='last', subset=['Title', 'Company', 'Source', 'Date Posted'], inplace=True) length_after = new_dataFrame.shape[0] total_duplicates = length_before - length_after print("Total duplicates dropped:", total_duplicates) new_dataFrame.to_excel(file_name + '.xlsx', index=False) if __name__ == "__main__": all_threads = [] for entry in os.scandir(path="./Search Configs"): if entry.name.split('.')[1] == 'json': all_threads.append(threading.Thread(target=run_search, args=(entry.name,))) if(len(all_threads) == 0): print("No json files found in 'Search Configs' directory. No search will be made.") else: for thread in all_threads: thread.start() for thread in all_threads: thread.join()