Oct-31-2018, 07:00 AM
Hi,
I am trying to webscrape google search results. My job is to find and scrape google search results basis on keywords have been provided to me in a csv file.
what my codes do are they find those keywords in google and get first three links. But it also scrapes some links which are not required.
I have another csv file which is negative list which has certain keywords now if while scraping codes find any google search results whose text is also available on negative list it should avoid it and do not add it into database.
for example if if finds www.justdial.com and negative list has justdial, it should not add it into database
Till now i received no success.
below are my codes:
I am trying to webscrape google search results. My job is to find and scrape google search results basis on keywords have been provided to me in a csv file.
what my codes do are they find those keywords in google and get first three links. But it also scrapes some links which are not required.
I have another csv file which is negative list which has certain keywords now if while scraping codes find any google search results whose text is also available on negative list it should avoid it and do not add it into database.
for example if if finds www.justdial.com and negative list has justdial, it should not add it into database
Till now i received no success.
below are my codes:
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import TimeoutException from selenium.webdriver.chrome.options import Options import csv import time from itertools import groupby,chain from operator import itemgetter import sqlite3 final_data = [] def getresults(): global final_data conn = sqlite3.connect("Jobs_data.db") conn.execute("""CREATE TABLE IF NOT EXISTS naukri(id INTEGER PRIMARY KEY, KEYWORD text, LINK text, CONSTRAINT number_unique UNIQUE (KEYWORD,LINK)) """) cur = conn.cursor() #chrome_options = Options() #chrome_options.add_argument("--headless") #chrome_options.binary_location = '/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary' driver = webdriver.Chrome("./chromedriver") with open("./"+"terms12.csv", "r") as csvfile: reader = csv.reader(csvfile) next(reader) for row in reader: keywords = row[0] try: url = "https://www.google.co.in/search?num=10&q=" + keywords driver.get(url) time.sleep(5) count = 0 links = driver.find_elements_by_class_name("g")[:3] for i in links: data = i.find_elements_by_class_name("iUh30") dm = negativelist("junk.csv") for i in data: sublist = [] data = i.text if data.find("justdial")!=-1: continue print("I am in exception") sublist.append(keywords) sublist.append(data) print(sublist) final_data.append(sublist) cur.execute("INSERT OR IGNORE INTO naukri VALUES (NULL,?,?)",(keywords,data)) except Exception as e: print("I am outside exception") conn.commit() return final_data def negativelist(file): sublist = [] with open("./"+file,"r") as csvfile: reader = csv.reader(csvfile) for row in reader: _data = row[0] sublist.append(_data) return sublist def readfile(alldata, filename): with open ("./"+ filename, "w",encoding="utf-8") as csvfile: csvfile = csv.writer(csvfile, delimiter=",") csvfile.writerow("") for i in range(0, len(alldata)): csvfile.writerow(alldata[i]) def main(): getresults() readfile([[k, *chain.from_iterable(r for _, *r in g)] for k, g in groupby(final_data, key=itemgetter(0))], "Naukri.csv") main()