Sep-18-2018, 08:42 AM
hi,
I am trying to scrape a website that has text and links
i am creating a web scraper that will scrape the data using beautilfulsoup and requests and links using selenium.
everything is working fine in requests part but not in selenium part.
In the selenium part it is required to click on the link and link will open and then get page url and then move to main page and then start same procedures for another links but when i run the code it gets only first links and then throw error
below are my codes:
I am trying to scrape a website that has text and links
i am creating a web scraper that will scrape the data using beautilfulsoup and requests and links using selenium.
everything is working fine in requests part but not in selenium part.
In the selenium part it is required to click on the link and link will open and then get page url and then move to main page and then start same procedures for another links but when i run the code it gets only first links and then throw error
Error:Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 90, in <module>
main()
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 89, in main
parsedata()
File "C:\Users\prince.bhatia\Desktop\Bihar_rera\Bihar_Rera.py", line 80, in parsedata
geta = i.find_elements_by_tag_name("a")[1]
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 237, in find_elements_by_tag_name
return self.find_elements(by=By.TAG_NAME, value=name)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 527, in find_elements
{"using": by, "value": value})['value']
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 493, in _execute
return self._parent.execute(command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 256, in execute
self.error_handler.check_response(response)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\errorhandler.py", line 194, in check_response
raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.StaleElementReferenceException: Message: stale element reference: element is not attached to the page document
(Session info: chrome=67.0.3396.99)
(Driver info: chromedriver=2.39.562718 (9a2698cba08cf5a471a29d30c8b3e12becabb0e9),platform=Windows NT 6.1.7601 SP1 x86_64)
below are my codes:
from bs4 import BeautifulSoup import requests import csv from selenium import webdriver from selenium.webdriver.common import keys from selenium.webdriver.support.ui import Select import time import functools from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions url = "https://nagarseva.bihar.gov.in/rerabihar/ReraGetProjectStatus.aspx" final_data = [] def writefiles(alldata, filename): with open ("./"+ filename, "w") as csvfile: csvfile = csv.writer(csvfile, delimiter=",") csvfile.writerow("") for i in range(0, len(alldata)): csvfile.writerow(alldata[i]) def getbyGet(url, values): res = requests.get(url, data=values) text = res.text return text def parsedata(): payload = {} global url, final_data data = getbyGet(url, {}) soup = BeautifulSoup(data, "html.parser") #EVENTTARGET = soup.select("#__EVENTTARGET")[0]['value'] EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value'] #print(EVENTVALIDATION) VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value'] #print(VIEWSTATE) #VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Content-Type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'} formfields = {"__EVENTARGUMENT":"PrintIndicator$0", '__EVENTTARGET':"ctl00$ContentPlaceHolder1$GV_Building", '__EVENTVALIDATION':EVENTVALIDATION, #'__EVENTTARGET':EVENTTARGET, '__VIEWSTATE':VIEWSTATE, "__VIEWSTATEENCRYPTED":"", "__VIEWSTATEGENERATOR":"CE676888", } s = requests.session() res = s.post(url, data=formfields, headers=headers).text soup = BeautifulSoup(res, "html.parser") getdata = soup.find_all("div", {"class":"col-lg-8 col-md-8 text-left"}) for i in getdata: datas = i.find_all("h4") for getspan in datas: Buildername = getspan.find_all("span")[2].text projectname = getspan.find_all("span")[3].text getp = i.find_all("p") for data in getp: address = data.find_all("span")[2].text area = data.find_all("span")[5].text district = data.find_all("span")[8].text stardate = data.find_all("span")[11].text enddate = data.find_all("span")[12].text status = data.find_all("span")[13].text driver = webdriver.Chrome("./chromedriver") driver.get('https://nagarseva.bihar.gov.in/rerabihar/ReraGetProjectStatus.aspx') d = driver.find_element_by_xpath('/html/body/form/div[3]/div[2]/table/tbody/tr/td/table/tbody/tr[1]/td[1]/div/table/tbody/tr[2]/td[3]/input') d.click() getclass = driver.find_elements_by_css_selector(".col-lg-3.col-md-3") for i in getclass: sublist = [] time.sleep(2) geta = i.find_elements_by_tag_name("a")[1] geta.click() window_before = driver.window_handles[0] driver.switch_to_window(driver.window_handles[-1]) d = driver.current_url print(d) sublist.append(d) driver.switch_to_window(window_before) def main(): parsedata() main()Please help on this one