Jul-06-2017, 06:40 AM
Hi i have created a web scraper using in python using selenium.
Website i am scraping
Please remove the #
(to access website please select promoter , in division select konkan and and in district select mumbai city, and select projects you will see list of projects)
this is for 1 district and 1 division.
what my scraper do it it loops through every district and every division and select project and get details of those projects.
But nowadays it scraps only 1 project name and stops it doesn't loop through list of projects and this is the error i receive:
Python 3.6 i am using.
It should loop through every district and every division and scrape every project name with details.
Website i am scraping
Please remove the #
(to access website please select promoter , in division select konkan and and in district select mumbai city, and select projects you will see list of projects)
this is for 1 district and 1 division.
what my scraper do it it loops through every district and every division and select project and get details of those projects.
But nowadays it scraps only 1 project name and stops it doesn't loop through list of projects and this is the error i receive:
Error:Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\Version\Maha Rera.py", line 64, in <module>
while len(selectProject.options) == 1:
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\support\select.py", line 47, in options
return self._el.find_elements(By.TAG_NAME, 'option')
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 527, in find_elements
{"using": by, "value": value})['value']
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webelement.py", line 493, in _execute
return self._parent.execute(command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\webdriver.py", line 254, in execute
response = self.command_executor.execute(driver_command, params)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 464, in execute
return self._request(command_info[0], url, body=data)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\site-packages\selenium\webdriver\remote\remote_connection.py", line 487, in _request
self._conn.request(method, parsed_url.path, body, headers)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1239, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1285, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1234, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 1026, in _send_output
self.send(msg)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 964, in send
self.connect()
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\http\client.py", line 936, in connect
(self.host,self.port), self.timeout, self.source_address)
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\socket.py", line 722, in create_connection
raise err
File "C:\Users\prince.bhatia\AppData\Local\Programs\Python\Python36\lib\socket.py", line 713, in create_connection
sock.connect(sa)
OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted
from selenium import webdriver from selenium.webdriver.support.ui import Select from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.common.exceptions import NoSuchElementException import os import time import csv driver = webdriver.Chrome("./chromedriver") driver.get('https://maharerait.mahaonline.gov.in/searchlist/searchlist') # try: # element = WebDriverWait(driver, 100).until( # EC.presence_of_element_located((By.ID, "Promoter")) # ) # finally: # print('0000000000000000000000') # driver.quit() time.sleep(1) driver.find_element_by_id('Promoter').click() divisionLength = len(Select(driver.find_element_by_id('Division')).options) print('*********{}'.format(divisionLength)) firstRow = 0 titleRow = [] contentRows = [] for divisionElement in range(1,divisionLength): selectDivision = Select(driver.find_element_by_id('Division')) selectDivision.options selectDivision.select_by_index(divisionElement) time.sleep(1) districtLength = len(Select(driver.find_element_by_id('District')).options) while districtLength == 1: pass print(districtLength) for districtElement in range(1,districtLength): selectDistrict = Select(driver.find_element_by_id('District')) selectDistrict.options selectDistrict.select_by_index(districtElement) time.sleep(2) projectLength = len(Select(driver.find_element_by_id('Project')).options) print('/------------------------------/') print('/-----project number: {}-------/'.format(projectLength)) print('/------------------------------/') if projectLength == 1: continue for projectElement in range(1,projectLength): selectProject = Select(driver.find_element_by_id('Project')) selectProject.options while len(selectProject.options) == 1: pass # c = len(select.options) # print('---------------{}'.format(c)) # titleRow = [] # contentRows = [] # firstRow = 0 # for i in range(1,c): # select = Select(driver.find_element_by_id('Project')) # while len(select.options) == 1: # pass time.sleep(1) selectProject.select_by_index(projectElement) driver.find_element_by_id('btnSearch').click() tableRows = driver.find_element_by_class_name('table').find_elements_by_tag_name('tr') if firstRow == 0: headRow = tableRows[0].find_elements_by_tag_name('th') for headRowData in range(0,len(headRow)): text = headRow[headRowData].find_element_by_tag_name('span').text titleRow.append(text) firstRow = firstRow + 1 for dataRowsNumbers in range(1,len(tableRows)): dataRow = tableRows[dataRowsNumbers].find_elements_by_tag_name('td') tempList = [] for dataRowContents in range(0,len(dataRow)): try: a_link = dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href') tempList.append(str(a_link)) except NoSuchElementException: tempList.append(str(dataRow[dataRowContents].text)) # if dataRow[dataRowContents].text == 'View': # a_link = dataRow[dataRowContents].find_element_by_tag_name('a').get_attribute('href') # tempList.append(str(a_link)) # else: # tempList.append(str(dataRow[dataRowContents].text)) print(dataRow[dataRowContents].text) print(tempList) contentRows.append(tempList) # print('Automated check is over') # print('Stored data in programs is as below:') # print(contentRows) with open("./data.csv",'w') as csvfile: csvfile = csv.writer(csvfile, delimiter=',') csvfile.writerow(titleRow) csvfile.writerow("") for i in range(0,len(contentRows)): csvfile.writerow(contentRows[i]) driver.close()is someone can tell me what exact should i do to make it run or someone can make it run.
Python 3.6 i am using.
It should loop through every district and every division and scrape every project name with details.