Dec-19-2017, 05:36 AM
hi,
i am trying to webscrape this webpage:
https://maharerait.mahaonline.gov.in//Pr...JDSA%3d%3d
and the value i am trying to webscrape is the text PARAM DEVELOPERS
the code i have written in my original code is
i have attached the txt file even to run this code
i am trying to webscrape this webpage:
https://maharerait.mahaonline.gov.in//Pr...JDSA%3d%3d
and the value i am trying to webscrape is the text PARAM DEVELOPERS
the code i have written in my original code is
import urllib.request import urllib import json import xml.etree.ElementTree as ET import csv from bs4 import BeautifulSoup link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist' talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka" distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict" prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName" links = [] certificatedata = [] def parseJson(data): parsed = json.loads(data) return parsed def writedata(alldata1, filename): print(" >>>> FINAL PRINTING DATA >>>> ") #import pdb; pdb.set_trace() with open("./"+filename,'w') as csvfile: csvfile = csv.writer(csvfile, delimiter=',') #csvfile.writerow(titleRow) csvfile.writerow("") for i in range(0, len( alldata1 )): #print(alldata1[i]) csvfile.writerow( alldata1[i] ) def processlinksforcert(): global links, certificatedata print(">> Came in fetching certificates data >>> " ) for sublist in links: certno = sublist[0] link = sublist[1] htmldata = getData(link, {}) soup = BeautifulSoup(htmldata, "html.parser") divs = soup.find_all("div") #if len(certificatedata)>20: # break for div in divs: attr = div.attrs if "id" in attr.keys() and "DivExp" in attr['id']: table = div.find_all(class_="grid-wrap") for more in table: text = more.find_all("tr")[1:]#if header has any TH for tds in text: td = tds.find_all("td")[1] rnumber = "" for num in td: rnumber = num sublist = [] sublist.append(certno) sublist.append(rnumber) td1 = tds.find_all("td")[2] project = "" for prj in td1: project = prj sublist.append(project) td2 = tds.find_all("td")[3] others = "" for oth in td2: others = oth sublist.append(others) td3 = tds.find_all("td")[4] area = "" for ara in td3: area = ara sublist.append(area) td4 = tds.find_all("td")[5] add = "" for address in td4: add = address sublist.append(add) td5 = tds.find_all("td")[6] cts = "" for ctsn in td5: cts = ctsn sublist.append(cts) td6 = tds.find_all("td")[7] buildings = "" for build in td6: buildings = build sublist.append(buildings) td7 = tds.find_all("td")[8] apartments = "" for apart in td7: apartments = apart sublist.append(apartments) td8 = tds.find_all("td")[9] original = "" for date in td8: original = date sublist.append(original) td9 = tds.find_all("td")[10] actual = "" for adate in td9: actual = adate sublist.append(actual) certificatedata.append(sublist) org = div.find_all(class_="col-md-3 col-sm-3")[4] count = 0 val = len(certificatedata) sublist1 = certificatedata[val -1] for div1 in org: sublist1.append(div1.get_text) certificatedata[val-1] = sublist1 count +=1 def formattext(text): while text.find("\r\n")>=0: text = text.replace("\r\n","") while text.find(" ")>=0: text = text.replace(" ","") return text def readlinksdata(): global links f = open("./jsondata.txt", "r") txt = f.read() f.close() links = json.loads(txt) def main(): global alldata, certificatedata #data = getData(url, {}) #getProjectsList() #print("Before write the projects data to the file. Count >> "+str(len(alldata))) #writedata(alldata, "data.csv") readlinksdata() data = processlinksforcert() print("Before write the certificates data to the file. Count >> "+str(len(data))) writedata( data, "certificate2.csv" ) def getData(url, values): import requests #import pdb; pdb.set_trace() #req = requests.get(link) print("url >> "+url) req = requests.post(url, data=values, timeout=10) text = req.text req.close() #print("hello world"+str(values)) return text #getDataByReq() main()and the error i am getting is this :
Error:Traceback (most recent call last):
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 156, in <module>
main()
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 137, in main
data = processlinksforcert()
File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 108, in processlinksforcert
sublist1 = certificatedata[val -1]
IndexError: list index out of range
can anyone tell me what i am doing wrong?i have attached the txt file even to run this code
Attached Files