![]() |
List index out of range - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: List index out of range (/thread-7043.html) |
List index out of range - Prince_Bhatia - Dec-19-2017 hi, i am trying to webscrape this webpage: https://maharerait.mahaonline.gov.in//PrintPreview/PrintPreview/UHJvamVjdElEPTUwODMmRGl2aXNpb249NiZVc2VySUQ9MTUxNTEmUm9sZUlEPTEmQXBwSUQ9NTUzNzYmQWN0aW9uPVNFQVJDSA%3d%3d and the value i am trying to webscrape is the text PARAM DEVELOPERS the code i have written in my original code is import urllib.request import urllib import json import xml.etree.ElementTree as ET import csv from bs4 import BeautifulSoup link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist' talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka" distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict" prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName" links = [] certificatedata = [] def parseJson(data): parsed = json.loads(data) return parsed def writedata(alldata1, filename): print(" >>>> FINAL PRINTING DATA >>>> ") #import pdb; pdb.set_trace() with open("./"+filename,'w') as csvfile: csvfile = csv.writer(csvfile, delimiter=',') #csvfile.writerow(titleRow) csvfile.writerow("") for i in range(0, len( alldata1 )): #print(alldata1[i]) csvfile.writerow( alldata1[i] ) def processlinksforcert(): global links, certificatedata print(">> Came in fetching certificates data >>> " ) for sublist in links: certno = sublist[0] link = sublist[1] htmldata = getData(link, {}) soup = BeautifulSoup(htmldata, "html.parser") divs = soup.find_all("div") #if len(certificatedata)>20: # break for div in divs: attr = div.attrs if "id" in attr.keys() and "DivExp" in attr['id']: table = div.find_all(class_="grid-wrap") for more in table: text = more.find_all("tr")[1:]#if header has any TH for tds in text: td = tds.find_all("td")[1] rnumber = "" for num in td: rnumber = num sublist = [] sublist.append(certno) sublist.append(rnumber) td1 = tds.find_all("td")[2] project = "" for prj in td1: project = prj sublist.append(project) td2 = tds.find_all("td")[3] others = "" for oth in td2: others = oth sublist.append(others) td3 = tds.find_all("td")[4] area = "" for ara in td3: area = ara sublist.append(area) td4 = tds.find_all("td")[5] add = "" for address in td4: add = address sublist.append(add) td5 = tds.find_all("td")[6] cts = "" for ctsn in td5: cts = ctsn sublist.append(cts) td6 = tds.find_all("td")[7] buildings = "" for build in td6: buildings = build sublist.append(buildings) td7 = tds.find_all("td")[8] apartments = "" for apart in td7: apartments = apart sublist.append(apartments) td8 = tds.find_all("td")[9] original = "" for date in td8: original = date sublist.append(original) td9 = tds.find_all("td")[10] actual = "" for adate in td9: actual = adate sublist.append(actual) certificatedata.append(sublist) org = div.find_all(class_="col-md-3 col-sm-3")[4] count = 0 val = len(certificatedata) sublist1 = certificatedata[val -1] for div1 in org: sublist1.append(div1.get_text) certificatedata[val-1] = sublist1 count +=1 def formattext(text): while text.find("\r\n")>=0: text = text.replace("\r\n","") while text.find(" ")>=0: text = text.replace(" ","") return text def readlinksdata(): global links f = open("./jsondata.txt", "r") txt = f.read() f.close() links = json.loads(txt) def main(): global alldata, certificatedata #data = getData(url, {}) #getProjectsList() #print("Before write the projects data to the file. Count >> "+str(len(alldata))) #writedata(alldata, "data.csv") readlinksdata() data = processlinksforcert() print("Before write the certificates data to the file. Count >> "+str(len(data))) writedata( data, "certificate2.csv" ) def getData(url, values): import requests #import pdb; pdb.set_trace() #req = requests.get(link) print("url >> "+url) req = requests.post(url, data=values, timeout=10) text = req.text req.close() #print("hello world"+str(values)) return text #getDataByReq() main()and the error i am getting is this : can anyone tell me what i am doing wrong?i have attached the txt file even to run this code RE: List index out of range - Larz60+ - Dec-19-2017 add the following statement after line 107 print('val: {}'.format(val))and show results RE: List index out of range - Prince_Bhatia - Dec-19-2017 hi, i received val =0
RE: List index out of range - Larz60+ - Dec-19-2017 OK, so what is val - 1? the 1st cell of sublist1 can't be -1. This shows that 'certificatedata' is empty |