Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
List index out of range
#1
hi,

i am trying to webscrape this webpage:

https://maharerait.mahaonline.gov.in//Pr...JDSA%3d%3d

and the value i am trying to webscrape is the text PARAM DEVELOPERS

the code i have written in my original code is

import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup

link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"


links = []

certificatedata = []

def parseJson(data):
    parsed = json.loads(data)
    return parsed

def writedata(alldata1, filename):
    print(" >>>> FINAL PRINTING DATA >>>> ")
    #import pdb; pdb.set_trace()
    with open("./"+filename,'w') as csvfile:
        csvfile = csv.writer(csvfile, delimiter=',')
        #csvfile.writerow(titleRow)
        csvfile.writerow("")
        for i in range(0, len( alldata1 )):
            #print(alldata1[i])
            csvfile.writerow( alldata1[i]  )

def processlinksforcert():
    global links, certificatedata
    print(">> Came in fetching certificates data >>> " )
    for sublist in links:
        certno = sublist[0]
        link = sublist[1]
        htmldata = getData(link, {})
        soup = BeautifulSoup(htmldata, "html.parser") 
        divs = soup.find_all("div")
        #if len(certificatedata)>20:
        #    break
        for div in divs:
            attr = div.attrs
            if "id" in attr.keys() and "DivExp" in attr['id']:
                table = div.find_all(class_="grid-wrap")
                for more in table:
                    text = more.find_all("tr")[1:]#if header has any TH
                    for tds in text:
                        td = tds.find_all("td")[1]
                        rnumber = ""
                        for num in td:
                            rnumber = num
                            sublist = []
                            sublist.append(certno)
                            sublist.append(rnumber)
                        td1 = tds.find_all("td")[2]
                        project = ""
                        for prj in td1:
                            project = prj
                            sublist.append(project)
                        td2 = tds.find_all("td")[3]
                        others = ""
                        for oth in td2:
                            others = oth
                            sublist.append(others)
                        td3 = tds.find_all("td")[4]
                        area = ""
                        for ara in td3:
                            area = ara
                            sublist.append(area)
                        td4 = tds.find_all("td")[5]
                        add = ""
                        for address in td4:
                            add = address
                            sublist.append(add)
                        td5 = tds.find_all("td")[6]
                        cts = ""
                        for ctsn in td5:
                            cts = ctsn
                            sublist.append(cts)
                        td6 = tds.find_all("td")[7]
                        buildings = ""
                        for build in td6:
                            buildings = build
                            sublist.append(buildings)
                        td7 = tds.find_all("td")[8]
                        apartments = ""
                        for apart in td7:
                            apartments = apart
                            sublist.append(apartments)
                        td8 = tds.find_all("td")[9]
                        original = ""
                        for date in td8:
                            original = date
                            sublist.append(original)
                        td9 = tds.find_all("td")[10]
                        actual = ""
                        for adate in td9:
                            actual = adate
                            sublist.append(actual)
                        
                        certificatedata.append(sublist)
            org = div.find_all(class_="col-md-3 col-sm-3")[4]
            count = 0
            val = len(certificatedata)
            sublist1 = certificatedata[val -1]
            for div1 in org:
                sublist1.append(div1.get_text)
            certificatedata[val-1] = sublist1
            count +=1

def formattext(text):
    while text.find("\r\n")>=0:
        text = text.replace("\r\n","")

    while text.find("   ")>=0:
        text = text.replace("   ","")
    return text

def readlinksdata():
    global links
    f = open("./jsondata.txt", "r")
    txt = f.read()
    f.close()
    links = json.loads(txt)
    

def main():
    global alldata, certificatedata
    #data = getData(url, {})
    #getProjectsList()
    #print("Before write the projects data to the file. Count >> "+str(len(alldata)))
    #writedata(alldata, "data.csv")
    readlinksdata()
    data = processlinksforcert()
    print("Before write the certificates data to the file. Count >> "+str(len(data)))
    writedata( data, "certificate2.csv" )

def getData(url, values):
    import requests
    #import pdb; pdb.set_trace()     
    #req = requests.get(link)
    print("url >> "+url)
    req = requests.post(url, data=values, timeout=10)
    text = req.text
    req.close()
    #print("hello world"+str(values))
    return text


#getDataByReq()


main()
and the error i am getting is this :

Error:
Traceback (most recent call last): File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 156, in <module> main() File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 137, in main data = processlinksforcert() File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 108, in processlinksforcert sublist1 = certificatedata[val -1] IndexError: list index out of range
can anyone tell me what i am doing wrong?

i have attached the txt file even to run this code

Attached Files

.txt   jsondata.txt (Size: 183 bytes / Downloads: 472)
Reply
#2
add the following statement after line 107
print('val: {}'.format(val))
and show results
Reply
#3
hi,

i received val =0

Error:
val: 0 Traceback (most recent call last): File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 157, in <module> main() File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 138, in main data = processlinksforcert() File "C:\Users\prince.bhatia\Desktop\maharera\Past_Details.py", line 109, in processlinksforcert sublist1 = certificatedata[val -1] IndexError: list index out of range
Reply
#4
OK, so what is val - 1?
the 1st cell of sublist1  can't be -1.
This shows that 'certificatedata' is empty
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  IndexError: list index out of range" & "TypeError: The view function f: Flask Web App joelbeater992 5 3,455 Aug-31-2021, 08:08 PM
Last Post: joelbeater992
  Python BeautifulSoup IndexError: list index out of range rhat398 1 6,163 May-28-2021, 09:09 PM
Last Post: Daring_T
  IndexError: tuple index out of range ? JohnnyCoffee 4 3,355 Jan-22-2020, 06:54 AM
Last Post: JohnnyCoffee
  Getting 'list index out of range' while fetching product details using BeautifulSoup? PrateekG 8 8,043 Jun-06-2018, 12:15 PM
Last Post: snippsat

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020