Python Forum
Beautiful soup opens python command line and nothing happens
Thread Rating:
  • 2 Vote(s) - 4.5 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Beautiful soup opens python command line and nothing happens
#1
i am new to programming , i have created a webscraper in python using beautiful soup but when i run this program it opens python command line and and just cursor blink on it and nothing happens...please dont mind the indentation,can someone please tell me what is happeing here, i am using python 3.6


import urllib.request
import urllib
import json
import xml.etree.ElementTree as ET
import csv
from bs4 import BeautifulSoup

link = 'https://maharerait.mahaonline.gov.in/searchlist/searchlist'
talukaLink = "https://maharerait.mahaonline.gov.in/SearchList/GetTaluka"
distlink = "https://maharerait.mahaonline.gov.in/SearchList/GetDistrict"
prjLink = "https://maharerait.mahaonline.gov.in/SearchList/GetProjectName"

alldata = []

links = {}
certificatedata = []

def getData(url, values):
    data = urllib.parse.urlencode(values)
    data = data.encode('utf-8')
    req = urllib.request.Request(url, data)
    response=urllib.request.urlopen(req)
    data = response.read()
    data = data.decode("utf-8")
    return data


def getDivsion():
    ## for now we are taking 6 districts.. it needs to updated when the data gets updatedd
    return range(1,7)

def getDistrict(divId):
    global distlink
    values = {'DivID': divId}
    data = getData(distlink, values)
    return data

def parseJson(data):
    parsed = json.loads(data)
    return parsed

def getTaluka(disId):
    global talukaLink
    values= {'DisID': disId}
    data = getData(talukaLink, values)
    return data

def getProjects(divId, disId):
    global prjLink
    values= {'DisID': disId, 'DivID': divId}
    #print(values)
    data = getData( prjLink, values)
    if len(data)<10:
        return "{}"
    return data

def getProjectsList():
    divList = getDivsion()
    flag = 0
    for divId in divList:
        disData = getDistrict(divId)
        disList = parseJson(disData)
        for disObj in disList:
            disId = disObj["ID"]
            prjData = getProjects(divId, disId)
            #print(" >>>> "+str(disId)+" >> "+str(divId))
            #print(prjData)
            prjJson = parseJson(prjData)
            for prjObj in prjJson:
                flag += 1
                prjId = prjObj["ID"]
                values = {'ID':0, 'pageTraverse': 1, 'Division': divId,     'hdnDistrict': '', 'hdnProject':'', 'District': disId, 'Taluka':'', 'Village': '', 'Project': prjId, 'CertiNo':'', 'btnSearch':'Search'}
                finalPrjData = getData(link, values)
                parseXMLData(finalPrjData)
                #if len(alldata)>100:
                #    break

def parseXMLData(htmldata):
    global alldata, links
    soup = BeautifulSoup(htmldata, "html.parser")
    tables = soup.find_all("table")
    for table in tables:
        attr = table.attrs
        if "table" in attr['class']:
            tbody = table.find_all("tbody")
            if len(tbody)>0:
                tbody = tbody[0]
                tr_lst = tbody.find_all("tr")
                for tr in tr_lst:
                    sublist = []
                    td_lst = tr.find_all("td")
                    if len(td_lst)>6:
                        prjname = td_lst[1].text
                        proname = td_lst[2].text
                        certNo = td_lst[3].text
                        sublist.append(prjname)
                        sublist.append(proname)
                        sublist.append(certNo)
                        td = td_lst[4]
                        a_lst = td.find_all("a")
                        if len(a_lst)>0:
                            a = a_lst[0]
                            href = a.attrs['href']
                            link = "https://maharerait.mahaonline.gov.in/"+href
                            links[certNo] = link
                            sublist.append(link)
                    if len(sublist)>0:
                        alldata.append(sublist)
    return alldata


def writedata(alldata1, filename):
    print(" >>>> FINAL PRINTING DATA >>>> ")
    #import pdb; pdb.set_trace()
    with open("./"+filename,'w') as csvfile:
        csvfile = csv.writer(csvfile, delimiter=',')
        #csvfile.writerow(titleRow)
        csvfile.writerow("")
        for i in range(0, len( alldata1 )):
            #print(alldata1[i])
            csvfile.writerow( alldata1[i]  )


def processlinksforcert():
    global links, certificatedata
    print(">> Came in fetching certificates data >>> " )
    for certno in links.keys():
        link = links[certno]
        htmldata = getData(link, {})
        soup = BeautifulSoup(htmldata, "html.parser") 
        divs = soup.find_all("div")
        for div in divs:
            attr = div.attrs
            if "id" in attr.keys() and "DivProfessional" in attr['id']:
                table = div.find_all("table")
                if len(table)<=0:
                    continue
                t_attr = table[0].attrs
                if "table" in t_attr["class"]:
                    table = table[0]
                    tr_lst = table.find_all("tr")
                    index = 1
                    while index<len(tr_lst):
                        #import pdb; pdb.set_trace()
                        #for tr in tr_lst:
                        #if index==0:
                        #    continue
                        tr = tr_lst[index]
                        index += 1
                        sublist = []
                        td_lst = tr.find_all("td")
                        if len(td_lst)>2:
                            sublist.append(certno)
                            pername = formattext( td_lst[0].text)
                            cerno = formattext( td_lst[1].text )
                            proftype = formattext( td_lst[2].text )
                            sublist.append(pername)
                            sublist.append(cerno)
                            sublist.append(proftype)
                            certificatedata.append(sublist)
    return certificatedata

def formattext(text):
    while text.find("\r\n")>=0:
        text = text.replace("\r\n","")

    while text.find("   ")>=0:
        text = text.replace("   ","")
    return text

def main():
    global alldata, certificatedata
    #data = getData(url, {})
    getProjectsList()
    writedata(alldata, "data.csv")
    data = processlinksforcert()
    writedata( data, "certificates.csv" )


main()
Reply
#2
suggest you read the following.
You should use requests rather than urllib, and these explain it's use with beautifulsoup best
web scraping part1
web scraping part2
Reply
#3
(Aug-01-2017, 11:20 AM)Larz60+ Wrote: suggest you read the following.
You should use requests rather than urllib, and these explain it's use with beautifulsoup best
web scraping part1
web scraping part2

now i received these errors:
TimeoutError: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond

ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
Reply
#4
you should do the tutorials
Reply
#5
may be i should use import request, a try..Thank you so much for you help
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Beautiful Soup - access a rating value in a class KatMac 1 3,474 Apr-16-2021, 01:27 PM
Last Post: snippsat
  *Beginner* web scraping/Beautiful Soup help 7ken8 2 2,618 Jan-28-2021, 04:26 PM
Last Post: 7ken8
  Help: Beautiful Soup - Parsing HTML table ironfelix717 2 2,696 Oct-01-2020, 02:19 PM
Last Post: snippsat
  Beautiful Soup (suddenly) doesn't get full webpage html j.crater 8 16,902 Jul-11-2020, 04:31 PM
Last Post: j.crater
  Requests-HTML vs Beautiful Soup - How to Choose? robin73 0 3,827 Jun-23-2020, 02:53 PM
Last Post: robin73
  Python to interact with the Linux Command Line - Centos/RHEL redhat_boy 2 2,213 May-10-2020, 08:33 AM
Last Post: redhat_boy
  looking for direction - scrappy, crawler, beautiful soup Sly_Corn 2 2,457 Mar-17-2020, 03:17 PM
Last Post: Sly_Corn
  Beautiful soup truncates results jonesjoz 4 3,881 Mar-09-2020, 06:04 PM
Last Post: jonesjoz
  Beautiful soup and tags starter_student 11 6,185 Jul-08-2019, 03:41 PM
Last Post: starter_student
  Beautiful Soup find_all() kirito85 2 3,373 Jun-14-2019, 02:17 AM
Last Post: kirito85

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020