Jul-31-2018, 02:46 AM
I have one crawler which will help you get dataset.
Hi sir, below code will help you get dataset.
Hi sir, below code will help you get dataset.
import csv from bs4 import BeautifulSoup import requests import time import pdb import sqlite3 import datetime from datetime import date url = "http://up-rera.in/projects" url1 = "http://up-rera.in" final_data = [] dct = {} def writefiles(alldata, filename): with open ("./"+ filename, "w") as csvfile: csvfile = csv.writer(csvfile, delimiter=",") csvfile.writerow("") for i in range(0, len(alldata)): csvfile.writerow(alldata[i]) def getbyGet(url, values): res = requests.get(url, data=values) text = res.text return text def readHeaders(): global url, url1 html = getbyGet(url, {}) soup = BeautifulSoup(html, "html.parser") EVENTTARGET = soup.select("#__VIEWSTATE")[0]['value'] EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value'] VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value'] #VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Content-Type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'} formfields = {'__EVENTARGUMENT':'', '__EVENTVALIDATION':EVENTVALIDATION, '__EVENTTARGET':EVENTTARGET, '__VIEWSTATE':VIEWSTATE, "__VIEWSTATEGENERATOR": "4F1A7E70", 'ctl00$ContentPlaceHolder1$btnSearch':'Search', 'ctl00$ContentPlaceHolder1$DdlprojectDistrict':0, #this is where your city name changes in each iteration 'ctl00$ContentPlaceHolder1$txt_regid':'', 'ctl00$ContentPlaceHolder1$txtProject':''} s = requests.session() conn = sqlite3.connect("99_data_increment.db", timeout=10) fdate = date.today() #cur = conn.cursor() conn.execute("""CREATE TABLE IF NOT EXISTS crawled (id INTEGER PRIMARY KEY, State text, XID text, Project_Name text, City text, Main_City text, Registration_Number text, Promoter_Name text, Rera_URL text, PDF_text, Crawled_Date text, Status text, Names text, Transaction_Date text, Comments text, Call_Contact_Number text, Creation_Type text, Builder_Website text, CONSTRAINT number_unique UNIQUE (Registration_Number)) """) #data = State text, XID text, Project Name text, City text, Main City text, Registration Number text, Promoter Name text, Rera URL text, PDF text, Crawled Date text, Status text, Names text, Transaction Date text, Comments text, Call Contact Number text, Creation Type text, Builder Website text cur = conn.cursor() res = s.post(url, data=formfields, headers=headers).text soup = BeautifulSoup(res, "html.parser") get_details = soup.find_all(id="ctl00_ContentPlaceHolder1_GridView1") for details in get_details: gettr = details.find_all("tr")[1:] for tds in gettr: td = tds.find_all("td")[1] rera = td.find_all("span") rnumber = "" data1 = [] blank = "" for num in rera: rnumber = num.text data1.append(rnumber) sublist = [] sublist.append(rnumber) name = tds.find_all("td")[2] prj_name = name.find_all("span") prj = "" for prjname in prj_name: prj = prjname.text sublist.append(prj) promoter_name = tds.find_all("td")[3] promoter = promoter_name.find_all("span") prom = "" for promname in promoter: prom = promname.text sublist.append(prom) district = tds.find_all("td")[4] dist = district.find_all("span") district_name = "" for districtname in dist: district_name = districtname.text sublist.append(district_name) protype = tds.find_all("td")[5] project = protype.find_all("span") projectype = "" for prjtype in project: projectype = prjtype.text sublist.append(projectype) link = "http://up-rera.in/View_Registration_Details.aspx?binid=" for i in data1: a = i.split("J") links = link+a[1]+"&hfFlag=edit&ddlPRJ=0&txtPRJ=" sublist.append(links) final_data.append(sublist) #data = State text, XID text, Project Name text, City text, Main City text, Registration Number text, Promoter Name text, Rera URL text, PDF text, Crawled Date text, Status text, Names text, Transaction Date text, Comments text, Call Contact Number text, Creation Type text, Builder Website text cur.execute("INSERT OR IGNORE INTO crawled VALUES (NULL,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)",("Uttar Pradesh",blank,prj,district_name,blank,rnumber, prom , links, blank,fdate, blank, blank, blank, blank, blank, blank, blank )) conn.commit() #print(final_data) return final_data def main(): datas = readHeaders() writefiles(datas, "Up-new.csv") main()