Jun-08-2018, 06:59 AM
Hi,
I am scraping a website using beautifulsoup, requests and sqlite3. Right now i am scraping website and saving it's data into database. But now due to heavy inflow of data, scraping increases too much duplicate data into database. I tried to use command INSERT OR REPLACE to get only unique data but it didnt work. How can i save only unique data into database.
Below are my codes:
I am scraping a website using beautifulsoup, requests and sqlite3. Right now i am scraping website and saving it's data into database. But now due to heavy inflow of data, scraping increases too much duplicate data into database. I tried to use command INSERT OR REPLACE to get only unique data but it didnt work. How can i save only unique data into database.
Below are my codes:
import csv from bs4 import BeautifulSoup import requests import time import pdb import sqlite3 url = "http://up-rera.in/projects" url1 = "http://up-rera.in" final_data = [] dct = {} def writefiles(alldata, filename): with open ("./"+ filename, "w") as csvfile: csvfile = csv.writer(csvfile, delimiter=",") csvfile.writerow("") for i in range(0, len(alldata)): csvfile.writerow(alldata[i]) def getbyGet(url, values): res = requests.get(url, data=values) text = res.text return text def readHeaders(): global url, url1 html = getbyGet(url, {}) soup = BeautifulSoup(html, "html.parser") EVENTTARGET = soup.select("#__VIEWSTATE")[0]['value'] EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value'] VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value'] #VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"] headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Content-Type':'application/x-www-form-urlencoded', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'} formfields = {'__EVENTARGUMENT':'', '__EVENTVALIDATION':EVENTVALIDATION, '__EVENTTARGET':EVENTTARGET, '__VIEWSTATE':VIEWSTATE, "__VIEWSTATEGENERATOR": "4F1A7E70", 'ctl00$ContentPlaceHolder1$btnSearch':'Search', 'ctl00$ContentPlaceHolder1$DdlprojectDistrict':0, #this is where your city name changes in each iteration 'ctl00$ContentPlaceHolder1$txt_regid':'', 'ctl00$ContentPlaceHolder1$txtProject':''} s = requests.session() conn = sqlite3.connect("99_data_increment.db") #cur = conn.cursor() conn.execute("CREATE TABLE IF NOT EXISTS crawled (id INTEGER PRIMARY KEY, Rera_Number text, Project_Name text, Promoter_Name text, City text, ResComm text, Links text) ") cur = conn.cursor() res = s.post(url, data=formfields, headers=headers).text soup = BeautifulSoup(res, "html.parser") get_details = soup.find_all(id="ctl00_ContentPlaceHolder1_GridView1") for details in get_details: gettr = details.find_all("tr")[1:] for tds in gettr: td = tds.find_all("td")[1] rera = td.find_all("span") rnumber = "" for num in rera: rnumber = num.text sublist = [] sublist.append(rnumber) name = tds.find_all("td")[2] prj_name = name.find_all("span") prj = "" for prjname in prj_name: prj = prjname.text sublist.append(prj) promoter_name = tds.find_all("td")[3] promoter = promoter_name.find_all("span") prom = "" for promname in promoter: prom = promname.text sublist.append(prom) district = tds.find_all("td")[4] dist = district.find_all("span") district_name = "" for districtname in dist: district_name = districtname.text sublist.append(district_name) protype = tds.find_all("td")[5] project = protype.find_all("span") projectype = "" for prjtype in project: projectype = prjtype.text sublist.append(projectype) final_data.append(sublist) cur.execute("INSERT OR REPLACE INTO crawled VALUES (NULL,?,?,?,?,?,?)",(rnumber, prj, prom , district_name , projectype, projectype)) conn.commit() #print(final_data) return final_data def main(): datas = readHeaders() writefiles(datas, "Up-new.csv") main()