Python Forum
How to scrape only unique values and save it into database
Thread Rating:
  • 1 Vote(s) - 1 Average
  • 1
  • 2
  • 3
  • 4
  • 5
How to scrape only unique values and save it into database
#1
Hi,


I am scraping a website using beautifulsoup, requests and sqlite3. Right now i am scraping website and saving it's data into database. But now due to heavy inflow of data, scraping increases too much duplicate data into database. I tried to use command INSERT OR REPLACE to get only unique data but it didnt work. How can i save only unique data into database.

Below are my codes:

import csv
from bs4 import BeautifulSoup
import requests
import time
import pdb
import sqlite3

url = "http://up-rera.in/projects"
url1 = "http://up-rera.in"
final_data = []
dct = {}

def writefiles(alldata, filename):
    with open ("./"+ filename, "w") as csvfile:
        csvfile = csv.writer(csvfile, delimiter=",")
        csvfile.writerow("")
        for i in range(0, len(alldata)):
            csvfile.writerow(alldata[i])

def getbyGet(url, values):
    res = requests.get(url, data=values)
    text = res.text
    return text

def readHeaders():
    global url, url1
    html = getbyGet(url, {})
    soup  = BeautifulSoup(html, "html.parser")
    EVENTTARGET = soup.select("#__VIEWSTATE")[0]['value']
    EVENTVALIDATION = soup.select("#__EVENTVALIDATION")[0]['value']
    VIEWSTATE = soup.select("#__VIEWSTATE")[0]['value']
    #VIEWSTATEGENERATOR = soup.select("#__VIEWSTATEGENERATOR")[0]["value"]
    headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
              'Content-Type':'application/x-www-form-urlencoded',
              'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:59.0) Gecko/20100101 Firefox/59.0'}

    formfields =  {'__EVENTARGUMENT':'',
                  '__EVENTVALIDATION':EVENTVALIDATION,
                  '__EVENTTARGET':EVENTTARGET,
                  '__VIEWSTATE':VIEWSTATE,
                  "__VIEWSTATEGENERATOR": "4F1A7E70",
                  'ctl00$ContentPlaceHolder1$btnSearch':'Search',
                  'ctl00$ContentPlaceHolder1$DdlprojectDistrict':0, #this is where your city name changes in each iteration
                  'ctl00$ContentPlaceHolder1$txt_regid':'',
                  'ctl00$ContentPlaceHolder1$txtProject':''}
    s = requests.session()
    conn = sqlite3.connect("99_data_increment.db")
    #cur = conn.cursor()
    conn.execute("CREATE TABLE IF NOT EXISTS crawled (id INTEGER PRIMARY KEY, Rera_Number text, Project_Name text, Promoter_Name text, City text, ResComm text, Links text) ")
    cur = conn.cursor()
    res = s.post(url, data=formfields, headers=headers).text
    soup = BeautifulSoup(res, "html.parser")
    get_details = soup.find_all(id="ctl00_ContentPlaceHolder1_GridView1")
    for details in get_details:
        gettr = details.find_all("tr")[1:]
        for tds in gettr:
            td = tds.find_all("td")[1]
            rera = td.find_all("span")
            rnumber = ""
            for num in rera:
                rnumber = num.text
                sublist = []
                sublist.append(rnumber)
            name = tds.find_all("td")[2]
            prj_name = name.find_all("span")
            prj = ""
            for prjname in prj_name:
                prj = prjname.text
                sublist.append(prj)
            promoter_name = tds.find_all("td")[3]
            promoter = promoter_name.find_all("span")
            prom = ""
            for promname in promoter:
                prom = promname.text
                sublist.append(prom)
            district = tds.find_all("td")[4]
            dist = district.find_all("span")
            district_name = ""
            for districtname in dist:
                district_name = districtname.text
                sublist.append(district_name)
            protype = tds.find_all("td")[5]
            project = protype.find_all("span")
            projectype = ""
            for prjtype in project:
                projectype = prjtype.text
                sublist.append(projectype)
            final_data.append(sublist)
            cur.execute("INSERT OR REPLACE INTO crawled VALUES (NULL,?,?,?,?,?,?)",(rnumber, prj, prom , district_name , projectype, projectype))
            conn.commit()
            #print(final_data)
    return final_data

def main():
    datas = readHeaders()
    writefiles(datas, "Up-new.csv")
main()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
Question wkhtmltoimage generated jpeg save to Database using mongoengine Madoo 2 343 Aug-18-2020, 03:42 PM
Last Post: Madoo
  scrape data 1 go to next page scrape data 2 and so on alkaline3 6 1,058 Mar-13-2020, 07:59 PM
Last Post: alkaline3
  creating a range of unique & sequential ID's in a database while handling concurency jertel 0 425 Sep-19-2019, 02:41 PM
Last Post: jertel
  how retrieve database save multiple data in web Django 2.1 taomihiranga 0 827 Jul-30-2019, 04:58 PM
Last Post: taomihiranga
  Read Save RadioButtons from Database in Python Flask Webpage Gary8877 0 3,065 Apr-11-2019, 12:33 AM
Last Post: Gary8877
  Django- Remove leading zeros in values from database ntuttle 1 1,170 Mar-07-2019, 07:30 PM
Last Post: nilamo
  how i save the html form to flask database mebaysan 1 3,499 Feb-07-2019, 12:56 AM
Last Post: snippsat
  How to save uploaded image url in database from Django? PrateekG 14 7,775 Jul-04-2018, 05:18 PM
Last Post: PrateekG
  Execute using Html, Save data into Database and Download in CSV in Django --Part 1 Prince_Bhatia 0 2,164 Jan-19-2018, 06:05 AM
Last Post: Prince_Bhatia

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020