Python Forum

Full Version: Read url from CSV and Scrape website
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
hi,

i have written a code to read data from CSV file and scrape data, but whenever i am running this code i receive HTTP error 400

I have url like 16k but in this i am posting only 5 url.

my csv file has two columns, 1 st one is ID and second one is URL.

whenever i am running this code it prints invalid url 400 http error.


these are my codes:
import csv
from bs4 import BeautifulSoup
import requests
import time
import os

data_obj = {}
final_data = []

def readfile():
    global data_obj
    file = "BOOK.CSV"
    f = open("./"+ file, "r")
    for row in f.readlines():
        lst = row.split(",")
        data_obj[lst[0]] = lst[1]#here reading dictionary
        #print(data_obj)
        f.close()

def writedata(alldata1, filename):
    print(" >>>> FINAL PRINTING DATA >>>> ")
    #import pdb; pdb.set_trace()
    with open("./"+filename,'w') as csvfile:
        csvfile = csv.writer(csvfile, delimiter=',')
        #csvfile.writerow(titleRow)
        csvfile.writerow("")
        for i in range(0, len( alldata1 )):
            #print(alldata1[i])
            csvfile.writerow( alldata1[i]  )

def parsedata():
    global data_obj, final_data
    for sublist in data_obj.keys():
        url = data_obj[sublist]
        #print(url)
        data = getdata(url,{})
        soup = BeautifulSoup(data, "html.parser")
        print(soup)
        
def getdata(url, values):
    r = requests.post(url, data=values, timeout=10)
    text = r.text
    r.close()
    return text


def main():
    readfile()
    parsedata()

main()
this is the error i received:
Error:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN""http://www.w3.org/TR/html4/strict.dtd"> <html><head><title>Bad Request</title> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/></head> <body><h2>Bad Request - Invalid URL</h2> <hr/><p>HTTP Error 400. The request URL is invalid.</p> </body></html>
i am attaching the file, can someone please tell what should i do?
def getdata(url, values=None):
    r = requests.post(url, data=values, timeout=10)
    text = r.text
    r.close()
    return text
Your code is overcomplicated.

import csv
from bs4 import BeautifulSoup
import requests

def get(urls):
    for url in urls:
        yield requests.get(url).content.decode('utf-8')

with open('BOOK.csv') as csv_:
    reader = csv.reader(csv_)
    urls = [line[1] for line in urls if line]
    
    webpages = list(get(urls))
    
    for html in webpages:
        soup = BeautifulSoup(html, 'lxml')
        print(soup.pretify)
You will be able to put together the rest.
Thank you so much sir, it is great learning here.
urls not defined i get this error while running this code