Read url from CSV and Scrape website - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Read url from CSV and Scrape website (/thread-7104.html) |
Read url from CSV and Scrape website - Prince_Bhatia - Dec-21-2017 hi, i have written a code to read data from CSV file and scrape data, but whenever i am running this code i receive HTTP error 400 I have url like 16k but in this i am posting only 5 url. my csv file has two columns, 1 st one is ID and second one is URL. whenever i am running this code it prints invalid url 400 http error. these are my codes: import csv from bs4 import BeautifulSoup import requests import time import os data_obj = {} final_data = [] def readfile(): global data_obj file = "BOOK.CSV" f = open("./"+ file, "r") for row in f.readlines(): lst = row.split(",") data_obj[lst[0]] = lst[1]#here reading dictionary #print(data_obj) f.close() def writedata(alldata1, filename): print(" >>>> FINAL PRINTING DATA >>>> ") #import pdb; pdb.set_trace() with open("./"+filename,'w') as csvfile: csvfile = csv.writer(csvfile, delimiter=',') #csvfile.writerow(titleRow) csvfile.writerow("") for i in range(0, len( alldata1 )): #print(alldata1[i]) csvfile.writerow( alldata1[i] ) def parsedata(): global data_obj, final_data for sublist in data_obj.keys(): url = data_obj[sublist] #print(url) data = getdata(url,{}) soup = BeautifulSoup(data, "html.parser") print(soup) def getdata(url, values): r = requests.post(url, data=values, timeout=10) text = r.text r.close() return text def main(): readfile() parsedata() main()this is the error i received: i am attaching the file, can someone please tell what should i do?
RE: Read url from CSV and Scrape website - wavic - Dec-21-2017 def getdata(url, values=None): r = requests.post(url, data=values, timeout=10) text = r.text r.close() return textYour code is overcomplicated. import csv from bs4 import BeautifulSoup import requests def get(urls): for url in urls: yield requests.get(url).content.decode('utf-8') with open('BOOK.csv') as csv_: reader = csv.reader(csv_) urls = [line[1] for line in urls if line] webpages = list(get(urls)) for html in webpages: soup = BeautifulSoup(html, 'lxml') print(soup.pretify)You will be able to put together the rest. RE: Read url from CSV and Scrape website - Prince_Bhatia - Dec-22-2017 Thank you so much sir, it is great learning here. RE: Read url from CSV and Scrape website - binaryanimal - Jan-08-2020 urls not defined i get this error while running this code |