Dec-19-2018, 10:45 PM
import os import requests from bs4 import BeautifulSoup downloadDirectory = "downloaded" baseUrl = "http://pythonscraping.com" def getAbsoluteURL(baseUrl, source): if source.startswith("http://www."): url = "http://"+source[11:] elif source.startswith("http://"): url = source elif source.startswith("www."): url = source[4:] url = "http://"+source else: url = baseUrl+"/"+source if baseUrl not in url: return None return url def getDownloadPath(baseUrl, absoluteUrl, downloadDirectory): path = absoluteUrl.replace("www.", "") path = path.replace(baseUrl, "") path = downloadDirectory+path directory = os.path.dirname(path) if not os.path.exists(directory): os.makedirs(directory) return path html = requests.get("http://www.pythonscraping.com") bsObj = BeautifulSoup(html.content, 'html.parser') downloadList = bsObj.find_all('img') for download in downloadList: fileUrl = getAbsoluteURL(baseUrl,download["src"]) if fileUrl is not None: print(fileUrl) r = requests.get(fileUrl, allow_redirects=True) filename = fileUrl.split('/')[-1] with open(filename, 'wb') as out_file: out_file.write(r.content)I made some correction in last 10 lines but the problem is now that it completely ommits folder 'downloaded' and getDownloadPath function.