You can try this,i did look at download all .zip for all planes.
I let it run about 5-minute had no errors.
So if this is one time operation it may not be worth looking into concurrent.futures as i did show before.
Take break for a couple of hours ,and see if you have gotten all zip files.
I let it run about 5-minute had no errors.
So if this is one time operation it may not be worth looking into concurrent.futures as i did show before.
Take break for a couple of hours ,and see if you have gotten all zip files.
from bs4 import BeautifulSoup import requests def all_planes(): '''Generate url links for all planes''' url = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/acfiles.php?cat=6' url_get = requests.get(url) soup = BeautifulSoup(url_get.content, 'lxml') td = soup.find_all('td', width="50%") plain_link = [link.find('a').get('href') for link in td] all_links = [] for ref in plain_link: url_file_id = 'http://web.archive.org/web/20041114195147/http://www.projectai.com:80/libraries/{}'.format(ref) yield url_file_id def download(all_planes): '''Download zip for one plane,feed with more url's will download .zip for all planes''' # A_300 = next(all_planes()) # Test with first link for plane_url in all_planes(): url_get = requests.get(plane_url) soup = BeautifulSoup(url_get.content, 'lxml') td = soup.find_all('td', class_="text", colspan="2") zip_url = 'http://web.archive.org/web/20041108022719/http://www.projectai.com:80/libraries/download.php?fileid={}' for item in td: zip_name = item.text zip_number = item.find('a').get('href').split('=')[-1] with open(zip_name, 'wb') as f_out: down_url = requests.get(zip_url.format(zip_number)) f_out.write(down_url.content) if __name__ == '__main__': download(all_planes)