May-22-2018, 04:51 PM
Also with my code you can take it in step,no need to download all 72 planes in one go.
Because of
Code under take 10 first planes.
Because of
islice
method on yield
,can start where you want.Code under take 10 first planes.
from bs4 import BeautifulSoup import requests from tqdm import tqdm, trange from itertools import islice def all_planes(): '''Generate url links for all planes''' url = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/acfiles.php?cat=6' url_get = requests.get(url) soup = BeautifulSoup(url_get.content, 'lxml') td = soup.find_all('td', width="50%") plain_link = [link.find('a').get('href') for link in td] for ref in tqdm(plain_link): url_file_id = 'http://web.archive.org/web/20041114195147/http://www.projectai.com:80/libraries/{}'.format(ref) yield url_file_id def download(all_planes): '''Download zip for 1 plain,feed with more url download all planes''' # A_300 = next(all_planes()) # Test with first link how_many_planes = islice(all_planes(), 0, 10) for plane_url in how_many_planes: url_get = requests.get(plane_url) soup = BeautifulSoup(url_get.content, 'lxml') td = soup.find_all('td', class_="text", colspan="2") zip_url = 'http://web.archive.org/web/20041108022719/http://www.projectai.com:80/libraries/download.php?fileid={}' for item in tqdm(td): zip_name = item.text zip_number = item.find('a').get('href').split('=')[-1] with open(zip_name, 'wb') as f_out: down_url = requests.get(zip_url.format(zip_number)) f_out.write(down_url.content) if __name__ == '__main__': download(all_planes)As example 20 next planes.
how_many_planes = islice(all_planes(), 10, 31)