May-20-2018, 11:31 PM
from bs4 import BeautifulSoup import requests, wget, re, zipfile, io def get_zips(link_root, zips_suffix): # 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6' zips_page = link_root + zips_suffix # print zips_page zips_source = requests.get(zips_page).text zip_soup = BeautifulSoup(zips_source, "html.parser") for zip_file in zip_soup.select("a[href*=download.php?fileid=]"): zip_url = link_root + zip_file['href'] print('downloading', zip_file.text, '...',) r = requests.get(zip_url) with open(zip_file.text, 'wb') as zipFile: zipFile.write(r.content) def download_links(root, cat): url = ''.join([root, cat]) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, "html.parser") td = soup.find_all('td', class_="text", colspan="2", bgcolour="#FFFF99", href="download.php?fileid=") for h in td: h.a.get('href') for zips_suffix in soup.select("a[href*=repaints.php?ac=]"): get_zips(root, zips_suffix['href']) link_root = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/' # Example category, need to read all categories from first page into a list and iterate categories category = 'acfiles.php?cat=6' download_links(link_root, category)