Posts: 218
Threads: 27
Joined: May 2018
May-20-2018, 01:04 PM
(This post was last modified: May-20-2018, 01:04 PM by eddywinch82.)
I need help, with finishing of the following code :- which is not all my own work, i am not this good at programming,
but had help with it yesterday :-
from bs4 import BeautifulSoup
import requests, wget, re, zipfile, io
def get_zips(link_root, zips_suffix):
# 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
zips_page = link_root + zips_suffix
# print zips_page
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
for zips_suffix in soup.select("a[href*=repaints.php?ac=]"):
get_zips(root, zips_suffix['href'])
link_root = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/'
# Example category, need to read all categories from first page into a list and iterate categories
category = 'acfiles.php?cat=6'
download_links(link_root, category) This is the path for one of the Aircraft catagories :- 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
But there are several, where the last part of the path is always :- /repaints.php?ac=Two Digit Number@cat=6
What do i need to type, to download all the .zip Files from http://web.archive.org/web/2005031511271...c=89&cat=6, without having to type each different ac=Two Digit Number@cat=6 for that path each time? Any help would be much appreciated.
Posts: 7,268
Threads: 122
Joined: Sep 2016
Here some hints.
from bs4 import BeautifulSoup
import requests
url = 'http://web.archive.org/web/20041114195147/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
td = soup.find_all('td', class_="text", colspan="2") Look at td:
>>> td
[<td bgcolor="#FFFF99" class="text" colspan="2"><a href="download.php?fileid=6082">Texture.ABR-Air Contractors A30B_PW CONTRACT.zip</a> </td>,
<td bgcolor="#FFCC99" class="text" colspan="2"><a href="download.php?fileid=6177">Texture.AHK-Air Hong Kong A300B AIR HONG KONG.zip</a> </td>,
<td bgcolor="#FFFF99" class="text" colspan="2"><a href="download.php?fileid=6084">Texture.FPO-Europe Airpost A30B_GE FRENCH POST.zip</a> </td>,
<td bgcolor="#FFCC99" class="text" colspan="2"><a href="download.php?fileid=7223">Texture.HDA-Dragonair Cargo A30BGE DRAGONAIR.zip</a> </td>]
>>> for h in td:
... h.a.get('href')
...
'download.php?fileid=6082'
'download.php?fileid=6177'
'download.php?fileid=6084'
'download.php?fileid=7223' So now have all fileid for download,the whole url is the same before,
so 6082 is one .zip and change to 6177 is an other .zip file.
Posts: 218
Threads: 27
Joined: May 2018
I have adapted the working Code, for a different Flight Sim Website Link, but when I run the Code, there are no Traceback Errors, but on investigation no .zip Files appear to be downloading. unless they are ? Have I gone wrong somewhere ? :-
from bs4 import BeautifulSoup
import requests, wget, re, zipfile, io
def get_zips(link_root, zips_suffix):
# 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
zips_page = link_root + zips_suffix
# print zips_page
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download_model.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
td = soup.find_all('td', class_="text", colspan="2", bgcolour="#FFFF99", href="download_model.php?fileid=")
for h in td:
h.a.get('href')
link_root = 'http://web.archive.org/web/20050308033321/http://www.projectai.com:80/packages/fde.php'
Posts: 7,268
Threads: 122
Joined: Sep 2016
May-20-2018, 08:28 PM
(This post was last modified: May-20-2018, 08:29 PM by snippsat.)
You don't call functions,then nothing will be happening.
download_links function is missing root and cat argument and it don't return anything.
Posts: 218
Threads: 27
Joined: May 2018
I see what you mean, category = 'acfiles.php?cat=6' i tried category ='fde.php' but as it wasn't a category it didn't download .zip files, if cat = category what word should be used for php folder extensions ?
Also I have installed the Axel Download accelerator into Python, but I am not sure what to type to make it speed up all the .zip file downloads, when I run the modules ? I found the following on a website :- from axel import axel
# Download http://someurl/file.zip with 500 parallel connection
file_path = axel('http://someurl/file.zip', num_connections=500)
Posts: 218
Threads: 27
Joined: May 2018
from bs4 import BeautifulSoup
import requests, wget, re, zipfile, io
def get_zips(link_root, zips_suffix):
# 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
zips_page = link_root + zips_suffix
# print zips_page
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
td = soup.find_all('td', class_="text", colspan="2", bgcolour="#FFFF99", href="download.php?fileid=")
for h in td:
h.a.get('href')
for zips_suffix in soup.select("a[href*=repaints.php?ac=]"):
get_zips(root, zips_suffix['href'])
link_root = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/'
# Example category, need to read all categories from first page into a list and iterate categories
category = 'acfiles.php?cat=6'
download_links(link_root, category)
Posts: 7,268
Threads: 122
Joined: Sep 2016
(May-20-2018, 11:31 PM)eddywinch82 Wrote: need to read all categories from first page into a list and iterate categories Example:
from bs4 import BeautifulSoup
import requests
url = 'http://web.archive.org/web/20041225023002/http://www.projectai.com:80/libraries/acfiles.php?cat=6'
url_get = requests.get(url)
soup = BeautifulSoup(url_get.content, 'lxml')
td = soup.find_all('td', width="50%")
all_plains = [link.find('a').get('href') for link in td]
print(all_plains) eddywinch82 Wrote:Also I have installed the Axel Download accelerator into Python, but I am not sure what to type to make it speed up all the .zip file downloads, when I run the modules ? I have not heard of Alex.
I write my own for this,but it's a more advance topic.
Can show example with concurrent.futures,that i like to use for this.
import requests
import concurrent.futures
def download(number_id):
a_zip = 'http://web.archive.org/web/20041205075703/http://www.projectai.com:80/packages/download_model.php?eula=1&fileid={}'.format(number_id)
with open('{}.zip'.format(number_id), 'wb') as f:
f.write(requests.get(a_zip).content)
if __name__ == '__main__':
file_id = list(range(1,50))
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for number_id in file_id:
executor.submit(download, number_id) Without it takes 4-5 minute to download,now it take 30-sec.
Posts: 218
Threads: 27
Joined: May 2018
May-21-2018, 12:34 PM
(This post was last modified: May-21-2018, 12:37 PM by eddywinch82.)
Error: Exception in thread Thread-1:
Traceback (most recent call last):
File "C:\Python34\Lib\threading.py", line 920, in _bootstrap_inner
self.run()
File "C:\Python34\Lib\threading.py", line 868, in run
self._target(*self._args, **self._kwargs)
File "C:\Python34\Lib\concurrent\futures\process.py", line 251, in _queue_management_worker
shutdown_worker()
File "C:\Python34\Lib\concurrent\futures\process.py", line 209, in shutdown_worker
call_queue.put_nowait(None)
File "C:\Python34\Lib\multiprocessing\queues.py", line 131, in put_nowait
return self.put(obj, False)
File "C:\Python34\Lib\multiprocessing\queues.py", line 82, in put
raise Full
queue.Full
I get that traceback and then the .zip Files start downloading at the normal speed not quicker. What is the Traceback Text meaning ? Eddie
Also for one of the Website links :- I have the following Code from bs4 import BeautifulSoup
import requests, zipfile, io, concurrent.futures
def download(number_id):
a_zip = 'http://web.archive.org/web/20050301025710//http://www.projectai.com:80/packages/download_model.php?eula=1&fileid={}'.format(number_id)
with open('{}.zip'.format(number_id), 'wb') as f:
f.write(requests.get(a_zip).content)
if __name__ == '__main__':
file_id = list(range(1,50))
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for number_id in file_id:
executor.submit(download, number_id)
def get_zips(link_root, zips_suffix):
# 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
zips_page = link_root + zips_suffix
# print zips_page
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
link_root = 'http://web.archive.org/web/20050301025710/http://www.projectai.com:80/packages/'
category ='fde.php'
download_links(link_root,category) but .zip files are not being saved with the proper .zip File name as 49.zip 50.zip 51.zip etc and they say 0 bytes. Or is that because they havn't finished downloading ? Eddie
Also for one of the Website links :- I have the following Code from bs4 import BeautifulSoup
import requests, zipfile, io, concurrent.futures
def download(number_id):
a_zip = 'http://web.archive.org/web/20050301025710//http://www.projectai.com:80/packages/download_model.php?eula=1&fileid={}'.format(number_id)
with open('{}.zip'.format(number_id), 'wb') as f:
f.write(requests.get(a_zip).content)
if __name__ == '__main__':
file_id = list(range(1,50))
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
for number_id in file_id:
executor.submit(download, number_id)
def get_zips(link_root, zips_suffix):
# 'http://web.archive.org/web/20050315112710/http://www.projectai.com:80/libraries/repaints.php?ac=89&cat=6'
zips_page = link_root + zips_suffix
# print zips_page
zips_source = requests.get(zips_page).text
zip_soup = BeautifulSoup(zips_source, "html.parser")
for zip_file in zip_soup.select("a[href*=download.php?fileid=]"):
zip_url = link_root + zip_file['href']
print('downloading', zip_file.text, '...',)
r = requests.get(zip_url)
with open(zip_file.text, 'wb') as zipFile:
zipFile.write(r.content)
def download_links(root, cat):
url = ''.join([root, cat])
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text, "html.parser")
link_root = 'http://web.archive.org/web/20050301025710/http://www.projectai.com:80/packages/'
category ='fde.php'
download_links(link_root,category) but .zip files are not being saved with the proper .zip File name, they are being saved as 49.zip 50.zip 51.zip etc and they say 0 bytes. Or is that because they havn't finished downloading ? Eddie
Sorry they are being saved as 49.zip 50.zip 51.zip etc
Posts: 8,113
Threads: 158
Joined: Sep 2016
(May-21-2018, 12:34 PM)eddywinch82 Wrote: but .zip files are not being saved with the proper .zip File name, they are being saved as 49.zip 50.zip 51.zip etc and they say 0 bytes. Or is that because they havn't finished downloading ? Eddie No, it's because that is how you construct the file name in download function (see line#6)
Posts: 218
Threads: 27
Joined: May 2018
what do I need to type, so that the Files download, with the proper .zip File name ?
|