Posts: 14
Threads: 3
Joined: Oct 2023
from urllib.request import urlopen
from pathlib import Path
with open(r'D:\Desktop\sites.txt', 'r', encoding='UTF-8') as file:
while line := file.readline():
myurl = line.rstrip()
myfold = myurl[8:10]
myfn = myurl[8:12]
myfilen = myfn + 'txt'
Path("D:/Desktop/" + myfold).mkdir(parents=True, exist_ok=True)
with urlopen( myurl ) as webpage:
content = webpage.read().decode()
with open("D:/Desktop/" + myfold + "/" + myfilen , "w" ) as output:
output.write( content ) I have a list of urls named sites.txt
Each URL has the same length.
I am trying to loop through each site and
1. Take part of the url name and create a folder on the Desktop if it doesn't exist.
2. Take part of the url and create a filename.
3. Save the page source of each url as a text file within the corresponding folder
This seems to run in Python 3 without any errors, but if it's creating the txt files, they are somewhere else. It is not even creating the folders.
Sorry I am new to Python.
Thank you.
Posts: 74
Threads: 23
Joined: Mar 2024
Are you sure that:
D:\Desktop
is correct and it's not
C:\Desktop
You could be using the D drive, but seeing as how the C and D keys are right next to each other this may be a case of the typo that you just don't see... and would definitely explain why you have no idea where the files are going!
Posts: 14
Threads: 3
Joined: Oct 2023
(Jun-05-2024, 10:28 PM)sawtooth500 Wrote: Are you sure that:
D:\Desktop
is correct and it's not
C:\Desktop
You could be using the D drive, but seeing as how the C and D keys are right next to each other this may be a case of the typo that you just don't see... and would definitely explain why you have no idea where the files are going!
Of this I am positive. I set my Desktop up on my Hard Drive D: when I got my SSD which only has my OS and I set up as C:
Posts: 14
Threads: 3
Joined: Oct 2023
When I run this, it actually will create the results for the first URL but not for the rest of them.
I made some very minor fixes to the code with no change.
from urllib.request import urlopen
from pathlib import Path
with open(r'D:\Desktop\sites.txt', 'r', encoding='UTF-8') as file:
while line := file.readline():
myurl = line.rstrip()
myfold = myurl[9:11]
myfn = myurl[9:13]
myfilen = myfn + '.txt'
Path("D:/Desktop/" + myfold).mkdir(parents=True, exist_ok=True)
with urlopen(myurl) as webpage:
content = webpage.read().decode()
with open("D:/Desktop/" + myfold + "/" + myfilen , "w" ) as output:
output.write(content)
Posts: 14
Threads: 3
Joined: Oct 2023
This is the errors I get. Since my code ends at line 15, I do not understand it.
C:\Users\luger\AppData\Local\Programs\Python\Python313\python.exe D:\Desktop\getsites.py
Traceback (most recent call last):
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 1318, in do_open
h.request(req.get_method(), req.selector, req.data, headers,
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1319, in request
self._send_request(method, url, body, headers, encode_chunked)
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1365, in _send_request
self.endheaders(body, encode_chunked=encode_chunked)
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1314, in endheaders
self._send_output(message_body, encode_chunked=encode_chunked)
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1074, in _send_output
self.send(msg)
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1018, in send
self.connect()
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 1453, in connect
super().connect()
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\http\client.py", line 984, in connect
self.sock = self._create_connection(
^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\socket.py", line 828, in create_connection
for res in getaddrinfo(host, port, 0, SOCK_STREAM):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\socket.py", line 963, in getaddrinfo
for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
socket.gaierror: [Errno 11001] getaddrinfo failed
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "D:\Desktop\getsites.py", line 11, in <module>
with urlopen(myurl) as webpage:
^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 189, in urlopen
return opener.open(url, data, timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 489, in open
response = self._open(req, data)
^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 506, in _open
result = self._call_chain(self.handle_open, protocol, protocol +
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 466, in _call_chain
result = func(*args)
^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 1366, in https_open
return self.do_open(http.client.HTTPSConnection, req,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 1321, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
Process finished with exit code 1
Posts: 6,779
Threads: 20
Joined: Feb 2020
Jun-06-2024, 01:30 PM
(This post was last modified: Jun-06-2024, 01:30 PM by deanhystad.)
Your program getsites.py has an error on line 11 that causes urlopen to fail. The error is 11001, getaddrinfo failed.
Error: Traceback (most recent call last):
File "D:\Desktop\getsites.py", line 11, in <module>
with urlopen(myurl) as webpage:
. . .
File "C:\Users\luger\AppData\Local\Programs\Python\Python313\Lib\urllib\request.py", line 1321, in do_open
raise URLError(err)
urllib.error.URLError: <urlopen error [Errno 11001] getaddrinfo failed>
Your url is invalid. An invalid URL does not mean the address doesn't exist (that is a 404 error), but the format of the URL is incorrect. Does your url start with "http://"? Are there invalid characters (control characters) in the url?
How did you make D:\Desktop\sites.txt?
From the urlib requests documentation.
Quote:urllib.request.urlopen(url, data=None, [timeout, ]*, cafile=None, capath=None, cadefault=False, context=None)¶
Open url, which can be either a string containing a valid, properly encoded URL, or a Request object.
Posts: 14
Threads: 3
Joined: Oct 2023
I made them all https://
I am guessing I should change them to http://
Thank you.
Posts: 14
Threads: 3
Joined: Oct 2023
My sites.txt is just a list of urls like:
https://abc.com
https://def.com
https://ghi.com
etc.
and some of them do redirect to other urls, I hope that's OK. I am just experimenting with Python right now.
Thanks.
Posts: 7,313
Threads: 123
Joined: Sep 2016
Jun-06-2024, 07:24 PM
(This post was last modified: Jun-06-2024, 07:24 PM by snippsat.)
Some advice should use Requests and not urllib.
Also save all html source for a site is usually not so useful,can look this recent thread where also use BeautifulSoup.
Something like this can do it,also in code pathlib stem to avoid stuff like this myfold = myurl[9:11] .
>>> import pathlib
>>>
>>> url = 'https://abc.com'
>>> folder_name = pathlib.Path(url).stem
>>> folder_name
'abc'
>>> url = 'https://python-forum.io/'
>>> folder_name = pathlib.Path(url).stem
>>> folder_name
'python-forum' import requests
from bs4 import BeautifulSoup
from pathlib import Path
class URLSiteSaver:
def __init__(self, file_path, save_path):
self.file_path = Path(file_path)
self.save_path = Path(save_path)
# Ensure the base save directory exists
self.save_path.mkdir(parents=True, exist_ok=True)
def fetch_save(self, url):
try:
response = requests.get(url)
# Create directory directories from site names
folder_name = Path(url).stem
site_folder = self.save_path / folder_name
site_folder.mkdir(parents=True, exist_ok=True)
html_file_path = site_folder / 'index.html'
# Save the HTML content to a file in the site's directory
with html_file_path.open('wb') as html_file:
html_file.write(response.content)
print(f"Saved content to {site_folder}")
except requests.exceptions.RequestException as e:
print(f"Error fetching {url}: {e}")
def process_urls(self):
with self.file_path.open() as fp:
for url in fp:
url = url.strip()
print(f"Processing URL: {url}")
self.fetch_save(url)
if __name__ == '__main__':
file_path = r'G:\div_code\myurls.txt'
save_path = r'G:\div_code\urlsites'
url_saver = URLSiteSaver(file_path, save_path)
url_saver.process_urls() Output: Processing URL: https://python-forum.io/
Saved content to G:\div_code\urlsites\python-forum
Processing URL: https://books.toscrape.com/
Saved content to G:\div_code\urlsites\books.toscrape
Posts: 1,090
Threads: 143
Joined: Jul 2017
@ snippsat I always like your answers and save them often, so I can learn something.
Some web addresses lead down several folders.
I have a few webpage addresses from another recent question here, saved in a text file, but mixed with text.
I can get the web addresses easily.
An example is:
url = 'https://chipsandcheese.com/2024/06/03/intels-lion-cove-architecture-preview/'
folder_name = Path(url).stem Which gives as Path:
Output: folder_name
'intels-lion-cove-architecture-preview'
Problem?
I did not actually retrieve any text from the webpages, I think that is fairly easy with requests and BeautifulSoup.
I just looked at creating the folders and some text.
import re
from pathlib import Path
# these are urls mixed with text from a recent question here
urltext = Path(r'/home/pedro/tmp/some_urls.txt') # urls mixed with text
with open(urltext, 'r') as infile:
text = infile.read()
# the thing about urls, they can't/shouldn't contain spaces
# \S finds anything that is not whitespace
# ? Causes the resulting RE to match 0 or 1 repetitions of the preceding RE. ab? will match either ‘a’ or ‘ab’.
e = re.compile(r'(https?://\S+)') # finds http or https followed by :// and anything not whitespace (- is not \w)
res = e.findall(text) # returns a list of urls
# f find // anything not whitespace . webpage ending optionally followed by / optionally followed by anything
f = re.compile(r'//(\S+\.\w+)/?(\S+)?') # <re.Match object; span=(6, 35), match='//github.com/Jana-Marie/ligra'>
savepath = Path(r'/home/pedro/temp/')
for r in res:
myurl = r.rstrip()
print(f'myurl = {myurl}')
result = f.search(myurl)
print(f'result.group(1) = {result.group(1)}')
# some web addresses have multiple / which will cause problems when saving
mylist = result.group(1).split('/')
print(f'mylist = {mylist}')
myfolder = mylist[0]
print(f'myfolder = {myfolder}')
if result.group(2):
print(f'result.group(2) = {result.group(2)}')
# get rid of forward slashes in result.group(2)
# they will be mistaken for subfolders when saving
clean_result = result.group(2).replace('/', '_')
myfile_name = clean_result + '.txt'
p = savepath / myfolder
p.mkdir(parents=True, exist_ok=True)
with open(p / myfile_name, 'w') as output:
output.write(myurl)
# if there is only a result.group(1)
else:
myfile_name = myfolder + '.txt'
print(f'myfile_name = {myfile_name}')
p = savepath / myfolder
p.mkdir(parents=True, exist_ok=True)
with open(p / myfile_name, 'w') as output:
output.write(myurl) That results in folders like this with a text file in them:
Output: /home/pedro/temp/phys.org
/home/pedro/temp/toaster.llc
/home/pedro/temp/arxiv.org
/home/pedro/temp/samcurry.net
Since github.com is in my list of webpages 4 times, the folder github.com contains 4 text files, 1 for each different web address
|