Some advice should use Requests and not urllib.
Also save all html source for a site is usually not so useful,can look this recent thread where also use BeautifulSoup.
Something like this can do it,also in code pathlib
Also save all html source for a site is usually not so useful,can look this recent thread where also use BeautifulSoup.
Something like this can do it,also in code pathlib
stem
to avoid stuff like this myfold = myurl[9:11]
.>>> import pathlib >>> >>> url = 'https://abc.com' >>> folder_name = pathlib.Path(url).stem >>> folder_name 'abc' >>> url = 'https://python-forum.io/' >>> folder_name = pathlib.Path(url).stem >>> folder_name 'python-forum'
import requests from bs4 import BeautifulSoup from pathlib import Path class URLSiteSaver: def __init__(self, file_path, save_path): self.file_path = Path(file_path) self.save_path = Path(save_path) # Ensure the base save directory exists self.save_path.mkdir(parents=True, exist_ok=True) def fetch_save(self, url): try: response = requests.get(url) # Create directory directories from site names folder_name = Path(url).stem site_folder = self.save_path / folder_name site_folder.mkdir(parents=True, exist_ok=True) html_file_path = site_folder / 'index.html' # Save the HTML content to a file in the site's directory with html_file_path.open('wb') as html_file: html_file.write(response.content) print(f"Saved content to {site_folder}") except requests.exceptions.RequestException as e: print(f"Error fetching {url}: {e}") def process_urls(self): with self.file_path.open() as fp: for url in fp: url = url.strip() print(f"Processing URL: {url}") self.fetch_save(url) if __name__ == '__main__': file_path = r'G:\div_code\myurls.txt' save_path = r'G:\div_code\urlsites' url_saver = URLSiteSaver(file_path, save_path) url_saver.process_urls()
Output:Processing URL: https://python-forum.io/
Saved content to G:\div_code\urlsites\python-forum
Processing URL: https://books.toscrape.com/
Saved content to G:\div_code\urlsites\books.toscrape