import os
import urllib.request
# the path where the html is located
path = r"C:\Users\The Capricorn\Documents\Html"
for filename in os.listdir(path):
# Now we have to find the full path name of the files
subpath = os.path.join(path,filename)
if subpath.endswith('.html'):
print(subpath)
print('Reading',filename,'....')
html = open(subpath,'r').read()
if html:
print('Successfully fetched Html')
else:
for file in os.listdir(subpath):
# getting the full path of html file
fullpath = os.path.join(subpath,file)
if fullpath.endswith('.html'):
print(fullpath)
print('Reading',file,'....')
html = open(fullpath,'r').read()
if html:
print('Successfully fetched Html')
This code is to fetch local HTML files in the directory. This works fine when
path contains only a single sub-folder inside it or no sub-folders but not when there are folders inside sub-folders as well and gives an error if files with different extension instead of Html are present inside
path. What should I do to correct this?
You use
os.walk() which is recursive walking the whole tree.
Example:
import os
for root, dirs, files in os.walk(r'E:\1\web_title'):
for file in files:
if file.endswith('.html'):
print(file)
Output:
Pipfile_1.html
Pipfile_2.html
Pipfile_3.html
join together then see that these files are in nested sub-folders.
import os
for root, dirs, files in os.walk(r'E:\1\web_title'):
for file in files:
if file.endswith('.html'):
print(os.path.join(root, file))
Output:
E:\1\web_title\Pipfile_1.html
E:\1\web_title\New folder\Pipfile_2.html
E:\1\web_title\New folder\New folder\New folder\Pipfile_3.html
The pathlib module exists since 3.4.
It gives you a better abstraction.
Written as a generator:
def find_by_ext(root, suffix):
for root, dirs, files in os.walk(root):
for file in files:
path = pathlib.Path(root, file)
if path.suffix == suffix:
yield path
The argument root is the start point.
Suffix should be '.html' in your case.
The generator returns for each iteration a Path object.
To get the same behaviour, you can write a second function, which is
iterating over the generator:
def open_all_html(root):
for file in find_by_ext(root, '.html'):
try:
data = file.open('r', encoding='utf-8', errors='ignore')
except Exception as error:
print('Could not open file {}. Error: {}'.format(file, error))
else:
print('Successfully opened file {}.'.format(file))
# normally you do something with the data
# this can also be put into a extra function
Calling it:
open_all_html('Downloads/')
Output:
Successfully opened file Downloads/asterisk-15.2.2/asterisk-15.2.2-summary.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjmedia/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjmedia/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjnath/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjnath/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib-util/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib-util/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjsip/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjsip/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/static-http/mantest.html.
Successfully opened file Downloads/asterisk-15.2.2/static-http/ajamdemo.html.
Successfully opened file Downloads/skyradar-gui/ui/index.html.
from glob import glob
htmls = glob('*.htm*', recursive=True)