Python Forum

import os
import urllib.request

# the path where the html is located
path = r"C:\Users\The Capricorn\Documents\Html"   


for filename in os.listdir(path):
    # Now we have to find the full path name of the files
    subpath = os.path.join(path,filename)
    if subpath.endswith('.html'):
            print(subpath)
            print('Reading',filename,'....')
            html = open(subpath,'r').read()
            if html:
                print('Successfully fetched Html')
    else:
        for file in os.listdir(subpath):
            # getting the full path of html file
            fullpath = os.path.join(subpath,file)
            if fullpath.endswith('.html'):
                print(fullpath)
                print('Reading',file,'....')
                html = open(fullpath,'r').read()
                if html:
                    print('Successfully fetched Html')

This code is to fetch local HTML files in the directory. This works fine when path contains only a single sub-folder inside it or no sub-folders but not when there are folders inside sub-folders as well and gives an error if files with different extension instead of Html are present inside path. What should I do to correct this?

You use os.walk() which is recursive walking the whole tree.
Example:

import os

for root, dirs, files in os.walk(r'E:\1\web_title'):
    for file in files:
        if file.endswith('.html'):
            print(file)

Output:Pipfile_1.html
Pipfile_2.html
Pipfile_3.html

join together then see that these files are in nested sub-folders.

import os

for root, dirs, files in os.walk(r'E:\1\web_title'):
    for file in files:
        if file.endswith('.html'):
            print(os.path.join(root, file))

Output:E:\1\web_title\Pipfile_1.html
E:\1\web_title\New folder\Pipfile_2.html
E:\1\web_title\New folder\New folder\New folder\Pipfile_3.html

The pathlib module exists since 3.4.
It gives you a better abstraction.
Written as a generator:

def find_by_ext(root, suffix):
    for root, dirs, files in os.walk(root):
        for file in files:
            path = pathlib.Path(root, file)
            if path.suffix == suffix:
                yield path

The argument root is the start point.
Suffix should be '.html' in your case.
The generator returns for each iteration a Path object.

To get the same behaviour, you can write a second function, which is
iterating over the generator:

def open_all_html(root):
    for file in find_by_ext(root, '.html'):
        try:
            data = file.open('r', encoding='utf-8', errors='ignore')
        except Exception as error:
            print('Could not open file {}. Error: {}'.format(file, error))
        else:
            print('Successfully opened file {}.'.format(file))
            # normally you do something with the data
            # this can also be put into a extra function

Calling it:

open_all_html('Downloads/')

Output:Successfully opened file Downloads/asterisk-15.2.2/asterisk-15.2.2-summary.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjmedia/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjmedia/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjnath/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjnath/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib-util/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib-util/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjsip/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjsip/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib/docs/footer.html.
Successfully opened file Downloads/asterisk-15.2.2/third-party/pjproject/source/pjlib/docs/header.html.
Successfully opened file Downloads/asterisk-15.2.2/static-http/mantest.html.
Successfully opened file Downloads/asterisk-15.2.2/static-http/ajamdemo.html.
Successfully opened file Downloads/skyradar-gui/ui/index.html.

from glob import glob

htmls = glob('*.htm*', recursive=True)

shiva

snippsat

DeaD_EyE

wavic