Python Forum

Any ideas/tips on how to better optimize this code or better and cleaner than it is?

import requests
import argparse
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

# initialize the set of links (unique links)
internal_urls = set()
external_urls = set()

total_urls_visited = 0

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)


def get_all_website_links(url):
    """
    Returns all URLs that is found on `url` in which it belongs to the same website
    """
    # all URLs of `url`
    urls = set()
    # domain name of the URL without the protocol
    domain_name = urlparse(url).netloc
    soup = BeautifulSoup(requests.get(url).content, "html.parser")
    for a_tag in soup.findAll("a"):
        href = a_tag.attrs.get("href")
        if href == "" or href is None:
            # href empty tag
            continue
        # join the URL if it's relative (not absolute link)
        href = urljoin(url, href)
        parsed_href = urlparse(href)
        # remove URL GET parameters, URL fragments, etc.
        href = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
        if not is_valid(href):
            # not a valid URL
            continue
        if href in internal_urls:
            # already in the set
            continue
        if domain_name not in href:
            # external link
            if href not in external_urls:
                print(f"External link: {href}")
                external_urls.add(href)
            continue
        print(f"Internal link: {href}")
        urls.add(href)
        internal_urls.add(href)
    return urls


def crawl(url, max_urls=50):
    """
    Crawls a web page and extracts all links.
    You'll find all links in `external_urls` and `internal_urls` global set variables.
    params:
        max_urls (int): number of max urls to crawl, default is 30.
    """
    global total_urls_visited
    total_urls_visited += 1
    links = get_all_website_links(url)
    for link in links:
        if total_urls_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
    parser.add_argument("url", help="The URL to extract links from.")
    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
    
    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls

    crawl(url, max_urls=max_urls)

    print("Total Internal links:", len(internal_urls))
    print("Total External links:", len(external_urls))
    print("Total URLs:", len(external_urls) + len(internal_urls))

    domain_name = urlparse(url).netloc

    # save the internal links to a file
    with open(f"{domain_name}_internal_links.txt", "w") as f:
        for internal_link in internal_urls:
            print(internal_link.strip(), file=f)

    # save the external links to a file
    with open(f"{domain_name}_external_links.txt", "w") as f:
        for external_link in external_urls:
            print(external_link.strip(), file=f)

I don't know if it is better, but I like to use generic tools for iteration or graph traversal. In this case, you are doing a breadth first walk of a graph of pages linked by internal hrefs. I wrote a small code for generic breadth first walk. It can be reused here. It makes the code more robust by removing a reinvention of the wheel and it separates the concerns of parsing html from that of walking the graph.

from breadthfirst import breadth_first
from bs4 import BeautifulSoup
from itertools import islice
import requests
from urllib.request import urlparse, urljoin
 
__version__ = '2020.07.13'

def is_valid(url):
    """
    Checks whether `url` is a valid URL.
    """
    parsed = urlparse(url)
    return bool(parsed.netloc and parsed.scheme)


class Crawler:

    def crawl(self, url, max_url=30):
        self.root = url
        self.domain_name = urlparse(url).netloc
        self.external_links = set()
        for x in islice(
            breadth_first(self.root, self.internal_hrefs), 0, max_url):
            print(x)

    def internal_hrefs(self, url):
        """Yield internal hrefs found in url"""
        soup = BeautifulSoup(requests.get(url).content, "html.parser")
        for a_tag in soup.findAll("a"):
            href = a_tag.attrs.get("href")
            if not href:
                # href empty tag
                continue
            # join the URL if it's relative (not absolute link)
            href = urljoin(url, href)
            parsed_href = urlparse(href)
            # remove URL GET parameters, URL fragments, etc.
            href = (parsed_href.scheme
                        + "://" + parsed_href.netloc + parsed_href.path)
            if not is_valid(href):
                continue
            elif self.is_internal(href):
                yield href
            else:
                self.external_links.add(href)
                
    def is_internal(self, link):
        return self.domain_name in link

if __name__ == '__main__':
    Crawler().crawl('https://www.google.com')

This code is not complete, I removed the argparse part...

Output:https://www.google.com
https://www.google.com/preferences
https://www.google.com/advanced_search
https://www.google.com/intl/fr/ads/
https://www.google.com/services/
https://www.google.com/intl/fr/about.html
https://www.google.com/setprefdomain
https://www.google.com/intl/fr/policies/privacy/
https://www.google.com/intl/fr/policies/terms/
https://www.google.com/webhp
https://www.google.com/support/websearch
https://www.google.com/history/optout
https://www.google.com/calendar
https://www.google.com/chrome/
https://www.google.com/url
https://www.google.com/
https://www.google.com/intl/fr/home/
https://www.google.com/intl/fr/home/how-it-works/
https://www.google.com/intl/fr/home/pricing/
https://www.google.com/intl/fr/home/faq/
https://www.google.com/intl/fr/home/resources/advanced/
https://www.google.com/intl/en/retail/solutions/shopping-campaigns/
https://www.google.com/intl/en/business/
https://www.google.com/intl/en/chrome/
https://www.google.com/intl/en/services/
https://www.google.com/intl/en/adsense/start/
https://www.google.com/intl/fr_fr/admob/
https://www.google.com/intl/fr/
https://www.google.com/intl/fr/impact/
https://www.google.com/intl/fr/products/

Note that the program considers https://www.google.com and https://www.google.com/ as two different urls because the latter has a terminating slash. Something should perhaps be done about this.

samlee916

Gribouillis