Python Forum

import requests
import argparse
import os
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#function to check in url is valid
def is_valid(url):
   parsed = urlparse(url)
   return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
   global urls
   domain_name = urlparse(url).netloc
   soup = BeautifulSoup(requests.get(url).content, "html.parser")
   for a_tag in soup.findAll("a"):
      href_tag = a_tag.attrs.get("href")
      if href_tag:
         href_tag = urljoin(url, href_tag)
         parsed_href = urlparse(href_tag)
         href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
         if is_valid(href_tag):
            if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
               print(f"External link: {href_tag}")
               external_links.add(href_tag)
               continue
            elif href_tag not in urls:
               print(f"Internal link: {href_tag}")
               urls.append(href_tag)
               internal_links.add(href_tag)

#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
   global total_links_visited, urls
   total_links_visited += 1
   get_all_website_links(url)
   for link in urls:
      if total_links_visited > max_urls:
         break
      crawl(link, max_urls=max_urls)

#main function
def main():
   parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
   parser.add_argument("url", help="The URL to extract links from.")
   parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)

   args = parser.parse_args()
   url = args.url
   max_urls = args.max_urls

   domain_name = urlparse(url).netloc
   
   """Ignore this piece of commented out code
   if os.path.exists(f"{domain_name}_internal_links.txt"):
      with open(f"{domain_name}_internal_links.txt", "r") as f:
         for line in f:
            internal_links.add(line.strip())
            urls.append(line.strip())
      with open(f"{domain_name}_external_links.txt", "r") as f:
         for line in f:
            external_links.add(line.strip())
    """

   crawl(url, max_urls=max_urls)

   print("Total Internal Links:", len(internal_links))
   print("Total External Links:", len(external_links))
   print("Total URLs:", len(external_links) + len(internal_links))

   with open(f"{domain_name}_internal_links.txt", "w") as f:
      for internal_link in internal_links:
         print(internal_link.strip(), file=f)
   with open(f"{domain_name}_external_links.txt", "w") as f:
      for external_link in external_links:
         print(external_link.strip(), file=f)

#executing the python script
if __name__ == "__main__":
   main()

what is the issue?
Please explain what is happening now
give example URL.
Show any error messages unaltered, complete within error tags.
Anything else that might help diagnose the problem.

The code crawls a website and finds the internal and external link.
I would like help getting the request response from the main url.
For example, to run the script:
python <filename> <url>

Output:
<Success>? trying to print that out for <url>
Internal Links: ..
External Links ..
Total Links: ..

At least, I would do/recommend the following:

1) Use classes, avoid globals; You can declare, e.g. class LinkExtractor with specific methods;
2) There are magic numbers in your code, e.g. max-urls; I would put them to the beginning of the file or into a separate settings file;
3) You didn't check the input url for validity; you probably need to call is_valid for the input url too?!
4) No docstrings; It is highly recommended to have docstrings;
5) Optionally, you can provide type hints (however, for such small project they might be redundant...)
6) Some tests (within the main func) would be good;
7) If the link is valid you do something, but if it is not valid (you probably need to log such cases..?!)

samlee916

Larz60+

samlee916

scidam