Python Forum
Need Help Getting Request Response From Main URL
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Need Help Getting Request Response From Main URL
#1
import requests
import argparse
import os
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#function to check in url is valid
def is_valid(url):
   parsed = urlparse(url)
   return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
   global urls
   domain_name = urlparse(url).netloc
   soup = BeautifulSoup(requests.get(url).content, "html.parser")
   for a_tag in soup.findAll("a"):
      href_tag = a_tag.attrs.get("href")
      if href_tag:
         href_tag = urljoin(url, href_tag)
         parsed_href = urlparse(href_tag)
         href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
         if is_valid(href_tag):
            if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
               print(f"External link: {href_tag}")
               external_links.add(href_tag)
               continue
            elif href_tag not in urls:
               print(f"Internal link: {href_tag}")
               urls.append(href_tag)
               internal_links.add(href_tag)

#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
   global total_links_visited, urls
   total_links_visited += 1
   get_all_website_links(url)
   for link in urls:
      if total_links_visited > max_urls:
         break
      crawl(link, max_urls=max_urls)

#main function
def main():
   parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
   parser.add_argument("url", help="The URL to extract links from.")
   parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)

   args = parser.parse_args()
   url = args.url
   max_urls = args.max_urls

   domain_name = urlparse(url).netloc
   
   """Ignore this piece of commented out code
   if os.path.exists(f"{domain_name}_internal_links.txt"):
      with open(f"{domain_name}_internal_links.txt", "r") as f:
         for line in f:
            internal_links.add(line.strip())
            urls.append(line.strip())
      with open(f"{domain_name}_external_links.txt", "r") as f:
         for line in f:
            external_links.add(line.strip())
    """

   crawl(url, max_urls=max_urls)

   print("Total Internal Links:", len(internal_links))
   print("Total External Links:", len(external_links))
   print("Total URLs:", len(external_links) + len(internal_links))

   with open(f"{domain_name}_internal_links.txt", "w") as f:
      for internal_link in internal_links:
         print(internal_link.strip(), file=f)
   with open(f"{domain_name}_external_links.txt", "w") as f:
      for external_link in external_links:
         print(external_link.strip(), file=f)

#executing the python script
if __name__ == "__main__":
   main()
Reply


Messages In This Thread
Need Help Getting Request Response From Main URL - by samlee916 - Jul-17-2020, 04:15 PM

Possibly Related Threads…
Thread Author Replies Views Last Post
  malformed header from script 'main.py': Bad header: * Serving Flask app "main" anuragsapanbharat 2 4,542 Jun-12-2019, 07:26 AM
Last Post: anuragsapanbharat

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020