Python Forum
Any ideas on making this python web crawler code more secure and optimal?
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Any ideas on making this python web crawler code more secure and optimal?
#1
Code prints out internal and external links and stores it into four different file formats.
How to execute program: python filename url
import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#check if url is valid
def is_valid(url):
   parsed = urlparse(url)
   return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
   global urls
   domain_name = urlparse(url).netloc
   res1 = requests.get(url)
   soup = BeautifulSoup(res1.content, "html.parser")
   for a_tag in soup.findAll("a"):
      href_tag = a_tag.attrs.get("href")
      if href_tag:
         href_tag = urljoin(url, href_tag)
         parsed_href = urlparse(href_tag)
         href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
         if is_valid(href_tag):
            if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
               print(f"External link: {href_tag}")
               external_links.add(href_tag)
               continue
            elif href_tag not in urls:
               print(f"Internal link: {href_tag}")
               urls.append(href_tag)
               internal_links.add(href_tag)

#this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
   global total_links_visited, urls
   total_links_visited += 1
   get_all_website_links(url)
   for link in urls:
      if total_links_visited > max_urls:
         break
      crawl(link, max_urls=max_urls)

#main function
def main():
   parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
   parser.add_argument("url", help="The URL to extract links from.")
   parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
   args = parser.parse_args()
   url = args.url
   max_urls = args.max_urls
   domain_name = urlparse(url).netloc
   res = requests.get(url)
   statuscode = res.status_code
   print("Status Code:", statuscode)
   if statuscode == 200: 
      crawl(url, max_urls=max_urls)
   else:
      print("Failed to get a request response back.")

   print("Total Internal Links:", len(internal_links))
   print("Total External Links:", len(external_links))
   print("Total Links:", len(external_links) + len(internal_links))

   with open(f"{domain_name}_internal_links.txt", "w") as f:
      for internal_link in internal_links:
         print(internal_link.strip(), file=f)
   with open(f"{domain_name}_external_links.txt", "w") as f:
      for external_link in external_links:
         print(external_link.strip(), file=f)

   #writing to json files
   f = open(f"{domain_name}_internal_links.json","w")
   json.dump({'internal_links':list(internal_links)}, f, indent=6)
   f.close()
   f = open(f"{domain_name}_external_links.json","w")
   json.dump({'external_links':list(external_links)}, f, indent=6)
   f.close()

   #writing to csv
   df = pd.DataFrame(list(internal_links))
   df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
   df = pd.DataFrame(list(external_links))
   df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)

   #writing to xml
   xmlformat = xml.Element("internal_links")
   xmlformat_1 = xml.SubElement(xmlformat, "link")
   for l in list(internal_links):
      xmlformat_1.text = str(l)
      xmlformat.append(xmlformat_1)
   tree = xml.ElementTree(xmlformat)
   tree.write(f"{domain_name}_internal_links.xml")

   xmlformat = xml.Element("external_links")
   xmlformat_1 = xml.SubElement(xmlformat, "link")
   for l in list(external_links):
      xmlformat_1.text = str(l)
      xmlformat.append(xmlformat_1)
   tree = xml.ElementTree(xmlformat)
   tree.write(f"{domain_name}_external_links.xml")

#executing the python script
if __name__ == "__main__":
   main()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Any ideas on making this python web crawler code better, simpler, or optimal? samlee916 0 1,599 Jul-15-2020, 01:30 PM
Last Post: samlee916

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020