import requests import argparse from urllib.request import urlparse, urljoin from bs4 import BeautifulSoup #initialize the set of links internal_links = set() external_links = set() total_links_visited = 0 def is_valid(url): #checks whether url is a valid URL parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) def get_all_website_links(url): #returns all URLs that is found on url in which it belongs to the same website #all URLs of url urls = set() #domain name of the URL without the protocol domain_name = urlparse(url).netloc soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href_tag = a_tag.attrs.get("href") if not href_tag: #href empty tag continue #join the URL if it's relative href_tag = urljoin(url, href_tag) parsed_href = urlparse(href_tag) #remove URL GET parameters, URL fragments, etc. href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if not is_valid(href_tag): #not a valid URL continue if href_tag in internal_links: #already in the set continue if domain_name not in href_tag: #external link if href_tag not in external_links: print(f"External link: {href_tag}") external_links.add(href_tag) continue print(f"Internal link: {href_tag}") urls.add(href_tag) internal_links.add(href_tag) return urls def crawl(url, max_urls=50): #crawls a web page and extracts all links global total_links_visited total_links_visited += 1 links = get_all_website_links(url) for link in links: if total_links_visited > max_urls: break crawl(link, max_urls=max_urls) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Link Extractor Tool with Python") parser.add_argument("url", help="The URL to extract links from.") parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int) args = parser.parse_args() url = args.url max_urls = args.max_urls crawl(url, max_urls=max_urls) print("Total Internal Links:", len(internal_links)) print("Total External Links:", len(external_links)) print("Total URLs:", len(external_links) + len(internal_links)) domain_name = urlparse(url).netloc # save the internal links to a file with open(f"{domain_name}_internal_links.txt", "w") as f: for internal_link in internal_links: print(internal_link.strip(), file=f) # save the external links to a file with open(f"{domain_name}_external_links.txt", "w") as f: for external_link in external_links: print(external_link.strip(), file=f)
Any ideas on making this python web crawler code better, simpler, or optimal?
Possibly Related Threads… | |||||
Thread | Author | Replies | Views | Last Post | |
Any ideas on making this python web crawler code more secure and optimal? | samlee916 | 0 | 2,050 |
Jul-21-2020, 03:47 AM Last Post: samlee916 |
Users browsing this thread: 1 Guest(s)