![]() |
Need Help Getting Request Response From Main URL - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: Need Help Getting Request Response From Main URL (/thread-28402.html) |
Need Help Getting Request Response From Main URL - samlee916 - Jul-17-2020 import requests import argparse import os from urllib.request import urlparse, urljoin from bs4 import BeautifulSoup internal_links = set() external_links = set() urls = [] total_links_visited = 0 #function to check in url is valid def is_valid(url): parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) #this function finds and prints out the internal and external links def get_all_website_links(url): global urls domain_name = urlparse(url).netloc soup = BeautifulSoup(requests.get(url).content, "html.parser") for a_tag in soup.findAll("a"): href_tag = a_tag.attrs.get("href") if href_tag: href_tag = urljoin(url, href_tag) parsed_href = urlparse(href_tag) href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path if is_valid(href_tag): if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links: print(f"External link: {href_tag}") external_links.add(href_tag) continue elif href_tag not in urls: print(f"Internal link: {href_tag}") urls.append(href_tag) internal_links.add(href_tag) #this function crawls a web page and extracts all links def crawl(url, max_urls=50): global total_links_visited, urls total_links_visited += 1 get_all_website_links(url) for link in urls: if total_links_visited > max_urls: break crawl(link, max_urls=max_urls) #main function def main(): parser = argparse.ArgumentParser(description="Link Extractor Tool with Python") parser.add_argument("url", help="The URL to extract links from.") parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int) args = parser.parse_args() url = args.url max_urls = args.max_urls domain_name = urlparse(url).netloc """Ignore this piece of commented out code if os.path.exists(f"{domain_name}_internal_links.txt"): with open(f"{domain_name}_internal_links.txt", "r") as f: for line in f: internal_links.add(line.strip()) urls.append(line.strip()) with open(f"{domain_name}_external_links.txt", "r") as f: for line in f: external_links.add(line.strip()) """ crawl(url, max_urls=max_urls) print("Total Internal Links:", len(internal_links)) print("Total External Links:", len(external_links)) print("Total URLs:", len(external_links) + len(internal_links)) with open(f"{domain_name}_internal_links.txt", "w") as f: for internal_link in internal_links: print(internal_link.strip(), file=f) with open(f"{domain_name}_external_links.txt", "w") as f: for external_link in external_links: print(external_link.strip(), file=f) #executing the python script if __name__ == "__main__": main() RE: Need Help Getting Request Response From Main URL - Larz60+ - Jul-17-2020 what is the issue? Please explain what is happening now give example URL. Show any error messages unaltered, complete within error tags. Anything else that might help diagnose the problem. RE: Need Help Getting Request Response From Main URL - samlee916 - Jul-17-2020 The code crawls a website and finds the internal and external link. I would like help getting the request response from the main url. For example, to run the script: python <filename> <url> Output: <Success>? trying to print that out for <url> Internal Links: .. External Links .. Total Links: .. RE: Need Help Getting Request Response From Main URL - scidam - Jul-18-2020 At least, I would do/recommend the following: 1) Use classes, avoid globals; You can declare, e.g. class LinkExtractor with specific methods;2) There are magic numbers in your code, e.g. max-urls ; I would put them to the beginning of the file or into a separate settings file;3) You didn't check the input url for validity; you probably need to call is_valid for the input url too?!4) No docstrings; It is highly recommended to have docstrings; 5) Optionally, you can provide type hints (however, for such small project they might be redundant...) 6) Some tests (within the main func) would be good; 7) If the link is valid you do something, but if it is not valid (you probably need to log such cases..?!) |