Python Forum
Python web crawler and input command not having the correct results see below for mor
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Python web crawler and input command not having the correct results see below for mor
#1
Web crawler crawls a website and exports the results into either a txt, son, XML, or csv file format of your choosing. I am getting an error when I run the python filename.py -t ALL https://www.youtube.com. The ALL part is supposed to download all four file formats but it doesn't. I am able to choose my file formats and if I don't include one txt file is the default file of choice. Also, if you can help implement a way to make this python script more optimal or better please do. Thanks in advance.
import requests
import argparse
import time
import json
import random
import pandas as pd
import os
import xml.etree.ElementTree as xml
from urllib.request import urlparse, urljoin
from bs4 import BeautifulSoup

internal_links = set()
external_links = set()
urls = []
total_links_visited = 0

#check if url is valid
def is_valid(url):
    parsed = urlparse(url)
    return bool(parsed.netloc) and bool(parsed.scheme)

#this function finds and prints out the internal and external links
def get_all_website_links(url):
    global urls
    domain_name = urlparse(url).netloc
    res1 = requests.get(url)
    soup = BeautifulSoup(res1.content, "html.parser")
    for a_tag in soup.findAll("a"):
        href_tag = a_tag.attrs.get("href")
        if href_tag:
            href_tag = urljoin(url, href_tag)
            parsed_href = urlparse(href_tag)
            href_tag = parsed_href.scheme + "://" + parsed_href.netloc + parsed_href.path
            if is_valid(href_tag):
                if domain_name not in urlparse(href_tag).netloc and href_tag not in external_links:
                    print(f"External link: {href_tag}")
                    external_links.add(href_tag)
                    continue
                elif href_tag not in urls:
                    print(f"Internal link: {href_tag}")
                    urls.append(href_tag)
                    internal_links.add(href_tag)

    #this function crawls a web page and extracts all links
def crawl(url, max_urls=50):
    global total_links_visited, urls
    total_links_visited += 1
    get_all_website_links(url)
    for link in urls:
        if total_links_visited > max_urls:
            break
        crawl(link, max_urls=max_urls)

def save(output_file_format, domain_name, internal_links, external_links):
    if (output_file_format == "json" or output_file_format == "ALL"):
        #writing to json files
        f = open(f"{domain_name}_internal_links.json","w")
        json.dump({'internal_links':list(internal_links)}, f, indent=6)
        f.close()
        f = open(f"{domain_name}_external_links.json","w")
        json.dump({'external_links':list(external_links)}, f, indent=6)
        f.close()

    elif (output_file_format == "csv" or output_file_format == "ALL"):
        #writing to csv
        df = pd.DataFrame(list(internal_links))
        df.to_csv(f"{domain_name}_internal_links.csv", index=False, header=False)
        df = pd.DataFrame(list(external_links))
        df.to_csv(f"{domain_name}_external_links.csv", index=False, header=False)

    elif (output_file_format == "xml" or output_file_format == "ALL"):
        #writing to xml
        xmlformat = xml.Element("internal_links")
        xmlformat_1 = xml.SubElement(xmlformat, "link")
        for l in list(internal_links):
            xmlformat_1.text = str(l)
            xmlformat.append(xmlformat_1)
        tree = xml.ElementTree(xmlformat)
        tree.write(f"{domain_name}_internal_links.xml")

        xmlformat = xml.Element("external_links")
        xmlformat_1 = xml.SubElement(xmlformat, "link")
        for l in list(external_links):
            xmlformat_1.text = str(l)
            xmlformat.append(xmlformat_1)
        tree = xml.ElementTree(xmlformat)
        tree.write(f"{domain_name}_external_links.xml")
      
    elif (output_file_format == "ALL"):
        with open(f"{domain_name}_internal_links.txt", "w") as f:
            for internal_link in internal_links:
                print(internal_link.strip(), file=f)
        with open(f"{domain_name}_external_links.txt", "w") as f:
            for external_link in external_links:
                print(external_link.strip(), file=f)
    
    else:
        with open(f"{domain_name}_internal_links.txt", "w") as f:
            for internal_link in internal_links:
                print(internal_link.strip(), file=f)
        with open(f"{domain_name}_external_links.txt", "w") as f:
            for external_link in external_links:
                print(external_link.strip(), file=f)

#main function
def main():
    parser = argparse.ArgumentParser(description="Link Extractor Tool with Python")
    parser.add_argument("url", help="The URL to extract links from.")
    parser.add_argument("-m", "--max-urls", help="Number of max URLs to crawl, default is 30.", default=30, type=int)
    parser.add_argument("-t", "--output-file-format", help="Output file format to store the data. Write ALL to get all file formats Default text", default="txt")
    args = parser.parse_args()
    url = args.url
    max_urls = args.max_urls
    output_file_format = args.output_file_format
    domain_name = urlparse(url).netloc
    res = requests.get(url)
    statuscode = res.status_code
    print("Status Code:", statuscode)
    if statuscode == 200:
        crawl(url, max_urls=max_urls)
        print("Total Internal Links:", len(internal_links))
        print("Total External Links:", len(external_links))
        print("Total Links:", len(external_links) + len(internal_links))
        save(output_file_format, domain_name, internal_links, external_links)
    else:
        print("Failed to get a request response back.")
'''
    print("Total Internal Links:", len(internal_links))
    print("Total External Links:", len(external_links))
    print("Total Links:", len(external_links) + len(internal_links))
    
    save(output_file_format, domain_name, internal_links, external_links)
'''
#executing the python script
if __name__ == "__main__":
    main()
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Python project - sport results Qn91 3 1,759 Oct-10-2022, 08:48 AM
Last Post: Larz60+
  Python crawler reports errors for some Chinese characters yliu315 0 929 Sep-11-2022, 06:17 PM
Last Post: yliu315
  Web Crawler help Mr_Mafia 2 1,847 Apr-04-2020, 07:20 PM
Last Post: Mr_Mafia
  Web Crawler help takaa 39 26,858 Apr-26-2019, 12:14 PM
Last Post: stateitreal
  email crawler in python aaa 0 2,591 May-18-2018, 07:03 PM
Last Post: aaa
  how to check if i have installed python correct? apollo 4 5,090 Dec-07-2017, 09:46 PM
Last Post: apollo
  Python - Why multi threads are not working in this web crawler? ratanbhushan 1 2,767 Nov-17-2017, 05:21 PM
Last Post: Larz60+

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020