I am working on this web crawler and I am using the parsed_links variable to store links that have been found in crawled pages. When I print the variable though, it is empty. What is going wrong? I tried directly adding links to self.links.
git.r3df0x.com/snippets/1
git.r3df0x.com/snippets/1
import requests from bs4 import BeautifulSoup import threading import sys import re ### DEBUGING VARIABLES domain = 'https://en.wikipedia.org' all_urls = [] crawled_urls = [] operator_email = '[email protected]' stealth_mode = False ### These will be removed in the main version ## and replaced with ways of taking input ## from the user. system_user_agent = 'Stratofortress web crawler - Version 0.1 - Operator ' + operator_email stealth_user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20100101 Firefox/10.0' user_agent = '' if (stealth_mode == True): user_agent = stealth_user_agent else: user_agent = system_user_agent class indexer(threading.Thread): def __init__(self, url): self.__url = url self.links = [] self.domain = 'unset' def __get(self, url, stealth=False): if (stealth == True): user_agent = stealth_user_agent else: user_agent = system_user_agent return requests.get(url, headers={'user-agent': user_agent}) def run(self): parsed_links = [] r = self.__get(self.__url) html = r.text soup = BeautifulSoup(html) links = soup.findAll('a', href=True) for link in links: print link['href'] #if (link['href'].startswith('http://') or link['href'].startswith('https://')): if (re.match('(http|https):\/\/*', link['href'])): found_domain = link['href'].split('//')[1].split('/')[0] print '======= DOMAIN ==== from http(s):// =====> ' + found_domain if (found_domain == domain): parsed_links.append(link['href']) if (re.match('^(\/\/)*', link['href'])): print '======== Matched // URL ===========> ' + link['href'] print parsed_links def main(): i = indexer(domain) i.run() if __name__ == '__main__': main()