okay made another web crawler but with Tor proxies this time...
Here's the repository.
Here's the repository.
#! /usr/bin/python3 # rootVIII # Open a Firefox ESR Browser on Kali Linux # using Tor Socks Proxy and Selenium Geckodriver. # Take up thy stethoscope and crawl from sys import exit from selenium import webdriver from random import randint, random from os import popen, system from time import sleep class TorBot: def __init__(self): self.fp = webdriver.FirefoxProfile() self.browser = None self.first_page = 'https://en.wikipedia.org/wiki/link_rot' self.default_page = self.first_page self.visited = [] # Ensure that the Tor service is running @staticmethod def service_running(): for i in popen('netstat -nap --tcp | grep tor'): if '127.0.0.1:9050' in i and 'tor' in i.split()[6]: return True return False def recurring_visits(self): return len([url for url in self.visited if url == self.first_page]) # generate a new Tor IP address def new_ip(self): self.browser.close() system('service tor reload') # set Firefox ESR config settings def set_tor_details(self): ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " ua += "AppleWebKit/537.36 (KHTML, like Gecko) " ua += "Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" self.fp.set_preference('network.proxy.type', 1) self.fp.set_preference('network.proxy.socks', '127.0.0.1') self.fp.set_preference('network.proxy.socks_port', 9050) self.fp.set_preference('general.useragent.override', ua) self.fp.set_preference("http.response.timeout", 10) self.fp.set_preference("dom.max_script_run_time", 10) self.fp.update_preferences() self.browser = webdriver.Firefox(firefox_profile=self.fp) # Crawl indefinitely. Temporarily store each page in self.first_page # so that if an exception occurs/new IP is generated, the page # stored in memory will be revisited with new IP before # continuing the crawl and finding a new link. If page is in # self.visited more than 2x, the bot is probably stuck in a # loop... visit default page in that case def crawl(self): self.browser.get('https://www.iplocation.net/find-ip-address') assert 'What is my IP address?' in self.browser.title trs = self.browser.find_elements_by_tag_name('tr') crawl_details = [tr.text for tr in trs] print('\n'.join(crawl_details[:8])) if self.recurring_visits() > 1: print(self.first_page) print('\nretrieving default page to break loop...\n') self.browser.get(self.default_page) self.visited = [] else: self.browser.get(self.first_page) self.visited.append(self.browser.current_url) print('\nVisting page: %s' % self.browser.current_url) while True: sleep(randint(10, 20) + random()) links = self.browser.find_elements_by_xpath("//a[@href]") links[randint(0, len(links) - 1)].click() self.first_page = self.browser.current_url print('\nVisting random link: %s' % self.browser.current_url) if __name__ == "__main__": bot = TorBot() while True: if not bot.service_running(): message = 'Ensure Tor is running/listening on 127.0.0.1:9050' print(message) exit(1) bot.set_tor_details() try: print('\nstarting new crawl:\n') bot.crawl() except Exception as err: print('\nException caught:') print(err) finally: print('\nGenerating new IP address...\n') bot.new_ip()