selenium proxy web crawler

rootVIII · (This post was last modified: Jun-23-2019, 12:40 AM by rootVIII.)

Here is the repository. Performance may vary depending on the current proxy. If an exception is caught, the next proxy/socket will be used. It "should" pretty much just run continuously if you were to just leave it running on your machine. The "with pyvirual display" can easily be added if you didn't want to see the GUI/Browser running (terminal/stdout only).

As a POC,

I'm definitely not in Russia

Liar

#! /usr/bin/python3
# rootVIII
from sys import exit
from time import sleep
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from re import findall
from argparse import ArgumentParser
from random import randint, random
from requests import get


class ProxyCrawler:
    def __init__(self, url, keyword):
        self.sockets, self.agents = [], []
        self.keyword = keyword
        self.url = url
        self.proxy_host = ''
        self.proxy_port = 0
        self.fp = None
        self.browser = None
        self.request_count = 0
        # request_MAX: # of requests to be made
        # before refreshing the list of proxies
        self.request_MAX = 5
        self.set_agents()
        self.scrape_sockets()

    @staticmethod
    def random_sleep(length):
        if length != 'long':
            sleep(randint(5, 10) + random())
        else:
            sleep(randint(30, 60) + random())

    def set_current_proxy(self):
        self.fp = webdriver.FirefoxProfile()
        self.fp.set_preference('network.proxy.type', 1)
        self.fp.set_preference('network.proxy.http', self.proxy_host)
        self.fp.set_preference('network.proxy.http_port', self.proxy_port)
        self.fp.set_preference('network.proxy.ssl', self.proxy_host)
        self.fp.set_preference('network.proxy.ssl_port', self.proxy_port)
        self.fp.set_preference('general.useragent.override', self.agent())
        self.fp.update_preferences()

    def scrape_sockets(self):
        print('acquiring new proxies')
        r = get("https://www.sslproxies.org/")
        matches = findall(r"<td>\d+\.\d+\.\d+\.\d+</td><td>\d+</td>", r.text)
        revised = [m.replace('<td>', '') for m in matches]
        self.sockets = [s[:-5].replace('</td>', ':') for s in revised]

    def search(self, socket):
        print('starting search')
        temp_socket = socket.split(':')
        self.proxy_host = temp_socket[0]
        self.proxy_port = int(temp_socket[1])
        self.set_current_proxy()
        self.browser = webdriver.Firefox(firefox_profile=self.fp)
        self.browser.get('https://www.bing.com/')
        assert 'Bing' in self.browser.title
        print('using socket: %s:%d' % (self.proxy_host, self.proxy_port))
        print('searching for keyword(s):   %s' % self.keyword)
        self.random_sleep('short')
        search_box = self.browser.find_element_by_name('q')
        sub_button = self.browser.find_element_by_name('go')
        self.random_sleep('short')
        search_box.send_keys(self.keyword)
        self.random_sleep('long')
        sub_button.send_keys(Keys.RETURN)
        self.random_sleep('long')
        page_index = 0
        while self.url not in self.browser.current_url:
            page_index += 1
            print('current index:   %s' % str(page_index))
            page_links = self.browser.find_elements_by_xpath("//a[@href]")
            found_link = ''
            for link in page_links:
                if self.url in link.get_attribute('href'):
                    found_link = link
            if found_link:
                print('found %s at index %d' % (self.url, page_index))
                found_link.click()
            self.random_sleep('short')
            if self.url not in self.browser.current_url:
                self.random_sleep('short')
                idx = str(page_index + 1)
                self.browser.find_element_by_link_text(idx).click()
        self.random_sleep('short')
        target_links = self.browser.find_elements_by_xpath("//a[@href]")
        random_page_num = randint(0, len(target_links) - 1)
        self.random_sleep('long')
        target_link = target_links[random_page_num]
        target_link.click()
        self.random_sleep('long')
        print('visiting random page: ' + self.browser.current_url)
        self.random_sleep('short')

    def start_search(self):
        for socket in self.sockets:
            try:
                self.search(socket)
            except Exception as e:
                print('\nexception caught:')
                print(type(e).__name__, e)
                print('trying next socket...')
                continue
            finally:
                self.browser.quit()
                self.request_count += 1
                if self.request_count > self.request_MAX:
                    self.request_count = 0
                    self.scrape_sockets()

    def agent(self):
        return self.agents[randint(0, len(self.agents) - 1)]

    # Add/remove desired user-agents below
    def set_agents(self):
        self.agents = [
            "Opera/9.80 (S60; SymbOS; Opera Mobi/498; U; sv)",
            "Mozilla/2.02 [fr] (WinNT; I)",
            "WeatherReport/1.2.2 CFNetwork/485.12.7 Darwin/10.4.0",
            "W3C_Validator/1.432.2.10",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)",
            "Cyberdog/2.0 (Macintosh; 68k)",
            "MJ12bot/v1.0.8 (http://majestic12.co.uk/bot.php?+)",
            "Exabot/2.0",
            "Mozilla/5.0 (compatible; news bot /2.1)",
            "curl/7.19.6 (i686-pc-cygwin)",
            "ConveraCrawler/0.4",
            "Mozilla/4.0 (MSIE 6.0; Windows NT 5.1; Search)",
            "EARTHCOM.info/1.6 [www.earthcom.info]",
            "librabot/1.0 (+http://search.msn.com/msnbot.htm)",
            "NetResearchServer/2.5(loopimprovements.com/robot.html)",
            "PHP/5.2.10",
            "msnbot-Products/1.0 (+http://search.msn.com/msnbot.htm)",
            "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;)"
        ]


if __name__ == "__main__":
    description = 'Usage: python proxy_crawler.py -u '
    description += '<https://example.com> -k <search keyword>'
    parser = ArgumentParser(description=description)
    parser.add_argument('-u', '--url', required=True, help='url')
    parser.add_argument('-k', '--keyword', required=True, help='keyword')
    d = parser.parse_args()
    if 'http://' not in d.url and 'https://' not in d.url:
        print('please use an absolute URL')
        exit(1)
    while True:
        bot = ProxyCrawler(d.url, d.keyword)
        bot.start_search()

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	web crawler with Tor proxy & Firefox ESR on Linux Kali	rootVIII	0	2,767	Jul-01-2019, 04:02 AM Last Post: rootVIII

selenium proxy web crawler

User Panel Messages

Announcements