Dec-18-2017, 05:46 AM
I got bored today so I started coding up this little web crawler. Not knowing much about the standard library's HTMLParser I figured a web crawler would be a good way to learn the API. I was surprised how easy it was to get the content from the pages that I wanted. The only major issue I had with HTMLParser was making it fault tolerant. The fix was actually very simple. I had to overwrite the error method of HTMLParser. I'm not sure but I think it stops parsing the page on broken html. Before overwriting the error method it would just break by way of an exception that would be very hard and hacky to catch.
The problem I'm taking on now is recording the 404 links. I thought this would be trivial but for some reason I can't get it to work. No exceptions are raised but the set that's supposed to contain the bad links stays empty.
#-*-coding:utf8;-*- #qpy:3 #qpy:console from html.parser import HTMLParser as _HTMLParser from urllib.request import urlopen import urllib.robotparser from urllib.parse import urlparse, urljoin from json import dumps as parse_json import cProfile import mimetypes import time import math import random mimetypes.init() PROFILE = cProfile.Profile() ENABLE_PROFILING = False if ENABLE_PROFILING: PROFILE.enable() DEFAULT_ENCODING = 'latin-1' PARANT = '../' def profile(func): def wrap(*args, **kw): PROFILE.enable() return func(*args, **kw) PROFILE.disable() return wrap def average(lst): return float(sum(lst) / len(lst)) def random_wait(mini, maxi): time.sleep(random.randint(mini, maxi)) USER_AGENTS = [ 'Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11', 'Opera/9.25 (Windows NT 5.1; U; en)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9' ] def random_ua(): return random.choice(USER_AGENTS) class CrawlerClient(object): def __init__(self, **kw): self.robotparser = urllib.robotparser.RobotFileParser() self.ua = random_ua() self.referer = kw.get('referer', 'www.google.com') self.not_found = set() def can_fetch(self, url): return self.robotparser.can_fetch(self.ua, url) def get(self, url): self.ua = random_ua() req = urllib.request.Request(url) req.add_header('User-Agent', self.ua) req.add_header('Connection', 'keep-alive') req.add_header('Accept','text/html,xhtml,xml') req.add_header('Referer', self.referer) parsed_url = urlparse(url) robot_file_path = parsed_url.scheme + '://' + parsed_url.netloc + '/robots.txt' self.robotparser.set_url(robot_file_path) self.robotparser.read() if self.can_fetch(url): try: with urlopen(req) as res: http_headers = res.headers status_code = res.getcode() if status_code == 404: self.not_found.add(url) return if status_code == 500: return content_type, *charset = http_headers.get('content-type').split(';') # Try to guess the charset. if charset: charset = charset[0].strip().split('=')[1] # Use fallback encoding. else: charset = DEFAULT_ENCODING # If the downloaded content is of type text/* # feed the content too the parser. if content_type.split('/')[0] == 'text': return res.read().decode(charset) except Exception as e: #print(e) pass class HTMLParser(_HTMLParser): def __init__(self, url, strict=False): self.url = urlparse(url) self.size = 0 self.client = CrawlerClient(referer='https://rickys-python-notes.blogspot.com') # If strict is True the parser will break # on broken html. Othewise it will ignore # broken html and keep on parsing. if not strict: self.error = self._do_nothing _HTMLParser.__init__(self) # Links holds all the links that parser finds. # The parser looks for links in anchor and link tags. self.links = set() self.base_url = '{}://{}'.format(self.url.scheme, self.url.netloc) # title will hold the value of the pages title if # the page has a title self.title = None # This variable lets the handle_data method know # that we are curently reading the title data so # it can store it in self.title self.recording_title = False html = self.client.get(url) if html: self.feed(html) def handle_starttag(self, tag, attrs): try: key, val, *_ = attrs[0] except IndexError: key, val = 0, 0 if key == 'href': if val.startswith('//'): val = self.url.scheme + ':' + val url = urlparse(val) if not url.netloc: url = urlparse(urljoin(self.base_url, url.path)) self.links.add(url.geturl()) if tag == 'title': self.recording_title = True def handle_endtag(self, tag): if tag == 'title': self.recording_title = False def handle_data(self, data): if self.recording_title: self.title = data.strip() def _do_nothing(self, *_, **__): return class CrawlerQueue(object): def __init__(self, seed, **kw): self.seed = seed self.tocrawl = [seed] self.crawled = list() self.non_html_links = list() self.domain = urlparse(seed).netloc self.same_domain = kw.get('same_domain', True) self.exclude_parant_links = kw.get('exclude_parant_links', True) def next(self): random.shuffle(self.tocrawl) link = self.tocrawl.pop() self.crawled.append(link) return link def is_same_domain(self, link): return urlparse(link).netloc == self.domain def add_link(self, link): guessed_type = mimetypes.guess_type(link)[0] or 'text/html' if not guessed_type == 'text/html': return else: self.non_html_links.append(link) if link in self.crawled: return if self.exclude_parant_links and PARANT in link: return if not self.same_domain: self.tocrawl.append(link) else: if self.is_same_domain(link): self.tocrawl.append(link) def add_links(self, links): [self.add_link(link) for link in links] @property def total_crawled(self): return len(self.crawled) @property def in_queue(self): return len(self.tocrawl) @property def total_non_html_links(self): return len(self.non_html_links) @property def has_links(self): return bool(self.tocrawl) @property def empty(self): return self.has_links is False q = CrawlerQueue('http://reddit.com', same_domain=0) not_found = set([]) while q.has_links: crawling = q.next() page = HTMLParser(crawling) [not_found.add(link) for link in page.client.not_found] q.add_links(page.links) title = page.title if title: print(title, not_found)