A simple web crawler that does nothing special.

A simple web crawler that does nothing special. - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: General (https://python-forum.io/forum-1.html)
+--- Forum: Code sharing (https://python-forum.io/forum-5.html)
+--- Thread: A simple web crawler that does nothing special. (/thread-6746.html)

A simple web crawler that does nothing special. - RickyWilson - Dec-05-2017

I wrote this on my android using qpython3

#-*-coding:utf8;-*-
#qpy:3
#qpy:console


import urllib.request
from urllib.parse import urlparse
import re

seed = 'http://www.google.com'
tocrawl = set([seed])
crawled = set()


def get_links(url):
    try:
        html = urllib.request.urlopen(url).read().decode('latin-1')
        links = set(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html))
    except Exception as error:
        links = 0
        print(error)
    if links:
        for link in links:
            if not urlparse(link).scheme:
                link = '{}{}'.format(seed, link)
            yield urlparse(link)

def crawl(root):
    print('Crawling:--> {}'.format(root))
    queue_size = len(tocrawl)
    total_crawled = len(crawled)
    print(total_crawled, queue_size)
    crawled.add(root)
    for link in get_links(root):    
        if link.netloc == urlparse(seed).netloc and link.geturl() not in crawled:
            tocrawl.add(link.geturl())
            
    
while tocrawl:
    root = tocrawl.pop()                            
    crawl(root)