Dec-05-2017, 05:51 PM
I wrote this on my android using qpython3
#-*-coding:utf8;-*- #qpy:3 #qpy:console import urllib.request from urllib.parse import urlparse import re seed = 'http://www.google.com' tocrawl = set([seed]) crawled = set() def get_links(url): try: html = urllib.request.urlopen(url).read().decode('latin-1') links = set(re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)) except Exception as error: links = 0 print(error) if links: for link in links: if not urlparse(link).scheme: link = '{}{}'.format(seed, link) yield urlparse(link) def crawl(root): print('Crawling:--> {}'.format(root)) queue_size = len(tocrawl) total_crawled = len(crawled) print(total_crawled, queue_size) crawled.add(root) for link in get_links(root): if link.netloc == urlparse(seed).netloc and link.geturl() not in crawled: tocrawl.add(link.geturl()) while tocrawl: root = tocrawl.pop() crawl(root)