I am trying to build a web crawler to extract all the links on a webpage. I have created 2 python files. (class: scanner.py and object: vulnerability-scanner.py). When I run the script there is an error shows up. I am unable to find the error. Help me to solve this.
Source code
----------------------------------------------------------------------------
scanner.py
------------------------------------------------------------------------------------------
vulnerability-scanner.py
-------------------------------------------------------------------------------------------
error
Source code
----------------------------------------------------------------------------
scanner.py
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
import requests import re import urllib.parse class Scanner: def __init__( self , url): self .target_url = url self .target_links = [] def extract_links_from( self , url): response = requests.get(url) return re.findall( '"((http|ftp)s?://.*?)"' , response.content.decode( 'utf-8' )) def crawl( self , url = None ): if url = = None : url = self .target_url href_links = self .extract_links_from(url) for link in href_links: link = urllib.parse.urljoin(url, link) if '#' in link: link = link.split( "#" )[ 0 ] if self .target_url in link and link not in self .target_links: self .target_links.append(link) print (link) self .crawl(link) |
vulnerability-scanner.py
1 2 3 4 5 |
import scanner vul_scanner = scanner.Scanner(target_url) vul_scanner.crawl(target_url) |
error
Error:Traceback (most recent call last):
File "C:/xampp/htdocs/WebVIM/vulnerability-scanner.py", line 5, in <module>
vul_scanner.crawl(target_url)
File "C:\xampp\htdocs\WebVIM\scanner.py", line 19, in crawl
link = urllib.parse.urljoin(url, link)
File "C:\Users\HouseMoNaRa\AppData\Local\Programs\Python\Python37-32\lib\urllib\parse.py", line 487, in urljoin
base, url, _coerce_result = _coerce_args(base, url)
File "C:\Users\HouseMoNaRa\AppData\Local\Programs\Python\Python37-32\lib\urllib\parse.py", line 120, in _coerce_args
raise TypeError("Cannot mix str and non-str arguments")
TypeError: Cannot mix str and non-str arguments