Python Forum
Python - Why multi threads are not working in this web crawler? - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Python - Why multi threads are not working in this web crawler? (/thread-6346.html)



Python - Why multi threads are not working in this web crawler? - ratanbhushan - Nov-17-2017



Hi Team

I have a program in which I want threads want to be run in parallel.

import requests
from queue import Queue
import time
from bs4 import BeautifulSoup
import threading

urlList = []
q = Queue()

def url_c(url):

    try:
        r = requests.get(url)
        htmldoc = r.content

        if r.status_code in [400,404,403,408,409,501,502,503]:print (str(r.status_code)+"-"+str(r.status_code)+"-->"+url)               
        else: print ("no problem in-->",url)
        
        soup= BeautifulSoup(htmldoc,'html.parser')
        links = []
        links = soup.findAll('a')

        if len(links)>0:
            for link in links:
                if link.get('href') not in urlList and link.get('href') is not None and len(link.get('href'))>10 and 'JavaScript' not in link.get('href'):
                    if 'http' not in link.get('href'):
                        urlList.append(url + link.get('href'))
                    else:
                        urlList.append(link.get('href'))
    except:
        print("ERROR ",url)


def threader():
    while True:
        url = q.get
        url_c(url)
        q.task_done()



# how many threads are we going to allow for
for x in range(10):
     t = threading.Thread(target=threader)

     # classifying as a daemon, so they will die when the main dies
     t.daemon = True

     # begins, must come after daemon definition
     t.start()


    
def main():
    global end 
    global start

    end = 0
    start =1

    print('Enter the URL')
    url = input()
    
    url_c(url)
    end = len(urlList)
    
    while start != end:
        end = len(urlList)
        print(start)
        url_c(urlList[start])
        q.put(urlList[start])
        print(end)
        start +=1
        
    i =1
    for u in urlList:
        print('length is ->',len(u),'-',u)
        i +=1

    print('There are ',len(urlList),' links.')


main()
Please advise what to do?


RE: Python - Why multi threads are not working in this web crawler? - Larz60+ - Nov-17-2017

what are the specific issues?
what are the full, verbatim error tracebacks, if any?