Python Forum
Find a specific word on a webpage and save the url
Thread Rating:
  • 1 Vote(s) - 5 Average
  • 1
  • 2
  • 3
  • 4
  • 5
Find a specific word on a webpage and save the url
#1
Hello Everyone,

I have an assignment that is quickly nearing due date, I was giving a code to correct for almost a week and i have tried my best so far but it's not enough obviously.

The job the code is suppose to do is to search through a list of url to find if a specific Product (text)in this case NIKE appear on each website, if it is on the site then save the url in ouput open("NikeShoes.txt","a") and if not do nothing and got to the next site in the list as fast as possible.

Now my problem is that no matter what I do I just don't get it to save url instead it saves the string 'nike' which was what I searched for but not what I intend to get as output, the output is supposed to be the website which the string is found on. below is the entire code.

import urllib2
import re
import sys
import cookielib
from threading import Timer
from multiprocessing import Process, Queue
 
class GetResults(Process): 
    def __init__(self, rezqueue):
        Process.__init__(self)
        self.rezqueue = rezqueue
   
    def run(self):
        while True:
            shoe = self.rezqueue.get()
            if shoe is None:   return False
            with open("NikeShoes.txt","a") as Product:
                Product.write(shoe.rstrip()+"\n")
            print shoe
 
class Crawler(Process): 
    def __init__(self, queue, rezqueue):
        Process.__init__(self)
        self.queue = queue
        self.rezqueue = rezqueue
   
    
            

    def run(self):
        while True:
            site = self.queue.get()
            if site is None:    return False
            self.crawl(site)
            
            
    def crawl(self,site):
        try:
            WatchIt = Timer(15.0, self.WatchDog)
            WatchIt.start()
 
            cj = cookielib.CookieJar()        
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
            opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]
            opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]
            resp = opener.open(site,timeout=10)
            WatchIt.cancel()
            self.getem(resp.read())
            
        except Exception, e:
            #print e
            f = 1
             
    def getem(self,resp):
        try:
            shoes = re.findall('nike', str(resp))
            CleanProducts = set(shoes)
            for em in CleanProducts:
                self.rezqueue.put(em.lower())
        except Exception, e:
            return False
 
    def WatchDog(self):
        return False           
              
 
             
if __name__ == "__main__":
 
    if len(sys.argv) < 3:
        print "\tExample: ",sys.argv[0],"30 dom.txt"
        sys.exit()
 
    queue = Queue(maxsize=3000)
    rezqueue = Queue()
    ThreadNumber = int(sys.argv[1])
    ThreadList = []
 
    for i in range(ThreadNumber):
        t = Crawler(queue,rezqueue)
        t.daemon = True
        t.start()
        ThreadList.append(t)
         
    GR = GetResults(rezqueue)
    GR.daemon = True
    GR.start()
     
    with open(sys.argv[2],"rU") as urls:
        for url in urls:
            try:
                if url.startswith('http://'):
                    queue.put(url.rstrip())
                else:
                    url = 'http://'+url.rstrip()
                    queue.put(url.rstrip())
            except Exception, e:
                print e
                 
    for i in range(ThreadNumber):
        queue.put(None)
              
    for Worker in ThreadList:
        Worker.join()
         
    GR.join()
Quick response will be appreciated i have to submit before weekend.
Thank you everyone.
Moses.
Reply


Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020