Find a specific word on a webpage and save the url

moses · May-16-2018, 08:29 PM

Hello Everyone,

I have an assignment that is quickly nearing due date, I was giving a code to correct for almost a week and i have tried my best so far but it's not enough obviously.

The job the code is suppose to do is to search through a list of url to find if a specific Product (text)in this case NIKE appear on each website, if it is on the site then save the url in ouput open("NikeShoes.txt","a") and if not do nothing and got to the next site in the list as fast as possible.

Now my problem is that no matter what I do I just don't get it to save url instead it saves the string 'nike' which was what I searched for but not what I intend to get as output, the output is supposed to be the website which the string is found on. below is the entire code.

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

import urllib2
import re
import sys
import cookielib
from threading import Timer
from multiprocessing import Process, Queue
  
class GetResults(Process): 
    def __init__(self, rezqueue):
        Process.__init__(self)
        self.rezqueue = rezqueue
    
    def run(self):
        while True:
            shoe = self.rezqueue.get()
            if shoe is None:   return False
            with open("NikeShoes.txt","a") as Product:
                Product.write(shoe.rstrip()+"\n")
            print shoe
  
class Crawler(Process): 
    def __init__(self, queue, rezqueue):
        Process.__init__(self)
        self.queue = queue
        self.rezqueue = rezqueue
    
     
             
 
    def run(self):
        while True:
            site = self.queue.get()
            if site is None:    return False
            self.crawl(site)
             
             
    def crawl(self,site):
        try:
            WatchIt = Timer(15.0, self.WatchDog)
            WatchIt.start()
  
            cj = cookielib.CookieJar()        
            opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
            opener.addheaders = [('Accept:','*'),("User-Agent", "Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:31.0) Gecko/20100101 Firefox/31.0")]
            opener.addheaders = [('Content-Type', 'text/html; charset=utf-8'),("Accept-Encoding", "")]
            resp = opener.open(site,timeout=10)
            WatchIt.cancel()
            self.getem(resp.read())
             
        except Exception, e:
            #print e
            f = 1
              
    def getem(self,resp):
        try:
            shoes = re.findall('nike', str(resp))
            CleanProducts = set(shoes)
            for em in CleanProducts:
                self.rezqueue.put(em.lower())
        except Exception, e:
            return False
  
    def WatchDog(self):
        return False           
               
  
              
if __name__ == "__main__":
  
    if len(sys.argv) < 3:
        print "\tExample: ",sys.argv[0],"30 dom.txt"
        sys.exit()
  
    queue = Queue(maxsize=3000)
    rezqueue = Queue()
    ThreadNumber = int(sys.argv[1])
    ThreadList = []
  
    for i in range(ThreadNumber):
        t = Crawler(queue,rezqueue)
        t.daemon = True
        t.start()
        ThreadList.append(t)
          
    GR = GetResults(rezqueue)
    GR.daemon = True
    GR.start()
      
    with open(sys.argv[2],"rU") as urls:
        for url in urls:
            try:
                if url.startswith('http://'):
                    queue.put(url.rstrip())
                else:
                    url = 'http://'+url.rstrip()
                    queue.put(url.rstrip())
            except Exception, e:
                print e
                  
    for i in range(ThreadNumber):
        queue.put(None)
               
    for Worker in ThreadList:
        Worker.join()
          
    GR.join()

Quick response will be appreciated i have to submit before weekend.
Thank you everyone.
Moses.

Find a specific word on a webpage and save the url

User Panel Messages

Announcements