Apr-09-2018, 06:38 PM
Is it possible to multiprocess this script without breaking it up into functions?
I'm trying to keep it as barebones and simple as possible.
I'm trying to keep it as barebones and simple as possible.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
# EXTREMELY SIMPLE SCRAPING SCRIPT from time import sleep from bs4 import BeautifulSoup import re import requests from multiprocessing import Pool exceptions = [] "http://www.thesportsbank.net/football/liverpool/jurgen-klopp-very-positive-about-mo-salah-injury/" , "http://bangaloreweekly.com/2018-04-06-city-holding-co-invests-in-cvs-health-corporation-cvs-shares/" , "http://www.businessdayonline.com/1bn-eca-withdrawal-commence-action-president-buhari-pdp-tasks-nass/" , "https://www.pmnewsnigeria.com/2018/04/03/apc-governors-keep-sealed-lips-after-meeting-with-buhari/" , "https://stocknewstimes.com/2018/04/05/amazon-com-inc-amzn-shares-bought-by-west-oak-capital-llc.html" , list_counter = 0 p = Pool( 10 ) # process count records = p. map (,list1[list_counter]) # argument required p.terminate() p.join() print () print ( 'Total URLS:' , len (list1), "- Starting Task..." ) print () for items in list1: try : scrape = requests.get(list1[list_counter], headers = { "user-agent" : "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36" }, timeout = 10 ) if scrape.status_code = = 200 : html = scrape.content soup = BeautifulSoup(html, 'html.parser' ) """ --------------------------------------------- """ # --------------------------------------------------- ''' --> SCRAPE ALEXA RANK: <-- ''' # --------------------------------------------------- """ --------------------------------------------- """ sleep( 0.15 ) headers = { "user-agent" : "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.90 Safari/537.36" }) html = scrape.content soup = BeautifulSoup(html, 'lxml' ) rank = re.findall(r '<popularity[^>]*text="(\d+)"' , str (soup)) print ( "Server Status:" , scrape.status_code, '-' , u "\u2713" , '-' , list_counter, '-' , list1[list_counter], '-' , "Rank:" , rank[ 0 ]) list_counter = list_counter + 1 else : print ( "Server Status:" , scrape.status_code) list_counter = list_counter + 1 pass except BaseException as e: exceptions.append(e) print () print (e) print () list_counter = list_counter + 1 pass if len (exceptions) > 0 : print ( "OUTPUT ERROR LOGS:" , exceptions) else : print ( "No Errors To Report" ) |