Dec-29-2017, 12:55 AM
This is my completed Python script so far. It basically just parses a news website and organizes the companies' news into a google sheet file. The news is searched for with key words, and companies of certain market caps are kept.
while True: import bs4 as bs import urllib.request from selenium import webdriver driver = webdriver.Chrome() driver.get('https://login.globenewswire.com/?ReturnUrl=%2fSecurity%2fLogin%3fculture%3den-US&culture=en-US#login') # MANUALLY DO THE LOGIN import pygsheets gc = pygsheets.authorize() sh = gc.open('GNW API') wks = sh.sheet1 import datetime import time x = 18 y = 26 KeyWords = ['develop', 'contract', 'award', 'certif', 'execut', 'research', 'drug', 'theraputic', 'pivotal', 'trial', 'patient', 'data', 'fda', 'stud', 'phase', 'licenc', 'cancer', 'agree', 'clinical', 'acquisition', 'translational', 'trial', 'worldwide', 'world wide', 'world-wide', 'exclusiv', 'positive', 'successful', 'enter', 'sell', 'acquir', 'buy', 'bought', 'payment', 'availiab', 'design', 'transaction', 'increas', 'sale', 'record', 'clearance', 'right', 'launch', 'introduc', 'payment', 'meet', 'endpoint', 'primary', 'secondary', 'major', 'milestone', 'collaborat', 'beat', 'astound', 'sign', 'order', 'suppl', 'produc', 'made', 'make', 'making', 'customer', 'client', 'mulitpl', 'result', 'distribut', 'disease', 'treat', 'chmp', 'priority', 'promis', 'patent', 'purchas', 'allianc', 'strategic', 'team', 'commercializ', 'approv', 'select', 'strong', 'strength', 'grow', 'profit', 'improv', 'partner', 'cannabis', 'crypto', 'bitcoin', 'platform', 'expands', 'extends'] break while True: now = datetime.datetime.today() while now.hour == x: while now.minute == y: list = listfinal = driver.get('https://globenewswire.com/Search?runSearchId=41556723') elementals = driver.find_elements_by_class_name('post-title16px') for elements in elementals: list.append(elements.find_element_by_css_selector('a').get_attribute('href')) for elements in list: if any(KeyWords_item in elements.lower() for KeyWords_item in KeyWords): listfinal.append(elements) for elementals in listfinal: sauce = urllib.request.urlopen(elementals).read() soup = bs.BeautifulSoup(sauce,'lxml') desc = soup.find_all(attrs={"name":"ticker"}, limit=1) tickerraw = (desc[0]['content'].encode('utf-8')) decodedticker = tickerraw.decode('utf') souptitle = soup.title.text while True: if ', ' in decodedticker.lower(): finaltickerlist = decodedticker.split(', ') for elements in finaltickerlist: if 'nyse' in elements.lower(): if ':' in elements: a, b = elements.split(':') finaltickerexchange = 'NYSE' finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') break else: break else: finalticker = 'NoTicker' finaltickerexchange = 'NoTicker' elif 'nasdaq' in elements.lower(): if ':' in elements: a, b = elements.split(':') finaltickerexchange = 'NASDAQ' finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') break else: break else: finalticker = 'NoTicker' finaltickerexchange = 'NoTicker' elif 'tsx' in elements.lower(): if ':' in elements: a, b = elements.split(':') finaltickerexchange = 'TSX' finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') break else: break else: finalticker = 'NoTicker' finaltickerexchange = 'NoTicker' else: finalticker = 'NoTicker' finaltickerexchange = 'NoTicker' elif 'nasdaq' in decodedticker.lower(): if ':' in decodedticker.lower(): a, b = decodedticker.split(':', maxsplit=1) finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') finaltickerexchange = 'NASDAQ' elif 'nyse' in decodedticker.lower(): if ':' in decodedticker.lower(): a, b = decodedticker.split(':', maxsplit=1) finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') finaltickerexchange = 'NYSE' elif 'tsx' in decodedticker.lower(): if ':' in decodedticker.lower(): a, b = decodedticker.split(':', maxsplit=1) finalticker = b if ' ' in finalticker: finalticker = finalticker.replace(' ', '') finaltickerexchange = 'TSX' else: finalticker = 'NoTicker' finaltickerexchange = 'NoTicker' break if finalticker != 'NoTicker': sauce = urllib.request.urlopen('https://finance.yahoo.com/quote/' + finalticker + '?p=' + finalticker).read() soup = bs.BeautifulSoup(sauce,'lxml') mc_elm = soup.find(attrs={"data-test":"MARKET_CAP-value"}) while True: if mc_elm: marketcap = mc_elm.get_text() else: marketcap = "TickerNotFound" break while True: if 'B' in marketcap: marketcap = 'Billion Kalppa' else: values_list = ([finalticker,finaltickerexchange,marketcap,souptitle,elements]) wks.insert_rows(row=0, number=1, values=values_list) break if x == 23: x = 0 time.sleep(3000) else: x += 1 time.sleep(3000) break time.sleep(55) breakI intend to have this program run in the background and scrap news every hour when its the 26th minute of the hour. However after a while (not immediately, the code can run for 7-8 hours or iterations before this happens) I get this error:
Traceback (most recent call last): File "<pyshell#3>", line 93, in <module> sauce = urllib.request.urlopen('https://finance.yahoo.com/quote/' + finalticker + '?p=' + finalticker).read() File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 223, in urlopen return opener.open(url, data, timeout) File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 532, in open response = meth(req, response) File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 642, in http_response 'http', request, response, code, msg, hdrs) File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 570, in error return self._call_chain(*args) File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 504, in _call_chain result = func(*args) File "C:\Users\Arbi717\AppData\Local\Programs\Python\Python36-32\lib\urllib\request.py", line 650, in http_error_default raise HTTPError(req.full_url, code, msg, hdrs, fp) urllib.error.HTTPError: HTTP Error 503: Service UnavailableI believe urllib.request is the problem because I am seeing that in the error a lot, however I have no idea what the solution is. Any help is much appreciated.
