Jan-09-2018, 06:21 PM
Here is my code:
import asyncio import aiohttp import urllib import webbrowser from bs4 import BeautifulSoup import re global link link = "" async def get(url): async with aiohttp.ClientSession() as session: async with session.get(link) as resp: return await resp.text() async def main(): address = 'https://google.com/search?q=' # Default Google search address start file = open( "OCR.txt", "rt" ) # Open text document that contains the question word = file.read() file.close() myList = [item for item in word.split('\n')] newString = ' '.join(myList) # The question is on multiple lines so this joins them together with proper spacing qstr = urllib.parse.quote_plus(newString) # Encode the string newWord = address + qstr # Combine the base and the encoded query text = await get(newWord) answers = open("ocr2.txt", "rt") ansTable = answers.read() answers.close() ans = ansTable.splitlines() ans1 = str(ans[0]) ans2 = str(ans[2]) ans3 = str(ans[4]) ans1Score = 0 ans2Score = 0 ans3Score = 0 links = [] soup = BeautifulSoup(text, 'lxml') for r in soup.find_all(class_='r'): linkRaw = str(r) link = re.search("(?P<url>https?://[^\s]+)", linkRaw).group("url") if '&' in link: finalLink = link.split('&') link = str(finalLink[0]) links.append(link) coros = [ process_single_g(g) for g in soup.find_all(class_='g') ] results = await asyncio.gather(*coros) for res in results: ans1Score, ans2Score, ans3Score = res print(' ') print('-----') print(ans1+": "+str(ans1Score)) print(ans2+": "+str(ans2Score)) print(ans3+": "+str(ans3Score)) print('-----') async def process_single_g(g): for g in soup.find_all(class_='g'): webBlock = str(g).lower() ans1Tally = webBlock.count(ans1) ans2Tally = webBlock.count(ans2) ans3Tally = webBlock.count(ans3) ans1Found = True ans2Found = True ans3Found = True if ans1 in webBlock: ans1Score += ans1Tally else: ans1Found = False if ans2 in webBlock: ans2Score += ans2Tally else: ans2Found = False if ans3 in webBlock: ans3Score += ans3Tally else: ans3Found = False if (ans1Found == False) and (ans2Found == False) and (ans3Found == False): print("Searching Link!") try: searchLink = str(links[0]) if searchLink.endswith('pdf'): pass else: response2 = requests.get(searchLink) soup2 = BeautifulSoup(response2.text, 'lxml') for p in soup2.find_all('p'): extraBlock = str(p) extraAns1Tally = extraBlock.count(ans1) extraAns2tally = extraBlock.count(ans2) extraAns3Tally = extraBlock.count(ans3) if ans1.lower() in extraBlock.lower(): ans1Score += extraAns1Tally if ans2.lower() in extraBlock.lower(): ans2Score += extraAns2Tally if ans3.lower() in extraBlock.lower(): ans3Score += extraAns3Tally except: pass if len(links) > 0: links.pop(0) else: pass text2 = await get(searchLink) if __name__ == '__main__': loop = asyncio.get_event_loop() try: loop.run_until_complete(main()) finally: loop.run_until_complete(loop.shutdown_asyncgens()) loop.close()Here is my error:
Traceback (most recent call last): File "scottbot.py", line 179, in <module> loop.run_until_complete(main()) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/asyncio/base_events.py", line 467, in run_until_complete return future.result() File "scottbot.py", line 36, in main text = await get(newWord) File "scottbot.py", line 14, in get async with session.get(link) as resp: File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/aiohttp/client.py", line 565, in __aenter__ self._resp = yield from self._coro File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/aiohttp/client.py", line 195, in _request proxy=proxy, proxy_auth=proxy_auth, timeout=timeout) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/aiohttp/client_reqrep.py", line 91, in __init__ self.update_host(url) File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/aiohttp/client_reqrep.py", line 111, in update_host raise ValueError('Host could not be detected.') ValueError: Host could not be detected.I am sure there are many things wrong with it, as this was another users suggestion to make my code asynchronous. I am really lost on how to use aiohttp and asyncio, but my program that runs synchronously works perfectly. So the only trouble I am having is converting it to scrape all of the "g" classes at the same time instead of doing them one at a time down the line. There will always be 10 of them, as the "g" class is a google search result. I have looked over the documents and have searched SO but can not find anyone that has been trying to do the same thing as me. If anybody could provide some assistance it would be greatly appreciated. I am sorry if this code is really messed up, I am really lost so ANY kind of help and clarification (or even better, just showing me how it is done so I can actually see what the proper code looks like to learn) is massively appreciated. I have posted and looked on SO before but nobody seems to have posted my kind of situation before. Thanks for reading!