This is one that has worked before with no issues. I thought this was true of all of my Python files but now it seems is just this one. I did however have the wrong file that actually worked in the end. This file has given A LOT of traceback errors that I hope will be rectified once the initial issue is resolved.
Hold on though - let me try the virtual environment and I will let you know.
Thank you!
Hold on though - let me try the virtual environment and I will let you know.
Thank you!
import requests from bs4 import BeautifulSoup from pathlib import Path import CheckInternet import sys class GetCompletions: def __init__(self, infile): self.check_network = CheckInternet.CheckInternet() self.homepath = Path('.') self.rootpath = self.homepath / '..' self.datapath = self.rootpath / 'data' self.commandpath = self.datapath / 'command_files' self.completionspath = self.datapath / 'completions' self.htmlpath = self.datapath / 'html' self.reportspath = self.datapath / 'reports' if self.check_network.check_availability(): # use: Api_May_27_2018.txt for testing # self.infilename = 'Api_May_27_2018.txt' self.infilename = input('Please enter api filename: ') self.infile = self.commandpath / self.infilename self.api = [] with self.infile.open() as f: for line in f: self.api.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] self.get_all_pages() self.parse_and_save(getpdfs=True) else: print('Internet access required, and not found.') print('Please make Internet available and try again') def get_url(self): for entry in self.api: print("http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.htmlpath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.htmlpath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.completionspath / header_info[idx + 9:] except ValueError: filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError: filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, response.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(response.headers) with filename.open('wb') as f: f.write(response.content) sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) # Delete html file when finished file.unlink() if __name__ == '__main__': GetCompletions('apis.txt')
Error:Warning (from warnings module):
File "C:\Python365\lib\site-packages\requests\__init__.py", line 91
RequestsDependencyWarning)
RequestsDependencyWarning: urllib3 (dev) or chardet (3.0.4) doesn't match a supported version!
Please enter api filename: Api_May_27_2018.txt
http://wogcc.state.wy.us/wyocomp.cfm?nAPI=0906469
Fetching main page for entry: 49009064690000
Traceback (most recent call last):
File "C:\Python365\lib\urllib3\connectionpool.py", line 603, in urlopen
chunked=chunked)
File "C:\Python365\lib\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Python365\lib\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Python365\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Python365\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Python365\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python365\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python365\lib\site-packages\requests\adapters.py", line 445, in send
timeout=timeout
File "C:\Python365\lib\urllib3\connectionpool.py", line 641, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Python365\lib\urllib3\util\retry.py", line 357, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Python365\lib\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "C:\Python365\lib\urllib3\connectionpool.py", line 603, in urlopen
chunked=chunked)
File "C:\Python365\lib\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Python365\lib\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Python365\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Python365\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Python365\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python365\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 96, in <module>
GetCompletions('apis.txt')
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 33, in __init__
self.get_all_pages()
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 47, in get_all_pages
response = requests.get(url)
File "C:\Python365\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Python365\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python365\lib\site-packages\requests\sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python365\lib\site-packages\requests\sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File "C:\Python365\lib\site-packages\requests\adapters.py", line 495, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))