This is one that has worked before with no issues. I thought this was true of all of my Python files but now it seems is just this one. I did however have the wrong file that actually worked in the end. This file has given A LOT of traceback errors that I hope will be rectified once the initial issue is resolved.
Hold on though - let me try the virtual environment and I will let you know.
Thank you!
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys
class GetCompletions:
def __init__(self, infile):
self.check_network = CheckInternet.CheckInternet()
self.homepath = Path('.')
self.rootpath = self.homepath / '..'
self.datapath = self.rootpath / 'data'
self.commandpath = self.datapath / 'command_files'
self.completionspath = self.datapath / 'completions'
self.htmlpath = self.datapath / 'html'
self.reportspath = self.datapath / 'reports'
if self.check_network.check_availability():
# use: Api_May_27_2018.txt for testing
# self.infilename = 'Api_May_27_2018.txt'
self.infilename = input('Please enter api filename: ')
self.infile = self.commandpath / self.infilename
self.api = []
with self.infile.open() as f:
for line in f:
self.api.append(line.strip())
self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
self.get_all_pages()
self.parse_and_save(getpdfs=True)
else:
print('Internet access required, and not found.')
print('Please make Internet available and try again')
def get_url(self):
for entry in self.api:
print("http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))
yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))
def get_all_pages(self):
for entry, url in self.get_url():
print('Fetching main page for entry: {}'.format(entry))
response = requests.get(url)
if response.status_code == 200:
filename = self.htmlpath / 'api_{}.html'.format(entry)
with filename.open('w') as f:
f.write(response.text)
else:
print('error downloading {}'.format(entry))
def parse_and_save(self, getpdfs=False):
filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
for file in filelist:
with file.open('r') as f:
soup = BeautifulSoup(f.read(), 'lxml')
if getpdfs:
links = soup.find_all('a')
for link in links:
url = link['href']
if 'www' in url:
continue
print('downloading pdf at: {}'.format(url))
p = url.index('=')
response = requests.get(url, stream=True, allow_redirects=False)
if response.status_code == 200:
try:
header_info = response.headers['Content-Disposition']
idx = header_info.index('filename')
filename = self.completionspath / header_info[idx + 9:]
except ValueError:
filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
print("couldn't locate filename for {} will use: {}".format(file, filename))
except KeyError:
filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
print('will use name: {}'.format(filename))
print(response.headers)
with filename.open('wb') as f:
f.write(response.content)
sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
tds = soup.find_all('td')
with sfname.open('w') as f:
for td in tds:
if td.text:
if any(field in td.text for field in self.fields):
f.write('{}\n'.format(td.text))
# Delete html file when finished
file.unlink()
if __name__ == '__main__':
GetCompletions('apis.txt')
Error:
Warning (from warnings module):
File "C:\Python365\lib\site-packages\requests\__init__.py", line 91
RequestsDependencyWarning)
RequestsDependencyWarning: urllib3 (dev) or chardet (3.0.4) doesn't match a supported version!
Please enter api filename: Api_May_27_2018.txt
http://wogcc.state.wy.us/wyocomp.cfm?nAPI=0906469
Fetching main page for entry: 49009064690000
Traceback (most recent call last):
File "C:\Python365\lib\urllib3\connectionpool.py", line 603, in urlopen
chunked=chunked)
File "C:\Python365\lib\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Python365\lib\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Python365\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Python365\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Python365\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python365\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
ConnectionResetError: [WinError 10054] An existing connection was forcibly closed by the remote host
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Python365\lib\site-packages\requests\adapters.py", line 445, in send
timeout=timeout
File "C:\Python365\lib\urllib3\connectionpool.py", line 641, in urlopen
_stacktrace=sys.exc_info()[2])
File "C:\Python365\lib\urllib3\util\retry.py", line 357, in increment
raise six.reraise(type(error), error, _stacktrace)
File "C:\Python365\lib\urllib3\packages\six.py", line 685, in reraise
raise value.with_traceback(tb)
File "C:\Python365\lib\urllib3\connectionpool.py", line 603, in urlopen
chunked=chunked)
File "C:\Python365\lib\urllib3\connectionpool.py", line 387, in _make_request
six.raise_from(e, None)
File "<string>", line 2, in raise_from
File "C:\Python365\lib\urllib3\connectionpool.py", line 383, in _make_request
httplib_response = conn.getresponse()
File "C:\Python365\lib\http\client.py", line 1331, in getresponse
response.begin()
File "C:\Python365\lib\http\client.py", line 297, in begin
version, status, reason = self._read_status()
File "C:\Python365\lib\http\client.py", line 258, in _read_status
line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
File "C:\Python365\lib\socket.py", line 586, in readinto
return self._sock.recv_into(b)
urllib3.exceptions.ProtocolError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 96, in <module>
GetCompletions('apis.txt')
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 33, in __init__
self.get_all_pages()
File "O:\Python\Wellinfo\src\FetchCompletions.py", line 47, in get_all_pages
response = requests.get(url)
File "C:\Python365\lib\site-packages\requests\api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "C:\Python365\lib\site-packages\requests\api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python365\lib\site-packages\requests\sessions.py", line 512, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python365\lib\site-packages\requests\sessions.py", line 622, in send
r = adapter.send(request, **kwargs)
File "C:\Python365\lib\site-packages\requests\adapters.py", line 495, in send
raise ConnectionError(err, request=request)
requests.exceptions.ConnectionError: ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))