I made copies of both versions in case something I messed actually messed yours as well. That I MOST CERTAINLY don't want or need (for either of us)!!
Initially, I changed "'self.wellgeopath = self.datapath / 'wellgeo'" from your respective completions path and folder. Ran it with PyCharms (I am so glad you showed me this! It's truly a life saver!) it showed me 2 other areas that needed the 'completionspath' change as well so I did so.
Also - there is the matter of these fields. They are not on the new reports page. There are values to obtain but I don't need them. Can we just leave them out? One of the issues I see is there may be from 1 - 6 reports (5 or 6 is the largest number I remember seeing). This means if we pursue the download of information like what's shown below, it may cause some issues.
This link will show you two reports.
http://wogcc.state.wy.us/wellapi.cfm?nAp...ps=ID88472
Initially, I changed "'self.wellgeopath = self.datapath / 'wellgeo'" from your respective completions path and folder. Ran it with PyCharms (I am so glad you showed me this! It's truly a life saver!) it showed me 2 other areas that needed the 'completionspath' change as well so I did so.
import requests from bs4 import BeautifulSoup from pathlib import Path import CheckInternet import sys class GetCompletions: def __init__(self, infile): self.check_network = CheckInternet.CheckInternet() self.homepath = Path('.') self.rootpath = self.homepath / '..' self.datapath = self.rootpath / 'data' self.commandpath = self.datapath / 'command_files' self.wellgeopath = self.datapath / 'wellgeo' self.htmlpath = self.datapath / 'html' self.reportspath = self.datapath / 'reports' if self.check_network.check_availability(): # use: Api_May_27_2018.txt for testing # self.infilename = 'Api_May_27_2018.txt' self.infilename = input('Please enter api filename: ') self.infile = self.commandpath / self.infilename self.api = [] with self.infile.open() as f: for line in f: self.api.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] self.get_all_pages() self.parse_and_save(getpdfs=True) else: print('Internet access required, and not found.') print('Please make Internet available and try again') def get_url(self): for entry in self.api: print("http://wogcc.state.wy.us/wellapi.cfm?nAPI={}".format(entry[3:10])) yield (entry, "http://wogcc.state.wy.us/wellapi.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.htmlpath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.htmlpath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.wellgeopath / header_info[idx + 9:] except ValueError: filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError: filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, response.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(response.headers) with filename.open('wb') as f: f.write(response.content) sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) # Delete html file when finished file.unlink() if __name__ == '__main__': GetCompletions('apis.txt')I ran it again and this is what I got:
Error:C:\Python365\python.exe "O:/Python/Wellinfo - GEO/src/FetchCompletions.py"
Please enter api filename: Api_May_27_2018.txt
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510778
Fetching main page for entry: 49025107780001
error downloading 49025107780001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510781
Fetching main page for entry: 49025107810000
error downloading 49025107810000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510788
Fetching main page for entry: 49025107880001
error downloading 49025107880001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510792
Fetching main page for entry: 49025107920000
error downloading 49025107920000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510807
Fetching main page for entry: 49025108070000
error downloading 49025108070000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510831
Fetching main page for entry: 49025108310001
error downloading 49025108310001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510864
Fetching main page for entry: 49025108640000
error downloading 49025108640000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510869
Fetching main page for entry: 49025108690000
error downloading 49025108690000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510876
Fetching main page for entry: 49025108760000
error downloading 49025108760000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510882
Fetching main page for entry: 49025108820000
error downloading 49025108820000
Process finished with exit code 0
Also - there is the matter of these fields. They are not on the new reports page. There are values to obtain but I don't need them. Can we just leave them out? One of the issues I see is there may be from 1 - 6 reports (5 or 6 is the largest number I remember seeing). This means if we pursue the download of information like what's shown below, it may cause some issues.
This link will show you two reports.
http://wogcc.state.wy.us/wellapi.cfm?nAp...ps=ID88472
self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']As always - thank you!