All - if you're game - I would like to continue with the collaborative effort! I really appreciate the time and energy everyone has given up until now. I think we are at the tail end of this!
I've made the change Lars60+ suggested when he said I was overwriting my file so here is the latest...
Alright, I think we may have a better idea of how to do this. For now, lets forgo the idea of fixing this one. More later! Thank you!
I've made the change Lars60+ suggested when he said I was overwriting my file so here is the latest...
Error: RESTART: C:/Users/toliver/AppData/Local/Programs/Python/Python36/OilWellCompletions/OilWellCompletions.py
Traceback (most recent call last):
File "C:/Users/toliver/AppData/Local/Programs/Python/Python36/OilWellCompletions/OilWellCompletions.py", line 81, in <module>
GetCompletions('apis.txt')
File "C:/Users/toliver/AppData/Local/Programs/Python/Python36/OilWellCompletions/OilWellCompletions.py", line 19, in __init__
with self.infile.open() as f:
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\lib\pathlib.py", line 1181, in open
opener=self._opener)
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\lib\pathlib.py", line 1035, in _opener
return self._accessor.open(self, flags, mode)
File "C:\Users\toliver\AppData\Local\Programs\Python\Python36\lib\pathlib.py", line 387, in wrapped
return strfunc(str(pathobj), *args)
FileNotFoundError: [Errno 2] No such file or directory: 'text\\apis.txt'
This is what I've found so far on the error - "Obviously, based on the error message, mkdir returns None." I will keep looking.import requests from bs4 import BeautifulSoup from pathlib import Path import sys class GetCompletions: def __init__(self, infile): self.homepath = Path('.') self.completionspath = self.homepath / 'xx_completions_xx' self.completionspath.mkdir(exist_ok=True) self.log_pdfpath = self.homepath / 'logpdfs' self.log_pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.textpath.mkdir(exist_ok=True) self.infile = self.textpath / infile self.apis = [] with self.infile.open() as f: for line in f: self.apis.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] # self.get_all_pages() self.parse_and_save(getpdfs=True) def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.completionspath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.completionspath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.log_pdfpath / header_info[idx+9:] except ValueError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, response.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(response.headers) with filename.open('wb') as f: f.write(response.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) if __name__ == '__main__': GetCompletions('apis.txt')Thanks again!
Alright, I think we may have a better idea of how to do this. For now, lets forgo the idea of fixing this one. More later! Thank you!