May-12-2018, 04:57 PM
Ok - the modules issue is solved! Now I have additional issues.
This is the code I'm trying to run - thank you Lars60+
As always, any help is appreciated!
Thanks!
This is the code I'm trying to run - thank you Lars60+

import requests from bs4 import BeautifulSoup from pathlib import Path import sys class GetCompletions: def __init__(self, infile): self.homepath = Path('.') self.completionspath = self.homepath / 'xx_completions_xx' self.completionspath.mkdir(exist_ok=True) self.log_pdfpath = self.homepath / 'logpdfs' self.log_pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.textpath.mkdir(exist_ok=True) self.infile = self.textpath / infile self.apis = [] with self.infile.open() as f: for line in f: self.apis.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] # self.get_all_pages() self.parse_and_save(getpdfs=True) def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.completionspath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.completionspath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.log_pdfpath / header_info[idx+9:] except ValueError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, response.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(response.headers) with filename.open('wb') as f: f.write(response.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) if __name__ == '__main__': GetCompletions('apis.txt')I get the following traceback errors.
Error:=== RESTART: C:\Users\Tonya\AppData\Local\Programs\Python\Python36\Test.py ===
= RESTART: C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py =
Traceback (most recent call last):
File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 81, in <module>
GetCompletions('apis.txt')
File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 26, in __init__
self.parse_and_save(getpdfs=True)
File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 47, in parse_and_save
soup = BeautifulSoup(f.read(), 'lxml')
File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\lib\site-packages\bs4\__init__.py", line 165, in __init__
% ",".join(features))
bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?
I've installed 'Parsy' for my parser. Should I use something else? The rest of it - I have no clue.As always, any help is appreciated!
Thanks!