May-16-2018, 05:03 PM
I mentioned this before:
This was perfectly ok in the original code.
Again, especially with the plethora of errors (all caused since original was severely modified), perhaps it's time to go back to the original.
In case you've lost it, here is a copy (which I just ran without a hitch)
Just remember you must have the apis.text file in the text directory, and it contains the api numbers for the documents you want
I am attaching the original file (in order not to reload tha same info, clean the file out and start with new numbers or it will download the original) This file directs the software, It will download whatever is specified in this file, nothing more, nothing less
self.log.pdfpath = self.homepath / 'comppdf' self.log.pdfpath.mkdir(exist_ok=True) self.log.pdfpath = self.homepath / 'geocorepdf' self.log.pdfpath.mkdir(exist_ok=True)You are overwriting self.log.pdfpath
This was perfectly ok in the original code.
Again, especially with the plethora of errors (all caused since original was severely modified), perhaps it's time to go back to the original.
In case you've lost it, here is a copy (which I just ran without a hitch)
Just remember you must have the apis.text file in the text directory, and it contains the api numbers for the documents you want
I am attaching the original file (in order not to reload tha same info, clean the file out and start with new numbers or it will download the original) This file directs the software, It will download whatever is specified in this file, nothing more, nothing less
import requests from bs4 import BeautifulSoup from pathlib import Path import sys class GetCompletions: def __init__(self, infile): self.homepath = Path('.') self.completionspath = self.homepath / 'xx_completions_xx' self.completionspath.mkdir(exist_ok=True) self.log_pdfpath = self.homepath / 'logpdfs' self.log_pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.textpath.mkdir(exist_ok=True) self.infile = self.textpath / infile self.apis = [] with self.infile.open() as f: for line in f: self.apis.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] # self.get_all_pages() self.parse_and_save(getpdfs=True) def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.completionspath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.completionspath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.log_pdfpath / header_info[idx+9:] except ValueError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError: filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, response.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(response.headers) with filename.open('wb') as f: f.write(response.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) if __name__ == '__main__': GetCompletions('apis.txt')