Lars60+ - I appreciate your help! Ok here is the code we started with. What I am trying to do is get the completions reports and anything on the Cores / Pressures / Reports link. I was simply trying to
get rid of some of the things I don't need and add the things I do.
For example: these things are great to have but they are already in my database. There is just no way for me to use this. self.apis.append(line.strip())
self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
Thank you for your help!
Wavic - thank you - I will look and see where I need to make my changes! Thank you for your support!
nilamo - Thank you for your assistance! I need to look at this more closely. It is probably something I messed up when I was 'refurbishing'. I will look at it and get back to you! Thanks for your input!
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import sys
class GetCompletions:
def __init__(self, infile):
self.homepath = Path('.')
self.completionspath = self.homepath / 'xx_completions_xx'
self.completionspath.mkdir(exist_ok=True)
self.log_pdfpath = self.homepath / 'logpdfs'
self.log_pdfpath.mkdir(exist_ok=True)
self.textpath = self.homepath / 'text'
self.textpath.mkdir(exist_ok=True)
self.infile = self.textpath / infile
self.apis = []
with self.infile.open() as f:
for line in f:
self.apis.append(line.strip())
self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
self.get_all_pages()
self.parse_and_save(getpdfs=True)
def get_url(self):
for entry in self.apis:
yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))
def get_all_pages(self):
for entry, url in self.get_url():
print('Fetching main page for entry: {}'.format(entry))
response = requests.get(url)
if response.status_code == 200:
filename = self.completionspath / 'api_{}.html'.format(entry)
with filename.open('w') as f:
f.write(response.text)
else:
print('error downloading {}'.format(entry))
def parse_and_save(self, getpdfs=False):
filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
for file in filelist:
with file.open('r') as f:
soup = BeautifulSoup(f.read(), 'lxml')
if getpdfs:
links = soup.find_all('a')
for link in links:
url = link['href']
if 'www' in url:
continue
print('downloading pdf at: {}'.format(url))
p = url.index('=')
filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
response = requests.get(url, stream=True, allow_redirects=False)
if response.status_code == 200:
with filename.open('wb') as f:
f.write(response.content)
sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
tds = soup.find_all('td')
with sfname.open('w') as f:
for td in tds:
if td.text:
if any(field in td.text for field in self.fields):
f.write('{}\n'.format(td.text))
if __name__ == '__main__':
GetCompletions('apis.txt')