Apr-11-2018, 03:53 AM
The following program will get the pdf files of the completion logs, as well as the html completion page which can be scanned to extract further information. as well as a summary file for each
I would suggest that if you plan on using this for future work that you move the api list to a file, and
read it in at the start. This will allow you to reuse without having to change the program each time.
The program will create three directories directly below the source code:
xx_completions_xx - contains the completion text pages by name
logpdfs - comtains completion logs in the followinf format -- compnnnn.pdf (example: comp18098.pdf)
text - contains summary information in unparsed string for each elemsnt in self.fields files are named:
summary_apino.txt
these are the completion logs in pdf format.
the files in the xx_completions_xx are the original html pages, so can be displayed in a browser as is (just double click on one)
Enjoy!
I would suggest that if you plan on using this for future work that you move the api list to a file, and
read it in at the start. This will allow you to reuse without having to change the program each time.
The program will create three directories directly below the source code:
xx_completions_xx - contains the completion text pages by name
logpdfs - comtains completion logs in the followinf format -- compnnnn.pdf (example: comp18098.pdf)
text - contains summary information in unparsed string for each elemsnt in self.fields files are named:
summary_apino.txt
these are the completion logs in pdf format.
the files in the xx_completions_xx are the original html pages, so can be displayed in a browser as is (just double click on one)
Enjoy!
import requests from bs4 import BeautifulSoup from pathlib import Path class GetCompletions: def __init__(self): self.homepath = Path('.') self.completionspath = self.homepath / 'xx_completions_xx' self.completionspath.mkdir(exist_ok=True) self.log_pdfpath = self.homepath / 'logpdfs' self.log_pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.textpath.mkdir(exist_ok=True) self.apis = ['49009229900000','49009226390000','49009278600000','49009226340000','49009200210000', '49009065760000','49009201380000','49009230130000','49009278800000','49009222250000', '49009225900000','49009219970000','49009225890000','49009225140000','49009225760000', '49009212630000','49009205440000','49009211590000','49009203660000','49009203940000', '49009204340000','49009226780000','49009220310000','49009229730000','49009212240000', '49009214450000','49009213790000','49009222660000','49009227960000','49009222100000', '49009228020000','49009228260000','49009228290000','49009229090000','49009228250000', '49009229340000','49009229360000','49009227890000','49009228010000','49009228030000', '49009228450000','49009224160000','49009221890000','49009222760000','49009214980000', '49009214620000','49009213800000','49009214380000','49009214730000','49009228150000', '49009228190000','49009227710000','49009215280000','49009228940000','49009227920000', '49009227980000','49009228170000','49009219540000','49009227870000','49009228370000', '49009204330000','49009205120000','49009227860000','49009228360000','49009228160000', '49009216100000','49009229000000','49009229150000','49009229490000','49009215680000', '49009229350000','49009215210000','49009217070000','49009216610000','49009206800000', '49009205590000','49009206310000','49009217960000','49009223190000','49009210640000', '49009209260000','49009213710000','49009212360000','49009212740000','49009218680000', '49009210130000','49009211420000','49009224280000','49009213750000','49009220880000', '49009225300000','49009218090000','49009227720000','49009225830000','49009223170000', '49009209370000','49009214990000','49009207260000','49009211540000','49009227380000'] self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] self.get_all_pages() self.parse_and_save(getpdfs=True) def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.completionspath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.completionspath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:]) response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: with filename.open('wb') as f: f.write(response.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) if __name__ == '__main__': GetCompletions()