Quote:Lars60+ - How did you get the API numbers to be seen as text and Python to observe the commas as separators?when put between brackets '[]' the data becomes a python list (https://docs.python.org/3/tutorial/datastructures.html). So the standard separator for lists is comma. If each entry is placed in in single or double quotes, it indicates that the value is a string.
This data could have come from a file
in that case, you wouldn't need to put quotes around the data example:
(NOTE: I attached a starting 'apis.txt' file with old data, copy to text directory)
let's say you have a file named 'apis.txt' (must be in the 'text' directory)which looks like:
Output:49009229900000
49009226390000
49009278600000
49009226340000
49009200210000
49009065760000
49009201380000
... Add rest of completion codes here
change this:def __init__(self):to this
def __init__(self, infile):in the code, replace the list (starting on line 15) with:
self.infile = self.textpath / infile self.apis = [] with self.infile.open() as f: for line in f: self.apis.append(line.strip())Now you can change the input file to control the code.
finally, change the following at bottom of code:
from this:
if __name__ == '__main__': GetCompletions()to this
if __name__ == '__main__': GetCompletions('apis.txt')So now the complete program looks like:
import requests from bs4 import BeautifulSoup from pathlib import Path import sys class GetCompletions: def __init__(self, infile): self.homepath = Path('.') self.completionspath = self.homepath / 'xx_completions_xx' self.completionspath.mkdir(exist_ok=True) self.log_pdfpath = self.homepath / 'logpdfs' self.log_pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.textpath.mkdir(exist_ok=True) self.infile = self.textpath / infile self.apis = [] with self.infile.open() as f: for line in f: self.apis.append(line.strip()) self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date', 'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls'] self.get_all_pages() self.parse_and_save(getpdfs=True) def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10])) def get_all_pages(self): for entry, url in self.get_url(): print('Fetching main page for entry: {}'.format(entry)) response = requests.get(url) if response.status_code == 200: filename = self.completionspath / 'api_{}.html'.format(entry) with filename.open('w') as f: f.write(response.text) else: print('error downloading {}'.format(entry)) def parse_and_save(self, getpdfs=False): filelist = [file for file in self.completionspath.iterdir() if file.is_file()] for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url = link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:]) response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: with filename.open('wb') as f: f.write(response.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.text: if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text)) # You can run the following once to create your original data as a file def add_input(self): apis = ['49009229900000', '49009226390000', '49009278600000', '49009226340000', '49009200210000', '49009065760000', '49009201380000', '49009230130000', '49009278800000', '49009222250000', '49009225900000', '49009219970000', '49009225890000', '49009225140000', '49009225760000', '49009212630000', '49009205440000', '49009211590000', '49009203660000', '49009203940000', '49009204340000', '49009226780000', '49009220310000', '49009229730000', '49009212240000', '49009214450000', '49009213790000', '49009222660000', '49009227960000', '49009222100000', '49009228020000', '49009228260000', '49009228290000', '49009229090000', '49009228250000', '49009229340000', '49009229360000', '49009227890000', '49009228010000', '49009228030000', '49009228450000', '49009224160000', '49009221890000', '49009222760000', '49009214980000', '49009214620000', '49009213800000', '49009214380000', '49009214730000', '49009228150000', '49009228190000', '49009227710000', '49009215280000', '49009228940000', '49009227920000', '49009227980000', '49009228170000', '49009219540000', '49009227870000', '49009228370000', '49009204330000', '49009205120000', '49009227860000', '49009228360000', '49009228160000', '49009216100000', '49009229000000', '49009229150000', '49009229490000', '49009215680000', '49009229350000', '49009215210000', '49009217070000', '49009216610000', '49009206800000', '49009205590000', '49009206310000', '49009217960000', '49009223190000', '49009210640000', '49009209260000', '49009213710000', '49009212360000', '49009212740000', '49009218680000', '49009210130000', '49009211420000', '49009224280000', '49009213750000', '49009220880000', '49009225300000', '49009218090000', '49009227720000', '49009225830000', '49009223170000', '49009209370000', '49009214990000', '49009207260000', '49009211540000', '49009227380000'] with self.infile.open('w') as f: for item in apis: f.write(f'{item}\n') if __name__ == '__main__': GetCompletions('apis.txt')
Attached Files
apis.txt (Size: 1.56 KB / Downloads: 229)