May-13-2018, 08:56 PM
When I run this I get an "unexpected indent error" at 'def __init__ (self, infile):. When I remove the space that's marked, I get a "invalid syntax error".
I 'refurbished' this from what Lars60+ invented for me so this is just getting started.
As always - any help is most appreciated!
Thank you!
I 'refurbished' this from what Lars60+ invented for me so this is just getting started.
As always - any help is most appreciated!
Thank you!
import requests from bs4 import BeautifulSoup from pathlib import path class GetCompletions: def __init__(self, infile): self.homepath = Path('.') self.log.pdfpath = self.homepath / 'comppdf' self.log.pdfpath.mkdir(exist_ok=True) self.log.pdfpath = self.homepath / 'geocorepdf' self.log.pdfpath.mkdir(exist_ok=True) self.textpath = self.homepath / 'text' self.text.mkdir(exist_ok=True) self.infile = self.textpath / infile self.api = [] self.parse_and_save(getpdfs=True) """Above will create a folder called comppdf, and wsgeo wherever the WOGCC File Downloads file is run from as well as a text file for my api file to reside.""" def get_url(self): for entry in self.apis: yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI=[]".format(entry[3:10])) yield (entry, "http://wogcc.state.wy.us/whatupcores.cfm?autonum=[]".format(entry[3:10])) """Above will get the URL that matches my API numbers.""" def parse_and_save(self, getpdfs=False): for file in filelist: with file.open('r') as f: soup = BeautifulSoup(f.read(), 'lxml') if getpdfs: links = soup.find_all('a') for link in links: url in link['href'] if 'www' in url: continue print('downloading pdf at: {}'.format(url)) p = url.index('=') response = requests.get(url, stream=True, allow_redirects=False) if response.status_code == 200: try: header_info = response.headers['Content-Disposition'] idx = header_info.index('filename') filename = self.log_pdfpath / header[idx+9:] except ValueError: filename = self.log_path / 'comp{}'.format(url[p 1:]) print("couldn't locate filename for {} will use: {}".format(file, filename)) except KeyError filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:]) print('got KeyError on {}, respnse.headers = {}'.format(file, response.headers)) print('will use name: {}'.format(filename)) print(repsonse.headers) with filename.open('wb') as f: f.write(respnse.content) sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10]) tds = soup.find_all('td') with sfname.open('w') as f: for td in tds: if td.txt if any(field in td.text for field in self.fields): f.write('{}\n'.format(td.text) if__name__ == '__main__': GetCompletions('api.txt')