Traceback errors - many

tjnichols · May-12-2018, 04:57 PM

Ok - the modules issue is solved! Now I have additional issues.

This is the code I'm trying to run - thank you Lars60+ Angel

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import sys
  
class GetCompletions:
    def __init__(self, infile):
        self.homepath = Path('.')
        self.completionspath = self.homepath / 'xx_completions_xx'
        self.completionspath.mkdir(exist_ok=True)
        self.log_pdfpath = self.homepath / 'logpdfs'
        self.log_pdfpath.mkdir(exist_ok=True)
        self.textpath = self.homepath / 'text'
        self.textpath.mkdir(exist_ok=True)
  
        self.infile = self.textpath / infile
        self.apis = []
  
        with self.infile.open() as f:
            for line in f:
                self.apis.append(line.strip())
  
        self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                       'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
        # self.get_all_pages()
        self.parse_and_save(getpdfs=True)
  
    def get_url(self):
        for entry in self.apis:
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))
  
    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.completionspath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))
  
    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.log_pdfpath / header_info[idx+9:]
                        except ValueError:
                            filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
  
if __name__ == '__main__':
    GetCompletions('apis.txt')

I get the following traceback errors.

Error:=== RESTART: C:\Users\Tonya\AppData\Local\Programs\Python\Python36\Test.py ===
= RESTART: C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py =
Traceback (most recent call last):
  File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 81, in <module>
    GetCompletions('apis.txt')
  File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 26, in __init__
    self.parse_and_save(getpdfs=True)
  File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py", line 47, in parse_and_save
    soup = BeautifulSoup(f.read(), 'lxml')
  File "C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\lib\site-packages\bs4\__init__.py", line 165, in __init__
    % ",".join(features))
bs4.FeatureNotFound: Couldn't find a tree builder with the features you requested: lxml. Do you need to install a parser library?

I've installed 'Parsy' for my parser. Should I use something else? The rest of it - I have no clue.

As always, any help is appreciated!

Thanks!

***snippsat*** · May-12-2018, 05:23 PM

soup = BeautifulSoup(f.read(), 'lxml')

This mean that BS use lxml as parser.
Install lxml.

pip install lxml

tjnichols · May-12-2018, 08:22 PM

Ok - GREAT! No more traceback errors! Thank you!! Big Grin

Now - it doesn't return anything though. Should I open a new case?

Here is what I get...

= RESTART: C:\Users\Tonya\AppData\Local\Programs\Python\Python36 - 2\Test.py =
>>>

Thanks again!

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Traceback errors - too many	tjnichols	0	2,503	Jul-06-2018, 05:29 PM Last Post: tjnichols

Traceback errors - many

User Panel Messages

Announcements