Python Forum

Full Version: Trying to add logs to webscraping module
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hello all - It's been a while!

I am trying to download the raster and LAS logs from the WOGCC site. Previously, I wanted to download reports - which work beautifully - thank you Lars60+!

Essentially, all I've done is change the location but I've found an issue and I'm not sure how to change it so it works for this.

Here is my code. I'm sure there is more to it than this but see line 36.

import WellPaths
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys

class GetCompletions:
    def __init__(self, infile):
        self.wpath = WellPaths.WellPaths()
        self.check_network = CheckInternet.CheckInternet()
        # self.homepath = Path('.')
        # self.rootpath = self.homepath / '..'
        # self.datapath = self.rootpath / 'data'
        # self.commandpath = self.datapath / 'command_files'
        # self.completionspath = self.datapath / 'completions'
        # self.htmlpath = self.datapath / 'html'
        # self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.wpath.commandpath / self.infilename
            self.api = []

            with as f:
                for line in f:

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):

        for entry in self.api:
            yield (entry, "{}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.wpath.htmlpath / 'api_{}.html'.format(entry)
                with'w') as f:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.wpath.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with'r') as f:
                soup = BeautifulSoup(, 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.wpath.completionspath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.wpath.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.wpath.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                        with'wb') as f:
            sfname = self.wpath.reportspath / 'summary_{}.txt'.format(('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with'w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
            # Delete html file when finished

if __name__ == '__main__':
Here is the errors...

Fetching main page for entry: 49009225080000 error downloading 49009225080000 Fetching main page for entry: 49009225250000 error downloading 49009225250000 Fetching main page for entry: 49009225260000 error downloading 49009225260000 Fetching main page for entry: 49009225270000 error downloading 49009225270000 Fetching main page for entry: 49009225660000 error downloading 49009225660000 Traceback (most recent call last): File "O:/Python/well_info-master/src/", line 102, in <module> GetCompletions('apis.txt') File "O:/Python/well_info-master/src/", line 36, in __init__ self.parse_and_save(getpdfs=True) File "O:/Python/well_info-master/src/", line 90, in parse_and_save sfname = self.wpath.reportspath / 'summary_{}.txt'.format(('_'))[1].split('.')[0][3:10]) IndexError: list index out of range
As always - I appreciate any help you can provide!