Trying to add logs to webscraping module

Trying to add logs to webscraping module - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Trying to add logs to webscraping module (/thread-11648.html)

Trying to add logs to webscraping module - tjnichols - Jul-19-2018

Hello all - It's been a while!

I am trying to download the raster and LAS logs from the WOGCC site. Previously, I wanted to download reports - which work beautifully - thank you Lars60+!

Essentially, all I've done is change the location but I've found an issue and I'm not sure how to change it so it works for this.

Here is my code. I'm sure there is more to it than this but see line 36.

import WellPaths
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys


class GetCompletions:
    def __init__(self, infile):
        self.wpath = WellPaths.WellPaths()
        self.check_network = CheckInternet.CheckInternet()
        # self.homepath = Path('.')
        # self.rootpath = self.homepath / '..'
        # self.datapath = self.rootpath / 'data'
        # self.commandpath = self.datapath / 'command_files'
        # self.completionspath = self.datapath / 'completions'
        # self.htmlpath = self.datapath / 'html'
        # self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.wpath.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        """

        :return:
        """
        for entry in self.api:
            print("http://pipeline.wyo.gov/wyologs.cfm?nAPI={}".format(entry[3:10]))
            yield (entry, "http://pipeline.wyo.gov/wyologs.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.wpath.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.wpath.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.wpath.completionspath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.wpath.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.wpath.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.wpath.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

Here is the errors...

Error:Fetching main page for entry: 49009225080000
error downloading 49009225080000
http://pipeline.wyo.gov/wyologs.cfm?nAPI=0922525
Fetching main page for entry: 49009225250000
error downloading 49009225250000
http://pipeline.wyo.gov/wyologs.cfm?nAPI=0922526
Fetching main page for entry: 49009225260000
error downloading 49009225260000
http://pipeline.wyo.gov/wyologs.cfm?nAPI=0922527
Fetching main page for entry: 49009225270000
error downloading 49009225270000
http://pipeline.wyo.gov/wyologs.cfm?nAPI=0922566
Fetching main page for entry: 49009225660000
error downloading 49009225660000
Traceback (most recent call last):
  File "O:/Python/well_info-master/src/FetchLogs.py", line 102, in <module>
    GetCompletions('apis.txt')
  File "O:/Python/well_info-master/src/FetchLogs.py", line 36, in __init__
    self.parse_and_save(getpdfs=True)
  File "O:/Python/well_info-master/src/FetchLogs.py", line 90, in parse_and_save
    sfname = self.wpath.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
IndexError: list index out of range

As always - I appreciate any help you can provide!