Python Forum

Lars60+ crafted the first version of this. When I run his version, it runs fine. I don't know what I did to mess mine up. I didn't change much but I've learned it doesn't take much to turn it right on its head! Let me know if you want to see his to compare. I am downloading different reports but from the same website.

As always - any help is most appreciated!

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys


class GetCompletions:
    def __init__(self, infile):
        self.check_network = CheckInternet.CheckInternet()
        self.homepath = Path('.')
        self.rootpath = self.homepath / '..'
        self.datapath = self.rootpath / 'data'
        self.commandpath = self.datapath / 'command_files'
        self.wellgeopath = self.datapath / 'wellgeo'
        self.htmlpath = self.datapath / 'html'
        self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = ['API Number', 'Field', 'Formation', 'Well', 'location']
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        for entry in self.api:
            print("http://http://wogcc.state.wy.us/coreapi.cfm?API={}".format(entry[3:10]))
            yield (entry, "http://http://wogcc.state.wy.us/coreapi.cfm?API={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.wellgeopath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

Error:C:\Python365\python.exe "O:/Python/WellInfo - GEO/src/FetchCompletions.py"
Please enter api filename: Api_May_27_2018.txt
http://http://wogcc.state.wy.us/coreapi.cfm?API=0527389
Fetching main page for entry: 49005273890000
Traceback (most recent call last):
  File "C:\Python365\lib\site-packages\urllib3\connection.py", line 141, in _new_conn
    (self.host, self.port), self.timeout, **extra_kw)
  File "C:\Python365\lib\site-packages\urllib3\util\connection.py", line 60, in create_connection
    for res in socket.getaddrinfo(host, port, family, socket.SOCK_STREAM):
  File "C:\Python365\lib\socket.py", line 745, in getaddrinfo
    for res in _socket.getaddrinfo(host, port, family, type, proto, flags):
socket.gaierror: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Python365\lib\site-packages\urllib3\connectionpool.py", line 601, in urlopen
    chunked=chunked)
  File "C:\Python365\lib\site-packages\urllib3\connectionpool.py", line 357, in _make_request
    conn.request(method, url, **httplib_request_kw)
  File "C:\Python365\lib\http\client.py", line 1239, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "C:\Python365\lib\http\client.py", line 1285, in _send_request
    self.endheaders(body, encode_chunked=encode_chunked)
  File "C:\Python365\lib\http\client.py", line 1234, in endheaders
    self._send_output(message_body, encode_chunked=encode_chunked)
  File "C:\Python365\lib\http\client.py", line 1026, in _send_output
    self.send(msg)
  File "C:\Python365\lib\http\client.py", line 964, in send
    self.connect()
  File "C:\Python365\lib\site-packages\urllib3\connection.py", line 166, in connect
    conn = self._new_conn()
  File "C:\Python365\lib\site-packages\urllib3\connection.py", line 150, in _new_conn
    self, "Failed to establish a new connection: %s" % e)
urllib3.exceptions.NewConnectionError: <urllib3.connection.HTTPConnection object at 0x000000000372ECC0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "C:\Python365\lib\site-packages\requests\adapters.py", line 440, in send
    timeout=timeout
  File "C:\Python365\lib\site-packages\urllib3\connectionpool.py", line 639, in urlopen
    _stacktrace=sys.exc_info()[2])
  File "C:\Python365\lib\site-packages\urllib3\util\retry.py", line 388, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPConnectionPool(host='http', port=80): Max retries exceeded with url: //wogcc.state.wy.us/coreapi.cfm?API=0527389 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000000000372ECC0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "O:/Python/WellInfo - GEO/src/FetchCompletions.py", line 95, in <module>
    GetCompletions('apis.txt')
  File "O:/Python/WellInfo - GEO/src/FetchCompletions.py", line 32, in __init__
    self.get_all_pages()
  File "O:/Python/WellInfo - GEO/src/FetchCompletions.py", line 46, in get_all_pages
    response = requests.get(url)
  File "C:\Python365\lib\site-packages\requests\api.py", line 72, in get
    return request('get', url, params=params, **kwargs)
  File "C:\Python365\lib\site-packages\requests\api.py", line 58, in request
    return session.request(method=method, url=url, **kwargs)
  File "C:\Python365\lib\site-packages\requests\sessions.py", line 508, in request
    resp = self.send(prep, **send_kwargs)
  File "C:\Python365\lib\site-packages\requests\sessions.py", line 618, in send
    r = adapter.send(request, **kwargs)
  File "C:\Python365\lib\site-packages\requests\adapters.py", line 508, in send
    raise ConnectionError(e, request=request)
requests.exceptions.ConnectionError: HTTPConnectionPool(host='http', port=80): Max retries exceeded with url: //wogcc.state.wy.us/coreapi.cfm?API=0527389 (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000000000372ECC0>: Failed to establish a new connection: [Errno 11004] getaddrinfo failed',))

Process finished with exit code 1

Whenever you make changes you should:

change just one item
Verify that the change works as expected
Repeat

Doing otherwise will cost you much time and aggravation in the long run.

Now you went back to a very old version. Why did you do that!

Ok - I get your point. Change one thing and if it fails, you know what was changed and you can fix it from there. What I don't understand is: If you change something in your code above how do you know it isn't something further down that is matched but not changed that's causing your issue?

I don't understand about 'an old version'? This is the one that worked so beautifully. Let me recopy it and do as you suggested. Then I will let you know what errors I run into.

Thank you!

The code you provided is here and what I started with for my current reports:

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys


class GetCompletions:
    def __init__(self, infile):
        self.check_network = CheckInternet.CheckInternet()
        self.homepath = Path('.')
        self.rootpath = self.homepath / '..'
        self.datapath = self.rootpath / 'data'
        self.commandpath = self.datapath / 'command_files'
        self.completionspath = self.datapath / 'completions'
        self.htmlpath = self.datapath / 'html'
        self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        for entry in self.api:
            print("http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.completionspath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

Somehow your error messages are from an older version
But you also had a timeout.
Is your internet connection reliable?

I don't have the issues that you have, you are doing something fundamentally wrong.

After reloading the latest unmodified code that I gave you (And creating a backup ... Please ...)
Make sure it runs without a hitch.

Then proceed to modify one step at a time, test, continue ...

I made copies of both versions in case something I messed actually messed yours as well. That I MOST CERTAINLY don't want or need (for either of us)!!

Initially, I changed "'self.wellgeopath = self.datapath / 'wellgeo'" from your respective completions path and folder. Ran it with PyCharms (I am so glad you showed me this! It's truly a life saver!) it showed me 2 other areas that needed the 'completionspath' change as well so I did so.

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys


class GetCompletions:
    def __init__(self, infile):
        self.check_network = CheckInternet.CheckInternet()
        self.homepath = Path('.')
        self.rootpath = self.homepath / '..'
        self.datapath = self.rootpath / 'data'
        self.commandpath = self.datapath / 'command_files'
        self.wellgeopath = self.datapath / 'wellgeo'
        self.htmlpath = self.datapath / 'html'
        self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        for entry in self.api:
            print("http://wogcc.state.wy.us/wellapi.cfm?nAPI={}".format(entry[3:10]))
            yield (entry, "http://wogcc.state.wy.us/wellapi.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.wellgeopath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.wellgeopath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

I ran it again and this is what I got:

Error:C:\Python365\python.exe "O:/Python/Wellinfo - GEO/src/FetchCompletions.py"
Please enter api filename: Api_May_27_2018.txt
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510778
Fetching main page for entry: 49025107780001
error downloading 49025107780001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510781
Fetching main page for entry: 49025107810000
error downloading 49025107810000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510788
Fetching main page for entry: 49025107880001
error downloading 49025107880001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510792
Fetching main page for entry: 49025107920000
error downloading 49025107920000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510807
Fetching main page for entry: 49025108070000
error downloading 49025108070000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510831
Fetching main page for entry: 49025108310001
error downloading 49025108310001
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510864
Fetching main page for entry: 49025108640000
error downloading 49025108640000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510869
Fetching main page for entry: 49025108690000
error downloading 49025108690000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510876
Fetching main page for entry: 49025108760000
error downloading 49025108760000
http://wogcc.state.wy.us/wellapi.cfm?nAPI=2510882
Fetching main page for entry: 49025108820000
error downloading 49025108820000

Process finished with exit code 0

Also - there is the matter of these fields. They are not on the new reports page. There are values to obtain but I don't need them. Can we just leave them out? One of the issues I see is there may be from 1 - 6 reports (5 or 6 is the largest number I remember seeing). This means if we pursue the download of information like what's shown below, it may cause some issues.

This link will show you two reports.

http://wogcc.state.wy.us/wellapi.cfm?nAp...ps=ID88472

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']

As always - thank you!

Ok,

Removing or modifying that list without removing the corresponding code will cause the program to explode!

Now before you do anything new ... I have been working on the code to make it more pythonic and expandable.
So, there is a new github version out there.

It is very similar, but has the new code module, and has paths separated into a WellPaths.py file which means you can make changes to that file and test them before integrating into the main program.

If you want me (or someone else) to work on new code, place post under Jobs. I will answer that post (and provide conditions) and perhaps someone else will as well.

Read the following before running clone commsnd:
to get the new software, rename your existing path (by adding bak or a date or something, then clone with the following statement)

git clone https://github.com/Larz60p/well_info

You will need to copy your most recent command file to data/command_files

Thank you for posting the revision to github. I appreciate your help. The problem is the only thing I need is for the pdf to be downloaded. All of the information you've added is fine but I already have it in our database.

If you would / can help me with getting the pdfs to download, that would be awesome. If you want to be paid for your efforts, I understand. Please contact me personally so we can try to work something out.

This is what I have that works except for the downloads:

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet
import sys


class GetCompletions:
    def __init__(self, infile):
        self.check_network = CheckInternet.CheckInternet()
        self.homepath = Path('.')
        self.rootpath = self.homepath / '..'
        self.datapath = self.rootpath / 'data'
        self.commandpath = self.datapath / 'command_files'
        self.completionspath = self.datapath / 'completions'
        self.htmlpath = self.datapath / 'html'
        self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                           'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        for entry in self.api:
            print("http://wogcc.state.wy.us/wellapi.cfm?nAPIno={}".format(entry[3:10]))
            yield (entry, "http://wogcc.state.wy.us/wellapi.cfm?nAPIno={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.completionspath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

I have also set it up so it has its own file structure. I just thought for the sake of understanding this - I would only change the download locations of the files in question.

As always - I appreciate your help!

The code does nothing other than what it did before.
It is just written in a better format which will make it easier to maintain,
and if ever desired, turning it into a package.
If you are going to use what I created, don't mess it up by creating a new file structure.

I appreciate this and I'm sure it will be extremely useful!

My intent is to download yet another report from http://wogcc.state.wy.us/wellapi.cfm?nAp...ps=ID88472 from the "Cores/Pressures/Reports" section.

I have it setup so it will go "fetch the page". It just won't download the reports (there are 2). I'm not trying to 'mess up your file structure' but rather keep it unchanged while still downloading this report. I think the problem is likely with my BeautifulSoup but I'm not sure where to go.

I will appreciate any help I can get with this. I'm hoping it's something simple but I've come to understand that, with Python, nothing is simple.

Lars60+ you've gone far beyond what I've ever thought to hope for! I've really enjoyed working with you. If you would like to continue to work together and would like to be paid for your time, I understand. If we can work something out as far as payment is concerned that would be great! Either way, you're an excellent teacher and I've learned a lot from you.

As always - I appreciate your help! Thank you!

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import CheckInternet


class GetCompletions:
    def __init__(self, infile):
        self.check_network = CheckInternet.CheckInternet()
        self.homepath = Path('.')
        self.rootpath = self.homepath / '..'
        self.datapath = self.rootpath / 'data'
        self.commandpath = self.datapath / 'command_files'
        self.completionspath = self.datapath / 'completions'
        self.htmlpath = self.datapath / 'html'
        self.reportspath = self.datapath / 'reports'

        if self.check_network.check_availability():
            # use: Api_May_27_2018.txt for testing
            # self.infilename = 'Api_May_27_2018.txt'
            self.infilename = input('Please enter api filename: ')

            self.infile = self.commandpath / self.infilename
            self.api = []

            with self.infile.open() as f:
                for line in f:
                    self.api.append(line.strip())

            self.fields = []
            self.get_all_pages()
            self.parse_and_save(getpdfs=True)
        else:
            print('Internet access required, and not found.')
            print('Please make Internet available and try again')

    def get_url(self):
        for entry in self.api:
            print("http://wogcc.state.wy.us/wellapi.cfm?nAPIno={}".format(entry[3:10]))
            yield (entry, "http://wogcc.state.wy.us/wellapi.cfm?nAPIno={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.htmlpath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.htmlpath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        try:
                            header_info = response.headers['Content-Disposition']
                            idx = header_info.index('filename')
                            filename = self.completionspath / header_info[idx + 9:]
                        except ValueError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print("couldn't locate filename for {} will use: {}".format(file, filename))
                        except KeyError:
                            filename = self.completionspath / 'comp{}.pdf'.format(url[p + 1:])
                            print('got KeyError on {}, response.headers = {}'.format(file, response.headers))
                            print('will use name: {}'.format(filename))
                            print(response.headers)
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.reportspath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))
            # Delete html file when finished
            file.unlink()


if __name__ == '__main__':
    GetCompletions('apis.txt')

As a project grows, it is extremely important to discipline the layout of code, otherwise it will become unmanageable very quickly.

The core reports are built on the fly, so you will have to scrape the information from the html file and then rebuild the report from scratch. I would suggest that you have one main report for an api number. This can have the information from such things as the core report in an excel file, along with clickable links that will bring up any associated pdf'd or other reports. The html file for Core information is located at: http://wogcc.state.wy.us/coreapi.cfm?nAPI=, and api number (930476) after the = so you will need to create a variable (this should be done in the WellPaths.py file from the new file structure in github). If you want to work with this structure, I'll help, but I don't want to do it otherwise, this means installing the github version (which is set up to work in a virtual environment).

This main report can be in the form of an excel spreadsheet.

If you're game, let me know and I will give you a step by step.

tjnichols

Larz60+

tjnichols

Larz60+

tjnichols

Larz60+

tjnichols

Larz60+

tjnichols

Larz60+