"EOL While Scanning String Literal"

**Larz60+** · Apr-11-2018, 03:53 AM

The following program will get the pdf files of the completion logs, as well as the html completion page which can be scanned to extract further information. as well as a summary file for each

I would suggest that if you plan on using this for future work that you move the api list to a file, and
read it in at the start. This will allow you to reuse without having to change the program each time.

The program will create three directories directly below the source code:

xx_completions_xx - contains the completion text pages by name
logpdfs - comtains completion logs in the followinf format -- compnnnn.pdf (example: comp18098.pdf)
text - contains summary information in unparsed string for each elemsnt in self.fields files are named:
summary_apino.txt
these are the completion logs in pdf format.

the files in the xx_completions_xx are the original html pages, so can be displayed in a browser as is (just double click on one)
Enjoy!

import requests
from bs4 import BeautifulSoup
from pathlib import Path

class GetCompletions:
    def __init__(self):
        self.homepath = Path('.')
        self.completionspath = self.homepath / 'xx_completions_xx'
        self.completionspath.mkdir(exist_ok=True)
        self.log_pdfpath = self.homepath / 'logpdfs'
        self.log_pdfpath.mkdir(exist_ok=True)
        self.textpath = self.homepath / 'text'
        self.textpath.mkdir(exist_ok=True)

        self.apis = ['49009229900000','49009226390000','49009278600000','49009226340000','49009200210000',
                     '49009065760000','49009201380000','49009230130000','49009278800000','49009222250000',
                     '49009225900000','49009219970000','49009225890000','49009225140000','49009225760000',
                     '49009212630000','49009205440000','49009211590000','49009203660000','49009203940000',
                     '49009204340000','49009226780000','49009220310000','49009229730000','49009212240000',
                     '49009214450000','49009213790000','49009222660000','49009227960000','49009222100000',
                     '49009228020000','49009228260000','49009228290000','49009229090000','49009228250000',
                     '49009229340000','49009229360000','49009227890000','49009228010000','49009228030000',
                     '49009228450000','49009224160000','49009221890000','49009222760000','49009214980000',
                     '49009214620000','49009213800000','49009214380000','49009214730000','49009228150000',
                     '49009228190000','49009227710000','49009215280000','49009228940000','49009227920000',
                     '49009227980000','49009228170000','49009219540000','49009227870000','49009228370000',
                     '49009204330000','49009205120000','49009227860000','49009228360000','49009228160000',
                     '49009216100000','49009229000000','49009229150000','49009229490000','49009215680000',
                     '49009229350000','49009215210000','49009217070000','49009216610000','49009206800000',
                     '49009205590000','49009206310000','49009217960000','49009223190000','49009210640000',
                     '49009209260000','49009213710000','49009212360000','49009212740000','49009218680000',
                     '49009210130000','49009211420000','49009224280000','49009213750000','49009220880000',
                     '49009225300000','49009218090000','49009227720000','49009225830000','49009223170000',
                     '49009209370000','49009214990000','49009207260000','49009211540000','49009227380000']

        self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                       'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
        self.get_all_pages()
        self.parse_and_save(getpdfs=True)

    def get_url(self):
        for entry in self.apis:
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.completionspath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))

if __name__ == '__main__':
    GetCompletions()

tjnichols · Apr-11-2018, 05:16 PM

You're right - David Beazley is funny. I like the way he teaches too because it's not dry.

I've tried running your code (above) and got a traceback error.

Error: Traceback (most recent call last):
  File "C:/Python Tutorials/LARS60+.py", line 1, in <module>
    import requests
ModuleNotFoundError: No module named 'requests'

I've imported the requests module with the 'pip' command. I tried it again after I got the error and it said "Requirement already satisfied: requests in c:\users\toliver\appdata\local\enthought\canopy\user\lib\site-packages".

I am enjoying his video. I've got his cookbook now too. I've also found his website so I'll just soak up some of Mr. Beazley's knowledge and hope to be wiser because of it!

I appreciate your help and pointing me in the direction of such a wealth of information!

***snippsat*** · (This post was last modified: Apr-11-2018, 06:39 PM by snippsat.)

(Apr-11-2018, 05:16 PM)tjnichols Wrote: I've imported the requests module with the 'pip' command. I tried it again after I got the error and it said "Requirement already satisfied: requests in c:\users\toliver\appdata\local\enthought\canopy\user\lib\site-packages".

At one time you have installed Enthought Python Distribution
it will set Environment Variables Path to point python and pip in cmd to Enthought.

I have set my main Python as in part-1, part-2.
Then it look like this.

Microsoft Windows [Version 10.0.16299.309]
(c) 2017 Microsoft Corporation. Med enerett.

C:\Windows\System32>cd\

C:\>pip -V
pip 9.0.3 from c:\python36\lib\site-packages (python 3.6)

C:\>python -c "import sys; print(sys.executable)"
C:\python36\python.exe

C:\>

I also have a Distribution Anaconda | my tutorial,but has not set it Windows path Path.
It's has it own place i can access it from.

G:\Anaconda3
λ python -V
Python 3.6.5 :: Anaconda custom (64-bit)

G:\Anaconda3
λ cd scripts

G:\Anaconda3\Scripts
λ pip -V
pip 9.0.1 from G:\Anaconda3\lib\site-packages (python 3.6)

G:\Anaconda3\Scripts
λ conda -V
conda 4.5.0

G:\Anaconda3\Scripts
λ

So you have to think of how you would set it up,i do of course like the way i have it Wink

**Larz60+** · Apr-11-2018, 09:51 PM

tjnichols: Did you get requests installed properly?

tjnichols · Apr-12-2018, 02:43 PM

Hey Larz60+ - yes (the short answer) I did. I cannot thank you enough for your help! This is something I couldn't have come up with on my own but gives me something to strive for! If I knew how to give you a resounding round of applause I would do it!

The one question I have is - I can't find where the pdf's are saved.

Thanks again!

**Larz60+** · Apr-12-2018, 03:56 PM

pdf's should be in logpdfs directory directly below the source.

**nilamo** · Apr-12-2018, 03:57 PM

If you're using Larz's code, it's saved in logpdfs, in whatever folder you're currently running the script in.

tjnichols · (This post was last modified: Apr-13-2018, 04:41 PM by tjnichols.)

Lars60+ - How did you get the API numbers to be seen as text and Python to observe the commas as separators?

self.apis = ['49009229900000','49009226390000','49009278600000','49009226340000','49009200210000',
                     '49009065760000','49009201380000','49009230130000','49009278800000','49009222250000',
                     '49009225900000','49009219970000','49009225890000','49009225140000','49009225760000',
                     '49009212630000','49009205440000','49009211590000','49009203660000','49009203940000',
                     '49009204340000','49009226780000','49009220310000','49009229730000','49009212240000',
                     '49009214450000','49009213790000','49009222660000','49009227960000','49009222100000',
                     '49009228020000','49009228260000','49009228290000','49009229090000','49009228250000',
                     '49009229340000','49009229360000','49009227890000','49009228010000','49009228030000',
                     '49009228450000','49009224160000','49009221890000','49009222760000','49009214980000',
                     '49009214620000','49009213800000','49009214380000','49009214730000','49009228150000',
                     '49009228190000','49009227710000','49009215280000','49009228940000','49009227920000',
                     '49009227980000','49009228170000','49009219540000','49009227870000','49009228370000',
                     '49009204330000','49009205120000','49009227860000','49009228360000','49009228160000',
                     '49009216100000','49009229000000','49009229150000','49009229490000','49009215680000',
                     '49009229350000','49009215210000','49009217070000','49009216610000','49009206800000',
                     '49009205590000','49009206310000','49009217960000','49009223190000','49009210640000',
                     '49009209260000','49009213710000','49009212360000','49009212740000','49009218680000',
                     '49009210130000','49009211420000','49009224280000','49009213750000','49009220880000',
                     '49009225300000','49009218090000','49009227720000','49009225830000','49009223170000',
                     '49009209370000','49009214990000','49009207260000','49009211540000','49009227380000']

Thanks in advance!

T

**Larz60+** · (This post was last modified: Apr-14-2018, 12:13 AM by Larz60+.)

Quote:Lars60+ - How did you get the API numbers to be seen as text and Python to observe the commas as separators?

when put between brackets '[]' the data becomes a python list (https://docs.python.org/3/tutorial/datastructures.html). So the standard separator for lists is comma. If each entry is placed in in single or double quotes, it indicates that the value is a string.

This data could have come from a file
in that case, you wouldn't need to put quotes around the data example:

(NOTE: I attached a starting 'apis.txt' file with old data, copy to text directory)
let's say you have a file named 'apis.txt' (must be in the 'text' directory)which looks like:

Output:49009229900000
49009226390000
49009278600000
49009226340000
49009200210000
49009065760000
49009201380000
... Add rest of completion codes here

change this:

    def __init__(self):

to this

    def __init__(self, infile):

in the code, replace the list (starting on line 15) with:

        self.infile = self.textpath / infile
        self.apis = []
        with self.infile.open() as f:
            for line in f:
                self.apis.append(line.strip())

Now you can change the input file to control the code.
finally, change the following at bottom of code:
from this:

if __name__ == '__main__':
    GetCompletions()

to this

if __name__ == '__main__':
    GetCompletions('apis.txt')

So now the complete program looks like:

import requests
from bs4 import BeautifulSoup
from pathlib import Path
import sys

class GetCompletions:
    def __init__(self, infile):
        self.homepath = Path('.')
        self.completionspath = self.homepath / 'xx_completions_xx'
        self.completionspath.mkdir(exist_ok=True)
        self.log_pdfpath = self.homepath / 'logpdfs'
        self.log_pdfpath.mkdir(exist_ok=True)
        self.textpath = self.homepath / 'text'
        self.textpath.mkdir(exist_ok=True)

        self.infile = self.textpath / infile
        self.apis = []

        with self.infile.open() as f:
            for line in f:
                self.apis.append(line.strip())

        self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                       'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
        self.get_all_pages()
        self.parse_and_save(getpdfs=True)

    def get_url(self):
        for entry in self.apis:
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.completionspath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))

    # You can run the following once to create your original data as a file
    def add_input(self):
        apis = ['49009229900000', '49009226390000', '49009278600000', '49009226340000', '49009200210000',
                '49009065760000', '49009201380000', '49009230130000', '49009278800000', '49009222250000',
                '49009225900000', '49009219970000', '49009225890000', '49009225140000', '49009225760000',
                '49009212630000', '49009205440000', '49009211590000', '49009203660000', '49009203940000',
                '49009204340000', '49009226780000', '49009220310000', '49009229730000', '49009212240000',
                '49009214450000', '49009213790000', '49009222660000', '49009227960000', '49009222100000',
                '49009228020000', '49009228260000', '49009228290000', '49009229090000', '49009228250000',
                '49009229340000', '49009229360000', '49009227890000', '49009228010000', '49009228030000',
                '49009228450000', '49009224160000', '49009221890000', '49009222760000', '49009214980000',
                '49009214620000', '49009213800000', '49009214380000', '49009214730000', '49009228150000',
                '49009228190000', '49009227710000', '49009215280000', '49009228940000', '49009227920000',
                '49009227980000', '49009228170000', '49009219540000', '49009227870000', '49009228370000',
                '49009204330000', '49009205120000', '49009227860000', '49009228360000', '49009228160000',
                '49009216100000', '49009229000000', '49009229150000', '49009229490000', '49009215680000',
                '49009229350000', '49009215210000', '49009217070000', '49009216610000', '49009206800000',
                '49009205590000', '49009206310000', '49009217960000', '49009223190000', '49009210640000',
                '49009209260000', '49009213710000', '49009212360000', '49009212740000', '49009218680000',
                '49009210130000', '49009211420000', '49009224280000', '49009213750000', '49009220880000',
                '49009225300000', '49009218090000', '49009227720000', '49009225830000', '49009223170000',
                '49009209370000', '49009214990000', '49009207260000', '49009211540000', '49009227380000']

        with self.infile.open('w') as f:
            for item in apis:
                f.write(f'{item}\n')

if __name__ == '__main__':
    GetCompletions('apis.txt')

tjnichols · Apr-15-2018, 11:52 PM

Ok - this is awesome but I get the following error...

Error:Traceback (most recent call last):
  File "C:/Python Tutorials/LARS with API.py", line 97, in <module>
    GetCompletions('apis.txt')
  File "C:/Python Tutorials/LARS with API.py", line 19, in __init__
    with self.infile.open() as f:
  File "C:\Python 36\lib\pathlib.py", line 1181, in open
    opener=self._opener)
  File "C:\Python 36\lib\pathlib.py", line 1035, in _opener
    return self._accessor.open(self, flags, mode)
  File "C:\Python 36\lib\pathlib.py", line 387, in wrapped
    return strfunc(str(pathobj), *args)
FileNotFoundError: [Errno 2] No such file or directory: 'text\\apis.txt'

Any ideas?

I hope you've had a good weekend!

T

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Literal beginner - needs help	warriordazza	2	2,439	Apr-27-2020, 11:15 AM Last Post: warriordazza

"EOL While Scanning String Literal"

User Panel Messages

Announcements