Python Forum
"EOL While Scanning String Literal"
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
"EOL While Scanning String Literal"
#11
The following program will get the pdf files of the completion logs, as well as the html completion page which can be scanned to extract further information. as well as a summary file for each

I would suggest that if you plan on using this for future work that you move the api list to a file, and
read it in at the start. This will allow you to reuse without having to change the program each time.

The program will create three directories directly below the source code:

xx_completions_xx - contains the completion text pages by name
logpdfs - comtains completion logs in the followinf format -- compnnnn.pdf (example: comp18098.pdf)
text - contains summary information in unparsed string for each elemsnt in self.fields files are named:
summary_apino.txt
these are the completion logs in pdf format.

the files in the xx_completions_xx are the original html pages, so can be displayed in a browser as is (just double click on one)
Enjoy!
import requests
from bs4 import BeautifulSoup
from pathlib import Path

class GetCompletions:
    def __init__(self):
        self.homepath = Path('.')
        self.completionspath = self.homepath / 'xx_completions_xx'
        self.completionspath.mkdir(exist_ok=True)
        self.log_pdfpath = self.homepath / 'logpdfs'
        self.log_pdfpath.mkdir(exist_ok=True)
        self.textpath = self.homepath / 'text'
        self.textpath.mkdir(exist_ok=True)

        self.apis = ['49009229900000','49009226390000','49009278600000','49009226340000','49009200210000',
                     '49009065760000','49009201380000','49009230130000','49009278800000','49009222250000',
                     '49009225900000','49009219970000','49009225890000','49009225140000','49009225760000',
                     '49009212630000','49009205440000','49009211590000','49009203660000','49009203940000',
                     '49009204340000','49009226780000','49009220310000','49009229730000','49009212240000',
                     '49009214450000','49009213790000','49009222660000','49009227960000','49009222100000',
                     '49009228020000','49009228260000','49009228290000','49009229090000','49009228250000',
                     '49009229340000','49009229360000','49009227890000','49009228010000','49009228030000',
                     '49009228450000','49009224160000','49009221890000','49009222760000','49009214980000',
                     '49009214620000','49009213800000','49009214380000','49009214730000','49009228150000',
                     '49009228190000','49009227710000','49009215280000','49009228940000','49009227920000',
                     '49009227980000','49009228170000','49009219540000','49009227870000','49009228370000',
                     '49009204330000','49009205120000','49009227860000','49009228360000','49009228160000',
                     '49009216100000','49009229000000','49009229150000','49009229490000','49009215680000',
                     '49009229350000','49009215210000','49009217070000','49009216610000','49009206800000',
                     '49009205590000','49009206310000','49009217960000','49009223190000','49009210640000',
                     '49009209260000','49009213710000','49009212360000','49009212740000','49009218680000',
                     '49009210130000','49009211420000','49009224280000','49009213750000','49009220880000',
                     '49009225300000','49009218090000','49009227720000','49009225830000','49009223170000',
                     '49009209370000','49009214990000','49009207260000','49009211540000','49009227380000']

        self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                       'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
        self.get_all_pages()
        self.parse_and_save(getpdfs=True)

    def get_url(self):
        for entry in self.apis:
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.completionspath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))

if __name__ == '__main__':
    GetCompletions()
Reply
#12
You're right - David Beazley is funny. I like the way he teaches too because it's not dry.

I've tried running your code (above) and got a traceback error.

Error:
Traceback (most recent call last): File "C:/Python Tutorials/LARS60+.py", line 1, in <module> import requests ModuleNotFoundError: No module named 'requests'
I've imported the requests module with the 'pip' command. I tried it again after I got the error and it said "Requirement already satisfied: requests in c:\users\toliver\appdata\local\enthought\canopy\user\lib\site-packages".

I am enjoying his video. I've got his cookbook now too. I've also found his website so I'll just soak up some of Mr. Beazley's knowledge and hope to be wiser because of it!

I appreciate your help and pointing me in the direction of such a wealth of information!
Reply
#13
(Apr-11-2018, 05:16 PM)tjnichols Wrote: I've imported the requests module with the 'pip' command. I tried it again after I got the error and it said "Requirement already satisfied: requests in c:\users\toliver\appdata\local\enthought\canopy\user\lib\site-packages".
At one time you have installed Enthought Python Distribution
it will set Environment Variables Path to point python and pip in cmd to Enthought.

I have set my main Python as in part-1, part-2.
Then it look like this.
Microsoft Windows [Version 10.0.16299.309]
(c) 2017 Microsoft Corporation. Med enerett.

C:\Windows\System32>cd\

C:\>pip -V
pip 9.0.3 from c:\python36\lib\site-packages (python 3.6)

C:\>python -c "import sys; print(sys.executable)"
C:\python36\python.exe

C:\>
I also have a Distribution Anaconda | my tutorial,but has not set it Windows path Path.
It's has it own place i can access it from.
G:\Anaconda3
λ python -V
Python 3.6.5 :: Anaconda custom (64-bit)

G:\Anaconda3
λ cd scripts

G:\Anaconda3\Scripts
λ pip -V
pip 9.0.1 from G:\Anaconda3\lib\site-packages (python 3.6)

G:\Anaconda3\Scripts
λ conda -V
conda 4.5.0

G:\Anaconda3\Scripts
λ
So you have to think of how you would set it up,i do of course like the way i have it Wink
Reply
#14
tjnichols: Did you get requests installed properly?
Reply
#15
Hey Larz60+ - yes (the short answer) I did. I cannot thank you enough for your help! This is something I couldn't have come up with on my own but gives me something to strive for! If I knew how to give you a resounding round of applause I would do it!

The one question I have is - I can't find where the pdf's are saved.

Thanks again!
Reply
#16
pdf's should be in logpdfs directory directly below the source.
Reply
#17
If you're using Larz's code, it's saved in logpdfs, in whatever folder you're currently running the script in.
Reply
#18
Lars60+ - How did you get the API numbers to be seen as text and Python to observe the commas as separators?

self.apis = ['49009229900000','49009226390000','49009278600000','49009226340000','49009200210000',
                     '49009065760000','49009201380000','49009230130000','49009278800000','49009222250000',
                     '49009225900000','49009219970000','49009225890000','49009225140000','49009225760000',
                     '49009212630000','49009205440000','49009211590000','49009203660000','49009203940000',
                     '49009204340000','49009226780000','49009220310000','49009229730000','49009212240000',
                     '49009214450000','49009213790000','49009222660000','49009227960000','49009222100000',
                     '49009228020000','49009228260000','49009228290000','49009229090000','49009228250000',
                     '49009229340000','49009229360000','49009227890000','49009228010000','49009228030000',
                     '49009228450000','49009224160000','49009221890000','49009222760000','49009214980000',
                     '49009214620000','49009213800000','49009214380000','49009214730000','49009228150000',
                     '49009228190000','49009227710000','49009215280000','49009228940000','49009227920000',
                     '49009227980000','49009228170000','49009219540000','49009227870000','49009228370000',
                     '49009204330000','49009205120000','49009227860000','49009228360000','49009228160000',
                     '49009216100000','49009229000000','49009229150000','49009229490000','49009215680000',
                     '49009229350000','49009215210000','49009217070000','49009216610000','49009206800000',
                     '49009205590000','49009206310000','49009217960000','49009223190000','49009210640000',
                     '49009209260000','49009213710000','49009212360000','49009212740000','49009218680000',
                     '49009210130000','49009211420000','49009224280000','49009213750000','49009220880000',
                     '49009225300000','49009218090000','49009227720000','49009225830000','49009223170000',
                     '49009209370000','49009214990000','49009207260000','49009211540000','49009227380000']
Thanks in advance!

T
Reply
#19
Quote:Lars60+ - How did you get the API numbers to be seen as text and Python to observe the commas as separators?
when put between brackets '[]' the data becomes a python list (https://docs.python.org/3/tutorial/datastructures.html). So the standard separator for lists is comma. If each entry is placed in in single or double quotes, it indicates that the value is a string.

This data could have come from a file
in that case, you wouldn't need to put quotes around the data example:

(NOTE: I attached a starting 'apis.txt' file with old data, copy to text directory)
let's say you have a file named 'apis.txt' (must be in the 'text' directory)which looks like:
Output:
49009229900000 49009226390000 49009278600000 49009226340000 49009200210000 49009065760000 49009201380000 ... Add rest of completion codes here
change this:
    def __init__(self):
to this
    def __init__(self, infile):
in the code, replace the list (starting on line 15) with:
        self.infile = self.textpath / infile
        self.apis = []
        with self.infile.open() as f:
            for line in f:
                self.apis.append(line.strip())
Now you can change the input file to control the code.
finally, change the following at bottom of code:
from this:
if __name__ == '__main__':
    GetCompletions()
to this
if __name__ == '__main__':
    GetCompletions('apis.txt')
So now the complete program looks like:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import sys

class GetCompletions:
    def __init__(self, infile):
        self.homepath = Path('.')
        self.completionspath = self.homepath / 'xx_completions_xx'
        self.completionspath.mkdir(exist_ok=True)
        self.log_pdfpath = self.homepath / 'logpdfs'
        self.log_pdfpath.mkdir(exist_ok=True)
        self.textpath = self.homepath / 'text'
        self.textpath.mkdir(exist_ok=True)

        self.infile = self.textpath / infile
        self.apis = []

        with self.infile.open() as f:
            for line in f:
                self.apis.append(line.strip())

        self.fields = ['Spud Date', 'Total Depth', 'IP Oil Bbls', 'Reservoir Class', 'Completion Date',
                       'Plug Back', 'IP Gas Mcf', 'TD Formation', 'Formation', 'IP Water Bbls']
        self.get_all_pages()
        self.parse_and_save(getpdfs=True)

    def get_url(self):
        for entry in self.apis:
            yield (entry, "http://wogcc.state.wy.us/wyocomp.cfm?nAPI={}".format(entry[3:10]))

    def get_all_pages(self):
        for entry, url in self.get_url():
            print('Fetching main page for entry: {}'.format(entry))
            response = requests.get(url)
            if response.status_code == 200:
                filename = self.completionspath / 'api_{}.html'.format(entry)
                with filename.open('w') as f:
                    f.write(response.text)
            else:
                print('error downloading {}'.format(entry))

    def parse_and_save(self, getpdfs=False):
        filelist = [file for file in self.completionspath.iterdir() if file.is_file()]
        for file in filelist:
            with file.open('r') as f:
                soup = BeautifulSoup(f.read(), 'lxml')
            if getpdfs:
                links = soup.find_all('a')
                for link in links:
                    url = link['href']
                    if 'www' in url:
                        continue
                    print('downloading pdf at: {}'.format(url))
                    p = url.index('=')
                    filename = self.log_pdfpath / 'comp{}.pdf'.format(url[p+1:])
                    response = requests.get(url, stream=True, allow_redirects=False)
                    if response.status_code == 200:
                        with filename.open('wb') as f:
                            f.write(response.content)
            sfname = self.textpath / 'summary_{}.txt'.format((file.name.split('_'))[1].split('.')[0][3:10])
            tds = soup.find_all('td')
            with sfname.open('w') as f:
                for td in tds:
                    if td.text:
                        if any(field in td.text for field in self.fields):
                            f.write('{}\n'.format(td.text))

    # You can run the following once to create your original data as a file
    def add_input(self):
        apis = ['49009229900000', '49009226390000', '49009278600000', '49009226340000', '49009200210000',
                '49009065760000', '49009201380000', '49009230130000', '49009278800000', '49009222250000',
                '49009225900000', '49009219970000', '49009225890000', '49009225140000', '49009225760000',
                '49009212630000', '49009205440000', '49009211590000', '49009203660000', '49009203940000',
                '49009204340000', '49009226780000', '49009220310000', '49009229730000', '49009212240000',
                '49009214450000', '49009213790000', '49009222660000', '49009227960000', '49009222100000',
                '49009228020000', '49009228260000', '49009228290000', '49009229090000', '49009228250000',
                '49009229340000', '49009229360000', '49009227890000', '49009228010000', '49009228030000',
                '49009228450000', '49009224160000', '49009221890000', '49009222760000', '49009214980000',
                '49009214620000', '49009213800000', '49009214380000', '49009214730000', '49009228150000',
                '49009228190000', '49009227710000', '49009215280000', '49009228940000', '49009227920000',
                '49009227980000', '49009228170000', '49009219540000', '49009227870000', '49009228370000',
                '49009204330000', '49009205120000', '49009227860000', '49009228360000', '49009228160000',
                '49009216100000', '49009229000000', '49009229150000', '49009229490000', '49009215680000',
                '49009229350000', '49009215210000', '49009217070000', '49009216610000', '49009206800000',
                '49009205590000', '49009206310000', '49009217960000', '49009223190000', '49009210640000',
                '49009209260000', '49009213710000', '49009212360000', '49009212740000', '49009218680000',
                '49009210130000', '49009211420000', '49009224280000', '49009213750000', '49009220880000',
                '49009225300000', '49009218090000', '49009227720000', '49009225830000', '49009223170000',
                '49009209370000', '49009214990000', '49009207260000', '49009211540000', '49009227380000']

        with self.infile.open('w') as f:
            for item in apis:
                f.write(f'{item}\n')

if __name__ == '__main__':
    GetCompletions('apis.txt')

Attached Files

.txt   apis.txt (Size: 1.56 KB / Downloads: 219)
Reply
#20
Ok - this is awesome but I get the following error...

Error:
Traceback (most recent call last): File "C:/Python Tutorials/LARS with API.py", line 97, in <module> GetCompletions('apis.txt') File "C:/Python Tutorials/LARS with API.py", line 19, in __init__ with self.infile.open() as f: File "C:\Python 36\lib\pathlib.py", line 1181, in open opener=self._opener) File "C:\Python 36\lib\pathlib.py", line 1035, in _opener return self._accessor.open(self, flags, mode) File "C:\Python 36\lib\pathlib.py", line 387, in wrapped return strfunc(str(pathobj), *args) FileNotFoundError: [Errno 2] No such file or directory: 'text\\apis.txt'
Any ideas?

I hope you've had a good weekend!

T
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Literal beginner - needs help warriordazza 2 1,788 Apr-27-2020, 11:15 AM
Last Post: warriordazza

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020