Nov-24-2018, 05:44 PM
This is wierd. I don't see the problem.
I am passing a posix path to a module that I have been using in the same way for months, and now all of a sudden there is a data type change when calling an external module.
I will show all of the code:
This module simply contains paths, url's and filenames (no problem here):
GovernmentPaths.py
GetPage.py
I am passing a posix path to a module that I have been using in the same way for months, and now all of a sudden there is a data type change when calling an external module.
I will show all of the code:
This module simply contains paths, url's and filenames (no problem here):
GovernmentPaths.py
import os from pathlib import Path class GovernmentPaths: def __init__(self): os.chdir(os.path.abspath(os.path.dirname(__file__))) self.homepath = Path('.') self.rootpath = self.homepath / '..' self.docpath = self.rootpath / 'docs' self.docpath.mkdir(exist_ok=True) self.datapath = self.rootpath / 'data' self.datapath.mkdir(exist_ok=True) self.tmppath = self.datapath / 'tmp' self.tmppath.mkdir(exist_ok=True) self.jsonpath = self.datapath / 'json' self.jsonpath.mkdir(exist_ok=True) self.htmlpath = self.datapath / 'html' self.htmlpath.mkdir(exist_ok=True) self.developer_url = 'https://www.govinfo.gov/developers' self.developer_filename = self.htmlpath / 'DevelopersPage.html' self.detail_url = 'https://www.govinfo.gov/app/details/CDIR-2018-07-27/context' self.detail_filename = self.htmlpath / 'CongDir115thContext.html' self.bulk_data_repository = 'https://www.govinfo.gov/bulkdata' self.bulk_data_filename = self.htmlpath / 'BulkData.html' if __name__ == '__main__': GovernmentPaths()This is my 'GetPage' module (error occurs here in method get_page):
GetPage.py
import GovernmentPaths import requests import os import time import sys import codecs class GetPage: def __init__(self): os.chdir(os.path.dirname(__file__)) self.gpath = GovernmentPaths.GovernmentPaths() self.user_agent = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) ' 'Gecko/20100101 Firefox/60.0 AppleWebKit/537.36 (KHTML, ' 'like Gecko) Chrome/68.0.3440.75 Safari/537.36'} def get_filename(self, url, abbr=None): # problem with 'https://www.cia.gov/library/publications/resources/the-world-factbook/docs/notesanddefs.html#2028' # filename should be modified (last entry in urlpart) to notesanddefs_2028.html urlpart = url.split('/') name = urlpart[-1] if '#' in name: name = name.split('.') name2 = name[1].split('#') urlpart[-1] = f'{name[0]}_{name2[1]}.{name2[0]}' filename = self.gpath.cia_homepath for n in range (3, len(urlpart)): filename = filename / urlpart[n] return filename def get_page(self, url, filename, verbose=False, image=False, encoding='utf-8'): print(f'in GetPage type filename: {type(filename)}') htmlpage = None if verbose: print(f'\nget url: {url}') if not filename.exists(): response = requests.get(url, headers=self.user_agent) time.sleep(5) if response.status_code == 200: if verbose: print('success') if len(response.content) > 0: if image: with filename.open('wb') as zp: zp.write(response.content) else: if encoding is None: with filename.open('wb') as zp: zp.write(response.content) else: with filename.open('wb', encoding=encoding) as zp: zp.write(response.content) htmlpage = response.content else: print('length is zero') else: print(f'Problem fetching {url}\n') else: file_name = filename.resolve() with codecs.open(file_name, "r",encoding='utf-8', errors='ignore') as fp: # with filename.open() as fp: htmlpage = fp.read() return htmlpage if __name__ == '__main__': GetPage()And the program which calls get_page:
import GovernmentPaths import GetPage from bs4 import BeautifulSoup class GetData: def __init__(self): self.gpath = GovernmentPaths.GovernmentPaths() gp = GetPage.GetPage() self.getpage = gp.get_page filename = self.gpath.htmlpath / 'DevelopersPage.html' print(f'Before call, type filename: {type(filename)}') self.devpage = self.getpage(filename, self.gpath.developer_url) def scrape_devpage(self): soup = BeautifulSoup(self.devpage, 'lxml') print(soup.prettify()) if __name__ == '__main__': GetData()running produces error:
Error:Traceback (most recent call last):
File "./src/GetData.py", line 22, in <module>
GetData()
File "./src/GetData.py", line 14, in __init__
self.devpage = self.getpage(filename, self.gpath.developer_url)
File "/media/larz60/Data-2TB/Projects/Government/src/GetPage.py", line 36, in get_page
if not filename.exists():
AttributeError: 'str' object has no attribute 'exists'
there is a printout before and during call to get _page:Output:(gov_venv) larz60@larz60-Z97MX-Gaming-5 /media/larz60/Data-2TB/Projects/Government $ python ./src/GetData.py
[b]Before call, type filename: <class 'pathlib.PosixPath'>
in GetPage type filename: <class 'str'>[/b]
Traceback (most recent call last):
File "./src/GetData.py", line 22, in <module>
GetData()
File "./src/GetData.py", line 14, in __init__
self.devpage = self.getpage(filename, self.gpath.developer_url)
File "/media/larz60/Data-2TB/Projects/Government/src/GetPage.py", line 36, in get_page
if not filename.exists():
AttributeError: 'str' object has no attribute 'exists'
(gov_venv) larz60@larz60-Z97MX-Gaming-5 /media/larz60/Data-2TB/Projects/Government $