Python Forum

Full Version: Wierd file type problem
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
This is wierd. I don't see the problem.
I am passing a posix path to a module that I have been using in the same way for months, and now all of a sudden there is a data type change when calling an external module.

I will show all of the code:

This module simply contains paths, url's and filenames (no problem here):
GovernmentPaths.py
import os
from pathlib import Path


class GovernmentPaths:
    def __init__(self):
        os.chdir(os.path.abspath(os.path.dirname(__file__)))
        self.homepath = Path('.')

        self.rootpath = self.homepath / '..'

        self.docpath = self.rootpath / 'docs'
        self.docpath.mkdir(exist_ok=True)

        self.datapath = self.rootpath / 'data'
        self.datapath.mkdir(exist_ok=True)

        self.tmppath = self.datapath / 'tmp'
        self.tmppath.mkdir(exist_ok=True)

        self.jsonpath = self.datapath / 'json'
        self.jsonpath.mkdir(exist_ok=True)

        self.htmlpath = self.datapath / 'html'
        self.htmlpath.mkdir(exist_ok=True)

        self.developer_url = 'https://www.govinfo.gov/developers'
        self.developer_filename = self.htmlpath / 'DevelopersPage.html'

        self.detail_url = 'https://www.govinfo.gov/app/details/CDIR-2018-07-27/context'
        self.detail_filename = self.htmlpath / 'CongDir115thContext.html'

        self.bulk_data_repository = 'https://www.govinfo.gov/bulkdata'
        self.bulk_data_filename = self.htmlpath / 'BulkData.html'

if __name__ == '__main__':
    GovernmentPaths()
This is my 'GetPage' module (error occurs here in method get_page):
GetPage.py
import GovernmentPaths
import requests
import os
import time
import sys
import codecs

class GetPage:
    def __init__(self):
        os.chdir(os.path.dirname(__file__))
        self.gpath = GovernmentPaths.GovernmentPaths()
        self.user_agent = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:60.0) '
            'Gecko/20100101 Firefox/60.0 AppleWebKit/537.36 (KHTML, '
            'like Gecko)  Chrome/68.0.3440.75 Safari/537.36'}

    def get_filename(self, url, abbr=None):
        # problem with 'https://www.cia.gov/library/publications/resources/the-world-factbook/docs/notesanddefs.html#2028'
        # filename should be modified (last entry in urlpart) to notesanddefs_2028.html
        urlpart = url.split('/')
        name = urlpart[-1]
        if '#' in name:
            name = name.split('.')
            name2 = name[1].split('#')
            urlpart[-1] = f'{name[0]}_{name2[1]}.{name2[0]}'
        filename = self.gpath.cia_homepath
        for n in range (3, len(urlpart)):
            filename = filename / urlpart[n]
        return filename

    def get_page(self, url, filename, verbose=False, image=False, encoding='utf-8'):
        print(f'in GetPage type filename: {type(filename)}')
        htmlpage = None
        if verbose:
            print(f'\nget url: {url}')
        if not filename.exists():
            response = requests.get(url, headers=self.user_agent)
            time.sleep(5)
            if response.status_code == 200:
                if verbose:
                    print('success')
                if len(response.content) > 0:
                    if image:
                        with filename.open('wb') as zp:
                            zp.write(response.content)
                    else:
                        if encoding is None:
                            with filename.open('wb') as zp:
                                zp.write(response.content)
                        else:
                            with filename.open('wb', encoding=encoding) as zp:
                                zp.write(response.content)
                    htmlpage = response.content
                else:
                    print('length is zero')
            else:
                print(f'Problem fetching {url}\n')
        else:
            file_name = filename.resolve()
            with codecs.open(file_name, "r",encoding='utf-8', errors='ignore') as fp:
            # with filename.open() as fp:
                htmlpage = fp.read()
        return htmlpage

if __name__ == '__main__':
    GetPage()
And the program which calls get_page:
import GovernmentPaths
import GetPage
from bs4 import BeautifulSoup


class GetData:
    def __init__(self):
        self.gpath = GovernmentPaths.GovernmentPaths()
        gp = GetPage.GetPage()
        self.getpage = gp.get_page

        filename = self.gpath.htmlpath / 'DevelopersPage.html'
        print(f'Before call, type filename: {type(filename)}')
        self.devpage = self.getpage(filename, self.gpath.developer_url)
    
    def scrape_devpage(self):
        soup = BeautifulSoup(self.devpage, 'lxml')
        print(soup.prettify())


if __name__ == '__main__':
    GetData()
running produces error:
Error:
Traceback (most recent call last): File "./src/GetData.py", line 22, in <module> GetData() File "./src/GetData.py", line 14, in __init__ self.devpage = self.getpage(filename, self.gpath.developer_url) File "/media/larz60/Data-2TB/Projects/Government/src/GetPage.py", line 36, in get_page if not filename.exists(): AttributeError: 'str' object has no attribute 'exists'
there is a printout before and during call to get _page:
Output:
(gov_venv) larz60@larz60-Z97MX-Gaming-5 /media/larz60/Data-2TB/Projects/Government $ python ./src/GetData.py [b]Before call, type filename: <class 'pathlib.PosixPath'> in GetPage type filename: <class 'str'>[/b] Traceback (most recent call last): File "./src/GetData.py", line 22, in <module> GetData() File "./src/GetData.py", line 14, in __init__ self.devpage = self.getpage(filename, self.gpath.developer_url) File "/media/larz60/Data-2TB/Projects/Government/src/GetPage.py", line 36, in get_page if not filename.exists(): AttributeError: 'str' object has no attribute 'exists' (gov_venv) larz60@larz60-Z97MX-Gaming-5 /media/larz60/Data-2TB/Projects/Government $
def get_page(self, url, filename, verbose=False, image=False, encoding='utf-8'):
self.devpage = self.getpage(filename, self.gpath.developer_url)
Looks like you mixed up the argument order.
I would have looked at that till the cows came home and never noticed!
Thanks.
I generally copy the def to where I want to call the function, and remove the def and colon, modify where self is, etc. I too have been bitten by that bug.
Quote:I generally copy the def to where I want to call the function
After 50 years of writing code, I have shelved most things like this, and usually write code as fast as I can type (unless something new), I have used the GetPage class hundreds of times (and wrote it myself). I think it's just my age inverting things that shouldn't be inverted.