I use a method that saves the request to a file, and checks the created date on the file each time it's
run. If the file is newer than the refresh time, the file will be loaded rather than fetching a new copy,
otherwise it will get a new one.
This may be a bit more that you want, but you can look at the code to get an idea,
there are three modules involved:
CheckInternet.py - checks that there is a network connection available
GetUrl.py - Does the actual fetching of a url
GetPage.py - takes care of caching, calls GetUrl, which does the request
I modified the GetPage.py testit routine on the fly for this post without testing,
so if it doesn't work, let me know.
This code was part of a presentation I gave for a makeit meeting, and has a full jupyter notebook
document with the full code on github here:
https://github.com/Larz60p/MakerProject
if you have any interest in how the above code was used in a full package.
prog1 CheckInternet.py:
import socket
class CheckInternet:
def __init__(self):
self.internet_available = False
def check_availability(self):
self.internet_available = False
if socket.gethostbyname(socket.gethostname()) != '127.0.0.1':
self.internet_available = True
return self.internet_available
def testit():
ci = CheckInternet()
print('Please turn internet OFF, then press Enter')
input()
ci.check_availability()
print(f'ci.internet_available: {ci.internet_available}')
if not ci.internet_available:
print(' Off test successful')
else:
print(' Off test failed')
print('Please turn internet ON, then press Enter')
input()
ci.check_availability()
print(f'ci.internet_available: {ci.internet_available}')
if ci.internet_available:
print(' On test successful')
else:
print(' On test failed')
if __name__ == '__main__':
testit()
prog2 GetUrl.py:
import requests
import CheckInternet
import sys
class GetUrl:
def __init__(self):
self.ci = CheckInternet.CheckInternet()
self.ok_status = 200
self.r = None
def fetch_url(self, url):
self.r = None
if self.ci.check_availability():
self.r = requests.get(url, allow_redirects=False)
return self.r
def testit():
gu = GetUrl()
page = gu.fetch_url('https://www.google.com/')
count = 0
maxcount = 20
try:
if page.status_code == 200:
ptext = page.text.split('/n')
for line in ptext:
print(f'{line}\n')
count += 1
if count > maxcount:
break
else:
print(f'Error retreving file status code: {page.status_code}')
except AttributeError:
print('Please enable internet and try again')
if __name__ == '__main__':
testit()
prog3 GetPage:
import GetUrl
import time
import sys
class GetPage:
def __init__(self):
"""
Initalize - Instantiate imported modules, initialize class variables
"""
self.elapsed_hours = 0
self.gu = GetUrl.GetUrl()
self.savefile = None
def get_page(self, url, savefile=None, refresh_hours_every=48):
self.url = url
self.savefile = savefile
self.refresh_hours_every = refresh_hours_every
self.page = None
if self.savefile:
if self.savefile.exists():
lstats = savefile.lstat()
self.elapsed_hours = (time.time() - lstats.st_mtime) / 3600
if lstats.st_size == 0 or (self.elapsed_hours > self.refresh_hours_every):
self.page = self.download_new_file()
else:
with self.savefile.open('r') as f:
self.page = f.read()
else:
self.page = self.download_new_file()
else:
self.page = self.download_new_file()
return self.page
def download_new_file(self):
page = None
try:
page = self.gu.fetch_url(self.url)
if page.status_code == 200:
with self.savefile.open('wb') as f:
f.write(page.content)
else:
print(f'Invalid status code: {page.st}')
except AttributeError:
print('Please enable internet and try again')
return page
def testit():
from pathlib import Path
homepath = Path('.')
datapath = self.homepath / 'data'
datapath.mkdir(exist_ok=True)
htmlpath = self.datapath / 'html'
htmlpath.mkdir(exist_ok=True)
rfc_index_html = self.htmlpath / 'rfc_index.html'
# Test url = rfc index download page, save to data/html/rfc_index.html, refresh always
gp = GetPage()
page = gp.get_page(url='https://www.rfc-editor.org/rfc/', savefile=rfc_index_html,
refresh_hours_every=0)
if page:
if page.status_code == 200:
print(f'Page contents: {page.text}')
else:
print('Page is empty or in')
if __name__ == '__main__':
testit()