Some improvement,like
So schedule and loguru(great) for logging.
Use Prettier have a command line tool so do just
time.sleep
(blocking) is not the best for schedule stuff.So schedule and loguru(great) for logging.
import requests import os from bs4 import BeautifulSoup import time from loguru import logger logger.add("log_file.log", rotation="2 days") import schedule try: from lxml import etree except ImportError: raise RuntimeError("Please install lxml with `pip install lxml`") URL_TO_MONITOR = "https://hckrnews.com/" CHECK_INTERVAL = 15 def process_html(site_content): soup = BeautifulSoup(site_content, features="lxml") # Combining tag selections for s in soup(["script", "meta"]): s.extract() return str(soup).replace("\r", "") def webpage_was_changed(): headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36", "Pragma": "no-cache", "Cache-Control": "no-cache", } response = requests.get(URL_TO_MONITOR, headers=headers) if not os.path.exists("previous_content.txt"): open("previous_content.html", "w+").close() with open("previous_content.html", "r+") as filehandle: previous_response_html = filehandle.read() processed_response_html = process_html(response.content) if processed_response_html != previous_response_html: filehandle.seek(0) filehandle.write(processed_response_html) filehandle.truncate() return True return False def check_webpage(): try: if webpage_was_changed(): logger.info("WEBPAGE WAS CHANGED.") else: logger.info("Webpage was not changed.") except Exception as e: logger.exception(e) def main(): schedule.every(CHECK_INTERVAL).seconds.do(check_webpage) logger.info("Running Website Monitor") while True: schedule.run_pending() time.sleep(1) if __name__ == "__main__": main()Also a tips i would say that
soup.prettify()
is broken,make new lines in tag so dos look like standard HTML at all.Use Prettier have a command line tool so do just
prettier --write .
in folder then get correct formatted HTML.