May-11-2025, 05:32 PM
For others' benefit, here's how to do it in Beautiful Soup:
import sys import os import glob import shutil from bs4 import BeautifulSoup ROOT = r"c:\temp" os.chdir(ROOT) for file in glob.glob("*.html"): print("Handling ", file) #save original file ORIGFILE = fr"{file}.orig" #grab original times mtime = os.stat(file).st_mtime atime = os.stat(file).st_atime tup = (atime, mtime) dest = shutil.copyfile(file, ORIGFILE) os.utime(ORIGFILE, tup) #Remove all carriage returns with open(file, "r") as f: dna = f.read().replace("\n", "") #trim each string soup = BeautifulSoup(dna,"lxml") _ = [s.replace_with(s.text.strip()) for s in soup.find_all(string=True)] #save soup back to file with open(file, 'w', encoding='utf-8') as outp: outp.write(str(soup)) #Must close before updating time os.utime(file, tup)