![]() |
python-forum.io on way back machine - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html) +--- Thread: python-forum.io on way back machine (/thread-13394.html) |
python-forum.io on way back machine - metulburr - Oct-12-2018 I wanted to write a script to automatically archive the forum's latest activities from time to time. I dont see a method via their API so i just wrote a quick script to use with selenium to archive the forums, help pages, and the latest 50 threads. It has some things hard-coded that are not professional, but i'll update it when i get the chance. If anyone is interested, they can run it on their own to help archive the forum forever. I could put it on the server to run every night, but im not sure if its worth the resources or not. from selenium import webdriver import time import os from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC CHROMEPATH = '/home/metulburr/chromedriver' PHANTOMPATH = '/home/metulburr/phantomjs' URLSAVE = 'https://python-forum.io' FORUMS = ['https://python-forum.io', 'https://python-forum.io/Forum-General-Coding-Help', 'https://python-forum.io/Forum-Homework', 'https://python-forum.io/Forum-GUI', 'https://python-forum.io/Forum-Game-Development', 'https://python-forum.io/Forum-Networking', 'https://python-forum.io/Forum-Web-Development', 'https://python-forum.io/Forum-General', 'https://python-forum.io/Forum-News-and-Discussions', 'https://python-forum.io/Forum-Tutorials', 'https://python-forum.io/Forum-Tutorial-Requests-and-Submissions', 'https://python-forum.io/Forum-Python-Installation-and-Execution', 'https://python-forum.io/Forum-Fundamentals', 'https://python-forum.io/Forum-Common-pitfalls-and-what-to-do', 'https://python-forum.io/Forum-Web-Scraping', 'https://python-forum.io/Forum-Web-Tutorials', 'https://python-forum.io/Forum-GUI-tutorials', 'https://python-forum.io/Forum-Game-Tutorials', 'https://python-forum.io/Forum-Networking-Tutorials', 'https://python-forum.io/Forum-Jobs', 'https://python-forum.io/Forum-Forum-Off-Topic', 'https://python-forum.io/Forum-Board', 'https://python-forum.io/Forum-Bar', 'https://python-forum.io/online.php', 'https://python-forum.io/online.php?action=today', 'https://python-forum.io/misc.php?action=help', 'https://python-forum.io/misc.php?action=help&hid=40', 'https://python-forum.io/misc.php?action=help&hid=41', 'https://python-forum.io/misc.php?action=help&hid=25', 'https://python-forum.io/misc.php?action=help&hid=35', 'https://python-forum.io/misc.php?action=help&hid=19', 'https://python-forum.io/misc.php?action=help&hid=20', 'https://python-forum.io/misc.php?action=help&hid=21', 'https://python-forum.io/misc.php?action=help&hid=22', 'https://python-forum.io/misc.php?action=help&hid=30', 'https://python-forum.io/misc.php?action=help&hid=32', 'https://python-forum.io/misc.php?action=help&hid=42', 'https://python-forum.io/misc.php?action=help&hid=46', 'https://python-forum.io/misc.php?action=help&hid=28', 'https://python-forum.io/misc.php?action=help&hid=33', 'https://python-forum.io/misc.php?action=help&hid=10', 'https://python-forum.io/misc.php?action=help&hid=11', 'https://python-forum.io/misc.php?action=help&hid=13', 'https://python-forum.io/misc.php?action=help&hid=37', 'https://python-forum.io/misc.php?action=help&hid=29', 'https://python-forum.io/misc.php?action=help&hid=31', 'https://python-forum.io/misc.php?action=help&hid=34', 'https://python-forum.io/misc.php?action=help&hid=38', 'https://python-forum.io/misc.php?action=help&hid=44', 'https://python-forum.io/misc.php?action=help&hid=45', 'https://python-forum.io/misc.php?action=help&hid=47', 'https://python-forum.io/misc.php?action=help&hid=14', 'https://python-forum.io/misc.php?action=help&hid=15', 'https://python-forum.io/misc.php?action=help&hid=16', 'https://python-forum.io/misc.php?action=help&hid=27', 'https://python-forum.io/misc.php?action=help&hid=24', 'https://python-forum.io/misc.php?action=help&hid=43', 'https://python-forum.io/misc.php?action=help&hid=48', 'https://python-forum.io/misc.php?action=help&hid=39', ] class App: def __init__(self): self.setup_chrome() #self.setup_headless() self.latest = [] for url in FORUMS: self.archive_url(url) self.get_latest() for url in self.latest: self.archive_url(url) def archive_url(self, url): self.browser.get('https://web.archive.org/') self.delay() self.browser.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div[3]/div[3]/div[2]/form/input[1]").click() self.delay() self.browser.find_element_by_class_name('web-save-url-input').send_keys(url) self.delay() self.browser.find_element_by_xpath('/html/body/div[3]/div/div[2]/div/div[3]/div[3]/div[2]/form/input[2]').click() WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.ID,"wmtbURL"))) print(f'Archived: {url}') def delay(self): time.sleep(1.5) def setup_chrome(self): #options = self.chrome_prep() os.environ["webdriver.chrome.driver"] = CHROMEPATH self.browser = webdriver.Chrome(CHROMEPATH) self.browser.set_window_position(0,0) self.delay() def setup_headless(self): self.browser = webdriver.PhantomJS(PHANTOMPATH) self.delay() def get_latest(self): self.browser.get('https://python-forum.io/latest50.php') elems = self.browser.find_elements_by_xpath("//a[@href]") for elem in elems: url = elem.get_attribute("href") self.latest.append(url) App() RE: python-forum.io on way back machine - metulburr - Jan-13-2019 updated as xpath changed, and added tutorial links, removed time sleeps from selenium import webdriver import time import os from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.chrome.options import Options MEMBERS = 'https://python-forum.io/memberlist.php?sort=regdate&order=ascending&perpage=20&page=1' CHROMEPATH = '/home/metulburr/chromedriver' URLSAVE = 'https://python-forum.io' FORUMS = ['https://python-forum.io', 'https://python-forum.io/Forum-General-Coding-Help', 'https://python-forum.io/Forum-Homework', 'https://python-forum.io/Forum-GUI', 'https://python-forum.io/Forum-Game-Development', 'https://python-forum.io/Forum-Networking', 'https://python-forum.io/Forum-Web-Development', 'https://python-forum.io/Forum-General', 'https://python-forum.io/Forum-News-and-Discussions', 'https://python-forum.io/Forum-Tutorials', 'https://python-forum.io/Forum-Tutorial-Requests-and-Submissions', 'https://python-forum.io/Forum-Python-Installation-and-Execution', 'https://python-forum.io/Forum-Fundamentals', 'https://python-forum.io/Forum-Common-pitfalls-and-what-to-do', 'https://python-forum.io/Forum-Web-Scraping', 'https://python-forum.io/Forum-Web-Tutorials', 'https://python-forum.io/Forum-GUI-tutorials', 'https://python-forum.io/Forum-Game-Tutorials', 'https://python-forum.io/Forum-Networking-Tutorials', 'https://python-forum.io/Forum-Jobs', 'https://python-forum.io/Forum-Forum-Off-Topic', 'https://python-forum.io/Forum-Board', 'https://python-forum.io/Forum-Bar', 'https://python-forum.io/online.php', 'https://python-forum.io/online.php?action=today', 'https://python-forum.io/misc.php?action=help', 'https://python-forum.io/misc.php?action=help&hid=40', 'https://python-forum.io/misc.php?action=help&hid=41', 'https://python-forum.io/misc.php?action=help&hid=25', 'https://python-forum.io/misc.php?action=help&hid=35', 'https://python-forum.io/misc.php?action=help&hid=19', 'https://python-forum.io/misc.php?action=help&hid=20', 'https://python-forum.io/misc.php?action=help&hid=21', 'https://python-forum.io/misc.php?action=help&hid=22', 'https://python-forum.io/misc.php?action=help&hid=30', 'https://python-forum.io/misc.php?action=help&hid=32', 'https://python-forum.io/misc.php?action=help&hid=42', 'https://python-forum.io/misc.php?action=help&hid=46', 'https://python-forum.io/misc.php?action=help&hid=28', 'https://python-forum.io/misc.php?action=help&hid=33', 'https://python-forum.io/misc.php?action=help&hid=10', 'https://python-forum.io/misc.php?action=help&hid=11', 'https://python-forum.io/misc.php?action=help&hid=13', 'https://python-forum.io/misc.php?action=help&hid=37', 'https://python-forum.io/misc.php?action=help&hid=29', 'https://python-forum.io/misc.php?action=help&hid=31', 'https://python-forum.io/misc.php?action=help&hid=34', 'https://python-forum.io/misc.php?action=help&hid=38', 'https://python-forum.io/misc.php?action=help&hid=44', 'https://python-forum.io/misc.php?action=help&hid=45', 'https://python-forum.io/misc.php?action=help&hid=47', 'https://python-forum.io/misc.php?action=help&hid=14', 'https://python-forum.io/misc.php?action=help&hid=15', 'https://python-forum.io/misc.php?action=help&hid=16', 'https://python-forum.io/misc.php?action=help&hid=27', 'https://python-forum.io/misc.php?action=help&hid=24', 'https://python-forum.io/misc.php?action=help&hid=43', 'https://python-forum.io/misc.php?action=help&hid=48', 'https://python-forum.io/misc.php?action=help&hid=39', 'https://python-forum.io/misc.php?action=help&hid=49', 'https://python-forum.io/showteam.php', 'https://python-forum.io/Thread-Python3-2-differences-input-vs-raw-input', 'https://python-forum.io/Thread-Basic-Part-1-Python-3-6-3-7-and-pip-installation-under-Windows', 'https://python-forum.io/Thread-Anaconda-and-other-ways-to-run-Python', 'https://python-forum.io/Thread-Packaging-Modules-Wheel-pip-setup-py-Freeze', 'https://python-forum.io/Thread-Part-1-Linux-Python-3-environment', 'https://python-forum.io/Thread-Part-2-Python-environment-Windows', 'https://python-forum.io/Thread-Install-a-library-manually', 'https://python-forum.io/Thread-Building-an-exe-with-Py2exe', 'https://python-forum.io/Thread-Creating-C-extensions', 'https://python-forum.io/Thread-windows-command-prompt-modifications-add-options-start-loc', 'https://python-forum.io/Thread-How-to-Execute-python-code', 'https://python-forum.io/Thread-Indentation', 'https://python-forum.io/Thread-Annotations', 'https://python-forum.io/Thread-Simple-debugging-and-how-to-read-tracebacks', 'https://python-forum.io/Thread-Generators-Iterators', 'https://python-forum.io/Thread-Comprehension-Expressions', 'https://python-forum.io/Thread-Classes-Classes-advanced-Dependent-attributes-and-Descriptors', 'https://python-forum.io/Thread-Classes-Classes-advanced-Descriptors-managed-attributes', 'https://python-forum.io/Thread-Basic-Ternary-Conditional-Expressions', 'https://python-forum.io/Thread-Lambda-How-Why-and-Why-not', 'https://python-forum.io/Thread-Functions', 'https://python-forum.io/Thread-Basic-Set-Windows-Python-Path-to-run-python-in-any-directory', 'https://python-forum.io/Thread-Basic-Lists', 'https://python-forum.io/Thread-Basic-Modules-part-2', 'https://python-forum.io/Thread-Basic-Strings-index-and-slicing', 'https://python-forum.io/Thread-Basic-string-format-and-string-expressions', 'https://python-forum.io/Thread-Basic-Dictionaries', 'https://python-forum.io/Thread-Files', 'https://python-forum.io/Thread-Classes-Class-Basics', 'https://python-forum.io/Thread-Classes-Class-Intermediate-Inheritance', 'https://python-forum.io/Thread-Classes-Class-Intermediate-Operator-Overloading', 'https://python-forum.io/Thread-Basic-Modules-part-3', 'https://python-forum.io/Thread-Basic-Modules-part-1', 'https://python-forum.io/Thread-Basic-Never-use-for-i-in-range-len-sequence', 'https://python-forum.io/Thread-Efficiency-Crash-Course', 'https://python-forum.io/Thread-Basic-Python-Gotchas', 'https://python-forum.io/Thread-Basic-Naming-Conventions-PEP-8', 'https://python-forum.io/Thread-Multiple-expressions-with-or-keyword', 'https://python-forum.io/Thread-Namespace-flooding-with-imports', 'https://python-forum.io/Thread-Web-scraping-part-2', 'https://python-forum.io/Thread-Web-Scraping-part-1', 'https://python-forum.io/Thread-Regular-Expression-re-module', 'https://python-forum.io/Thread-Web-scraping-with-Scrapy', 'https://python-forum.io/Thread-Flask-Weather-app-Updatet', 'https://python-forum.io/Thread-Flask-Bootstrap-node-npm-gulp-bower', 'https://python-forum.io/Thread-Flask-Trow-away-JS-function-and-use-a-Python-function', 'https://python-forum.io/Thread-Run-Python-CGI-from-Apache', 'https://python-forum.io/Thread-Flask-Starting-web-development-part-1', 'https://python-forum.io/Thread-Flask-Ajax-in-Flask', 'https://python-forum.io/Thread-wxPython-phoenix-install-step-by-step', 'https://python-forum.io/Thread-Tkinter-Getting-Tkinter-Grid-Sizing-Right-the-first-time', 'https://python-forum.io/Thread-WxPython-Very-Basic-Example-Only-Hello-World', 'https://python-forum.io/Thread-Tkinter-Very-Basic-Example-Only-Hello-World', 'https://python-forum.io/Thread-Tkinter-How-to-determine-if-tkinter-attribute-accepts-color', 'https://python-forum.io/Thread-Tkinter-Find-tkinter-widgets-for-arguments-or-arguments-for-widgets', 'https://python-forum.io/Thread-WxPython-Tutorial-Notespad-W-I-P', 'https://python-forum.io/Thread-PyGame-Creating-a-state-machine', 'https://python-forum.io/Thread-PyGame-User-Interface', 'https://python-forum.io/Thread-PyGame-Install-and-Troubleshooting', 'https://python-forum.io/Thread-PyGame-Structure-and-Organizing-part-8', 'https://python-forum.io/Thread-PyGame-Enemy-AI-and-collision-part-6', 'https://python-forum.io/Thread-PyGame-Flair-and-Organizing-part-7', 'https://python-forum.io/Thread-PyGame-Common-Tasks', 'https://python-forum.io/Thread-Intermediate-Command-Line-Interfaces', 'https://python-forum.io/Thread-PyGame-Basic-animation-part-5', 'https://python-forum.io/Thread-PyGame-Adding-player-effects-part-4', 'https://python-forum.io/Thread-PyGame-Basic-event-handling-part-3', 'https://python-forum.io/Thread-PyGame-Loading-images-transparency-handling-spritesheets-part-2', 'https://python-forum.io/Thread-PyGame-Creating-a-window-part-1', 'https://python-forum.io/Thread-PyGame-python3-x-installation', 'https://python-forum.io/Thread-Text-Adventure-Tutorial-if-structure-to-dictionary', 'https://python-forum.io/Thread-PyGame-warnings-of-sentdex-pygame-tutorials', 'https://python-forum.io/Thread-Bare-Minimum-Sockets', 'https://python-forum.io/Thread-Basic-IRC-bot-with-socket', 'https://python-forum.io/Thread-VS-Code-from-start', 'https://python-forum.io/Thread-A-Neat-way-to-use-pathlib', 'https://python-forum.io/Thread-A-look-at-dataclass', 'https://python-forum.io/Thread-Sound-player-standalone', 'https://python-forum.io/Thread-simulate-static-local-varibles-with-getattr', 'https://python-forum.io/Thread-Underscores-And-You-A-Guide-To-Turning-Magic-Into-Science', 'https://python-forum.io/Thread-Basic-DC-Electronics-Resistors', 'https://python-forum.io/Thread-Database-the-easy-way-dataset', 'https://python-forum.io/Thread-Basic-Creating-a-repo-for-your-completed-scripts', 'https://python-forum.io/Thread-A-List-of-Free-Python-Resources', 'https://python-forum.io/Thread-A-List-of-Free-Game-Resources', 'https://python-forum.io/Thread-Collection-of-programming-ideas-and-challenges', 'https://python-forum.io/Thread-New-Users-Introduce-Yourself', ] class App: def __init__(self): self.setup_chrome() self.latest = [] for url in FORUMS: self.archive_url(url) self.get_latest() for url in self.latest: self.archive_url(url) def archive_url(self, url): self.browser.get('https://web.archive.org/') WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.ID,"web_save_div"))) self.browser.find_element_by_xpath("/html/body/div[3]/div/div[3]/div/div[2]/div[3]/div[2]/form/input").click() self.browser.find_element_by_class_name('web-save-url-input').send_keys(url) self.delay() self.browser.find_element_by_xpath('/html/body/div[3]/div/div[3]/div/div[2]/div[3]/div[2]/form/button').click() WebDriverWait(self.browser, 10).until(EC.presence_of_element_located((By.ID,"wmtbURL"))) print(f'Archived: {url}') def delay(self): time.sleep(1.5) def setup_chrome(self): options = Options() options.add_argument("--headless") os.environ["webdriver.chrome.driver"] = CHROMEPATH self.browser = webdriver.Chrome(CHROMEPATH, chrome_options=options) self.browser.set_window_position(0,0) self.delay() def get_latest(self): self.browser.get('https://python-forum.io/latest50.php') elems = self.browser.find_elements_by_xpath("//a[@href]") for elem in elems: url = elem.get_attribute("href") self.latest.append(url) App() |