I managed to scrape the article post date using some PyQt5 code I found..
import bs4 as bs
import sys
from PyQt5.QtWebEngineWidgets import QWebEnginePage
from PyQt5.QtWidgets import QApplication
from PyQt5.QtCore import QUrl
class Page(QWebEnginePage):
def __init__(self, url):
self.app = QApplication(sys.argv)
QWebEnginePage.__init__(self)
self.html = ''
self.loadFinished.connect(self._on_load_finished)
self.load(QUrl(url))
self.app.exec_()
def _on_load_finished(self):
self.html = self.toHtml(self.Callable)
def Callable(self, html_str):
self.html = html_str
self.app.quit()
def main():
page = Page('http://www.foxnews.com/tech/2018/01/11/how-deadly-drone-swarms-will-help-us-troops-on-frontline.html')
soup = bs.BeautifulSoup(page.html, 'html.parser')
js_test = soup.find('div', class_='article-date')
print(js_test.text)
if __name__ == '__main__': main()
If I print the soup (results here:
https://pastebin.com/f9na0nNc) It's not loading the comments section results. It has some kind of JS detection that appears to only load comments if the user scrolls down..
<script type="text/x-spotim-options">SPOTIM_OPTIONS = {"prerenderDeferred":true}</script><script src="https://app-cdn.spot.im/modules/prerender/3.1.385/conversation/host/host-bundle.js"></script><script data-ready="true" data-spotim-module="essential" src="https://app-cdn.spot.im/modules/essential/3.1.385/bundle.js" type="text/javascript"></script><script type="text/javascript">if (!window['SPOTIM_AUTH0_ENABLED']) {
(function () {
//VERSION 3
var intervalId = -1;
var CHECKER_INTERVAL = 1000;
var JANRAIN_CAPTURE_TOKEN_KEY = 'janrainCaptureToken';
var captureToken = '';
function startJanrainListener() {
if (typeof window.localStorage !== 'undefined') {
tokenChecker();
}
}
function tokenChecker() {
intervalId = window.setInterval(function () {
var captureTokenResult = localStorage.getItem(JANRAIN_CAPTURE_TOKEN_KEY);
if (captureTokenResult !== captureToken) {
onTokenChanged(captureTokenResult, captureToken);
captureToken = captureTokenResult;
}
}, CHECKER_INTERVAL)
}
function onTokenChanged(newToken, oldToken) {
//Don't remove listener due to multiple login possibility
// window.clearInterval(intervalId);
if (newToken) {
console.log('Janrain user detected');
if (window.SPOTIM && window.SPOTIM.getConversations && window.SPOTIM.getConversations().length > 0) {
window.SPOTIM.startSSOForProvider({provider: 'janrain', token: newToken});
} else {
document.addEventListener('spot-im-conversation-loaded', function () {
if (!!localStorage.getItem(JANRAIN_CAPTURE_TOKEN_KEY)) {
window.SPOTIM.startSSOForProvider({provider: 'janrain', token: newToken});
}
}, false)
}
} else {
if (oldToken) {
console.log('Janrain user logged out');
if (window.SPOTIM && window.SPOTIM.getConversations && window.SPOTIM.getConversations().length > 0) {
window.SPOTIM.logout({provider: 'janrain', token: newToken});
} else {
document.addEventListener('spot-im-conversation-loaded', function () {
if (!localStorage.getItem(JANRAIN_CAPTURE_TOKEN_KEY)) {
window.SPOTIM.logout({provider: 'janrain', token: newToken});
}
}, false)
}
}
else {
console.log('No janrain user detected');
if (window.SPOTIM && window.SPOTIM.logout) {
window.SPOTIM.logout({provider: 'janrain', token: null});
} else {
document.addEventListener('spot-im-conversation-loaded', function () {
if (!localStorage.getItem(JANRAIN_CAPTURE_TOKEN_KEY)) {
window.SPOTIM.logout({provider: 'janrain', token: null});
}
}, false)
}
}
}
}
startJanrainListener();
})();
}</script>
Can I force "prerenderDeferred":
false? Would that work?
Is there any other way I can get the comments to load without resorting to using Selenium?