I rewrote this code (original written at 1:00 AM, no doubt after being at it since 4 the previous morning)
I've added code to better prettify the html, and only displaying the 6th table, which is bizarre HTML
there are tr's nested within td's etc. it's quite bizarre.
Good luck with this one.
PrettifyPage.py (place in same directory as other script
# PrettifyPage.py
from bs4 import BeautifulSoup
import requests
import pathlib
class PrettifyPage:
def __init__(self):
pass
def prettify(self, soup, indent):
pretty_soup = str()
previous_indent = 0
for line in soup.prettify().split("\n"):
current_indent = str(line).find("<")
if current_indent == -1 or current_indent > previous_indent + 2:
current_indent = previous_indent + 1
previous_indent = current_indent
pretty_soup += self.write_new_line(line, current_indent, indent)
return pretty_soup
def write_new_line(self, line, current_indent, desired_indent):
new_line = ""
spaces_to_add = (current_indent * desired_indent) - current_indent
if spaces_to_add > 0:
for i in range(spaces_to_add):
new_line += " "
new_line += str(line) + "\n"
return new_line
if __name__ == '__main__':
pp = PrettifyPage()
pfilename = pp.bpath.htmlpath / 'BusinessEntityRecordsAA.html'
with pfilename.open('rb') as fp:
page = fp.read()
soup = BeautifulSoup(page, 'lxml')
pretty = pp.prettify(soup, indent=2)
print(pretty)
import requests
from bs4 import BeautifulSoup
import PrettifyPage
def parsepage(page):
pp = PrettifyPage.PrettifyPage()
if page:
soup = BeautifulSoup(page, 'lxml')
table = soup.find_all('table')[6]
if table is not None:
trs = table.find_all('tr')
for n, tr in enumerate(trs):
print(f"\n------------------------------ tr_{n} ------------------------------")
print(f"{pp.prettify(tr, 2)}")
# tds = tr.find_all('td')
# for n1, td in enumerate(tds):
# print(f"\n------------------------------ tr_{n}, td_{n1} ------------------------------")
# print(f"{pp.prettify(td, 2)}")
else:
print(f"Cound not find table")
def get_page(url):
response = requests.get(url)
if response.status_code == 200:
page = response.content
return page
else:
print(f"unable to retreive {url}")
def scrape_url(url):
parsepage(get_page(url))
if __name__ == '__main__':
url = 'https://www.global-rates.com/interest-rates/libor/libor.aspx'
scrape_url(url)
partial output:
Output:
------------------------------ tr_0 ------------------------------
<tr style="height:100%;" valign="top">
<td>
<table cellpadding="0" cellspacing="0" style="height:100%;">
<tr>
<td colspan="4">
<table cellpadding="0" cellspacing="0" style="width:100%;">
<tr>
<td>
<table style="margin:6px 0px 0px 0px;width:100%;">
<tr>
<td align="center">
<script type="text/javascript">
<!--
google_ad_client = "ca-pub-8844689419180727";
/* GR 728x90 positie 1 */
google_ad_slot = "6980580330";
google_ad_width = 728;
google_ad_height = 90;
//-->
</script>
<script src="https://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript">
</script>
</td>
</tr>
</table>
</td>
</tr>
</table>
</td>
</tr>