Oct-23-2019, 10:09 AM
I rewrote this code (original written at 1:00 AM, no doubt after being at it since 4 the previous morning)
I've added code to better prettify the html, and only displaying the 6th table, which is bizarre HTML
there are tr's nested within td's etc. it's quite bizarre.
Good luck with this one.
PrettifyPage.py (place in same directory as other script
I've added code to better prettify the html, and only displaying the 6th table, which is bizarre HTML
there are tr's nested within td's etc. it's quite bizarre.
Good luck with this one.
PrettifyPage.py (place in same directory as other script
# PrettifyPage.py from bs4 import BeautifulSoup import requests import pathlib class PrettifyPage: def __init__(self): pass def prettify(self, soup, indent): pretty_soup = str() previous_indent = 0 for line in soup.prettify().split("\n"): current_indent = str(line).find("<") if current_indent == -1 or current_indent > previous_indent + 2: current_indent = previous_indent + 1 previous_indent = current_indent pretty_soup += self.write_new_line(line, current_indent, indent) return pretty_soup def write_new_line(self, line, current_indent, desired_indent): new_line = "" spaces_to_add = (current_indent * desired_indent) - current_indent if spaces_to_add > 0: for i in range(spaces_to_add): new_line += " " new_line += str(line) + "\n" return new_line if __name__ == '__main__': pp = PrettifyPage() pfilename = pp.bpath.htmlpath / 'BusinessEntityRecordsAA.html' with pfilename.open('rb') as fp: page = fp.read() soup = BeautifulSoup(page, 'lxml') pretty = pp.prettify(soup, indent=2) print(pretty)
import requests from bs4 import BeautifulSoup import PrettifyPage def parsepage(page): pp = PrettifyPage.PrettifyPage() if page: soup = BeautifulSoup(page, 'lxml') table = soup.find_all('table')[6] if table is not None: trs = table.find_all('tr') for n, tr in enumerate(trs): print(f"\n------------------------------ tr_{n} ------------------------------") print(f"{pp.prettify(tr, 2)}") # tds = tr.find_all('td') # for n1, td in enumerate(tds): # print(f"\n------------------------------ tr_{n}, td_{n1} ------------------------------") # print(f"{pp.prettify(td, 2)}") else: print(f"Cound not find table") def get_page(url): response = requests.get(url) if response.status_code == 200: page = response.content return page else: print(f"unable to retreive {url}") def scrape_url(url): parsepage(get_page(url)) if __name__ == '__main__': url = 'https://www.global-rates.com/interest-rates/libor/libor.aspx' scrape_url(url)partial output:
Output:------------------------------ tr_0 ------------------------------
<tr style="height:100%;" valign="top">
<td>
<table cellpadding="0" cellspacing="0" style="height:100%;">
<tr>
<td colspan="4">
<table cellpadding="0" cellspacing="0" style="width:100%;">
<tr>
<td>
<table style="margin:6px 0px 0px 0px;width:100%;">
<tr>
<td align="center">
<script type="text/javascript">
<!--
google_ad_client = "ca-pub-8844689419180727";
/* GR 728x90 positie 1 */
google_ad_slot = "6980580330";
google_ad_width = 728;
google_ad_height = 90;
//-->
</script>
<script src="https://pagead2.googlesyndication.com/pagead/show_ads.js" type="text/javascript">
</script>
</td>
</tr>
</table>
</td>
</tr>
</table>
</td>
</tr>