Aug-14-2020, 02:30 AM
Here's a bit of a parser that uses BeautifulSoup to get you started:
needs polishing.
needs polishing.
from bs4 import BeautifulSoup data = """ <table align="center" border="0" cellpadding="0" cellspacing="0" class="forum_header_border" width="950"> <tr> <td class="section_post_header" colspan="12"> <h1 style="display: inline;"><u>The String I have Searched for </u> Statictext1 </h1> - <h2 style="display: inline;"><i>Statictext2; Statictext7 The String I have Searched for Statictext3</i></h2> </td> </tr> <tr> <td class="forum_thread_header" title="Search Information" width="35">Show</td> <td class="forum_thread_header" style="text-align: left; padding-left: 10px;">Item Name</td> <td class="forum_thread_header">Column3</td> <td class="forum_thread_header">Column4</td> <td class="forum_thread_header">Column5</td> <td class="forum_thread_header_end">Column6</td> </tr> <tr class="forum_header_border" name="hover"> <td align="center" class="forum_thread_post" width="35"> <a href="/searches/103304/the-string-i-have-searched-for/" title="The String I have Searched for Statictext4"><img alt="Info" border="0" src="/images/sdfsdf_sdfdsfs_info3.png" title="The String I have searched for Statictext5" /></a> </td> <td class="forum_thread_post"> <a alt="The String I have Searched for d1f4 [website] (50 MB)" class="searchinfo" href="/si/146wew1729/the-string-i-have-searched-for-sdfsdfs-asdad/" title="The String I have Searched for d1f4 [website] (50 MB)">The String I have Searched for d1f4 [website]</a> </td> <td align="center" class="forum_thread_post"> <a class="customlink" href="https://sdfsdfs"></a> </td> <td align="center" class="forum_thread_post">50 MB</td> <td align="center" class="forum_thread_post">1 mo</td> <td align="center" class="forum_thread_post_end"> <font color="green">6</font> </td> </tr> """ # This code is not my own, but can't remember where I found it def prettify(soup, indent): pretty_soup = str() previous_indent = 0 for line in soup.prettify().split("\n"): current_indent = str(line).find("<") if current_indent == -1 or current_indent > previous_indent + 2: current_indent = previous_indent + 1 previous_indent = current_indent pretty_soup += write_new_line(line, current_indent, indent) return pretty_soup def write_new_line(line, current_indent, desired_indent): new_line = "" spaces_to_add = (current_indent * desired_indent) - current_indent if spaces_to_add > 0: for i in range(spaces_to_add): new_line += " " new_line += str(line) + "\n" return new_line def parse_html(): soup = BeautifulSoup(data, 'lxml') trs = soup.find_all('tr') for n, tr in enumerate(trs): tds = tr.find_all('td') for n1, td in enumerate(tds): # print(f"/n---------------------- tr{n}, td{n1} ----------------------") # print(f"{prettify(td, 2)}") if td.a: link = td.a.get('href') title = td.a.text.strip() print(f"{title}: {link}") elif td.h1: print(f"h1: {td.h1.text.strip()}") elif td.h2: print(f"h2: {td.h2.text.strip()}") elif td.font: print(f"font: {td.font.text.strip()}") else: print(f"td text: {td.text.strip()}") parse_html()Produces:
Output:h1: The String I have Searched for Statictext1
td text: Show
td text: Item Name
td text: Column3
td text: Column4
td text: Column5
td text: Column6
: /searches/103304/the-string-i-have-searched-for/
The String I have Searched for d1f4 [website]: /si/146wew1729/the-string-i-have-searched-for-sdfsdfs-asdad/
: https://sdfsdfs
td text: 50 MB
td text: 1 mo
font: 6h1: The String I have Searched for Statictext1
td text: Show
td text: Item Name
td text: Column3
td text: Column4
td text: Column5
td text: Column6
: /searches/103304/the-string-i-have-searched-for/
The String I have Searched for d1f4 [website]: /si/146wew1729/the-string-i-have-searched-for-sdfsdfs-asdad/
: https://sdfsdfs
td text: 50 MB
td text: 1 mo
font: 6