import PipPackages import requests from bs4 import BeautifulSoup, SoupStrainer import pandas as pd website_url = requests.get('https://www8.miamidade.gov/Apps/COB/LobbyistOnline/Views/Queries/Registration_ByPeriod_List.aspx?startdate=07%2f01%2f2019&enddate=07%2f05%2f2019') content = website_url.text #print(content) soup = BeautifulSoup(content,'lxml') table = soup.find('table', attrs={'id': 'ctl00_mainContentPlaceHolder_gvLobbyistRegList'}) #print(table.prettify()) #print(table.get_text()) #remove = table.decompose("//*[@id='ctl00_mainContentPlaceHolder_gvLobbyistRegList']/tbody/tr[25]") rows_in_table = [] #columnNumber=0 # create loop on top to remove row 1 consideration for row in table.findAll('tr')[1:]: #print(row.prettify()) #print('\n') #testing cell = row.findAll('td') #print(cell) #print(type(cell)) if len(rows_in_table) == 0: # 0 == 0 rows_in_table = [None for _ in cell] #loops find all td elements #print(rows_in_table) elif len(cell) != len(rows_in_table): # 4 != 5 for index, rowspan in enumerate(rows_in_table): if rowspan is not None: value = rowspan["value"] cell.insert(index, value) #print(index) # 0 #print(value) # kesti michael #decreases rows by 1 if rows_in_table[index]["rows_left"] == 1: rows_in_table[index] = None else: rows_in_table[index]["rows_left"] -= 3 #decrease #print(cell[0].string) new_list = [] #print(type(new_list)) names_list = cell[0].string li = list(names_list.split("-")) #dic={'Name':cell[0].string} #print(dic) #print(li) #lit = li.append(cell[0].string) #print(names_list) #print(rows_in_table) for index, x in enumerate(cell): #print(index) #print(x) #print(x.content) #text = x.text.replace(' ','') if x.has_attr("rowspan"): rowspan = {"rows_left": int(x["rowspan"]), "value": x} rows_in_table[index] = rowspan #contentt = x.text #y = pd.DataFrame(contentt) #columnNumber+=1 #df = pd.DataFrame(rowspan) #print(df) #read title (content) of column ## columnName = x.text ## print('%d: %s' % (columnNumber,columnName)) ## rows_in_table.append((columnName,[])) #print(rows_in_table) ''' list_of_cells = [] #print('row ' + str(len(table.findAll('tr')))) for cell in row.findAll('td'): text = cell.text #width of table == 5 if len(row.findAll('td')) != 5: for index, rowspan in enumerate(rows_in_table): if rowspan is not None: combine_rows = rowspan["value"] #print(combine_rows) test = cell.insert(index, combine_rows) print(test) ## Create a conditional here to indicate that if text houses 4 entries, ## then append to previous 5 entry cell. ## if row.find('td', attrs={'rowspan':'2'}): ## list_of_cells.append.previous_node? ## else: ## list_of_cells.append(text) list_of_cells.append(text) print(list_of_cells) '''
Formatting Output after Web Scrape
Messages In This Thread |
Formatting Output after Web Scrape - by yoitspython - Jul-30-2019, 07:26 PM
RE: Formatting Output after Web Scrape - by cvsae - Jul-30-2019, 08:34 PM
RE: Formatting Output after Web Scrape - by yoitspython - Jul-30-2019, 08:39 PM
|
Possibly Related Threads… | |||||
Thread | Author | Replies | Views | Last Post | |
Scrape for html based on url string and output into csv | dana | 13 | 5,477 |
Jan-13-2021, 03:52 PM Last Post: snippsat |
|
scrape data 1 go to next page scrape data 2 and so on | alkaline3 | 6 | 5,203 |
Mar-13-2020, 07:59 PM Last Post: alkaline3 |
|
Formatting Output After Web Scraping | yoitspython | 3 | 2,921 |
Aug-01-2019, 01:22 PM Last Post: snippsat |
|
Problem formatting output text | aj347 | 5 | 4,147 |
Sep-10-2017, 04:54 PM Last Post: nilamo |
Users browsing this thread: 2 Guest(s)