hi,
i am learning beautifulsoup to scrape data but whenever i run my code it gives atribute error that did you use find_all() when wanted to use find()?
this is my code below:
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = "https://services.sccgov.org/facilityinspection/Closure/Index?sortField=sortbyEDate"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
tble = soup.find("table", {"class":"table text-left table-responsive"})
getname = tble.find_all("td")
for i in getname:
name = i.find_all("a")
contents = i.get_text()
print(name)
print(contents)
first i kept find_all("a") out of the loop , it gave above error, then secondly i put it in loop like for i in getname.find_all("a")..it gave same above error and when i put i into loop it prints the results , however results needed more work but it prints? how?
i tried everywhere to understand how to use find and find_all without attribute error, can someone explain what is going on? how to correctly use find() and find_all()
i guess i found the solution. I tried every same link on different website and writing my observations please correct if i am wrong
from bs4 import BeautifulSoup
from urllib.request import urlopen
import requests
url = "http://www.labormarketinfo.edd.ca.gov/majorer/countymajorer.asp?CountyCode=000001"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
#get_table = soup.find_all("table")# there are 4 tables on page , if i put index[0] means first table
#if i put index[1] means second table
# if i put index[2] means third table
# if i put index[3] means 4th table
#here i will use index 1 to get the second table
get_table = soup.find_all("table")[1]# if i use here find it will return first table irrrespective of index
#so i have to use find_all to get all tags under table[1]
firstelement = get_table.find_all("tr")#i am using find_all here. find_all returns all list of that tag and find returns first tag
#getsecondelement = firstelement.find_all("td")#find_all returns the list not the columns and in my website there are columns no list
#so i will comment the above line and will use loop to get the rows.
#so accordng to my observation, in order to find the element of any rows in....
#....columns you like , you need to loop over that coulmn
for rows in firstelement:#here we are using a loop to get all td
cells = rows.find_all("td")# i used loop here to get the all td tags but column wise like excel
for i in cells:# since in every td in table has some text also and link
link = i.find_all("a")#i used loop to extract info from all link
for i in link:#i used loop here also to extract link
content = i.get_text()# here i am getting text
link = i.get('href')#here i am getting links
print(link)#here i am printing link
print(content)#here i am printing text
# loop we actually used to extract the things until they are exausted
#more practice will make it more clear
tble = soup.find("table", {"class":"table text-left table-responsive"})
This has all info,so can loop over like this.
from bs4 import BeautifulSoup
from urllib import urlopen
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
url = "https://services.sccgov.org/facilityinspection/Closure/Index?sortField=sortbyEDate"
tble = soup.find("table", {"class":"table text-left table-responsive"})
for link in tble.find_all('a'):
if link.get('href').startswith('http'):
print(link.get('href'))
#print(link.text)
Output:
http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035
http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035
http://maps.google.com/maps?daddr=333%20SANTANA%20ROW%201100%20SAN%20JOSE,%20CA%2095128
http://maps.google.com/maps?daddr=333%20SANTANA%20ROW%201100%20SAN%20JOSE,%20CA%2095128
http://maps.google.com/maps?daddr=4040%20MONTEREY%20RD%20SAN%20JOSE,%20CA%2095111
http://maps.google.com/maps?daddr=4040%20MONTEREY%20RD%20SAN%20JOSE,%20CA%2095111
http://maps.google.com/maps?daddr=306%20WILLOW%20ST%20%20SAN%20JOSE,%20CA%2095110
http://maps.google.com/maps?daddr=306%20WILLOW%20ST%20%20SAN%20JOSE,%20CA%2095110
.......
But this can get messy if you gone save stuff as in your lasts posts.
Figure how to format one
tr
tag,before any looping.
Quick test with 1
tr
tag,so figure what you need and format the output of this tag then loop over all.
>>> tble = soup.find("table", {"class":"table text-left table-responsive"})
>>> tr = tble.find_all('tr')[1]
>>> tr
<tr>
<td class="text-left"><strong><a href="/facilityinspection/Home/ShowHistory/PR0379302">HIT</a></strong><br/>
<a href="http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035" target="_blank">228 BARBER CT<br/>MILPITAS, CA 95035</a>
<a href="http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035" target="_blank"><i class="fa fa-map-marker fa-lg" style="color: #912D25; border-style: none none none none;"></i></a></td>
<td class="text-left">Sep. 15, 2017<br/>Failure to maintain a valid operating permit.
</td>
<td class="text-left"></td>
</tr>
>>> # There are 3 link adress
>>> tr.find_all('a')[0].get('href')
u'/facilityinspection/Home/ShowHistory/PR0379302'
>>> tr.find_all('a')[1].get('href')
u'http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035'
>>> tr.find_all('a')[2].get('href')
u'http://maps.google.com/maps?daddr=228%20BARBER%20CT%20MILPITAS,%20CA%2095035'
>>> # There are two text object
>>> tr.find_all('td', class_="text-left")[0].text
u'HIT\n228 BARBER CTMILPITAS, CA 95035 '
>>> tr.find_all('td', class_="text-left")[1].text
u'Sep. 15, 2017Failure to maintain a valid operating permit.\r\n'
hi,
i wrote this code it prints every thing except text in second column, i tried your code above but not able to loop, so i tried every way but not able to get secondcolumn text and not able to get the first column text into csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
import csv
outfile = open('Mynew.csv', 'w',newline='')
writer = csv.writer(outfile)
writer.writerow(["Link","Restaurant Name", "Gmapslink", "Address", "Gmapslink", "Closuredate", "reopendate"])
url = "https://services.sccgov.org/facilityinspection/Closure/Index?sortField=sortbyEDate"
html = urlopen(url)
soup = BeautifulSoup(html, "html.parser")
tble = soup.find("table",{"class":"table text-left table-responsive"})
findtr = tble.find_all("tr")
for i in findtr:
cells = i.find_all("td")
if len(cells)==3:
for i in cells:
finda = i.find_all("a")
for i in finda:
link = i['href']
text = i.get_text()
print(link, text)
#writer.writerow([link, text])
closuredate = cells[1].find(text=True)
reopen_date = cells[2].find(text=True)
print(closuredate, reopen_date)
writer.writerow([link, text, closuredate, reopen_date])