Hi Everyone, I am very new to this but managed to last night get a succesful scrape using the code below. However, I must have an error somewhere in the code as while the code creates a csv file the results of the scrape don't end up in that file; they just stay in the terminal on my MAC. I am using python 3.6.2
Can anyone with more experience let me know which code I need to change to fix this?
from collections import deque
from urllib.parse import urljoin
import requests
import csv
b =open('bars1.csv','a')
a =csv.writer(b)
data = [['sales-info', 'email-business']]
a.writerows(data)
b.close()
from lxml import html
class YellowPage:
main_url = "https://www.yellowpages.com/search?search_terms=Bars&geo_location_terms=Miami%2C+FL"
def __init__(self):
self.links = [self.main_url]
self.storage =
def crawl(self):
for link in self.links :
self.get_link(link)
def get_link(self, link):
print('Scraping Now: ' + link)
url = "https://www.yellowpages.com"
response = requests.get(link)
tree = html.fromstring(response.text)
#scraping links of each bars
for items in tree.xpath("//div[@class='info']"):
link_page = items.xpath(".//a[@class='business-name'][not(@itemprop='name')]/@href")
for page in link_page:
if page and url + page not in self.links:
self.links += [url + page]
#parsing the links to the next page
next_page = tree.xpath("//div[@class='pagination']//li/a/@href")
for nepage in next_page:
if nepage and url + nepage not in self.links:
self.links += [url + nepage]
#going to the main page of each bars and harvest the record
for posts in tree.xpath("//*[@id='main-header']"):
name = posts.xpath(".//div[@class='sales-info']/h1/text()")[0] if posts.xpath(".//div[@class='sales-info']/h1/text()") else ""
email = posts.xpath(".//a[@class='email-business']/@href")[0] if posts.xpath(".//a[@class='email-business']/@href") else ""
records = name, email
self.storage.append(records)
def __str__(self):
return self.storage
crawler = YellowPage()
crawler.crawl()
for item in crawler.storage:
print(item)
Can anyone with more experience let me know which code I need to change to fix this?
from collections import deque
from urllib.parse import urljoin
import requests
import csv
b =open('bars1.csv','a')
a =csv.writer(b)
data = [['sales-info', 'email-business']]
a.writerows(data)
b.close()
from lxml import html
class YellowPage:
main_url = "https://www.yellowpages.com/search?search_terms=Bars&geo_location_terms=Miami%2C+FL"
def __init__(self):
self.links = [self.main_url]
self.storage =
def crawl(self):
for link in self.links :
self.get_link(link)
def get_link(self, link):
print('Scraping Now: ' + link)
url = "https://www.yellowpages.com"
response = requests.get(link)
tree = html.fromstring(response.text)
#scraping links of each bars
for items in tree.xpath("//div[@class='info']"):
link_page = items.xpath(".//a[@class='business-name'][not(@itemprop='name')]/@href")
for page in link_page:
if page and url + page not in self.links:
self.links += [url + page]
#parsing the links to the next page
next_page = tree.xpath("//div[@class='pagination']//li/a/@href")
for nepage in next_page:
if nepage and url + nepage not in self.links:
self.links += [url + nepage]
#going to the main page of each bars and harvest the record
for posts in tree.xpath("//*[@id='main-header']"):
name = posts.xpath(".//div[@class='sales-info']/h1/text()")[0] if posts.xpath(".//div[@class='sales-info']/h1/text()") else ""
email = posts.xpath(".//a[@class='email-business']/@href")[0] if posts.xpath(".//a[@class='email-business']/@href") else ""
records = name, email
self.storage.append(records)
def __str__(self):
return self.storage
crawler = YellowPage()
crawler.crawl()
for item in crawler.storage:
print(item)