Jan-30-2019, 08:35 AM
(Jan-28-2019, 12:39 PM)Stoss Wrote: Do you have a final version of the code you can share with me?
Thanks!
Sure, but it is not working anymore after I used it for about 2 weeks. I only needed it temporarily and didn't check for ways to get it working again.
import requests from bs4 import BeautifulSoup open('output.csv', 'w').close() import re def fundaSpider(max_pages): page = 1 while page <= max_pages: url = 'http://www.funda.nl/koop/rotterdam/p{}'.format(page) source_code = requests.get(url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') ads = soup.find_all('li', {'class': 'search-result'}) for ad in ads: title = ad.find('h3') title = ' '.join(title.get_text(separator='\n', strip=True).split()[ :-3]) # sep by newline, strip whitespace, then split to get the last 3 elements to cut out, then rejoin street = title.rpartition(' ')[0] street = re.sub(r'\d+$', '', street) address = ad.find('small').text.strip() price = ad.find('div', {'class': 'search-result-info search-result-info-price'}) price = price.find('span').text.strip() price = re.findall(r'\d', price) price = ''.join(price) size_results = ad.find('ul', {'class': 'search-result-kenmerken'}) li = size_results.find_all('li') try: size = li[0] except IndexError: size = 'Unknown' try: size = size.get_text(strip=True) except AttributeError: size = 'Unknown' try: size = size.split(" ")[0] except IndexError: size = 'Unknown' try: room = li[1].text.strip() except IndexError: room = 'Unknown' try: room = room.split(" ")[0] except IndexError: room = 'Unknown' try: href = ('http://www.funda.nl' + ad.find_all('a')[2]['href']) except IndexError: pass area = get_single_item_data(href) if not area: area = str('None') since = get_single_item_data_2(href) if not since: since = 'None' status = get_single_item_data_3(href) if not status: status = 'None' print('{},{},{},{},{},{},{},{},{},{}'.format(title,address,street,price,size,room,area,since,status,href)) saveFile = open('output.csv', 'a') saveFile.write(title + "," + address + "," + street + "," + price + "," + size + "," + room + "," + area + "," + since + "," + status + "," + href + '\n') saveFile.close() page += 1 def get_single_item_data(item_url): source_code = requests.get(item_url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') li = soup.find_all('li', {'class': 'breadcrumb-listitem'}) try: return (li[2].a.text) except AttributeError: pass def get_single_item_data_2(item_url): source_code = requests.get(item_url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') dl = soup.find('dl', {'class': 'object-kenmerken-list'}) try: #return (dl.find_all('dd')[1].text.strip()) return dl.find('dt', text='Aangeboden sinds').find_next_sibling('dd').text.strip() except AttributeError: pass def get_single_item_data_3(item_url): source_code = requests.get(item_url) plain_text = source_code.text soup = BeautifulSoup(plain_text, 'html.parser') uls = soup.find_all('ul', {'class': 'labels'}) for ul in uls: try: return(ul.find('li').text.strip()) except AttributeError: pass fundaSpider(2)