Jul-31-2021, 09:10 AM
Hello,
I am trying to scrape a real estate site in Greece from multiple locations of Athens, the site use some kind of protection so I am using cookies and headers always(I know I must rotate the user agent so I will not be blocked this will be the next step, because I must always change my IP to scrape (soz because I am newbie still on scrapy)). Each page of the site shows 10 flats, the loop iterates correct in the first 4 and afterwards returns [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.spitogatos.gr/aggelia/1111057856> (referer: https://www.spitogatos.gr/pwliseis-katoi...-proastia/) which puzzles me.
I am trying to scrape a real estate site in Greece from multiple locations of Athens, the site use some kind of protection so I am using cookies and headers always(I know I must rotate the user agent so I will not be blocked this will be the next step, because I must always change my IP to scrape (soz because I am newbie still on scrapy)). Each page of the site shows 10 flats, the loop iterates correct in the first 4 and afterwards returns [scrapy.core.engine] DEBUG: Crawled (200) <GET https://www.spitogatos.gr/aggelia/1111057856> (referer: https://www.spitogatos.gr/pwliseis-katoi...-proastia/) which puzzles me.
import scrapy class MainprojectSpider(scrapy.Spider): name = 'mainProject' allowed_domains = ['www.spitogatos.gr'] dummies = ["Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36"] user_agent_list = [ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.1.1 Safari/605.1.15', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:77.0) Gecko/20100101 Firefox/77.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', ] DOWNLOADER_MIDDLEWARES = { 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': None, 'scrapy_useragents.downloadermiddlewares.useragents.UserAgentsMiddleware': 500, } headers = { "authority": "www.spitogatos.gr", "pragma": "no-cache", "cache-control": "no-cache", "sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"", "sec-ch-ua-mobile": "?0", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36", "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "sec-fetch-site": "same-origin", "sec-fetch-mode": "navigate", "sec-fetch-user": "?1", "sec-fetch-dest": "document", "accept-language": "en-GB,en;q=0.9,el-GR;q=0.8,el;q=0.7,en-US;q=0.6" } cookies = { "PHPSESSID": "946hl5lpbcf6jalm271g2450u8", "spitogatosHomepageMap": "0", "currentCurrency": "EUR", "_ga": "GA1.2.1080968683.1626031679", "_gid": "GA1.2.1871094347.1627582033", "_hjTLDTest": "1", "_hjid": "66a31a57-9c80-47f1-8767-9bca91d47b1f", "_fbp": "fb.1.1626031679317.910543448", "__qca": "P0-1840509725-1625417320200", "_hjAbsoluteSessionInProgress": "0", "openedTabs": "1", "_gat_UA-3455846-10": "1", "_gat_UA-3455846-2": "1", "_hjIncludedInSessionSample": "1", "reese84": "3:5Stu7C2tUWBSIdWSdzMOSQ==:ppmbagk94sf6IvvS66908AApyTMfCE+K7i7PgkeyRs6C9VGkCcqBSz8ZsgbOx56c46ktjL+1iyfp8zL1PuiT7AUsmA9XLdcmMoDQm30MnEPgcbQl/dMQV1PgqtVJgWVwGabZlMhGM+T6D8zf5ENVuGhLJ81U74a+gr+GySA5Xx/CqUPcGa/YG2zNICEMnZN7D4bRwJq6vxEvOU+wbSfAE6OquI4ipeHR3dz8jBwY961ka2PfY7MoLCLeGdzPUu07yOxv41lvdcZbaj9/peyxLnLSFqD9QnV5MXsXy7mKE3eNoT46F/ITB8/GAVpc/zqW792F+7HuUkWJD/pWaNOsr6+rc75kpKw15xtN5oCw9Qh3Fw9SYUtfbFMTRXBrUt0Ow/Lv2C3oOLBQyVex80cr76c4ibxS/niuNvKA87f7XZc=:THUtE26ivNhlKtaznqNuX7swpAf4x5S8pF+xoBg5KwE=" } #start_urls = ['https://www.spitogatos.gr/'] def start_requests(self): url = 'https://www.spitogatos.gr/pwliseis-katoikies/athina-notia-proastia/' yield scrapy.Request( url=url, method='GET', cookies=self.cookies, headers=self.headers, callback= self.parse, ) def parse(self, response): print(response.xpath('//span[@class="tile-v5-sr__details-price"]/text()').extract()) houses = response.xpath('//h4[@class="tile-v5-sr__title"]') print("houses"+str(len(houses))) for house in houses: link = house.xpath('.//@href').get() print(link) yield response.follow(url=link, callback=self.parse_housing, meta={'description': "name"}, headers = self.headers, cookies= self.cookies) def parse_housing(self, response): name = response.request.meta['description'] yield (response.xpath('.//h6[@class="line__text nowrap"]/text()').extract())