Python Forum

Full Version: scraping code misses listings
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hey everyone
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.


[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]

i'll be glad to get some help here with that. thanks :-)

# -*- coding: utf-8 -*-
"""ad.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17lhluDBQi5LVjkQ8kPXNezFnqpEvU17P
"""



import requests
import time
import lxml.html
from bs4 import BeautifulSoup as soup
import pandas as pd

start_url = "https://www.ad.co.il/car?rp264=2022,2022&rp270=5000,120000&rp271=5000,5000&pageindex={}"
base_url = "https://www.ad.co.il"
START, END = (1,8)

def get_gallery_image(tree):
    img = tree.cssselect("img.main-gallery-image.justify-content-center")
    if img:
        img_link=img[0].attrib.get('src')
        return f"https:{img_link}"
    else:
        print ("Image not found")

def get_price_metadata(tree):
    cards = tree.cssselect("div.d-flex.justify-content-between h2.card-title")
    metadata = [x.text_content().strip() for x in cards]
    metadata.reverse()
    if(len(metadata)>1):
      return metadata[0], metadata[1]
    elif(len(metadata)==1):
      return metadata[0],""
    else:
      return "",""

def get_contact_info(Soup):
    contact_name = ""
    contact_num = ""
    scripts=Soup.find("script",attrs={"type":"application/ld+json"})
    if(scripts):
      data1=scripts.string
      #print(data1)
      data1=json.loads(data1)
      try:
        offers=data1["offers"]
      except:
        return contact_name, contact_num
      seller=offers["seller"]
      try:
        contact_name=seller["name"]
      except:
        contact_name=""
      try:
        contact_num=seller["contactPoint"]["telephone"]
      except:
        contact_num=""
    return contact_name, contact_num

def scrap_tables(Soup):
    html = Soup.findAll("table")    
    first_table = None
    second_table = None
    try:
        first_table = html[0]
    except IndexError:
        print ("No table found.")
    return first_table, second_table

def create_df_row(link, gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic):
    dictionary = {
        "link": link,
        "img": gallary_image,
        "car_model": price,
        "price": price_tag,
        "contact_name": contact_tag,
        "contact_num": contact_num,
        "newListing":True,
        "isItAvailable":True,
        "agency car":AgencyCar
    }
    for d in dic:
      dictionary[d]=dic[d]
    if first_table is not None:
        trs=first_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          #print(tds[0],"||",tds[1])
          #print(tds[0].text.strip(),"||",tds[1].text.strip())
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    if second_table is not None:
        trs=second_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    return dictionary

df=pd.read_excel("output.xlsx")
final=df.to_dict('records')

df["newListing"]=['']*df.shape[0]
df["isItAvailable"]=[False]*df.shape[0]

print(df.shape)

links=list(df["link"])

#df = pd.DataFrame()
import json
for i in range(START, 100):
    temp_url = start_url.format(i)
    print(temp_url)
    response = requests.get(temp_url)
    if response.status_code == 200:
        html = response.text
        Soup1=soup(html,"html.parser")
        tree =lxml.html.fromstring(html)
        cars = tree.cssselect("div.card-body.p-md-3 a")
        car_links = [x.attrib for x in cars]
        car_links=Soup1.find("div",attrs={"id":"cards"}).findAll("div",attrs={"class":"card overflow-hidden"})
        print(len(car_links))
        for car_link1 in car_links:
            car_link = car_link1.find("a").get("href")
            if not car_link:
                print ("No car link found!")
                continue
            print (f"{base_url}{car_link}")
            car_link =f"{base_url}{car_link}"
            #print(car_link)
            if(car_link in links):
              print("already")
              ind=links.index(car_link)
              try:
                oldprice=df["price"][ind].strip()
              except:
                oldprice=''
              oldprice1=df["oldPrice"][ind]
              curprice=car_link1.find("div",attrs={"class":"price"}).text.strip()
              if(oldprice != curprice):
                if(pd.isna(oldprice1)):
                  df["oldPrice"][ind]=oldprice+" - "+curprice
                else:
                  df["oldPrice"][ind]=oldprice1+" - "+curprice
                df["price"][ind]=curprice
              df["newListing"][ind]=''
              df["isItAvailable"][ind]=''
              continue
            car_res = requests.get(car_link)
            car_file_name = car_link.split("/")[-1]
            car_html = car_res.text
            Soup=soup(car_html,'html.parser')
            sub_tree = lxml.html.fromstring(car_html)
            gallary_image = get_gallery_image(sub_tree)
            price_tag, price = get_price_metadata(sub_tree)
            contact_tag, contact_num = get_contact_info(Soup)
            first_table, second_table = scrap_tables(Soup)
            div=Soup.find("div",attrs={"class":"px-3 text-primary font-weight-bold"})
            if(div):
              AgencyCar="yes"
            else:
              AgencyCar="no"
            dic={}
            p=Soup.find("p",attrs={"class":"text-word-break"})
            if(p):
              dic["Description"]=p.text.strip()
            px3s=Soup.findAll("div",attrs={"class":"px-3"})
            for px in px3s:
              try:
                key=px.text.strip().split(":")[0].strip()
                val=px.text.strip().split(":")[1].strip()
                dic[key]=val
              except:
                pass
            p=Soup.find("div",attrs={"class":"d-inline-flex text-end fs--1"})
            if(p):
              try:
                val=p.find("a").text.strip()
                key=p.text.strip().replace(val,"")
                dic[key]=val
              except:
                pass
            row = create_df_row(car_link,gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic)
            df = df.append(row,ignore_index=True)
        if(len(car_links)<48):
            break
        
        
    else:
        print (f"invalid status: {response.status_code} for {temp_url}")

df = df.fillna("")

import datetime
date = str(datetime.datetime.now())
date = date.split(".")[0]
df.to_excel("output.xlsx",index=False)