Python Forum
Thread Rating:
  • 0 Vote(s) - 0 Average
  • 1
  • 2
  • 3
  • 4
  • 5
scraping code misses listings
#1
Hey everyone
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.


[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]

i'll be glad to get some help here with that. thanks :-)

# -*- coding: utf-8 -*-
"""ad.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/17lhluDBQi5LVjkQ8kPXNezFnqpEvU17P
"""



import requests
import time
import lxml.html
from bs4 import BeautifulSoup as soup
import pandas as pd

start_url = "https://www.ad.co.il/car?rp264=2022,2022&rp270=5000,120000&rp271=5000,5000&pageindex={}"
base_url = "https://www.ad.co.il"
START, END = (1,8)

def get_gallery_image(tree):
    img = tree.cssselect("img.main-gallery-image.justify-content-center")
    if img:
        img_link=img[0].attrib.get('src')
        return f"https:{img_link}"
    else:
        print ("Image not found")

def get_price_metadata(tree):
    cards = tree.cssselect("div.d-flex.justify-content-between h2.card-title")
    metadata = [x.text_content().strip() for x in cards]
    metadata.reverse()
    if(len(metadata)>1):
      return metadata[0], metadata[1]
    elif(len(metadata)==1):
      return metadata[0],""
    else:
      return "",""

def get_contact_info(Soup):
    contact_name = ""
    contact_num = ""
    scripts=Soup.find("script",attrs={"type":"application/ld+json"})
    if(scripts):
      data1=scripts.string
      #print(data1)
      data1=json.loads(data1)
      try:
        offers=data1["offers"]
      except:
        return contact_name, contact_num
      seller=offers["seller"]
      try:
        contact_name=seller["name"]
      except:
        contact_name=""
      try:
        contact_num=seller["contactPoint"]["telephone"]
      except:
        contact_num=""
    return contact_name, contact_num

def scrap_tables(Soup):
    html = Soup.findAll("table")    
    first_table = None
    second_table = None
    try:
        first_table = html[0]
    except IndexError:
        print ("No table found.")
    return first_table, second_table

def create_df_row(link, gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic):
    dictionary = {
        "link": link,
        "img": gallary_image,
        "car_model": price,
        "price": price_tag,
        "contact_name": contact_tag,
        "contact_num": contact_num,
        "newListing":True,
        "isItAvailable":True,
        "agency car":AgencyCar
    }
    for d in dic:
      dictionary[d]=dic[d]
    if first_table is not None:
        trs=first_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          #print(tds[0],"||",tds[1])
          #print(tds[0].text.strip(),"||",tds[1].text.strip())
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    if second_table is not None:
        trs=second_table.findAll("tr")
        for tr in trs:
          tds=tr.findAll('td')
          dictionary[tds[0].text.strip()]=tds[1].text.strip()
    return dictionary

df=pd.read_excel("output.xlsx")
final=df.to_dict('records')

df["newListing"]=['']*df.shape[0]
df["isItAvailable"]=[False]*df.shape[0]

print(df.shape)

links=list(df["link"])

#df = pd.DataFrame()
import json
for i in range(START, 100):
    temp_url = start_url.format(i)
    print(temp_url)
    response = requests.get(temp_url)
    if response.status_code == 200:
        html = response.text
        Soup1=soup(html,"html.parser")
        tree =lxml.html.fromstring(html)
        cars = tree.cssselect("div.card-body.p-md-3 a")
        car_links = [x.attrib for x in cars]
        car_links=Soup1.find("div",attrs={"id":"cards"}).findAll("div",attrs={"class":"card overflow-hidden"})
        print(len(car_links))
        for car_link1 in car_links:
            car_link = car_link1.find("a").get("href")
            if not car_link:
                print ("No car link found!")
                continue
            print (f"{base_url}{car_link}")
            car_link =f"{base_url}{car_link}"
            #print(car_link)
            if(car_link in links):
              print("already")
              ind=links.index(car_link)
              try:
                oldprice=df["price"][ind].strip()
              except:
                oldprice=''
              oldprice1=df["oldPrice"][ind]
              curprice=car_link1.find("div",attrs={"class":"price"}).text.strip()
              if(oldprice != curprice):
                if(pd.isna(oldprice1)):
                  df["oldPrice"][ind]=oldprice+" - "+curprice
                else:
                  df["oldPrice"][ind]=oldprice1+" - "+curprice
                df["price"][ind]=curprice
              df["newListing"][ind]=''
              df["isItAvailable"][ind]=''
              continue
            car_res = requests.get(car_link)
            car_file_name = car_link.split("/")[-1]
            car_html = car_res.text
            Soup=soup(car_html,'html.parser')
            sub_tree = lxml.html.fromstring(car_html)
            gallary_image = get_gallery_image(sub_tree)
            price_tag, price = get_price_metadata(sub_tree)
            contact_tag, contact_num = get_contact_info(Soup)
            first_table, second_table = scrap_tables(Soup)
            div=Soup.find("div",attrs={"class":"px-3 text-primary font-weight-bold"})
            if(div):
              AgencyCar="yes"
            else:
              AgencyCar="no"
            dic={}
            p=Soup.find("p",attrs={"class":"text-word-break"})
            if(p):
              dic["Description"]=p.text.strip()
            px3s=Soup.findAll("div",attrs={"class":"px-3"})
            for px in px3s:
              try:
                key=px.text.strip().split(":")[0].strip()
                val=px.text.strip().split(":")[1].strip()
                dic[key]=val
              except:
                pass
            p=Soup.find("div",attrs={"class":"d-inline-flex text-end fs--1"})
            if(p):
              try:
                val=p.find("a").text.strip()
                key=p.text.strip().replace(val,"")
                dic[key]=val
              except:
                pass
            row = create_df_row(car_link,gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic)
            df = df.append(row,ignore_index=True)
        if(len(car_links)<48):
            break
        
        
    else:
        print (f"invalid status: {response.status_code} for {temp_url}")

df = df.fillna("")

import datetime
date = str(datetime.datetime.now())
date = date.split(".")[0]
df.to_excel("output.xlsx",index=False)
buran write Jan-27-2023, 10:02 AM:
Please, when post code use proper BBCode tags - e.g. python, not quote
Reply


Possibly Related Threads…
Thread Author Replies Views Last Post
  Code Help, web scraping non uniform lists(ul) luke_m 4 3,314 Apr-22-2021, 05:16 PM
Last Post: luke_m
  scraping code nexuz89 0 1,507 Sep-28-2020, 12:16 PM
Last Post: nexuz89
  In need of web scraping code! kolbyng 1 1,735 Sep-21-2020, 06:02 AM
Last Post: buran
  error in code web scraping alexisbrunaux 5 3,792 Aug-19-2020, 02:31 AM
Last Post: alexisbrunaux
  scraping from a website that hides source code PIWI_Protein 1 1,959 Mar-27-2020, 05:08 PM
Last Post: Larz60+

Forum Jump:

User Panel Messages

Announcements
Announcement #1 8/1/2020
Announcement #2 8/2/2020
Announcement #3 8/6/2020