Hey everyone
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.
![[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]](https://fiverr-res.cloudinary.com/image/upload/f_auto,q_auto/v1/secured-attachments/messaging_message/attachment/1991fc9dd15c5593624cf9403c49fee3-1674414016322/image.png?__cld_token__=exp=1674835006~hmac=da91313fdd7ddcdb00049e204d0bbc5a00855d8b2ddd3c7df5d007d42f23a1a9)
i'll be glad to get some help here with that. thanks :-)
i've got this python scraping code made for me that worked perfect few months but lately it miss some parameters while scraping.
you can see the missing cells starting from 94 row.
![[Image: image.png?__cld_token__=exp=1674835006~h...d42f23a1a9]](https://fiverr-res.cloudinary.com/image/upload/f_auto,q_auto/v1/secured-attachments/messaging_message/attachment/1991fc9dd15c5593624cf9403c49fee3-1674414016322/image.png?__cld_token__=exp=1674835006~hmac=da91313fdd7ddcdb00049e204d0bbc5a00855d8b2ddd3c7df5d007d42f23a1a9)
i'll be glad to get some help here with that. thanks :-)
# -*- coding: utf-8 -*- """ad.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/17lhluDBQi5LVjkQ8kPXNezFnqpEvU17P """ import requests import time import lxml.html from bs4 import BeautifulSoup as soup import pandas as pd start_url = "https://www.ad.co.il/car?rp264=2022,2022&rp270=5000,120000&rp271=5000,5000&pageindex={}" base_url = "https://www.ad.co.il" START, END = (1,8) def get_gallery_image(tree): img = tree.cssselect("img.main-gallery-image.justify-content-center") if img: img_link=img[0].attrib.get('src') return f"https:{img_link}" else: print ("Image not found") def get_price_metadata(tree): cards = tree.cssselect("div.d-flex.justify-content-between h2.card-title") metadata = [x.text_content().strip() for x in cards] metadata.reverse() if(len(metadata)>1): return metadata[0], metadata[1] elif(len(metadata)==1): return metadata[0],"" else: return "","" def get_contact_info(Soup): contact_name = "" contact_num = "" scripts=Soup.find("script",attrs={"type":"application/ld+json"}) if(scripts): data1=scripts.string #print(data1) data1=json.loads(data1) try: offers=data1["offers"] except: return contact_name, contact_num seller=offers["seller"] try: contact_name=seller["name"] except: contact_name="" try: contact_num=seller["contactPoint"]["telephone"] except: contact_num="" return contact_name, contact_num def scrap_tables(Soup): html = Soup.findAll("table") first_table = None second_table = None try: first_table = html[0] except IndexError: print ("No table found.") return first_table, second_table def create_df_row(link, gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic): dictionary = { "link": link, "img": gallary_image, "car_model": price, "price": price_tag, "contact_name": contact_tag, "contact_num": contact_num, "newListing":True, "isItAvailable":True, "agency car":AgencyCar } for d in dic: dictionary[d]=dic[d] if first_table is not None: trs=first_table.findAll("tr") for tr in trs: tds=tr.findAll('td') #print(tds[0],"||",tds[1]) #print(tds[0].text.strip(),"||",tds[1].text.strip()) dictionary[tds[0].text.strip()]=tds[1].text.strip() if second_table is not None: trs=second_table.findAll("tr") for tr in trs: tds=tr.findAll('td') dictionary[tds[0].text.strip()]=tds[1].text.strip() return dictionary df=pd.read_excel("output.xlsx") final=df.to_dict('records') df["newListing"]=['']*df.shape[0] df["isItAvailable"]=[False]*df.shape[0] print(df.shape) links=list(df["link"]) #df = pd.DataFrame() import json for i in range(START, 100): temp_url = start_url.format(i) print(temp_url) response = requests.get(temp_url) if response.status_code == 200: html = response.text Soup1=soup(html,"html.parser") tree =lxml.html.fromstring(html) cars = tree.cssselect("div.card-body.p-md-3 a") car_links = [x.attrib for x in cars] car_links=Soup1.find("div",attrs={"id":"cards"}).findAll("div",attrs={"class":"card overflow-hidden"}) print(len(car_links)) for car_link1 in car_links: car_link = car_link1.find("a").get("href") if not car_link: print ("No car link found!") continue print (f"{base_url}{car_link}") car_link =f"{base_url}{car_link}" #print(car_link) if(car_link in links): print("already") ind=links.index(car_link) try: oldprice=df["price"][ind].strip() except: oldprice='' oldprice1=df["oldPrice"][ind] curprice=car_link1.find("div",attrs={"class":"price"}).text.strip() if(oldprice != curprice): if(pd.isna(oldprice1)): df["oldPrice"][ind]=oldprice+" - "+curprice else: df["oldPrice"][ind]=oldprice1+" - "+curprice df["price"][ind]=curprice df["newListing"][ind]='' df["isItAvailable"][ind]='' continue car_res = requests.get(car_link) car_file_name = car_link.split("/")[-1] car_html = car_res.text Soup=soup(car_html,'html.parser') sub_tree = lxml.html.fromstring(car_html) gallary_image = get_gallery_image(sub_tree) price_tag, price = get_price_metadata(sub_tree) contact_tag, contact_num = get_contact_info(Soup) first_table, second_table = scrap_tables(Soup) div=Soup.find("div",attrs={"class":"px-3 text-primary font-weight-bold"}) if(div): AgencyCar="yes" else: AgencyCar="no" dic={} p=Soup.find("p",attrs={"class":"text-word-break"}) if(p): dic["Description"]=p.text.strip() px3s=Soup.findAll("div",attrs={"class":"px-3"}) for px in px3s: try: key=px.text.strip().split(":")[0].strip() val=px.text.strip().split(":")[1].strip() dic[key]=val except: pass p=Soup.find("div",attrs={"class":"d-inline-flex text-end fs--1"}) if(p): try: val=p.find("a").text.strip() key=p.text.strip().replace(val,"") dic[key]=val except: pass row = create_df_row(car_link,gallary_image, price_tag, price, contact_tag, contact_num, first_table, second_table,AgencyCar,dic) df = df.append(row,ignore_index=True) if(len(car_links)<48): break else: print (f"invalid status: {response.status_code} for {temp_url}") df = df.fillna("") import datetime date = str(datetime.datetime.now()) date = date.split(".")[0] df.to_excel("output.xlsx",index=False)
buran write Jan-27-2023, 10:02 AM:
Please, when post code use proper BBCode tags - e.g.
Please, when post code use proper BBCode tags - e.g.
python
, not quote