Python Forum
Scrape A tags from a website - Printable Version

+- Python Forum (https://python-forum.io)
+-- Forum: Python Coding (https://python-forum.io/forum-7.html)
+--- Forum: Web Scraping & Web Development (https://python-forum.io/forum-13.html)
+--- Thread: Scrape A tags from a website (/thread-5644.html)



Scrape A tags from a website - Prince_Bhatia - Oct-14-2017

hi,

i am trying to scrape https://www.efinancialcareers.com/search?page=1&sortBy=POSTED_DESC&searchMode=DEFAULT_SEARCH&jobSearchId=MDI5MjFDRTZDRjUzMDlENDNGRTVCRTY2Q0YwMkY2MTkuMTUwODAxNTYyMjMxNC4xNDk2MDcwODY3&updateEmitter=LOCATION&filterGroupForm.includeRefreshed=true&filterGroupForm.datePosted=OTHER&filterGroupForm.locationRadiusUnit=MILES&filterGroupForm.locationRadius=DISTANCE_FIVE

in this link i am trying to scrape the hidden jobs id:

jobs id are just below class_="jobPreview well which has class anchor
but when i run my code it exports csv without jobs id
can someone please tell what is wrong?
below is my code:
import requests
from bs4 import BeautifulSoup
import csv
import json
import time
import os
import datetime

url="https://www.efinancialcareers.com/search?page=1&sortBy=POSTED_DESC&searchMode=DEFAULT_SEARCH&jobSearchId=RUJFMEZDNjA2RTJEREJEMDcyMzlBQ0YyMEFDQjc1MjUuMTQ4NTE5MDY3NTI0Ni4tMTQ1Mjc4ODU3NQ%3D%3D&updateEmitter=SORT_BY&filterGroupForm.includeRefreshed=true&filterGroupForm.datePosted=OTHER"
final_data = []

r = requests.get(url)
data = r.text

if r.status_code == 200:
    print("Webiste is Perfect to Scrape, Please proceed and block yourself in JAIL")
else:
    quit

datetime.datetime.now()

soup = BeautifulSoup(data, "html.parser")
get_details = soup.find_all(class_="jobPreview well")
#while len(get_details)>0:
for jobs in get_details:
    title = jobs.find_all("h3")
    for job in title:
        geta = job.find_all("a")
        onlya = ""
        links = ""
        sublist=[]
        for a in geta:
            onlya = a.text
            getid = a.get("id")
            sublist.append(onlya)
            #sublist.append(getid)
        for link in geta:
            links = link.get("href")
            sublist.append(links)
            print(sublist)
    a_lst = jobs.find_all("a")
    for i in a_lst:
        an = i.find_all(class_="anchor")
        getid = ""
        for anch in an:
            getid = anch.get("id")
            sublist.append(getid)
            print(sublist)
        sublist.append(final_data)
filename = "jobs.csv"
with open("./"+filename, "w")as csvfile:
    csvfile = csv.writer(csvfile, delimiter=",")
    csvfile.writerow("")
    for i in range(0, len(final_data)):
        csvfile.writerow(final_data[i])



RE: Scrape A tags from a website - metulburr - Oct-15-2017

Quote:
    a_lst = jobs.find_all("a")
    for i in a_lst:
        an = i.find_all(class_="anchor")
You know you can search a tags by classname?

    a_lst = jobs.find_all('a', {'class':"anchor"})
    for tag in a_lst:
        print(tag.get('id'))
Output:
Webiste is Perfect to Scrape, Please proceed and block yourself in JAIL job2315296 job2150174 job2265680 job1788218 job2269486 job2212561 job2232475 job2297258 job2232473 job2247908 job2301531 job2268850 job2315276 job2315275 job2315270 job1977058 job2268835 job1977043 job1977065 job1977066 job2051545 job2103521 job2268838
Your code looks a bit overcomplicated. Here is an example of obtaining the same things, but is much more readable. You only have to use find_all to get all the jobs section, but after that you can use find() to get the child tag of that section which makes it much more readable

import requests
from bs4 import BeautifulSoup

URL = 'https://www.efinancialcareers.com/search?page=1&sortBy=POSTED_DESC&searchMode=DEFAULT_SEARCH&jobSearchId=RUJFMEZDNjA2RTJEREJEMDcyMzlBQ0YyMEFDQjc1MjUuMTQ4NTE5MDY3NTI0Ni4tMTQ1Mjc4ODU3NQ%3D%3D&updateEmitter=SORT_BY&filterGroupForm.includeRefreshed=true&filterGroupForm.datePosted=OTHER'
res = requests.get(URL)
soup = BeautifulSoup(res.text, 'html.parser')
jobs = soup.find_all('li', {'class':'jobPreview well'})
for job in jobs:
    print(job.a['id']) #ID
    print(job.h3.a['href']) #link
    print(job.h3.a['title']) #title
job.h3.a['title'] is really like
job.find('h3').find('a')['title'] but more simple

Then you just write it to the file how you want it.