Jun-13-2024, 05:14 PM
I have the below code to scrape this site (https://oig.hhs.gov/reports-and-publicat...lications/) and extract to a CSV.
It works well to scrape each title and URL. I want it to also scrape the content under "audit", "HHS agency" and "date" for each title, but I can't seem to code it right given all three elements are in a grid.
Any suggestions? Thanks
It works well to scrape each title and URL. I want it to also scrape the content under "audit", "HHS agency" and "date" for each title, but I can't seem to code it right given all three elements are in a grid.
Any suggestions? Thanks
import requests from bs4 import BeautifulSoup import csv from urllib.parse import urljoin base_url = "https://oig.hhs.gov" url = "https://oig.hhs.gov/reports-and-publications/all-reports-and-publications/?page={}" titles_with_urls = [] page = 1 max_pages = 100 while page <= max_pages: print(f"Scraping page {page}...") response = requests.get(url.format(page)) soup = BeautifulSoup(response.content, "html.parser") # Check if there are titles on the page page_titles = soup.select("h2 a") if not page_titles: print("No more pages to scrape. Exiting...") break # Exit the loop if no titles are found, assuming no more pages for title in page_titles: title_text = title.text.strip() url_link = title.get('href') # Get the URL link from the 'href' attribute # Ensure the URL starts with base_url if it's a relative URL if url_link.startswith("/"): full_url = urljoin(base_url, url_link) else: full_url = url_link titles_with_urls.append([title_text, full_url]) print(f"Scraped title: {title_text}, URL: {full_url}") page += 1 # Write titles and URLs to a CSV file with open('titles_with_urls.csv', mode='w', newline='', encoding='utf-8') as file: writer = csv.writer(file) writer.writerow(["Title", "URL"]) # Write header writer.writerows(titles_with_urls) print("All titles and URLs have been scraped and saved to titles_with_urls.csv.")