Python Forum

Full Version: Scrape medical information from MedlinePlus
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi friends
I am trying to achieve the following tasks:

1-Scrape the list of diseases from the MedlinePlus Medical Encyclopedia page.
2-For each disease, navigate to its page and extract the relevant information (name, symptoms, treatment).
3-Store this information in a structured format (e.g., a dictionary or a DataFrame) for later use in the chatbot.

I will highly appreciate any comment about the following program to achieve the above tasks by ChatGPT;

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# URL of the MedlinePlus Medical Encyclopedia
base_url = "https://medlineplus.gov/encyclopedia.html"

# Function to get the list of disease links from the main page
def get_disease_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the disease links
    disease_links = []
    for link in soup.find_all('a', href=True):
        href = link['href']
        if href.startswith('/ency/article'):
            disease_links.append("https://medlineplus.gov" + href)
    
    return disease_links

# Function to extract disease information from a given disease page
def extract_disease_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract disease name
    name = soup.find('h1').text.strip()
    
    # Extract symptoms and treatment
    symptoms, treatment = "", ""
    for header in soup.find_all('h2'):
        if 'Symptoms' in header.text:
            symptoms = header.find_next('p').text.strip()
        if 'Treatment' in header.text:
            treatment = header.find_next('p').text.strip()
    
    return {"name": name, "symptoms": symptoms, "treatment": treatment}

# Main script
if __name__ == "__main__":
    disease_links = get_disease_links(base_url)
    all_disease_info = []

    for link in disease_links:
        try:
            disease_info = extract_disease_info(link)
            all_disease_info.append(disease_info)
            print(f"Extracted info for: {disease_info['name']}")
            time.sleep(1)  # Be polite and don't overload the server
        except Exception as e:
            print(f"Failed to extract info from {link}: {e}")

    # Save the extracted information to a CSV file
    df = pd.DataFrame(all_disease_info)
    df.to_csv('diseases_info.csv', index=False)
    print("Saved disease information to 'diseases_info.csv'.")