Python Forum

Full Version: I am getting a valueError. And not sure why? My goal is to visualize the correlation
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
Hi all

I am just learning python. I am trying to visualize the correlation matrix of the S&P 500 stock data but I am getting a ValueError: could not convert string to float: '2000-01-03'.

Here is the coding:

import os
import pandas as pd
import requests
import pickle
import yfinance as yf
from pandas_datareader import data as pdr
import bs4 as bs
import datetime as dt
import matplotlib.pyplot as plt
from matplotlib import style
import numpy as np

# Function to save the S&P 500 tickers from the Wikipedia page

def save_sp500_tickers():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    resp = requests.get(url)
    soup = bs.BeautifulSoup(resp.text, "html.parser")
    table = soup.find("table", {"id": "constituents"})
    tickers = []

    for row in table.find_all("tr")[1:]:
        ticker = row.find_all("td")[0].text.strip()
        ticker = str(ticker.replace('.', '-'))
        tickers.append(ticker)

    with open("sp500tickers.pickle", "wb") as f:
        pickle.dump(tickers, f)

    print(tickers)
    return tickers

# Function to retrieve stock data from Yahoo Finance
def get_data_from_yahoo(reload_sp500=False):
    if reload_sp500:
        tickers = save_sp500_tickers()
    else:
        with open("sp500tickers.pickle", "rb") as f:
            tickers = pickle.load(f)

    if not os.path.exists('stock_dfs'):
        os.makedirs('stock_dfs')

    start = dt.datetime(2000, 1, 1)
    end = dt.datetime(2016, 12, 31)

    for ticker in tickers:
        print(ticker)
        if not os.path.exists(f'stock_dfs/{ticker}.csv'):
            data = yf.download(ticker, start=start, end=end)
            data.to_csv(f'stock_dfs/{ticker}.csv')
        else:
            print(f'Already have {ticker}')

# Function to compile the stock data into a single DataFrame
import csv

def compile_data():
    with open("sp500tickers.pickle", "rb") as f:
        tickers = pickle.load(f)

    main_df = pd.DataFrame()

    for count, ticker in enumerate(tickers):
        with open(f'stock_dfs/{ticker.replace(".", "_")}.csv', 'r') as file:
            reader = csv.reader(file)
            next(reader)  # Skip the header row

            df = pd.DataFrame(reader, columns=['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume'])
            df['Date'] = pd.to_datetime(df['Date'])
            df.set_index('Date', inplace=True)
            df[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df[['Open', 'High', 'Low', 'Close', 'Adj Close']].astype(float)
            df['Volume'] = df['Volume'].astype(object)

            df.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True)
            df.rename(columns={'Adj Close': ticker, 'Volume': f'{ticker}_Volume'}, inplace=True)

            if main_df.empty:
                main_df = df
            else:
                main_df = main_df.join(df, how='outer')

        if count % 10 == 0:
            print(count)

    print(main_df.head())
    main_df.to_csv('sp500_joined_close.csv')

# Function to visualize the correlation matrix of the stock data
def visualize_data():
    df = pd.read_csv('sp500_joined_close.csv')
    df_corr = df.corr()
    print(df_corr.head())

    data = df_corr.values
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn)
    fig.colorbar(heatmap)
    ax.set_xticks(np.arange(data.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()
    ax.xaxis.tick_top()

    column_labels = df_corr.columns
    row_labels = df_corr.index

    ax.set_xticklabels(column_labels, rotation=90)
    ax.set_yticklabels(row_labels)
    plt.tight_layout()
    plt.show()

save_sp500_tickers()
get_data_from_yahoo()
compile_data()
visualize_data()
Thank you for your helps!!!