Dec-11-2023, 05:33 AM
Hi all
I am just learning python. I am trying to visualize the correlation matrix of the S&P 500 stock data but I am getting a ValueError: could not convert string to float: '2000-01-03'.
Here is the coding:
I am just learning python. I am trying to visualize the correlation matrix of the S&P 500 stock data but I am getting a ValueError: could not convert string to float: '2000-01-03'.
Here is the coding:
import os import pandas as pd import requests import pickle import yfinance as yf from pandas_datareader import data as pdr import bs4 as bs import datetime as dt import matplotlib.pyplot as plt from matplotlib import style import numpy as np # Function to save the S&P 500 tickers from the Wikipedia page def save_sp500_tickers(): url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies" resp = requests.get(url) soup = bs.BeautifulSoup(resp.text, "html.parser") table = soup.find("table", {"id": "constituents"}) tickers = [] for row in table.find_all("tr")[1:]: ticker = row.find_all("td")[0].text.strip() ticker = str(ticker.replace('.', '-')) tickers.append(ticker) with open("sp500tickers.pickle", "wb") as f: pickle.dump(tickers, f) print(tickers) return tickers # Function to retrieve stock data from Yahoo Finance def get_data_from_yahoo(reload_sp500=False): if reload_sp500: tickers = save_sp500_tickers() else: with open("sp500tickers.pickle", "rb") as f: tickers = pickle.load(f) if not os.path.exists('stock_dfs'): os.makedirs('stock_dfs') start = dt.datetime(2000, 1, 1) end = dt.datetime(2016, 12, 31) for ticker in tickers: print(ticker) if not os.path.exists(f'stock_dfs/{ticker}.csv'): data = yf.download(ticker, start=start, end=end) data.to_csv(f'stock_dfs/{ticker}.csv') else: print(f'Already have {ticker}') # Function to compile the stock data into a single DataFrame import csv def compile_data(): with open("sp500tickers.pickle", "rb") as f: tickers = pickle.load(f) main_df = pd.DataFrame() for count, ticker in enumerate(tickers): with open(f'stock_dfs/{ticker.replace(".", "_")}.csv', 'r') as file: reader = csv.reader(file) next(reader) # Skip the header row df = pd.DataFrame(reader, columns=['Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']) df['Date'] = pd.to_datetime(df['Date']) df.set_index('Date', inplace=True) df[['Open', 'High', 'Low', 'Close', 'Adj Close']] = df[['Open', 'High', 'Low', 'Close', 'Adj Close']].astype(float) df['Volume'] = df['Volume'].astype(object) df.drop(['Open', 'High', 'Low', 'Close'], axis=1, inplace=True) df.rename(columns={'Adj Close': ticker, 'Volume': f'{ticker}_Volume'}, inplace=True) if main_df.empty: main_df = df else: main_df = main_df.join(df, how='outer') if count % 10 == 0: print(count) print(main_df.head()) main_df.to_csv('sp500_joined_close.csv') # Function to visualize the correlation matrix of the stock data def visualize_data(): df = pd.read_csv('sp500_joined_close.csv') df_corr = df.corr() print(df_corr.head()) data = df_corr.values fig = plt.figure() ax = fig.add_subplot(1, 1, 1) heatmap = ax.pcolor(data, cmap=plt.cm.RdYlGn) fig.colorbar(heatmap) ax.set_xticks(np.arange(data.shape[1]) + 0.5, minor=False) ax.set_yticks(np.arange(data.shape[0]) + 0.5, minor=False) ax.invert_yaxis() ax.xaxis.tick_top() column_labels = df_corr.columns row_labels = df_corr.index ax.set_xticklabels(column_labels, rotation=90) ax.set_yticklabels(row_labels) plt.tight_layout() plt.show() save_sp500_tickers() get_data_from_yahoo() compile_data() visualize_data()Thank you for your helps!!!