May-19-2018, 12:17 PM
import sklearn import pandas as pd import numpy as np from sklearn.utils import shuffle from sklearn.linear_model import LinearRegression #import the read set def read_dataset(): df = pd.read_csv("C:\\Users\\BADSHAH\\PycharmProjects\\edureka1.csv") x = df[df.columns[0:4]].values y = df[df.columns[4]] # to convert categorical data to numerical obj_df = df.select_dtypes(include=['object']).copy() obj_df["State"].value_counts() cleanup_nums = {"State": {'California': 2 , 'New York': 1 , 'Florida': 3}} # type: obj_df.replace(cleanup_nums, inplace=True) print(x.shape) return(x,y) # read data set x,y = read_dataset() # shuffle the dataset x,y = shuffle(x,y, random_state=1) #break data into test and train part x_train,x_test,y_train,y_test = sklearn.model_selection.train_test_split(x, y, test_size=0.20, random_state=5) print(x_train.shape) print(x_test.shape) print(y_train.shape) print(y_test.shape) lm = LinearRegression() lm.fit(x_train,y_train) y_train_predict = lm.predict(x_train) y_test_predict = lm.predict(x_test) cf=pd.DataFrame(y_test_predict,x_test) print(cf)