Technically not my code, i am learning by taking code of various website and trying to understand how they work. This one was about housing prices.
thanks
Error Message
data_description.txt
sample_submission.csv
test.csv
train.csv
Error:
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
~/AnacondaProjects/House5/house5a.py in <module>()
23 import pandas as pd
24 import numpy as np
---> 25 import matplotlib.pyplot as plt
26 import seaborn as sns
27 from sklearn import ensemble, tree, linear_model
~/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in <module>()
29 from cycler import cycler
30 import matplotlib
---> 31 import matplotlib.colorbar
32 from matplotlib import style
33 from matplotlib import _pylab_helpers, interactive
~/anaconda3/lib/python3.6/site-packages/matplotlib/colorbar.py in <module>()
34 import matplotlib.collections as collections
35 import matplotlib.colors as colors
---> 36 import matplotlib.contour as contour
37 import matplotlib.cm as cm
38 import matplotlib.gridspec as gridspec
~/anaconda3/lib/python3.6/site-packages/matplotlib/contour.py in <module>()
18 import matplotlib.colors as colors
19 import matplotlib.collections as mcoll
---> 20 import matplotlib.font_manager as font_manager
21 import matplotlib.text as text
22 import matplotlib.cbook as cbook
~/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py in <module>()
53 import logging
54
---> 55 from matplotlib import afm, cbook, ft2font, rcParams, get_cachedir
56 from matplotlib.compat import subprocess
57 from matplotlib.fontconfig_pattern import (
ImportError: dlopen(/Users/xUsernamex/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libfreetype.6.dylib
Referenced from: /Users/xUsernamex/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so
Reason: Incompatible library version: ft2font.cpython-36m-darwin.so requires version 22.0.0 or later, but libfreetype.6.dylib provides version 21.0.0
(base) xUsernamexs-Mac:House5 xUsernamex$
Code
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 14 18:31:46 2018
@author: xUsernamex
"""
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the "../input/" directory.
# For example, runninga this (by clicking run or pressing Shift+Enter) will list the files in the input directory
from subprocess import check_output
print(check_output(["ls", "/Users/xUsernamex/AnacondaProjects/House5/input"]).decode("utf8"))
# Any results you write to the current directory are saved as output
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import ensemble, tree, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.utils import shuffle
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
import warnings
warnings.filterwarnings('ignore')
train = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/train.csv')
test = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/test.csv')
train.describe()
train.head(3)
test.head(3)
train.shape,test.shape
#check for dupes for Id
idsUnique = len(set(train.Id))
idsTotal = train.shape[0]
idsdupe = idsTotal - idsUnique
print(idsdupe)
#drop id col
train.drop(['Id'],axis =1,inplace=True)
#correlation matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(20, 9))
sns.heatmap(corrmat, vmax=.8, annot=True);
# most correlated features
corrmat = train.corr()
top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn")
sns.barplot(train.OverallQual,train.SalePrice)
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size = 2.5)
plt.show();
from scipy import stats
from scipy.stats import norm, skew #for some statistics
sns.distplot(train['SalePrice'] , fit=norm);
# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')
fig = plt.figure()
res = stats.probplot(train['SalePrice'], plot=plt)
plt.show()
train.SalePrice = np.log1p(train.SalePrice )
y = train.SalePrice
plt.scatter(y =train.SalePrice,x = train.GrLivArea,c = 'black')
plt.show()
#we can see the outlier in the below image
train_nas = train.isnull().sum()
train_nas = train_nas[train_nas>0]
train_nas.sort_values(ascending=False)
test_nas = test.isnull().sum()
test_nas = test_nas[test_nas>0]
test_nas.sort_values(ascending = False)
print("Find most important features relative to target")
corr = train.corr()
corr.sort_values(["SalePrice"], ascending = False, inplace = True)
print(corr.SalePrice)
#this you can see at the time of heatmap als
# Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include=['object']).columns
categorical_features
numerical_features = train.select_dtypes(exclude = ["object"]).columns
# Differentiate numerical features (minus the target) and categorical features
categorical_features = train.select_dtypes(include = ["object"]).columns
numerical_features = train.select_dtypes(exclude = ["object"]).columns
numerical_features = numerical_features.drop("SalePrice")
print("Numerical features : " + str(len(numerical_features)))
print("Categorical features : " + str(len(categorical_features)))
train_num = train[numerical_features]
train_cat = train[categorical_features]
# Handle remaining missing values for numerical features by using median as replacement
print("NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
train_num = train_num.fillna(train_num.median())
print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum()))
from scipy.stats import skew
skewness = train_num.apply(lambda x: skew(x))
skewness.sort_values(ascending=False)
skewness = skewness[abs(skewness)>0.5]
skewness.index
skew_features = train[skewness.index]
skew_features.columns
#we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here.
skew_features = np.log1p(skew_features)
# Create dummy features for categorical values via one-hot encoding
train_cat.shape
train_cat = pd.get_dummies(train_cat)
train_cat.shape
train_cat.head()
str(train_cat.isnull().values.sum())
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer
import matplotlib.pyplot as plt
import seaborn as sns
train = pd.concat([train_cat,train_num],axis=1)
train.shape
#split the data to train the model
X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
X_train.head(3)
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
return (rmse)
def rmse_CV_test(model):
kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values)
rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
return (rmse)
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on train',rmse_CV_test(lr).mean())
#plot between predicted values and residuals
plt.scatter(train_pre, train_pre - y_train, c = "blue", label = "Training data")
plt.scatter(test_pre,test_pre - y_test, c = "black", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()
# Plot predictions - Real values
plt.scatter(train_pre, y_train, c = "blue", label = "Training data")
plt.scatter(test_pre, y_test, c = "black", label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()
ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60])
ridge.fit(X_train,y_train)
alpha = ridge.alpha_
print('best alpha',alpha)
print("Try again for more precision with alphas centered around " + str(alpha))
ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5)
ridge.fit(X_train, y_train)
alpha = ridge.alpha_
print("Best alpha :", alpha)
print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean())
print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean())
y_train_rdg = ridge.predict(X_train)
y_test_rdg = ridge.predict(X_test)
X_train.shape
coef = pd.Series(ridge.coef_, index = X_train.columns)
print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")
# Plot residuals
plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", label = "Training data")
plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()
# Plot predictions - Real values
plt.scatter(y_train_rdg, y_train, c = "blue", label = "Training data")
plt.scatter(y_test_rdg, y_test, c = "black", label = "Validation data")
plt.title("Linear regression with Ridge regularization")
plt.xlabel("Predicted values")
plt.ylabel("Real values")
plt.legend(loc = "upper left")
plt.plot([10.5, 13.5], [10.5, 13.5], c = "red")
plt.show()