Technically not my code, i am learning by taking code of various website and trying to understand how they work. This one was about housing prices.
thanks
Error Message
data_description.txt
sample_submission.csv
test.csv
train.csv
thanks
Error Message
data_description.txt
sample_submission.csv
test.csv
train.csv
Error:---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
~/AnacondaProjects/House5/house5a.py in <module>()
23 import pandas as pd
24 import numpy as np
---> 25 import matplotlib.pyplot as plt
26 import seaborn as sns
27 from sklearn import ensemble, tree, linear_model
~/anaconda3/lib/python3.6/site-packages/matplotlib/pyplot.py in <module>()
29 from cycler import cycler
30 import matplotlib
---> 31 import matplotlib.colorbar
32 from matplotlib import style
33 from matplotlib import _pylab_helpers, interactive
~/anaconda3/lib/python3.6/site-packages/matplotlib/colorbar.py in <module>()
34 import matplotlib.collections as collections
35 import matplotlib.colors as colors
---> 36 import matplotlib.contour as contour
37 import matplotlib.cm as cm
38 import matplotlib.gridspec as gridspec
~/anaconda3/lib/python3.6/site-packages/matplotlib/contour.py in <module>()
18 import matplotlib.colors as colors
19 import matplotlib.collections as mcoll
---> 20 import matplotlib.font_manager as font_manager
21 import matplotlib.text as text
22 import matplotlib.cbook as cbook
~/anaconda3/lib/python3.6/site-packages/matplotlib/font_manager.py in <module>()
53 import logging
54
---> 55 from matplotlib import afm, cbook, ft2font, rcParams, get_cachedir
56 from matplotlib.compat import subprocess
57 from matplotlib.fontconfig_pattern import (
ImportError: dlopen(/Users/xUsernamex/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libfreetype.6.dylib
Referenced from: /Users/xUsernamex/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so
Reason: Incompatible library version: ft2font.cpython-36m-darwin.so requires version 22.0.0 or later, but libfreetype.6.dylib provides version 21.0.0
(base) xUsernamexs-Mac:House5 xUsernamex$
Code#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 14 18:31:46 2018 @author: xUsernamex """ # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the "../input/" directory. # For example, runninga this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output(["ls", "/Users/xUsernamex/AnacondaProjects/House5/input"]).decode("utf8")) # Any results you write to the current directory are saved as output import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import ensemble, tree, linear_model from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import r2_score, mean_squared_error from sklearn.utils import shuffle from IPython import get_ipython get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') train = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/train.csv') test = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/test.csv') train.describe() train.head(3) test.head(3) train.shape,test.shape #check for dupes for Id idsUnique = len(set(train.Id)) idsTotal = train.shape[0] idsdupe = idsTotal - idsUnique print(idsdupe) #drop id col train.drop(['Id'],axis =1,inplace=True) #correlation matrix corrmat = train.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corrmat, vmax=.8, annot=True); # most correlated features corrmat = train.corr() top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5] plt.figure(figsize=(10,10)) g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn") sns.barplot(train.OverallQual,train.SalePrice) sns.set() cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] sns.pairplot(train[cols], size = 2.5) plt.show(); from scipy import stats from scipy.stats import norm, skew #for some statistics sns.distplot(train['SalePrice'] , fit=norm); # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice']) print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() train.SalePrice = np.log1p(train.SalePrice ) y = train.SalePrice plt.scatter(y =train.SalePrice,x = train.GrLivArea,c = 'black') plt.show() #we can see the outlier in the below image train_nas = train.isnull().sum() train_nas = train_nas[train_nas>0] train_nas.sort_values(ascending=False) test_nas = test.isnull().sum() test_nas = test_nas[test_nas>0] test_nas.sort_values(ascending = False) print("Find most important features relative to target") corr = train.corr() corr.sort_values(["SalePrice"], ascending = False, inplace = True) print(corr.SalePrice) #this you can see at the time of heatmap als # Differentiate numerical features (minus the target) and categorical features categorical_features = train.select_dtypes(include=['object']).columns categorical_features numerical_features = train.select_dtypes(exclude = ["object"]).columns # Differentiate numerical features (minus the target) and categorical features categorical_features = train.select_dtypes(include = ["object"]).columns numerical_features = train.select_dtypes(exclude = ["object"]).columns numerical_features = numerical_features.drop("SalePrice") print("Numerical features : " + str(len(numerical_features))) print("Categorical features : " + str(len(categorical_features))) train_num = train[numerical_features] train_cat = train[categorical_features] # Handle remaining missing values for numerical features by using median as replacement print("NAs for numerical features in train : " + str(train_num.isnull().values.sum())) train_num = train_num.fillna(train_num.median()) print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum())) from scipy.stats import skew skewness = train_num.apply(lambda x: skew(x)) skewness.sort_values(ascending=False) skewness = skewness[abs(skewness)>0.5] skewness.index skew_features = train[skewness.index] skew_features.columns #we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here. skew_features = np.log1p(skew_features) # Create dummy features for categorical values via one-hot encoding train_cat.shape train_cat = pd.get_dummies(train_cat) train_cat.shape train_cat.head() str(train_cat.isnull().values.sum()) import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score, train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV from sklearn.metrics import mean_squared_error, make_scorer import matplotlib.pyplot as plt import seaborn as sns train = pd.concat([train_cat,train_num],axis=1) train.shape #split the data to train the model X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0) X_train.shape,X_test.shape,y_train.shape,y_test.shape X_train.head(3) n_folds = 5 from sklearn.metrics import make_scorer from sklearn.model_selection import KFold scorer = make_scorer(mean_squared_error,greater_is_better = False) def rmse_CV_train(model): kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf)) return (rmse) def rmse_CV_test(model): kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf)) return (rmse) lr = LinearRegression() lr.fit(X_train,y_train) test_pre = lr.predict(X_test) train_pre = lr.predict(X_train) print('rmse on train',rmse_CV_train(lr).mean()) print('rmse on train',rmse_CV_test(lr).mean()) #plot between predicted values and residuals plt.scatter(train_pre, train_pre - y_train, c = "blue", label = "Training data") plt.scatter(test_pre,test_pre - y_test, c = "black", label = "Validation data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc = "upper left") plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red") plt.show() # Plot predictions - Real values plt.scatter(train_pre, y_train, c = "blue", label = "Training data") plt.scatter(test_pre, y_test, c = "black", label = "Validation data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc = "upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c = "red") plt.show() ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]) ridge.fit(X_train,y_train) alpha = ridge.alpha_ print('best alpha',alpha) print("Try again for more precision with alphas centered around " + str(alpha)) ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5) ridge.fit(X_train, y_train) alpha = ridge.alpha_ print("Best alpha :", alpha) print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean()) print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean()) y_train_rdg = ridge.predict(X_train) y_test_rdg = ridge.predict(X_test) X_train.shape coef = pd.Series(ridge.coef_, index = X_train.columns) print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") # Plot residuals plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", label = "Training data") plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data") plt.title("Linear regression with Ridge regularization") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc = "upper left") plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red") plt.show() # Plot predictions - Real values plt.scatter(y_train_rdg, y_train, c = "blue", label = "Training data") plt.scatter(y_test_rdg, y_test, c = "black", label = "Validation data") plt.title("Linear regression with Ridge regularization") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc = "upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c = "red") plt.show()