Code error - new user - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Data Science (https://python-forum.io/forum-44.html) +--- Thread: Code error - new user (/thread-10146.html) |
Code error - new user - CarlG - May-15-2018 I am getting this error, and for the life of me, cant figure it out. would appreicate your kind help. Loving Python, but it seems temperamental for a new user. Thanks ! ImportError: dlopen(/Users/xxx/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so, 2): Library not loaded: @rpath/libfreetype.6.dylib Referenced from: /Users/xxx/anaconda3/lib/python3.6/site-packages/matplotlib/ft2font.cpython-36m-darwin.so Reason: Incompatible library version: ft2font.cpython-36m-darwin.so requires version 22.0.0 or later, but libfreetype.6.dylib provides version 21.0.0 RE: Code error - new user - CarlG - May-15-2018 Pleaseeeeee RE: Code error - new user - buran - May-15-2018 Post your code in python tags + full traceback in error tags RE: Code error - new user - CarlG - May-16-2018 Technically not my code, i am learning by taking code of various website and trying to understand how they work. This one was about housing prices. thanks Error Message data_description.txt sample_submission.csv test.csv train.csv Code#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon May 14 18:31:46 2018 @author: xUsernamex """ # This Python 3 environment comes with many helpful analytics libraries installed # It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python # For example, here's several helpful packages to load in import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) # Input data files are available in the "../input/" directory. # For example, runninga this (by clicking run or pressing Shift+Enter) will list the files in the input directory from subprocess import check_output print(check_output(["ls", "/Users/xUsernamex/AnacondaProjects/House5/input"]).decode("utf8")) # Any results you write to the current directory are saved as output import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn import ensemble, tree, linear_model from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import r2_score, mean_squared_error from sklearn.utils import shuffle from IPython import get_ipython get_ipython().run_line_magic('matplotlib', 'inline') import warnings warnings.filterwarnings('ignore') train = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/train.csv') test = pd.read_csv('/Users/xUsernamex/AnacondaProjects/House5/input/test.csv') train.describe() train.head(3) test.head(3) train.shape,test.shape #check for dupes for Id idsUnique = len(set(train.Id)) idsTotal = train.shape[0] idsdupe = idsTotal - idsUnique print(idsdupe) #drop id col train.drop(['Id'],axis =1,inplace=True) #correlation matrix corrmat = train.corr() f, ax = plt.subplots(figsize=(20, 9)) sns.heatmap(corrmat, vmax=.8, annot=True); # most correlated features corrmat = train.corr() top_corr_features = corrmat.index[abs(corrmat["SalePrice"])>0.5] plt.figure(figsize=(10,10)) g = sns.heatmap(train[top_corr_features].corr(),annot=True,cmap="RdYlGn") sns.barplot(train.OverallQual,train.SalePrice) sns.set() cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] sns.pairplot(train[cols], size = 2.5) plt.show(); from scipy import stats from scipy.stats import norm, skew #for some statistics sns.distplot(train['SalePrice'] , fit=norm); # Get the fitted parameters used by the function (mu, sigma) = norm.fit(train['SalePrice']) print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma)) plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best') plt.ylabel('Frequency') plt.title('SalePrice distribution') fig = plt.figure() res = stats.probplot(train['SalePrice'], plot=plt) plt.show() train.SalePrice = np.log1p(train.SalePrice ) y = train.SalePrice plt.scatter(y =train.SalePrice,x = train.GrLivArea,c = 'black') plt.show() #we can see the outlier in the below image train_nas = train.isnull().sum() train_nas = train_nas[train_nas>0] train_nas.sort_values(ascending=False) test_nas = test.isnull().sum() test_nas = test_nas[test_nas>0] test_nas.sort_values(ascending = False) print("Find most important features relative to target") corr = train.corr() corr.sort_values(["SalePrice"], ascending = False, inplace = True) print(corr.SalePrice) #this you can see at the time of heatmap als # Differentiate numerical features (minus the target) and categorical features categorical_features = train.select_dtypes(include=['object']).columns categorical_features numerical_features = train.select_dtypes(exclude = ["object"]).columns # Differentiate numerical features (minus the target) and categorical features categorical_features = train.select_dtypes(include = ["object"]).columns numerical_features = train.select_dtypes(exclude = ["object"]).columns numerical_features = numerical_features.drop("SalePrice") print("Numerical features : " + str(len(numerical_features))) print("Categorical features : " + str(len(categorical_features))) train_num = train[numerical_features] train_cat = train[categorical_features] # Handle remaining missing values for numerical features by using median as replacement print("NAs for numerical features in train : " + str(train_num.isnull().values.sum())) train_num = train_num.fillna(train_num.median()) print("Remaining NAs for numerical features in train : " + str(train_num.isnull().values.sum())) from scipy.stats import skew skewness = train_num.apply(lambda x: skew(x)) skewness.sort_values(ascending=False) skewness = skewness[abs(skewness)>0.5] skewness.index skew_features = train[skewness.index] skew_features.columns #we can treat skewness of a feature with the help fof log transformation.so we'll apply the same here. skew_features = np.log1p(skew_features) # Create dummy features for categorical values via one-hot encoding train_cat.shape train_cat = pd.get_dummies(train_cat) train_cat.shape train_cat.head() str(train_cat.isnull().values.sum()) import pandas as pd import numpy as np from sklearn.model_selection import cross_val_score, train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV from sklearn.metrics import mean_squared_error, make_scorer import matplotlib.pyplot as plt import seaborn as sns train = pd.concat([train_cat,train_num],axis=1) train.shape #split the data to train the model X_train,X_test,y_train,y_test = train_test_split(train,y,test_size = 0.3,random_state= 0) X_train.shape,X_test.shape,y_train.shape,y_test.shape X_train.head(3) n_folds = 5 from sklearn.metrics import make_scorer from sklearn.model_selection import KFold scorer = make_scorer(mean_squared_error,greater_is_better = False) def rmse_CV_train(model): kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf)) return (rmse) def rmse_CV_test(model): kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(train.values) rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf)) return (rmse) lr = LinearRegression() lr.fit(X_train,y_train) test_pre = lr.predict(X_test) train_pre = lr.predict(X_train) print('rmse on train',rmse_CV_train(lr).mean()) print('rmse on train',rmse_CV_test(lr).mean()) #plot between predicted values and residuals plt.scatter(train_pre, train_pre - y_train, c = "blue", label = "Training data") plt.scatter(test_pre,test_pre - y_test, c = "black", label = "Validation data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc = "upper left") plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red") plt.show() # Plot predictions - Real values plt.scatter(train_pre, y_train, c = "blue", label = "Training data") plt.scatter(test_pre, y_test, c = "black", label = "Validation data") plt.title("Linear regression") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc = "upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c = "red") plt.show() ridge = RidgeCV(alphas = [0.01, 0.03, 0.06, 0.1, 0.3, 0.6, 1, 3, 6, 10, 30, 60]) ridge.fit(X_train,y_train) alpha = ridge.alpha_ print('best alpha',alpha) print("Try again for more precision with alphas centered around " + str(alpha)) ridge = RidgeCV(alphas = [alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4],cv = 5) ridge.fit(X_train, y_train) alpha = ridge.alpha_ print("Best alpha :", alpha) print("Ridge RMSE on Training set :", rmse_CV_train(ridge).mean()) print("Ridge RMSE on Test set :", rmse_CV_test(ridge).mean()) y_train_rdg = ridge.predict(X_train) y_test_rdg = ridge.predict(X_test) X_train.shape coef = pd.Series(ridge.coef_, index = X_train.columns) print("Ridge picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") # Plot residuals plt.scatter(y_train_rdg, y_train_rdg - y_train, c = "blue", label = "Training data") plt.scatter(y_test_rdg, y_test_rdg - y_test, c = "black", marker = "v", label = "Validation data") plt.title("Linear regression with Ridge regularization") plt.xlabel("Predicted values") plt.ylabel("Residuals") plt.legend(loc = "upper left") plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red") plt.show() # Plot predictions - Real values plt.scatter(y_train_rdg, y_train, c = "blue", label = "Training data") plt.scatter(y_test_rdg, y_test, c = "black", label = "Validation data") plt.title("Linear regression with Ridge regularization") plt.xlabel("Predicted values") plt.ylabel("Real values") plt.legend(loc = "upper left") plt.plot([10.5, 13.5], [10.5, 13.5], c = "red") plt.show() RE: Code error - new user - CarlG - May-20-2018 pleaseeee |