Python Forum

May I know how to modify my Python programming so that can obtain the accuracy vs number of neighbours as refer to the attached image file -

# read in the iris data
from sklearn.datasets import load_iris
iris = load_iris()
# create X (features) and y (response)
X = iris.data
y = iris.target

from sklearn.neighbors import KNeighborsClassifier
k1 = (1, 2, 3, 4, 5, 6, 7, 8, 9)
k2 = (10, 15, 20, 25, 30, 35, 40)
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X, y)
y_pred = knn.predict(X)

from sklearn import metrics
metrics.accuracy_score(y,y_pred)
knn = KNeighborsClassifier(n_neighbors=1)
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# import Matplotlib (scientific plotting library)
import matplotlib.pyplot as plt
import numpy as np
# try K=1 through K=9 and record testing accuracy
k1_range = range(1, 9)
k2_range = range(10, 40)
# create Python dictionary using [] 
scores = []

for k1 in k1_range:
         knn = KNeighborsClassifier(n_neighbors=k1, metric='minkowski', p=2)
         knn.fit(X_train, y_train)
         y_pred = knn.predict(X_test)
         scores.append(metrics.accuracy_score(y_test, y_pred))
         
for k2 in k2_range:
         knn = KNeighborsClassifier(n_neighbors=k2, metric='minkowski', p=2)
         knn.fit(X_train, y_train)
         y_pred = knn.predict(X_test)
         scores.append(metrics.accuracy_score(y_test, y_pred))         

# plot the relationship between K and testing accuracy
# plt.plot(x_axis, y_axis)
plt.plot(k1_range, scores)
plt.yticks(np.arange(0.93, 0.98, 0.03))
plt.plot(k2_range, scores)
plt.yticks(np.arange(0.91, 0.98, 0.03))
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')

The error message is -

runfile('C:/Users/HSIPL/Desktop/Homework 8 Solution draft.py', wdir='C:/Users/HSIPL/Desktop')
Traceback (most recent call last):

  File "<ipython-input-31-1ba40d3637a3>", line 1, in <module>
    runfile('C:/Users/HSIPL/Desktop/Homework 8 Solution draft.py', wdir='C:/Users/HSIPL/Desktop')

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 668, in runfile
    execfile(filename, namespace)

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\spyder_kernels\customize\spydercustomize.py", line 108, in execfile
    exec(compile(f.read(), filename, 'exec'), namespace)

  File "C:/Users/HSIPL/Desktop/Homework 8 Solution draft.py", line 45, in <module>
    plt.plot(k1_range, scores)

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\pyplot.py", line 3363, in plot
    ret = ax.plot(*args, **kwargs)

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\__init__.py", line 1867, in inner
    return func(ax, *args, **kwargs)

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\axes\_axes.py", line 1528, in plot
    for line in self._get_lines(*args, **kwargs):

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\axes\_base.py", line 406, in _grab_next_args
    for seg in self._plot_args(this, kwargs):

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\axes\_base.py", line 383, in _plot_args
    x, y = self._xy_from_xy(x, y)

  File "C:\Users\HSIPL\Anaconda3\lib\site-packages\matplotlib\axes\_base.py", line 242, in _xy_from_xy
    "have shapes {} and {}".format(x.shape, y.shape))

ValueError: x and y must have same first dimension, but have shapes (8,) and (38,)

Please refer the attached image file -

[Image: o8oNB.jpg]

Please help me on this case

You definitely need to use different score accumulation arrays: score1 and score2. Now, you are appending all results to the same array named score, it grows and becomes incompatible by size with k1_range and k2_range arrays

May I know how to write the correct and complete code for that part

You need to restructure your code significantly. All import statements should be moved to the beginning of the file/document; each part of your code should solve one particular problem and be clear for understanding.

Hide/Show

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split


# ------------- Data loading section ------------
iris = load_iris()

# -----------------------------------------------


# ----------- Data preparation section ----------

# create X (features) and y (response)

X = iris.data
y = iris.target


# Creating train and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
# -----------------------------------------------



# ---- Classifier parameter initialization ------

# allowed ranges for nearest neighbor method

k1_range = range(1, 10)
k2_range = range(10, 41, 5)


# you probably need to specify metric type here, e.g.
# metric_type = 'minkowski' and power, e.g. m_power = 2
# Note: minkowski metric with power 2 is eucledean metric.

# -----------------------------------------------


# ----- main computational block goes here ------

scores1 = list()
for k1 in k1_range:
         knn = KNeighborsClassifier(n_neighbors=k1, metric='minkowski', p=2) 
         knn.fit(X_train, y_train)
         y_pred = knn.predict(X_test)
         scores1.append(metrics.accuracy_score(y_test, y_pred))

scores2 = list()
for k2 in k2_range:
         knn = KNeighborsClassifier(n_neighbors=k2, metric='minkowski', p=2)
         knn.fit(X_train, y_train)
         y_pred = knn.predict(X_test)
         scores2.append(metrics.accuracy_score(y_test, y_pred))         

            
# -----------------------------------------------            
            
    
# ----------- plotting obtained results ---------    
plt.figure()
plt.plot(k1_range, scores1)
plt.yticks(np.arange(0.93, 0.98, 0.03))
plt.ylabel('Accuracy')
plt.figure()
plt.plot(k2_range, scores2)
plt.yticks(np.arange(0.91, 0.98, 0.03))
plt.xlabel('Number of neighbors')
plt.ylabel('Accuracy')
plt.show()

# -----------------------------------------------

You still need to tweak the code, add a title to each figure, make some refactoring,
e.g. "minkowski" with p=2 is euclidean distance (that is default).

vokoyo

scidam

vokoyo

scidam