Python Forum

Full Version: Method of k-nearest neighbors
You're currently viewing a stripped down version of our content. View the full version with proper formatting.
It is necessary to implement one of the methods for solving the problem of classification of statistical data: the method of reference vectors or the method of k nearest neighbors.
In the course of work it is required:
• visualize the initial data in the form of a scatter plot
• generate a data model
• Train the model
• Test the model

Here is the code, but it is not completely correct in relation to this task, help please change it.
from __future__ import division
import pandas as pd
url = r'https://archive.ics.uci.edu/ml/' \
    'machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, header=None)
df.columns = [u'Sepal.Lenght, cm',
              u'Sepal.Width, cm',
              u'Petal.Lenght, cm',
              u'Petal.Width, cm',
             'Class']
import numpy as np
def test_and_train(df, proportion):
    mask = np.random.rand(len(df)) < proportion
    return df[mask], df[~mask]
train, test = test_and_train(df, 0.67)
 
from math import sqrt
def euclidean_distance(instance1,instance2):
    squares = [(i-j)**2 for i,j in zip(instance1,instance2)]
    return sqrt(sum(squares))
import operator
def get_neighbours(instance, train,k):
    distances = []
    for i in train.ix[:,:-1].values:
        distances.append(euclidean_distance(instance,i))
    distances = tuple(zip(distances, train[u'Class'].values))
    return sorted(distances,key=operator.itemgetter(0))[:k]
 
from collections import Counter
def get_response(neigbours):
    return Counter(neigbours).most_common()[0][0][1]
 
def get_predictions(train, test, k):
    predictions = []
    for i in test.ix[:,:-1].values:
        neigbours = get_neighbours(i,train,k)
        response = get_response(neigbours)
        predictions.append(response)
    return predictions
 
 
 
def mean(instance):
    return sum(instance)/len(instance)
def get_accuracy(test,predictions):
    return mean([i == j for i,j in zip(test[u'Class'].values, predictions)])
get_accuracy(test,get_predictions(train, test, 5))
 
import pylab as pl
from sklearn.neighbors import KNeighborsClassifier
import pylab as pl
variables = [u'Sepal.Lenght, cm',u'Sepal.Width, cm',
              u'Petal.Lenght, cm',u'Petal.Width, cm']
results = []
for n in range(1,51,2):
    clf = KNeighborsClassifier(n_neighbors=n)
    clf.fit(train[variables], train[u'Class'])
    preds = clf.predict(test[variables])
    accuracy = np.where(preds==test[u'Class'], 1, 0).sum() / float(len(test))
    print("Neighbors: %d, Accuracy: %3f" % (n, accuracy))
    results.append([n, accuracy])
results = pd.DataFrame(results, columns=["n", "accuracy"])
pl.plot(results.n, results.accuracy)
pl.title("Accuracy with Increasing K")
pl.show()
Please, repost code with formatting removed (use Ctrl+shift+V to paste) and in python tags. See BBcode help