May-07-2017, 06:26 PM
It is necessary to implement one of the methods for solving the problem of classification of statistical data: the method of reference vectors or the method of k nearest neighbors.
In the course of work it is required:
• visualize the initial data in the form of a scatter plot
• generate a data model
• Train the model
• Test the model
Here is the code, but it is not completely correct in relation to this task, help please change it.
In the course of work it is required:
• visualize the initial data in the form of a scatter plot
• generate a data model
• Train the model
• Test the model
Here is the code, but it is not completely correct in relation to this task, help please change it.
from __future__ import division import pandas as pd url = r'https://archive.ics.uci.edu/ml/' \ 'machine-learning-databases/iris/iris.data' df = pd.read_csv(url, header=None) df.columns = [u'Sepal.Lenght, cm', u'Sepal.Width, cm', u'Petal.Lenght, cm', u'Petal.Width, cm', 'Class'] import numpy as np def test_and_train(df, proportion): mask = np.random.rand(len(df)) < proportion return df[mask], df[~mask] train, test = test_and_train(df, 0.67) from math import sqrt def euclidean_distance(instance1,instance2): squares = [(i-j)**2 for i,j in zip(instance1,instance2)] return sqrt(sum(squares)) import operator def get_neighbours(instance, train,k): distances = [] for i in train.ix[:,:-1].values: distances.append(euclidean_distance(instance,i)) distances = tuple(zip(distances, train[u'Class'].values)) return sorted(distances,key=operator.itemgetter(0))[:k] from collections import Counter def get_response(neigbours): return Counter(neigbours).most_common()[0][0][1] def get_predictions(train, test, k): predictions = [] for i in test.ix[:,:-1].values: neigbours = get_neighbours(i,train,k) response = get_response(neigbours) predictions.append(response) return predictions def mean(instance): return sum(instance)/len(instance) def get_accuracy(test,predictions): return mean([i == j for i,j in zip(test[u'Class'].values, predictions)]) get_accuracy(test,get_predictions(train, test, 5)) import pylab as pl from sklearn.neighbors import KNeighborsClassifier import pylab as pl variables = [u'Sepal.Lenght, cm',u'Sepal.Width, cm', u'Petal.Lenght, cm',u'Petal.Width, cm'] results = [] for n in range(1,51,2): clf = KNeighborsClassifier(n_neighbors=n) clf.fit(train[variables], train[u'Class']) preds = clf.predict(test[variables]) accuracy = np.where(preds==test[u'Class'], 1, 0).sum() / float(len(test)) print("Neighbors: %d, Accuracy: %3f" % (n, accuracy)) results.append([n, accuracy]) results = pd.DataFrame(results, columns=["n", "accuracy"]) pl.plot(results.n, results.accuracy) pl.title("Accuracy with Increasing K") pl.show()