机器学习-01-KNN

学习来源：日撸 Java 三百行（51-60天，kNN 与 NB）_闵帆的博客-CSDN博客_knn算法java实现

机器学习的本质：猜。因为其有着不确定性，所以导致大家都在别人方案上不断提高准确度。

51 KNN 分类器 51.1 KNN为什么要归一化：不同数据之间因为单位不同，导致数值差距十分大，容易导致预测结果被某项数据主导，所以需要进行数据的归一化。 51.2 KNN的特点：

1.简单。没有学习过程，又被称为惰性学习

2.本源。KNN算法和人类认识事物的行为一样，比如，路上看到一只鸟但它像鸭子，我们会认为它就是鸭子。但它会飞是属于另一种生物，说明我们认识也有误差。

3.效果好。对于很多数据，KNN的表现都挺好的。

4.适应性强。可以用于分类，回归。应用于各种数据。

5.可扩展性强。设计不同的度量方法，可以获得意想不到的效果。

6.一般需要对数据归一化。

7.复杂度高。对于每个测试数据，复杂度为：O((m + k)n)。

51.3 算法过程：

1.数据加载。

2.将数据划分训练集和测试机（需要把数据随机打乱）。

3.对于每一个测试数据，计算其与训练集的所有数据距离，取出最短的K个数据。

4.在K个数据中，各分类数量最多的即为对于该测试数据的预测。

5.最后计算并输出预测准确度：准确个数/测试数。

代码：

package 日撸Java300行_51_60;

import java.io.FileReader;
import java.util.Arrays;
import java.util.Random;

import weka.core.*;




public class KnnClassification {

	
	public static final int MANHATTAN = 0;

	
	public static final int EUCLIDEAN = 1;

	
	public int distanceMeasure = EUCLIDEAN;

	
	public static final Random random = new Random();

	
	int numNeighbors = 7;

	
	Instances dataset;

	
	int[] trainingSet;

	
	int[] testingSet;

	
	int[] predictions;

	
	public KnnClassification(String paraFilename) {
		try {
			FileReader fileReader = new FileReader(paraFilename);
			dataset = new Instances(fileReader);
			// The last attribute is the decision class.
			dataset.setClassIndex(dataset.numAttributes() - 1);
			fileReader.close();
		} catch (Exception ee) {
			System.out.println("Error occurred while trying to read '" + paraFilename
					+ "' in KnnClassification constructor.rn" + ee);
			System.exit(0);
		} // Of try
	}// Of the first constructor

	
	public static int[] getRandomIndices(int paraLength) {
		int[] resultIndices = new int[paraLength];

		// Step 1. Initialize.
		for (int i = 0; i < paraLength; i++) {
			resultIndices[i] = i;
		} // Of for i

		// Step 2. Randomly swap.
		int tempFirst, tempSecond, tempValue;
		for (int i = 0; i < paraLength; i++) {
			// Generate two random indices.
			tempFirst = random.nextInt(paraLength);
			tempSecond = random.nextInt(paraLength);

			// Swap.
			tempValue = resultIndices[tempFirst];
			resultIndices[tempFirst] = resultIndices[tempSecond];
			resultIndices[tempSecond] = tempValue;
		} // Of for i

		return resultIndices;
	}// Of getRandomIndices

	
	public void splitTrainingTesting(double paraTrainingFraction) {
		int tempSize = dataset.numInstances();
		int[] tempIndices = getRandomIndices(tempSize);
		int tempTrainingSize = (int) (tempSize * paraTrainingFraction);

		trainingSet = new int[tempTrainingSize];
		testingSet = new int[tempSize - tempTrainingSize];

		for (int i = 0; i < tempTrainingSize; i++) {
			trainingSet[i] = tempIndices[i];
		} // Of for i

		for (int i = 0; i < tempSize - tempTrainingSize; i++) {
			testingSet[i] = tempIndices[tempTrainingSize + i];
		} // Of for i
	}// Of splitTrainingTesting

	
	public void predict() {
		predictions = new int[testingSet.length];
		for (int i = 0; i < predictions.length; i++) {
			predictions[i] = predict(testingSet[i]);
		} // Of for i
	}// Of predict

	
	public int predict(int paraIndex) {
		int[] tempNeighbors = computeNearests(paraIndex);
		int resultPrediction = simpleVoting(tempNeighbors);

		return resultPrediction;
	}// Of predict

	
	public double distance(int paraI, int paraJ) {
		double resultDistance = 0;
		double tempDifference;
		switch (distanceMeasure) {
		case MANHATTAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
				if (tempDifference < 0) {
					resultDistance -= tempDifference;
				} else {
					resultDistance += tempDifference;
				} // Of if
			} // Of for i
			break;

		case EUCLIDEAN:
			for (int i = 0; i < dataset.numAttributes() - 1; i++) {
				tempDifference = dataset.instance(paraI).value(i) - dataset.instance(paraJ).value(i);
				resultDistance += tempDifference * tempDifference;
			} // Of for i
			break;
		default:
			System.out.println("Unsupported distance measure: " + distanceMeasure);
		}// Of switch

		return resultDistance;
	}// Of distance

	
	public double getAccuracy() {
		// A double divides an int gets another double.
		double tempCorrect = 0;
		for (int i = 0; i < predictions.length; i++) {
			if (predictions[i] == dataset.instance(testingSet[i]).classValue()) {
				tempCorrect++;
			} // Of if
		} // Of for i

		return tempCorrect / testingSet.length;
	}// Of getAccuracy

	
	public int[] computeNearests(int paraCurrent) {
		int[] resultNearests = new int[numNeighbors];
		boolean[] tempSelected = new boolean[trainingSet.length];
		double tempDistance;
		double tempMinimalDistance;
		int tempMinimalIndex = 0;

		// Select the nearest paraK indices.
		for (int i = 0; i < numNeighbors; i++) {
			tempMinimalDistance = Double.MAX_VALUE;

			for (int j = 0; j < trainingSet.length; j++) {
				if (tempSelected[j]) {
					continue;
				} // Of if

				tempDistance = distance(paraCurrent, trainingSet[j]);
				if (tempDistance < tempMinimalDistance) {
					tempMinimalDistance = tempDistance;
					tempMinimalIndex = j;
				} // Of if
			} // Of for j

			resultNearests[i] = trainingSet[tempMinimalIndex];
			tempSelected[tempMinimalIndex] = true;
		} // Of for i

		System.out.println("The nearest of " + paraCurrent + " are: " + Arrays.toString(resultNearests));
		return resultNearests;
	}// Of computeNearests

	
	public int simpleVoting(int[] paraNeighbors) {
		int[] tempVotes = new int[dataset.numClasses()];
		for (int i = 0; i < paraNeighbors.length; i++) {
			tempVotes[(int) dataset.instance(paraNeighbors[i]).classValue()]++;
		} // Of for i

		int tempMaximalVotingIndex = 0;
		int tempMaximalVoting = 0;
		for (int i = 0; i < dataset.numClasses(); i++) {
			if (tempVotes[i] > tempMaximalVoting) {
				tempMaximalVoting = tempVotes[i];
				tempMaximalVotingIndex = i;
			} // Of if
		} // Of for i

		return tempMaximalVotingIndex;
	}// Of simpleVoting

	
	public static void main(String args[]) {
		KnnClassification tempClassifier = new KnnClassification("D:\data\iris.arff");
		tempClassifier.splitTrainingTesting(0.8);
		tempClassifier.predict();
		System.out.println("The accuracy of the classifier is: " + tempClassifier.getAccuracy());
	}// Of main

}// Of class KnnClassification

截图：

机器学习-01-KNN

Java相关栏目本月热门文章