学习来源日撸 Java 三百行(51-60天,kNN 与 NB)_闵帆的博客——CSDN博客
1.针对符号型数据的NB算法NB全称为Naive Bayes,是基于概率论的分类算法。
数据集:
@relation weather.symbolic
@attribute outlook {sunny, overcast, rainy}
@attribute temperature {hot, mild, cool}
@attribute humidity {high, normal}
@attribute windy {TRUE, FALSE}
@attribute play {yes, no}
@data
sunny,hot,high,FALSE,no
sunny,hot,high,TRUE,no
overcast,hot,high,FALSE,yes
rainy,mild,high,FALSE,yes
rainy,cool,normal,FALSE,yes
rainy,cool,normal,TRUE,no
overcast,cool,normal,TRUE,yes
sunny,mild,high,FALSE,no
sunny,cool,normal,FALSE,yes
rainy,mild,normal,FALSE,yes
sunny,mild,normal,TRUE,yes
overcast,mild,high,TRUE,yes
overcast,hot,normal,FALSE,yes
rainy,mild,high,TRUE,no
2.理论基础
2.1条件概率公式:
P(AB) = P(A)P(B|A) (1)
其中:
P(A) 表示事件 A 发生的概率;
P(AB) 表示事件 A 和 B 同时发生的概率;
P(B|A) 表示在事件 A 发生的情况下,事件 B 发生的概率。
2.2独立性假设令 表示数据集中数据的前四个属性的组合, 令 表示最后一个属性。由(1)式可知
(2)
假设各属性之间是独立的:
(3)
由(2)(3)式可得:
(4)
由于我们是根据概率的大小来进行分类,且 P(x) 对所有的数据集都是相同的,所以只需要比较数据集中各项数据对应在(4)式中分子的大小就能分类。
2.3拉普拉斯平滑在(4)式中有一个问题。在
(5)
这一连乘式子中,当某一项为0时,会导致整个结果都为0。因此为了使它不为0,令
(6)
其中,n是数据集项目的个数, 表示第 j 个属性可能的取值个数。同理:
(7)
最后的预测方案为:
(8)
使用log一是因为log函数单调,不影响概率大小的比较;二是将连乘转换为连加,防止溢出。
3. 以下是代码部分package JavaDay13;
import weka.core.*;
import java.io.FileReader;
import java.util.Arrays;
public class NaiveBayes {
private class GaussianParameters {
double mu;
double sigma;
public GaussianParameters(double paraMu, double paraSigma) {
mu = paraMu;
sigma = paraSigma;
}//Of the constructor
public String toString() {
return "(" + mu + ", " + sigma + ")";
}//Of toString
}//Of GaussianParameters
Instances dataset;
int numClasses;
int numInstances;
int numConditions;
int[] predicts;
double[] classDistribution;
double[] classDistributionLaplacian;
double[][][] conditionalCounts;
double[][][] conditionalProbabilitiesLaplacian;
GaussianParameters[][] gaussianParameters;
int dataType;
public static final int NOMINAL = 0;
public static final int NUMERICAL = 1;
public NaiveBayes(String paraFilename) {
dataset = null;
try {
FileReader fileReader = new FileReader(paraFilename);
dataset = new Instances(fileReader);
fileReader.close();
} catch (Exception ee) {
System.out.println("Cannot read the file: " + paraFilename + "rn" + ee);
System.exit(0);
}//Of try
dataset.setClassIndex(dataset.numAttributes() - 1);
numConditions = dataset.numAttributes() - 1;
numInstances = dataset.numInstances();
numClasses = dataset.attribute(numConditions).numValues();
}//Of the constructor
public void setDataType(int paraDataType) {
dataType = paraDataType;
}//Of setDataType
public void calculateClassDistribution() {
classDistribution = new double[numClasses];
classDistributionLaplacian = new double[numClasses];
double[] tempCounts = new double[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClassValue = (int) dataset.instance(i).classValue();
tempCounts[tempClassValue]++;
}//Of for i
for (int i = 0; i < numClasses; i++) {
classDistribution[i] = tempCounts[i] / numInstances;
classDistributionLaplacian[i] = (tempCounts[i] + 1) / (numInstances + numClasses);
}//Of for i
System.out.println("Class distribution: " + Arrays.toString(classDistribution));
System.out.println(
"Class distribution Laplacian: " + Arrays.toString(classDistributionLaplacian));
}//Of calculateClassDistribution
public void calculateConditionalProbabilities() {
conditionalCounts = new double[numClasses][numConditions][];
conditionalProbabilitiesLaplacian = new double[numClasses][numConditions][];
//Allocate space
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) dataset.attribute(j).numValues();
conditionalCounts[i][j] = new double[tempNumValues];
conditionalProbabilitiesLaplacian[i][j] = new double[tempNumValues];
}//Of for j
}//Of for i
//Count the numbers
int[] tempClassCounts = new int[numClasses];
for (int i = 0; i < numInstances; i++) {
int tempClass = (int) dataset.instance(i).classValue();
tempClassCounts[tempClass]++;
for (int j = 0; j < numConditions; j++) {
int tempValue = (int) dataset.instance(i).value(j);
conditionalCounts[tempClass][j][tempValue]++;
}//Of for j
}//Of for i
//Now for the real probability with Laplacian
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
int tempNumValues = (int) dataset.attribute(j).numValues();
for (int k = 0; k < tempNumValues; k++) {
conditionalProbabilitiesLaplacian[i][j][k] = (conditionalCounts[i][j][k] + 1)
/ (tempClassCounts[i] + tempNumValues);
}//Of for k
}//Of for j
}//Of for i
System.out.println("Conditional probabilities: " + Arrays.deepToString(conditionalCounts));
}//Of calculateConditionalProbabilities
public void calculateGausssianParameters() {
gaussianParameters = new GaussianParameters[numClasses][numConditions];
double[] tempValuesArray = new double[numInstances];
int tempNumValues = 0;
double tempSum = 0;
for (int i = 0; i < numClasses; i++) {
for (int j = 0; j < numConditions; j++) {
tempSum = 0;
//Obtain values for this class.
tempNumValues = 0;
for (int k = 0; k < numInstances; k++) {
if ((int) dataset.instance(k).classValue() != i) {
continue;
}//Of if
tempValuesArray[tempNumValues] = dataset.instance(k).value(j);
tempSum += tempValuesArray[tempNumValues];
tempNumValues++;
}//Of for k
//Obtain parameters.
double tempMu = tempSum / tempNumValues;
double tempSigma = 0;
for (int k = 0; k < tempNumValues; k++) {
tempSigma += (tempValuesArray[k] - tempMu) * (tempValuesArray[k] - tempMu);
}//Of for k
tempSigma /= tempNumValues;
tempSigma = Math.sqrt(tempSigma);
gaussianParameters[i][j] = new GaussianParameters(tempMu, tempSigma);
}//Of for j
}//Of for i
System.out.println(Arrays.deepToString(gaussianParameters));
}//Of calculateGaussianParameters
public void classify() {
predicts = new int[numInstances];
for (int i = 0; i < numInstances; i++) {
predicts[i] = classify(dataset.instance(i));
}//Of for i
}//Of classify
public int classify(Instance paraInstance) {
if (dataType == NOMINAL) {
return classifyNominal(paraInstance);
} else if (dataType == NUMERICAL) {
return classifyNumerical(paraInstance);
}//Of if
return -1;
}//Of classify
public int classifyNominal(Instance paraInstance) {
//Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempClassProbabilityLaplacian = Math.log(classDistributionLaplacian[i]);
double tempPseudoProbability = tempClassProbabilityLaplacian;
for (int j = 0; j < numConditions; j++) {
int tempAttributeValue = (int) paraInstance.value(j);
//Laplacian smooth.
tempPseudoProbability += Math.log(conditionalCounts[i][j][tempAttributeValue])
- tempClassProbabilityLaplacian;
}//Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
}//Of if
}//Of for i
return resultBestIndex;
}//Of classifyNominal
public int classifyNumerical(Instance paraInstance) {
//Find the biggest one
double tempBiggest = -10000;
int resultBestIndex = 0;
for (int i = 0; i < numClasses; i++) {
double tempClassProbabilityLaplacian = Math.log(classDistributionLaplacian[i]);
double tempPseudoProbability = tempClassProbabilityLaplacian;
for (int j = 0; j < numConditions; j++) {
double tempAttributeValue = paraInstance.value(j);
double tempSigma = gaussianParameters[i][j].sigma;
double tempMu = gaussianParameters[i][j].mu;
tempPseudoProbability += -Math.log(tempSigma) - (tempAttributeValue - tempMu)
* (tempAttributeValue - tempMu) / (2 * tempSigma * tempSigma);
}//Of for j
if (tempBiggest < tempPseudoProbability) {
tempBiggest = tempPseudoProbability;
resultBestIndex = i;
}//Of if
}//Of for i
return resultBestIndex;
}//Of classifyNumerical
public double computeAccuracy() {
double tempCorrect = 0;
for (int i = 0; i < numInstances; i++) {
if (predicts[i] == (int) dataset.instance(i).classValue()) {
tempCorrect++;
}//Of if
}//Of for i
double resultAccuracy = tempCorrect / numInstances;
return resultAccuracy;
}//Of computeAccuracy
public static void testNominal() {
System.out.println("Hello, Naive Bayes. I only want to test the nominal data.");
String tempFilename = "D:/mushroom.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NOMINAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateConditionalProbabilities();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}//Of testNominal
public static void testNumerical() {
System.out.println(
"Hello, Naive Bayes. I only want to test the numerical data with Gaussian assumption.");
//String tempFilename = "D:/data/iris.arff";
String tempFilename = "D:/iris-imbalance.arff";
NaiveBayes tempLearner = new NaiveBayes(tempFilename);
tempLearner.setDataType(NUMERICAL);
tempLearner.calculateClassDistribution();
tempLearner.calculateGausssianParameters();
tempLearner.classify();
System.out.println("The accuracy is: " + tempLearner.computeAccuracy());
}//Of testNominal
public static void main(String[] args) {
testNominal();
}//Of main
}//Of class NaiveBayes
运行结果



