eg.降水概率为0.5-----P(A)
堵车概率为0.8----P(B)
下雨且堵车概率为0.95---P(B|A)
看到堵车后下雨的概率 P(A|B)=(P(B|A)*P(A))/P(B) = 0.59
import numpy as np
X = np.array([[0,1,0,1],
[1,1,1,0],
[0,1,1,0],
[0,0,0,1],
[0,1,1,0],
[0,1,0,1],
[1,0,0,1]])
y = np.array([0,1,1,0,1,0,0])
counts = {}
for label in np.unique(y):
counts[label] = X[y==label].sum(axis=0)
print("feature counts:n{}".format(counts))
import numpy as np
from sklearn.naive_bayes import BernoulliNB
X = np.array([[0,1,0,1],
[1,1,1,0],
[0,1,1,0],
[0,0,0,1],
[0,1,1,0],
[0,1,0,1],
[1,0,0,1]])
y = np.array([0,1,1,0,1,0,0])
counts = {}
for label in np.unique(y):
counts[label] = X[y==label].sum(axis=0)
clf = BernoulliNB()
clf.fit(X,y)
Next_Day = [[0,0,1,0]]
pre = clf.predict(Next_Day)
if pre ==[1]:
print("rain")
else:
print("sunnny")
播报没雨,但出现了多云的情况,倾向于放到下雨分类中,输出rain
Another_Day = [[1,1,0,1]] pre2 = clf.predict(Another_Day)
对另一天刮风闷热进行分类,输出sunnny
pre = clf.predict(Next_Day) print(clf.predict_proba(Next_Day))
测试预测准确率
第一天不下雨和下雨的概率
贝努利朴素贝叶斯
也被称为二项分布或者0-1分布,下面用具体的例子来测试一下
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
X,y = make_blobs(n_samples=500,centers=5,random_state=8)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=8)
nb = BernoulliNB()
nb.fit(X_train,y_train)
print("得分:{:.3f}".format(nb.score(X_test,y_test)))
通过图像来了解一下工作过程:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
X,y = make_blobs(n_samples=500,centers=5,random_state=8)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=8)
nb = BernoulliNB()
nb.fit(X_train,y_train)
print("得分:{:.3f}".format(nb.score(X_test,y_test)))
import matplotlib.pyplot as plt
x_min,x_max = X[:,0].min()-0.5,X[:,0].max()+0.5
y_min,y_max = X[:,1].min()-0.5,X[:,1].max()+0.5
xx,yy = np.meshgrid(np.arange(x_min,x_max,.02),np.arange(y_min,y_max,.02))
z = nb.predict(np.c_[(xx.ravel(),yy.ravel())]).reshape(xx.shape)
plt.pcolormesh(xx,yy,z,cmap=plt.cm.Pastel1)
plt.scatter(X_train[:,0],X_train[:,1],c=y_train,cmap=plt.cm.cool,edgecolors='k')
plt.scatter(X_test[:,0],X_test[:,1],c=y_test,cmap=plt.cm.cool,marker='*',edgecolors='k')
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title('Classifier:BernoulliNB')
plt.show()
模型对数据的判断是:如果两个特征都>=0,归为一类,以此类推,其余数据全部归为其他类。
高斯朴素贝叶斯
符合正态分布的算法。
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
print('得分:{:.3f}'.format(gnb.score(X_test,y_test)))
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
X,y = make_blobs(n_samples=500,centers=5,random_state=8)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=8)
nb = BernoulliNB()
nb.fit(X_train,y_train)
import matplotlib.pyplot as plt
x_min,x_max = X[:,0].min()-0.5,X[:,0].max()+0.5
y_min,y_max = X[:,1].min()-0.5,X[:,1].max()+0.5
xx,yy = np.meshgrid(np.arange(x_min,x_max,.02),np.arange(y_min,y_max,.02))
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(X_train,y_train)
print('得分:{:.3f}'.format(gnb.score(X_test,y_test)))
z = gnb.predict(np.c_[(xx.ravel(),yy.ravel())]).reshape(xx.shape)
plt.pcolormesh(xx,yy,z,cmap=plt.cm.Pastel1)
plt.scatter(X_train[:,0],X_train[:,1],c=y_train,cmap=plt.cm.cool,edgecolors='k')
plt.scatter(X_test[:,0],X_test[:,1],c=y_test,cmap=plt.cm.cool,marker='*',edgecolors='k')
plt.xlim(xx.min(),xx.max())
plt.ylim(yy.min(),yy.max())
plt.title('Classifier:GaussianNB')
plt.show()
分类情况要复杂的多
多项式朴素贝叶斯
使用时必须输入的X值是非负的,所以要对数据进行预处理
使用 MinMaxScaler()对数据进行预处理
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
mnb = MultinomialNB()
mnb.fit(X_train_scaled,y_train)
print("得分:{:.3f}".format(mnb.score(X_test,y_test)))
多项式所进行的分类更差,大部分的数据放在了错误的分类里,这是因为它只适合用来对非负离散数值进行分类。
实战1----判断肿瘤
导入数据集的30个特征,打印键值
from sklearn.datasets import load_breast_cancer cancer = load_breast_cancer() print(cancer.keys())
print('肿瘤的分类:',cancer['target_names'])
print('n肿瘤的特征:n',cancer['feature_names'])
使用高斯来进行建模
X,y = cancer.data,cancer.target
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=38)
print('训练集数据形态:',X_train.shape)
print('测试集数据形态',X_test.shape)
训练集436样本,测试集143个样本
gnb = GaussianNB()
gnb.fit(X_train,y_train)
print('训练集得分:{:.3f}'.format(gnb.score(X_train,y_train)))
print('测试集得分:{:.3f}'.format(gnb.score(X_test,y_test)))
下面用一个样本让模型进行预测
print('模型的预测分类是:{}'.format(gnb.predict([X[312]])))
print('样本的正确分类的是:',y[312])
这个数据是一个良性肿瘤
绘制学习曲线
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
cancer = load_breast_cancer()
#print(cancer.keys())
#print('肿瘤的分类:',cancer['target_names'])
#print('n肿瘤的特征:n',cancer['feature_names'])
X,y = cancer.data,cancer.target
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=38)
#print('训练集数据形态:',X_train.shape)
#print('测试集数据形态',X_test.shape)
gnb = GaussianNB()
gnb.fit(X_train,y_train)
#print('训练集得分:{:.3f}'.format(gnb.score(X_train,y_train)))
#print('测试集得分:{:.3f}'.format(gnb.score(X_test,y_test)))
#print('模型的预测分类是:{}'.format(gnb.predict([X[312]])))
#print('样本的正确分类的是:',y[312])
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator,title,X,y,ylim=None,cv=None,
n_jobs=1,train_sizes = np.linspace(.1,1.0,5)):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes,train_scores,test_scores = learning_curve(
estimator,X,y,cv=cv,n_jobs=n_jobs,train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
plt.grid()
plt.plot(train_sizes,train_scores_mean,'o-',color = "r",
label="Training score")
plt.plot(train_sizes,test_scores_mean,'o-',color="g",
label="Cross-validation score")
plt.legend(loc="lower right")
return plt
title = "Learning Curves (Naive Bayes)"
cv = ShuffleSplit(n_splits=100,test_size=0.2,random_state=0)
estimator = GaussianNB()
plot_learning_curve(estimator,title,X,y,ylim=(0.9,1.01),cv=cv,n_jobs=4)
plt.show()
随着样本量的增加,魔性的得分是逐渐降低的,因为模型要拟合的数据越来越多
实战2----使用Python进行文本分类
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet|set(document)
return list(vocabSet)
#创建两个集合的并集
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] =1
else:
print("thr word:%s is not in my Vocabulary!"%word)
return returnVec
listOPosts,listClasses = loadDataSet()
myVocabList =createVocabList(listOPosts)
print(myVocabList)
print(setOfWords2Vec(myVocabList,listOPosts[0]))
print(setOfWords2Vec(myVocabList,listOPosts[3]))
接下来看看如何用这些数字计算概率:
该函数的伪代码:
计算每个类别中的文档数目:
对每篇训练文档:
对每个类别:
如果词条出现在文档中->增加该词条的计数值
增加所有词条的计数值
对每个类别:
对每个词条:
将该词条的数目除以总词条数目得到条件概率
返回每个类别的条件概率
def trainNBO(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = zeros(numWords)
p1Num = zeros(numWords)
p0Denom = 0.0
p1Denom = 0.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect = p1Num/p1Denom
p0Vect = p0Num/p0Denom
return p0Vect,p1Vect,pAbusive
listOPosts,listClasses = loadDataSet()
myVocabList =createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNBO(trainMat,listClasses)
print(pAb)
print('n')
print(p0V)
print('n')
print(p1V)
接下来对函数的一些缺陷进行修改:
计算p(w0|1)p(w1|1)p(w2|1)时,如果其中一个概率为0,最后的乘积也为0,为降低这种影响 ,可以将所有词出现数初始化为1,并将分母初始化为2
修改trainNB0()函数的4,5行
p0Num = ones(numWords) p1Num = ones(numWords) p0Denom = 2.0 p1Denom = 2.0
另外一个问题是下溢出,这是由于太多很小的数相乘造成的,最后会四舍五入得到0.
可以通过对成绩取自然对数ln(a*b) = ln(a)+ln(b),修改return前的两行代码,将上述做法用到分类器中。
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify*p1Vec)+log(pClass1)
p0 = sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses =loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
p0V,p1V,pAb = trainNBO(array(trainMat),array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print(testEntry, 'classified as :', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid','garbage']
thisDoc = array(setOfWords2Vec(myVocabList, testEntry))
print (testEntry,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb))
共有四个输入,要分类的向量vec2Classify,trainNB0()计算得到的三个概率,第二个函数是用来封装所有操作。
目前为止,我们将每个词的出现与否作为一个特征,这可以被描述为词集模型。
如果一个词在文档中出现不止一次,这可能意味着包含该词是否出现在文档中所不能表示的某种信息,这种方法被称为词袋模型。
使用朴素贝叶斯过滤垃圾邮件def bagOfWords2VecMN(vocabList,inputSet):
retrunVec = [0]*len(vocabList)
for word in inputSet:
if word in inputSet:
if word in vocabList:
retrunVec[vocabList.index(word)]+=1
return retrunVec
mySent = 'This book is the best book on Python on M.L. I have ever laid eyes upon.'
print(mySent.split())
regEx = re.compile('\W*')
listOfTokens = regEx.split(mySent)
print(listOfTokens)
emailText = open('F:pythonmachinelearninginactionCh04emailham\6.txt').read()
listOfTokens = regEx.split(emailText)
print(listOfTokens)
接下来测试完整代码:
import random
from numpy import *
import re
def loadDataSet():
postingList = [['my','dog','has','flea','problems','help','please'],
['maybe','not','take','him','to','dog','park','stupid'],
['my','dalmation','is','so','cute','I','love','him'],
['stop','posting','stupid','worthless','garbage'],
['mr','licks','ate','my','steak','how','to','stop','him'],
['quit','buying','worthless','dog','food','stupid']]
classVec = [0,1,0,1,0,1]
return postingList,classVec
def createVocabList(dataSet):
vocabSet = set([])
for document in dataSet:
vocabSet = vocabSet|set(document)
return list(vocabSet)
#创建两个集合的并集
def setOfWords2Vec(vocabList,inputSet):
returnVec = [0]*len(vocabList)
for word in inputSet:
if word in vocabList:
returnVec[vocabList.index(word)] =1
else:
print("thr word:%s is not in my Vocabulary!"%word)
return returnVec
def bagOfWords2VecMN(vocabList,inputSet):
retrunVec = [0]*len(vocabList)
for word in inputSet:
if word in inputSet:
if word in vocabList:
retrunVec[vocabList.index(word)]+=1
return retrunVec
def trainNBO(trainMatrix,trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory)/float(numTrainDocs)
p0Num = ones(numWords)
p1Num = ones(numWords)
p0Denom = 2.0
p1Denom = 2.0
for i in range(numTrainDocs):
if trainCategory[i] == 1:
p1Num+=trainMatrix[i]
p1Denom+=sum(trainMatrix[i])
else:
p0Num+=trainMatrix[i]
p0Denom+=sum(trainMatrix[i])
p1Vect = log(p1Num/p1Denom)
p0Vect = log(p0Num/p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify,p0Vec,p1Vec,pClass1):
p1 = sum(vec2Classify*p1Vec)+log(pClass1)
p0 = sum(vec2Classify*p0Vec)+log(1.0-pClass1)
if p1>p0:
return 1
else:
return 0
def testingNB():
listOPosts,listClasses =loadDataSet()
myVocabList = createVocabList(listOPosts)
trainMat = []
for postinDoc in listOPosts:
trainMat.append(bagOfWords2VecMN(myVocabList,postinDoc))
p0V,p1V,pAb = trainNBO(array(trainMat),array(listClasses))
testEntry = ['love', 'my', 'dalmation']
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
print(testEntry, 'classified as :', classifyNB(thisDoc, p0V, p1V, pAb))
testEntry = ['stupid','garbage']
thisDoc = array(bagOfWords2VecMN(myVocabList, testEntry))
print (testEntry,'classified as :',classifyNB(thisDoc,p0V,p1V,pAb))
def textParse(bigString):
import re
listOfTokens = re.split(r'W*',bigString)
return [tok.lower() for tok in listOfTokens if len(tok) > 2]
def spamTest():
docList = []
classList = []
fullText = []
for i in range(1,26):
wordList = textParse(open('F:pythonmachinelearninginactionCh04emailham%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(open('F:pythonmachinelearninginactionCh04emailham%d.txt'%i).read())
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
trainingSet = range(50)
testSet = []
for i in range(10):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat=[]
trainClasses = []
for docIndex in trainingSet:
trainMat.append(setOfWords2Vec(vocabList,docList[docIndex]))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = setOfWords2Vec(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam) != classList[docIndex]:
errorCount +=1
print("the error rate is :",float(errorCount)/len(testSet))
listOPosts,listClasses = loadDataSet()
myVocabList =createVocabList(listOPosts)
#print(myVocabList)
#print(setOfWords2Vec(myVocabList,listOPosts[0]))
#print(setOfWords2Vec(myVocabList,listOPosts[3]))
trainMat = []
"""
for postinDoc in listOPosts:
trainMat.append(setOfWords2Vec(myVocabList,postinDoc))
"""
#p0V,p1V,pAb = trainNBO(trainMat,listClasses)
#print(pAb)
#print('n')
#print(p0V)
#print('n')
#print(p1V)
#testingNB()
mySent = 'This book is the best book on Python on M.L. I have ever laid eyes upon.'
print(mySent.split())
regEx = re.compile('\W*')
listOfTokens = regEx.split(mySent)
print(listOfTokens)
print([tok for tok in listOfTokens if len(tok) > 0])
print([tok.lower() for tok in listOfTokens if len(tok)>0])
emailText = open('F:pythonmachinelearninginactionCh04emailham\6.txt').read()
listOfTokens = regEx.split(emailText)
print(listOfTokens)
textParse()接受一个大写字符串并将其解析为字符串列表,去掉少于两个字符的字符串,并将所有的字符串转化为小写。
spamTest()对贝叶斯垃圾邮件分类器进行自动化处理,导入文件夹spam与ham下的文本文件,并将他们解析为词列表。接下来构建一个测试集和一个训练集。10封被随机选为测试集,同时也将其从训练集中剔除。
trainingSet是一个整数列表,其中的值从0到49.这种随机选择一部分作为训练集,剩余部分作为测试集的过程叫做留存交叉验证。
for 循环遍历训练集的所有文档,对每封邮件基于词汇表使用setOfWords2Vec()函数来构建词向量,在函数traindNB0()中用于计算分类所需的概率,然后遍历测试集,对其中每封邮件进行分类。如果分类错误,则错误数+1.
下面对上述过程进行尝试。
spamTest()会输出在10封随机选择的电子邮件上的分类错误率,因为是随机输入导致输出结果可能有差别,如果发现错误会输出错分文档的词表。
使用朴素贝叶斯分类器从个人广告中获取区域倾向
使用RSS阅读器
def calcMostFreq(vocabList, fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
def localWords(feed1, feed0):
import feedparser
docList = []
classList = []
fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)
for pairW in top30Words:
if pairW[0] in vocabList:vocabList.remove(pairW[0])
trainingSet = range(2*minLen)
testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNB0(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount += 1
print("the error rate is: ",float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def calcMostFreq(vocabList, fullText):
import operator
freqDict = {}
for token in vocabList:
freqDict[token] = fullText.count(token)
sortedFreq = sorted(freqDict.items(), key=operator.itemgetter(1), reverse=True)
return sortedFreq[:30]
def localWords(feed1, feed0):
docList = []
classList = []
fullText = []
minLen = min(len(feed1['entries']), len(feed0['entries']))
for i in range(minLen):
wordList = textParse(feed1['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(1)
wordList = textParse(feed0['entries'][i]['summary'])
docList.append(wordList)
fullText.extend(wordList)
classList.append(0)
vocabList = createVocabList(docList)
top30Words = calcMostFreq(vocabList,fullText)
for pairW in top30Words:
if pairW[0] in vocabList:vocabList.remove(pairW[0])
trainingSet = list(range(2*minLen))
testSet = []
for i in range(20):
randIndex = int(random.uniform(0,len(trainingSet)))
testSet.append(trainingSet[randIndex])
del(trainingSet[randIndex])
trainMat = []
trainClasses = []
for docIndex in trainingSet:
trainMat.append(bagOfWords2VecMN(vocabList,docList))
trainClasses.append(classList[docIndex])
p0V,p1V,pSpam = trainNBO(array(trainMat),array(trainClasses))
errorCount = 0
for docIndex in testSet:
wordVector = bagOfWords2VecMN(vocabList,docList[docIndex])
if classifyNB(array(wordVector),p0V,p1V,pSpam)!=classList[docIndex]:
errorCount += 1
print("the error rate is: ",float(errorCount)/len(testSet))
return vocabList,p0V,p1V
def getTopWords(ny,sf):
vocabList,p0V,p1V = localWords(ny,sf)
topNY = []
topSF = []
for i in range(len(p0V)):
if p0V[i] > -6.0 : topSF.append((vocabList[i],p0V))
if p1V[i] > -6.0: topSF.append((vocabList[i], p1V))
sortedSF = sorted(topSF,key=lambda pair:pair[1],reverse=True)
print("SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**SF**")
for item in sortedSF:
print(item[0])
sortedNY = sorted(topNY, key=lambda pair: pair[1], reverse=True)
print("NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**NY**")
for item in sortedNY:
print(item[0])
getTopWords()使用两个RSS源作为输入,然后训练测试朴素贝叶斯分类器,返回使用的概率值。然后创建两个列表用于元组的存储。



