这是我的想法。这是一个非常简单的解决方案,并且由于我要对高度不平衡的数据集进行分类,因此它依赖于自定义精度指标(称为weightedAccuracy)。但是,如果需要,应该很容易将其扩展。
from sklearn import datasetsimport pandasfrom sklearn.ensemble import RandomForestClassifierfrom sklearn import cross_validationfrom sklearn.metrics import confusion_matrixdef get_enhanced_confusion_matrix(actuals, predictions, labels): """"enhances confusion_matrix by adding sensivity and specificity metrics""" cm = confusion_matrix(actuals, predictions, labels = labels) sensitivity = float(cm[1][1]) / float(cm[1][0]+cm[1][1]) specificity = float(cm[0][0]) / float(cm[0][0]+cm[0][1]) weightedAccuracy = (sensitivity * 0.9) + (specificity * 0.1) return cm, sensitivity, specificity, weightedAccuracyiris = datasets.load_iris()x=pandas.Dataframe(iris.data, columns=['var1','var2','var3', 'var4'])y=pandas.Series(iris.target, name='target')response, _ = pandas.factorize(y)xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x, response, test_size = .25, random_state = 36583)print "building the first forest"rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2, n_jobs = -1, verbose = 1)rf.fit(xTrain, yTrain)importances = pandas.Dataframe({'name':x.columns,'imp':rf.feature_importances_ }).sort(['imp'], ascending = False).reset_index(drop = True)cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1])numFeatures = len(x.columns)rfeMatrix = pandas.Dataframe({'numFeatures':[numFeatures], 'weightedAccuracy':[weightedAccuracy], 'sensitivity':[sensitivity], 'specificity':[specificity]})print "running RFE on %d features"%numFeaturesfor i in range(1,numFeatures,1): varsUsed = importances['name'][0:i] print "now using %d of %s features"%(len(varsUsed), numFeatures) xTrain, xTest, yTrain, yTest = cross_validation.train_test_split(x[varsUsed], response, test_size = .25) rf = RandomForestClassifier(n_estimators = 500, min_samples_split = 2, n_jobs = -1, verbose = 1) rf.fit(xTrain, yTrain) cm, sensitivity, specificity, weightedAccuracy = get_enhanced_confusion_matrix(yTest, rf.predict(xTest), [0,1]) print("n"+str(cm)) print('the sensitivity is %d percent'%(sensitivity * 100)) print('the specificity is %d percent'%(specificity * 100)) print('the weighted accuracy is %d percent'%(weightedAccuracy * 100)) rfeMatrix = rfeMatrix.append( pandas.Dataframe({'numFeatures':[len(varsUsed)],'weightedAccuracy':[weightedAccuracy],'sensitivity':[sensitivity],'specificity':[specificity]}), ignore_index = True) print("n"+str(rfeMatrix)) maxAccuracy = rfeMatrix.weightedAccuracy.max()maxAccuracyFeatures = min(rfeMatrix.numFeatures[rfeMatrix.weightedAccuracy == maxAccuracy])featuresUsed = importances['name'][0:maxAccuracyFeatures].tolist()print "the final features used are %s"%featuresUsed


