package test_tfidf
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
import org.apache.spark.sql.SparkSession
//import utils.SparkUtils
/**
*测试Spark MLlib的tf-idf
* Created by zcy on 18-1-4.
*/
object TFIDFDemo {
def main(args: Array[String]) {
val spark_session = SparkSession.builder().appName("tf-idf").master("local[4]").getOrCreate()
import spark_session.implicits._ // 隐式转换
val sentenceData = spark_session.createDataframe(Seq(
(0, "Hi I heard about Spark"),
(0, "I wish Java could use case classes"),
(1, "Logistic regression models are neat")
)).toDF("label", "sentence")
// 分词
val tokenizer = new Tokenizer().setInputCol("sentence").setOutputCol("words")
println("wordsData----------------")
val wordsData = tokenizer.transform(sentenceData)
wordsData.show(3)
// 求TF
println("featurizedData----------------")
val hashingTF = new HashingTF()
.setInputCol("words").setOutputCol("rawFeatures").setNumFeatures(2000) // 设置哈希表的桶数为2000,即特征维度
val featurizedData = hashingTF.transform(wordsData)
featurizedData.show(3)
// 求IDF
println("recaledData----------------")
val idf = new IDF().setInputCol("rawFeatures").setOutputCol("features")
val idfModel = idf.fit(featurizedData)
val rescaledData = idfModel.transform(featurizedData)
rescaledData.show(3)
println("----------------")
rescaledData.select("features", "label").take(3).foreach(println)
}
}
上面TF转换特征向量的代码设置了桶数,即特征向量的维度,这里将每个文本用2000个特征向量表示。
5.2 调用K-means模型
// Trains a k-means model.
println("creating kmeans model ...")
val kmeans = new KMeans().setK(k).setSeed(1L)
val model = kmeans.fit(rescaledData)
// evaluate clustering by computing Within Set Sum of Squared Errors.
println("calculating wssse ...")
val WSSSE = model.computeCost(rescaledData)
println(s"Within Set Sum of Squared Errors = $WSSSE")
5.3 评价方式
假设最终得到的文件和预测结果如下:
val t = List(
("121.txt",0),("122.txt",0),("123.txt",3),("124.txt",0),("125.txt",0),("126.txt",1),
("221.txt",3),("222.txt",4),("223.txt",3),("224.txt",3),("225.txt",3),("226.txt",1),
("421.txt",4),("422.txt",4),("4.txt",3),("41.txt",3),("43.txt",4),("426.txt",1)