使用sample抽样获取数据倾斜的key
object SparkDataSkew {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf()
.setMaster("local")
.setAppName("data_skew_key")
val sparkContext = new SparkContext(sparkConf)
val rdd: RDD[String] = sparkContext.makeRDD(
List("spark", "flink", "hbase", "mysql", "spark", "spark", "spark",
"flink", "flink"), 2
)
rdd.sample(false,0.5) //不放回抽样,抽中几率为50%
.map((_,1)).reduceByKey(_+_) //word count
.map(x => (x._2,x._1)) //交换位置,数量放在前面
.sortByKey(false) //进行排序
.take(2) //取前两位
.foreach(println)
}
}