记录一些pyspark的简单程序......
WordCount读取hdfs文件,wc:
from pyspark import SparkContext, SparkConf
"""
Pyspark Word Count Demo
"""
def sorted_all_result(wc_rdd):
"""
WordCount取全量,collect()之后排序
:param wc_rdd: RDD
:return:
"""
word_list = wc_rdd.collect()
sorted_list = sorted(word_list, key=lambda tuple2: tuple2[1], reverse=True)
print(sorted_list)
def sorted_top_n_result(wc_rdd, n):
"""
WordCount takeOrdered()取TopN,倒序
:param wc_rdd: RDD
:param n: 取数个数
:return:
"""
top_n_list = wc_rdd.takeOrdered(n, key=lambda tuple2: tuple2[1])
print(top_n_list)
def main():
conf = SparkConf().setAppName("PysparkDemo01").setMaster("spark://192.168.61.128:7077")
sc = SparkContext(conf=conf)
rdd = sc.textFile("hdfs://192.168.61.128:9000/data/wc.txt")
wc_rdd = rdd.flatMap(lambda line: str(line).replace(".", "").replace(",", "").lower().split(" ")).map(
lambda word: (word, 1)).reduceByKey(lambda x, y: x + y)
# sorted_all_result(wc_rdd)
# print(type(wc_rdd))
sorted_top_n_result(wc_rdd, 3)
if __name__ == "__main__":
main()



