我相信您需要使用窗口函数基于
user_id和来获得每一行的排名
score,然后过滤结果以仅保留前两个值。
from pyspark.sql.window import Windowfrom pyspark.sql.functions import rank, colwindow = Window.partitionBy(df['user_id']).orderBy(df['score'].desc())df.select('*', rank().over(window).alias('rank')) .filter(col('rank') <= 2) .show() #+-------+---------+-----+----+#|user_id|object_id|score|rank|#+-------+---------+-----+----+#| user_1| object_1| 3| 1|#| user_1| object_2| 2| 2|#| user_2| object_2| 6| 1|#| user_2| object_1| 5| 2|#+-------+---------+-----+----+通常,官方编程指南是开始学习Spark的好地方。
数据
rdd = sc.parallelize([("user_1", "object_1", 3), ("user_1", "object_2", 2), ("user_2", "object_1", 5), ("user_2", "object_2", 2), ("user_2", "object_2", 6)])df = sqlContext.createDataframe(rdd, ["user_id", "object_id", "score"])


