ff =d.select(['dnum']).dropDuplicates() ff.count() ff.show() fff =d.select(['dnum']).distinct()2、withColumn、lit、col
withColumn增加一列
lit 指定列
col 选择列
import pyspark.sql.functions as F
temp_df = temp_df.withColumn("date", F.lit(target_date))
movie_feature_df = movie_feature_df.withColumn('tags', regexp_replace(col('tags'), "[", ""))
3、unionByName、groupBy
play_video_df = None
for i in range(args.range):
t = target_date - datetime.timedelta(days=i)
temp_df = spark.sql(
"select * from ***album where year=%s and month=%s and day=%s" % (t.year, t.month, t.day))
temp_df = temp_df.withColumn("date", F.lit(target_date))
if play_video_df == None:
play_video_df = temp_df
else:
play_video_df = play_video_df.unionByName(temp_df)
target_df = play_video_df
target_groupped_movie_df = target_movie_df.groupBy("dnum", "aid").agg(F.max("finish_rate").alias("finish_rate"))



