创建样本数据:
from pyspark.sql import Rowx = [Row(col1="xx", col2="yy", col3="zz", col4=[123,234])]rdd = sc.parallelize([Row(col1="xx", col2="yy", col3="zz", col4=[123,234])])df = spark.createDataframe(rdd)df.show()#+----+----+----+----------+#|col1|col2|col3| col4|#+----+----+----+----------+#| xx| yy| zz|[123, 234]|#+----+----+----+----------+
用于
getItem从数组列中提取元素,按实际情况替换
col4为
collect_set(TIMESTAMP):
df = df.withColumn("col5", df["col4"].getItem(1)).withColumn("col4", df["col4"].getItem(0))df.show()#+----+----+----+----+----+#|col1|col2|col3|col4|col5|#+----+----+----+----+----+#| xx| yy| zz| 123| 234|#+----+----+----+----+----+


