您可以创建一个
udf连接 数组/列表的对象 ,然后将其应用于 测试 列:
from pyspark.sql.functions import udf, coljoin_udf = udf(lambda x: ",".join(x))df.withColumn("test_123", join_udf(col("test_123"))).show()+----+----------------+|uuid| test_123|+----+----------------+| 1|test,test2,test3|| 2|test4,test,test6|| 3|test6,test9,t55o|+----+----------------+初始数据帧从以下位置创建:
from pyspark.sql.types import StructType, StructFieldschema = StructType([StructField("uuid",IntegerType(),True),StructField("test_123",ArrayType(StringType(),True),True)])rdd = sc.parallelize([[1, ["test","test2","test3"]], [2, ["test4","test","test6"]],[3,["test6","test9","t55o"]]])df = spark.createDataframe(rdd, schema)df.show()+----+--------------------+|uuid| test_123|+----+--------------------+| 1|[test, test2, test3]|| 2|[test4, test, test6]|| 3|[test6, test9, t55o]|+----+--------------------+


