spark sql常见操作(updating)

import org.apache.spark.sql.functions._ val myDF = sqlContext.parquetFile("hdfs:/to/my/file.parquet") val coder: (Int => String) = (arg: Int) => {if (arg < 100) "little" else "big"} val sqlfunc = udf(coder) myDF.withColumn("Code", sqlfunc(col("Amt")))

(1 to minusHours).toDF("hour_num").withColumn("a", datediff(current_date, current_date)).show


import org.apache.spark.sql.types._
import org.apache.spark.sql.{Dataframe, Row, SQLContext}

val schema =
          StructType(
            StructField("imps", IntegerType, true) ::
              StructField("clks", LongType, true) :: Nil)

val row1 = Row(1, 2)
val row2 = Row(2, 3)
val row3 = Row(2, 2)
val row4 = Row(3, 3)
val row5 = Row(3, 2)
val row6 = Row(3, 3)
val row7 = Row(4, 2)
val row8 = Row(5, 3)

val df = sqlContext.createDataframe(sc.parallelize(List(row1,row2,row3,row4,row5,row6,row7,row8)), schema)

df.groupBy("imps").agg(count("imps")).show

+----+-----------+

|imps|COUNT(imps)|

+----+-----------+

| 1| 1|

| 2| 2|

| 3| 3|

| 4| 1|

| 5| 1|

+----+-----------+

==============================================

import org.apache.spark.sql.functions._

val myDF = sqlContext.parquetFile("hdfs:/to/my/file.parquet")
val coder: (Int => String) = (arg: Int) => {if (arg < 100) "little" else "big"}
val sqlfunc = udf(coder)

myDF.withColumn("Code", sqlfunc(col("Amt")))

I think withColumn is the right way to add a column

but that fail at v1.4.1

======


import org.apache.spark.sql.types._

import org.apache.spark.sql.{Dataframe, Row, SQLContext}

val schema =

          StructType(

            StructField("imps", LongType, true) ::

              StructField("clks", LongType, true) ::

              StructField("cost", DoubleType, true) ::

              StructField("spending", DoubleType, true) :: Nil)

//        val people =

//          sc.textFile("examples/src/main/resources/people.txt").map(

//            _.split(",")).map(p => Row(p(0), p(1).trim.toInt))

       val row = Row(null, null, null, null)

        val row = Row(null, null, null, null)

        val dataframe = sqlContext.createDataframe(sc.parallelize(List(row)), schema)

        dataframe.write.parquet(savePath)

val schema2 =

          StructType(

            StructField("id", StringType, true) :: Nil)

val row = Row("1")

val d2 = sqlContext.createDataframe(sc.parallelize(List(row)), schema)

d1.withColumn("c2",d2.col("id"))          ==> That no use!!!

case classUserDefinedFunction(f: AnyRef, dataType: DataType) extends Product with Serializable

Experimental

A user-defined function. To create one, use the udf functions in functions. As an example:

import sqlContext._

// Defined a UDF that returns true or false based on some numeric score.

val predict = udf((score: Double) => if (score > 0.5) true else false)

// Projects a column that adds a prediction column based on the score column.

df.select( predict(df("score")) )

That`s succes!!

import sqlContext.implicits._     //下边代码中 "$" 才能被识别出来

dfOffline.select($"member_id", formatDate($"create_time") as "costDate", memberCastoneDay($"goods_price", $"goods_num") as "cost").registerTempTable("tmpMiddleData1")

spark sql常见操作(updating)

大数据系统相关栏目本月热门文章