Powered By Blogger

Tuesday, December 13, 2022

Array of structs

 scala> import spark.implicits._

import spark.implicits._


scala> import org.apache.spark.sql.types._

import org.apache.spark.sql.types._


scala> import org.apache.spark.sql._

import org.apache.spark.sql._


scala>  val arrayStructData = Seq(

     |       Row("James",List(Row("Java","XX",120),Row("Scala","XA",300))),

     |       Row("Michael",List(Row("Java","XY",200),Row("Scala","XB",500))),

     |       Row("Robert",List(Row("Java","XZ",400),Row("Scala","XC",250))),

     |       Row("Washington",null)

     |     )

arrayStructData: Seq[org.apache.spark.sql.Row] = List([James,List([Java,XX,120], [Scala,XA,300])], [Michael,List([Java,XY,200], [Scala,XB,500])], [Robert,List([Java,XZ,400], [Scala,XC,250])], [Washington,null])



    val arrayStructSchema = new StructType().add("name",StringType)

      .add("booksIntersted",ArrayType(new StructType()

        .add("name",StringType)

        .add("author",StringType)

        .add("pages",IntegerType)))



arrayStructSchema: org.apache.spark.sql.types.StructType = StructType(StructField(name,StringType,true), StructField(booksIntersted,ArrayType(StructType(StructField(name,StringType,true), StructField(author,StringType,true), StructField(pages,IntegerType,true)),true),true))


scala>     val df = spark.createDataFrame(spark.sparkContext

     |         .parallelize(arrayStructData),arrayStructSchema)

df: org.apache.spark.sql.DataFrame = [name: string, booksIntersted: array<struct<name:string,author:string,pages:int>>]


scala>     df.printSchema()

root

 |-- name: string (nullable = true)

 |-- booksIntersted: array (nullable = true)

 |    |-- element: struct (containsNull = true)

 |    |    |-- name: string (nullable = true)

 |    |    |-- author: string (nullable = true)

 |    |    |-- pages: integer (nullable = true)



scala>     df.show(false)

+----------+-----------------------------------+

|name      |booksIntersted                     |

+----------+-----------------------------------+

|James     |[[Java, XX, 120], [Scala, XA, 300]]|

|Michael   |[[Java, XY, 200], [Scala, XB, 500]]|

|Robert    |[[Java, XZ, 400], [Scala, XC, 250]]|

|Washington|null                               |

+----------+-----------------------------------+


Thursday, December 8, 2022

UnionByName with the empty dataframe

 import spark.implicits._

import org.apache.spark.sql.types.
import org.apache.spark.sql._

val data = Seq(("James","Sales",34), ("Michael","Sales",56),
("Robert","Sales",30), ("Maria","Finance",24) )
val df1 = data.toDF("name","dept","age")
df1.printSchema()



val schema = StructType(
StructField("name", StringType, true) ::
StructField("dept", StringType, false) ::
StructField("age", IntegerType, false) :: Nil)


val df = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], schema)


val merged_df = df1.unionByName(df)
merged_df.show(false)

val merged_df2 = df.unionByName(df1)
merged_df2.show(false)