Powered By Blogger

Sunday, April 19, 2020

Fill Substitute null with values in spark Dataframe


import spark.implicits._
val sourceDF = Seq(
           ("2019-12-09", "11", 1, "L1", "I11", "2018"),
           ("2019-12-09", "11", 2, "L2", "I10", "2018"),
           ("2019-12-09", "11", 3, "L3", "I4", "2018"),
           ("2019-12-09", "11", 4, "L4", "I4", "2015"),
           ("2019-12-09", "11", 5, "L5", "I4", "2019"),
           ("2019-12-09", "11", 6, "L6", null, "2014"),
           ("2019-12-09",  null, 7, "L7", null, "2013")

         ).toDF("date", "hour", "order", "line", "item", "time")

sourceDF.show(false)
val filledDF = sourceDF.na.fill("-99999",Seq("hour","item"))
filledDF.show(false)



scala> sourceDF.show(false)
+----------+----+-----+----+----+----+
|date      |hour|order|line|item|time|
+----------+----+-----+----+----+----+
|2019-12-09|11  |1    |L1  |I11 |2018|
|2019-12-09|11  |2    |L2  |I10 |2018|
|2019-12-09|11  |3    |L3  |I4  |2018|
|2019-12-09|11  |4    |L4  |I4  |2015|
|2019-12-09|11  |5    |L5  |I4  |2019|
|2019-12-09|11  |6    |L6  |null|2014|
|2019-12-09|null|7    |L7  |null|2013|
+----------+----+-----+----+----+----+


scala> val filledDF = sourceDF.na.fill("-99999",Seq("hour","item"))
filledDF: org.apache.spark.sql.DataFrame = [date: string, hour: string ... 4 more fields]


scala> filledDF.show(false)
+----------+------+-----+----+------+----+
|date      |hour  |order|line|item  |time|
+----------+------+-----+----+------+----+
|2019-12-09|11    |1    |L1  |I11   |2018|
|2019-12-09|11    |2    |L2  |I10   |2018|
|2019-12-09|11    |3    |L3  |I4    |2018|
|2019-12-09|11    |4    |L4  |I4    |2015|
|2019-12-09|11    |5    |L5  |I4    |2019|
|2019-12-09|11    |6    |L6  |-99999|2014|
|2019-12-09|-99999|7    |L7  |-99999|2013|
+----------+------+-----+----+------+----+


scala>

No comments:

Post a Comment