Powered By Blogger

Monday, May 4, 2020

Filter data in spark using isin

scala> :paste
// Entering paste mode (ctrl-D to finish)

import java.sql.Date
import org.apache.spark.sql._

    import spark.implicits._
    val inputDF :DataFrame = Seq(
      ("50104044", "order1", "orderline11" , "2020-03", Date.valueOf("2020-03-17")),
      ("50104279", "order2", "orderline21","2020-03", Date.valueOf("2020-03-18")),
      ("50104279", "order22", "orderline221","2020-03", Date.valueOf("2020-03-19")),
      ("50102271", "order3", "orderline31","2020-02", Date.valueOf("2020-02-11")),
      ("50104279", "order4", "orderline41","2020-02", Date.valueOf("2020-02-11")),
      ("50104279", "order5", "orderline51","2020-01", Date.valueOf("2020-01-11"))

    ).toDF("key",
      "order_number",
      "order_line_key",
      "order_mon",
      "order_date")

      val items = List("order1", "order2", "order3")

      val filterdf = inputDF.filter($"order_number".isin(items:_*))
      filterdf.show(false)

// Exiting paste mode, now interpreting.

+--------+------------+--------------+---------+----------+
|key     |order_number|order_line_key|order_mon|order_date|
+--------+------------+--------------+---------+----------+
|50104044|order1      |orderline11   |2020-03  |2020-03-17|
|50104279|order2      |orderline21   |2020-03  |2020-03-18|
|50102271|order3      |orderline31   |2020-02  |2020-02-11|
+--------+------------+--------------+---------+----------+

import java.sql.Date
import org.apache.spark.sql._
import spark.implicits._
inputDF: org.apache.spark.sql.DataFrame = [key: string, order_number: string ... 3 more fields]
items: List[String] = List(order1, order2, order3)
filterdf: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [key: string, order_number: string ... 3 more fields]

scala>