Powered By Blogger

Sunday, November 24, 2019

DataFrame Sample code

object DataFrameExample {

  case class Person(ID: Int, name: String, age: Int, numFriends: Int)

  def mapper(line: String): Person = {
    val fields = line.split(",")
    val person: Person = Person(fields(0).toInt, fields(1), fields(2).toInt, fields(3).toInt)
    person
  }

  def main(args: Array[String]) {

    Logger.getLogger("org").setLevel(Level.ERROR)

    val spark = SparkSession.builder
      .appName("Data frame test")
      .master("local[*]")
      //needed only in windows check it
      // .config("spark.sql.warehouse.dir", "/Users/basan/Documents/Spark/tempwarehouse")
      //.config("spark.driver.allowMultipleContexts", "true")
      .getOrCreate()

    // val sc = new SparkContext("local[*]", "ERROR count with bigFile")

    var lines = spark.sparkContext.textFile("/Users/basan/workspace-spark/sparkDemo1/spark-data/friends-data.csv")
    import spark.implicits._
    val people = lines.map(mapper).toDS().cache()

    println("Inferred schema")
    people.printSchema()

    println("select the name column")
    people.select("name").show()

    println("Filter out anyone over 21")
    people.filter(people("age") < 21).show()

    println("group by age")
    people.groupBy("age").count().show()

    println("make everyone 10 years older")
    people.select(people("name"), (people("age") + 10)).show()
    people.select(people("name"), ((people("age") + 10)).alias("newcolumn")).show()


    spark.stop()

  }

}

No comments:

Post a Comment