Powered By Blogger

Sunday, October 6, 2019

Converting string date value to Date object

scala> import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.IntegerType

scala> import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StringType

scala> import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructType

scala> import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructField

scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row

scala> import scala.collection.JavaConversions._
import scala.collection.JavaConversions._

scala>     val data = Seq(("Java", 20000 , "2012-02-05"), ("Python", 100000 , "2012-02-06"), ("Scala", 3000 , "2012-02-07"))
data: Seq[(String, Int, String)] = List((Java,20000,2012-02-05), (Python,100000,2012-02-06), (Scala,3000,2012-02-07))

scala>     val tableColumns = List(("language", "string"), ("people", "integer"), ("registerdate", "String"))
tableColumns: List[(String, String)] = List((language,string), (people,integer), (registerdate,String))

scala>     var schema = new StructType
schema: org.apache.spark.sql.types.StructType = StructType()

scala>     for (i <- p="" tablecolumns="">     |       schema = schema.add(i._1, i._2)

scala>       val rowData = data.map(attributes => Row(attributes._1, attributes._2 ,  attributes._3))
rowData: Seq[org.apache.spark.sql.Row] = List([Java,20000,2012-02-05], [Python,100000,2012-02-06], [Scala,3000,2012-02-07])

scala>       var dfFromData4 = spark.createDataFrame(rowData,schema)
dfFromData4: org.apache.spark.sql.DataFrame = [language: string, people: int ... 1 more field]

scala>       dfFromData4.show(2)
+--------+------+------------+
|language|people|registerdate|
+--------+------+------------+
|    Java| 20000|  2012-02-05|
|  Python|100000|  2012-02-06|
+--------+------+------------+
only showing top 2 rows


scala>

scala>         import org.apache.spark.sql.functions.to_date
import org.apache.spark.sql.functions.to_date

scala>         val dateFormat = "yyyy-dd-MM"
dateFormat: String = yyyy-dd-MM

scala>        var dfFromData5 = dfFromData4.withColumn("registerdate2", to_date(col("registerdate"), dateFormat))
dfFromData5: org.apache.spark.sql.DataFrame = [language: string, people: int ... 2 more fields]

scala>        dfFromData5.show(3)
+--------+------+------------+-------------+
|language|people|registerdate|registerdate2|
+--------+------+------------+-------------+
|    Java| 20000|  2012-02-05|   2012-05-02|
|  Python|100000|  2012-02-06|   2012-06-02|
|   Scala|  3000|  2012-02-07|   2012-07-02|
+--------+------+------------+-------------+


scala> dfFromData5.printSchema()
root
 |-- language: string (nullable = true)
 |-- people: integer (nullable = true)
 |-- registerdate: string (nullable = true)
 |-- registerdate2: date (nullable = true)

1 comment:

  1. import org.apache.spark.sql.types.IntegerType
    import org.apache.spark.sql.types.StringType
    import org.apache.spark.sql.types.StructType
    import org.apache.spark.sql.types.StructField
    import org.apache.spark.sql.Row
    import scala.collection.JavaConversions._
    val data = Seq(("Java", 20000 , "2012-02-05"), ("Python", 100000 , "2012-02-06"), ("Scala", 3000 , "2012-02-07"))
    val tableColumns = List(("language", "string"), ("people", "integer"), ("registerdate", "String"))
    var schema = new StructType
    for (i <- (tableColumns))
    schema = schema.add(i._1, i._2)
    val rowData = data.map(attributes => Row(attributes._1, attributes._2 , attributes._3))
    var dfFromData4 = spark.createDataFrame(rowData,schema)
    dfFromData4.show(2)

    import org.apache.spark.sql.functions.to_date
    val dateFormat = "yyyy-dd-MM"
    var dfFromData5 = dfFromData4.withColumn("registerdate", to_date(col("registerdate"), dateFormat))
    dfFromData5.show(3)
    dfFromData5.printSchema()

    to update the old column value to new data type

    ReplyDelete