scala> import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.IntegerType
scala> import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StringType
scala> import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructType
scala> import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructField
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
scala> import scala.collection.JavaConversions._
import scala.collection.JavaConversions._
scala> val data = Seq(("Java", 20000 , "2012-02-05"), ("Python", 100000 , "2012-02-06"), ("Scala", 3000 , "2012-02-07"))
data: Seq[(String, Int, String)] = List((Java,20000,2012-02-05), (Python,100000,2012-02-06), (Scala,3000,2012-02-07))
scala> val tableColumns = List(("language", "string"), ("people", "integer"), ("registerdate", "String"))
tableColumns: List[(String, String)] = List((language,string), (people,integer), (registerdate,String))
scala> var schema = new StructType
schema: org.apache.spark.sql.types.StructType = StructType()
scala> for (i <- p="" tablecolumns=""> | schema = schema.add(i._1, i._2)
scala> val rowData = data.map(attributes => Row(attributes._1, attributes._2 , attributes._3))
rowData: Seq[org.apache.spark.sql.Row] = List([Java,20000,2012-02-05], [Python,100000,2012-02-06], [Scala,3000,2012-02-07])
scala> var dfFromData4 = spark.createDataFrame(rowData,schema)
dfFromData4: org.apache.spark.sql.DataFrame = [language: string, people: int ... 1 more field]
scala> dfFromData4.show(2)
+--------+------+------------+
|language|people|registerdate|
+--------+------+------------+
| Java| 20000| 2012-02-05|
| Python|100000| 2012-02-06|
+--------+------+------------+
only showing top 2 rows
scala>
scala> import org.apache.spark.sql.functions.to_date
import org.apache.spark.sql.functions.to_date
scala> val dateFormat = "yyyy-dd-MM"
dateFormat: String = yyyy-dd-MM
scala> var dfFromData5 = dfFromData4.withColumn("registerdate2", to_date(col("registerdate"), dateFormat))
dfFromData5: org.apache.spark.sql.DataFrame = [language: string, people: int ... 2 more fields]
scala> dfFromData5.show(3)
+--------+------+------------+-------------+
|language|people|registerdate|registerdate2|
+--------+------+------------+-------------+
| Java| 20000| 2012-02-05| 2012-05-02|
| Python|100000| 2012-02-06| 2012-06-02|
| Scala| 3000| 2012-02-07| 2012-07-02|
+--------+------+------------+-------------+
scala> dfFromData5.printSchema()
root
|-- language: string (nullable = true)
|-- people: integer (nullable = true)
|-- registerdate: string (nullable = true)
|-- registerdate2: date (nullable = true)->
import org.apache.spark.sql.types.IntegerType
scala> import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StringType
scala> import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructType
scala> import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructField
scala> import org.apache.spark.sql.Row
import org.apache.spark.sql.Row
scala> import scala.collection.JavaConversions._
import scala.collection.JavaConversions._
scala> val data = Seq(("Java", 20000 , "2012-02-05"), ("Python", 100000 , "2012-02-06"), ("Scala", 3000 , "2012-02-07"))
data: Seq[(String, Int, String)] = List((Java,20000,2012-02-05), (Python,100000,2012-02-06), (Scala,3000,2012-02-07))
scala> val tableColumns = List(("language", "string"), ("people", "integer"), ("registerdate", "String"))
tableColumns: List[(String, String)] = List((language,string), (people,integer), (registerdate,String))
scala> var schema = new StructType
schema: org.apache.spark.sql.types.StructType = StructType()
scala> for (i <- p="" tablecolumns=""> | schema = schema.add(i._1, i._2)
scala> val rowData = data.map(attributes => Row(attributes._1, attributes._2 , attributes._3))
rowData: Seq[org.apache.spark.sql.Row] = List([Java,20000,2012-02-05], [Python,100000,2012-02-06], [Scala,3000,2012-02-07])
scala> var dfFromData4 = spark.createDataFrame(rowData,schema)
dfFromData4: org.apache.spark.sql.DataFrame = [language: string, people: int ... 1 more field]
scala> dfFromData4.show(2)
+--------+------+------------+
|language|people|registerdate|
+--------+------+------------+
| Java| 20000| 2012-02-05|
| Python|100000| 2012-02-06|
+--------+------+------------+
only showing top 2 rows
scala>
scala> import org.apache.spark.sql.functions.to_date
import org.apache.spark.sql.functions.to_date
scala> val dateFormat = "yyyy-dd-MM"
dateFormat: String = yyyy-dd-MM
scala> var dfFromData5 = dfFromData4.withColumn("registerdate2", to_date(col("registerdate"), dateFormat))
dfFromData5: org.apache.spark.sql.DataFrame = [language: string, people: int ... 2 more fields]
scala> dfFromData5.show(3)
+--------+------+------------+-------------+
|language|people|registerdate|registerdate2|
+--------+------+------------+-------------+
| Java| 20000| 2012-02-05| 2012-05-02|
| Python|100000| 2012-02-06| 2012-06-02|
| Scala| 3000| 2012-02-07| 2012-07-02|
+--------+------+------------+-------------+
scala> dfFromData5.printSchema()
root
|-- language: string (nullable = true)
|-- people: integer (nullable = true)
|-- registerdate: string (nullable = true)
|-- registerdate2: date (nullable = true)->
import org.apache.spark.sql.types.IntegerType
ReplyDeleteimport org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.Row
import scala.collection.JavaConversions._
val data = Seq(("Java", 20000 , "2012-02-05"), ("Python", 100000 , "2012-02-06"), ("Scala", 3000 , "2012-02-07"))
val tableColumns = List(("language", "string"), ("people", "integer"), ("registerdate", "String"))
var schema = new StructType
for (i <- (tableColumns))
schema = schema.add(i._1, i._2)
val rowData = data.map(attributes => Row(attributes._1, attributes._2 , attributes._3))
var dfFromData4 = spark.createDataFrame(rowData,schema)
dfFromData4.show(2)
import org.apache.spark.sql.functions.to_date
val dateFormat = "yyyy-dd-MM"
var dfFromData5 = dfFromData4.withColumn("registerdate", to_date(col("registerdate"), dateFormat))
dfFromData5.show(3)
dfFromData5.printSchema()
to update the old column value to new data type