Add another column by checking the previous column value
scala> val df1 = Seq(
| (
| "track1",
| "2019-11-18T16:11:28",
| "linekey1",
| "item1",
| "123"
| ),
| (
| "track2",
| "2019-11-18T16:11:28",
| "linekey2",
| "item2",
| null
| )).toDF(
| "number",
| "source_modified_timestamp",
| "line_key",
| "item_no",
| "tracking_number"
| )
df1: org.apache.spark.sql.DataFrame = [number: string, source_modified_timestamp: string ... 3 more fields]
scala> df1.show
+------+-------------------------+--------+-------+---------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|
+------+-------------------------+--------+-------+---------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null|
+------+-------------------------+--------+-------+---------------+
scala> df1.withColumn("tracking_number_rm", when($"tracking_number".isNull, lit("-999999")).otherwise(lit($"tracking_number"))).show
+------+-------------------------+--------+-------+---------------+------------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|tracking_number_rm|
+------+-------------------------+--------+-------+---------------+------------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null| -999999|
+------+-------------------------+--------+-------+---------------+------------------+
scala> df1.withColumn("tracking_number_rm", when($"tracking_number".isNull, "-1111").otherwise($"tracking_number")).show
+------+-------------------------+--------+-------+---------------+------------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|tracking_number_rm|
+------+-------------------------+--------+-------+---------------+------------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null| -1111|
+------+-------------------------+--------+-------+---------------+------------------+
scala> val df1 = Seq(
| (
| "track1",
| "2019-11-18T16:11:28",
| "linekey1",
| "item1",
| "123"
| ),
| (
| "track2",
| "2019-11-18T16:11:28",
| "linekey2",
| "item2",
| null
| )).toDF(
| "number",
| "source_modified_timestamp",
| "line_key",
| "item_no",
| "tracking_number"
| )
df1: org.apache.spark.sql.DataFrame = [number: string, source_modified_timestamp: string ... 3 more fields]
scala> df1.show
+------+-------------------------+--------+-------+---------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|
+------+-------------------------+--------+-------+---------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null|
+------+-------------------------+--------+-------+---------------+
scala> df1.withColumn("tracking_number_rm", when($"tracking_number".isNull, lit("-999999")).otherwise(lit($"tracking_number"))).show
+------+-------------------------+--------+-------+---------------+------------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|tracking_number_rm|
+------+-------------------------+--------+-------+---------------+------------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null| -999999|
+------+-------------------------+--------+-------+---------------+------------------+
scala> df1.withColumn("tracking_number_rm", when($"tracking_number".isNull, "-1111").otherwise($"tracking_number")).show
+------+-------------------------+--------+-------+---------------+------------------+
|number|source_modified_timestamp|line_key|item_no|tracking_number|tracking_number_rm|
+------+-------------------------+--------+-------+---------------+------------------+
|track1| 2019-11-18T16:11:28|linekey1| item1| 123| 123|
|track2| 2019-11-18T16:11:28|linekey2| item2| null| -1111|
+------+-------------------------+--------+-------+---------------+------------------+
No comments:
Post a Comment