Powered By Blogger

Saturday, November 9, 2019

spark word count

val rdd1 = sc.textFile("/spark_data/spark1-test.txt")
val rdd2 = rdd1.flatMap(x=> x.split(" "))
val rdd3 = rdd2.map(word => (word,1))
val rdd4 = rdd3.reduceByKey((x,y)=> {x+y})
rdd4.collect
//val rdd4 = rdd3.reduceBykey(_+_)


scala> val rdd1 = sc.textFile("/spark_data/spark1-test.txt")
rdd1: org.apache.spark.rdd.RDD[String] = /spark_data/spark1-test.txt MapPartitionsRDD[6] at textFile at :27

scala> val rdd2 = rdd1.flatMap(x=> x.split(" "))
rdd2: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[7] at flatMap at :29

scala> val rdd3 = rdd2.map(word => (word,1))
rdd3: org.apache.spark.rdd.RDD[(String, Int)] = MapPartitionsRDD[8] at map at :31

scala> val rdd4 = rdd3.reduceByKey((x,y)=> {x+y})
rdd4: org.apache.spark.rdd.RDD[(String, Int)] = ShuffledRDD[9] at reduceByKey at :33

scala> rdd4.collect
res2: Array[(String, Int)] = Array((hive,1), (am,1), (training,1), (attending,1), (hello,1), (pig,1), (spark,1), (I,1), (Basan,1))

scala>

No comments:

Post a Comment