Pair RDD Operations: Flat Map
Pair RDD Operations: Flat Map
x.collect()
Flat Map
val x = sc.parallelize(List("spark rdd example", "sample example"))
val y = x.flatMap(x => x.split(" "))
Map
val z = y.map(x => (x, 1));
Filter
val x = sc.parallelize(1 to 10)
Or with partition
Reduce
val x = sc.parallelize(1 to 10, 2)
val y = x.reduce((a, b) => (a+b))
y.collect();
ReduceByKey
val x = sc.parallelize(Array(("a", 1), ("b", 1), ("a", 1),("a", 1), ("b", 1),("b", 1),("b", 1), ("b", 1)))
val y = x.reduceByKey((key, value) => (key + value))
y.collect()
SortByKey
val y = x.sortByKey()
y.collect()
Joins
val salesprofit = sc.parallelize(Array(("Cadbury's", 3.5),("Nestle", 2.8),("Mars", 2.5), ("Thorton's", 2.2)));
join.collect();
Spark SQL
val df = sqlContext.read.json("/Users/syedrizvi/Desktop/HadoopExamples/Spark/sample.json")
df.show();
df.printSchema();
df.select(“name”).show();
df.select(df("name"),df("age")+1).show();
df.filter(df("age")>21).show()
df.groupBy("age").count().show();
val peopleDF =
spark.sparkContext.textFile("/Users/syedrizvi/Desktop/HadoopExamples/Spark/people.txt").map(_.split(",")).m
ap(attributes=>Person(attributes(0),attributes(1).trim.toInt)).toDF();
peopleDF.createOrReplaceTempView("people")
val teenagersDF = spark.sql("SELECT name, age FROM people WHERE age BETWEEN 13 AND 19")
import spark.implicits._
import spark.sql
sql("select current_database()").show(false)
Spark Streaming
To run the example from source
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext}