IDEA环境:
package wordcount
import org.apache.spark.{SparkConf, SparkContext}
object wordCountScala extends App{
val conf = new SparkConf().setAppName("Wordcount").setMaster("local"); //可删掉该setMaster
val sc = new SparkContext(conf)
val line = sc.textFile("D:\\win7远程\\14期大数据潭州课件\\第三阶段:实时开发(plus)\\2020-0105-Spark-SQL\\数据\\wordcount.txt")
val result = line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
result.foreach(println)
}
HDFS环境:
package wordcount
import org.apache.spark.{SparkConf, SparkContext}
object wordCountScala_HDFS extends App{
val conf = new SparkConf().setAppName("Wordcount");//可删掉该setMaster
val sc = new SparkContext(conf)
val line = sc.textFile("hdfs://bigdata166:9000/testdata/wordcount.txt")
val result = line.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)
result.foreach(println)
result.collect()
}
[root@bigdata166 bin]# ./spark-submit --master spark://bigdata166:7077 --class wordcount.wordCountScala_HDFS ../testjar/scalaTest-1.0-SNAPSHOT.jar
其它的坑:
有时候不关sparkshell的时候会导致内存不足,控制台会提示查看UI页面。
加collect输出到日志,print输出到outstd好像
