sparkStreaming与kafka整合案例

pom文件



    4.0.0

    com.cll
    day01
    1.0-SNAPSHOT

    
    
        
            aliyun
            http://maven.aliyun.com/nexus/content/groups/public/
        
        
            cloudera
            https://repository.cloudera.com/artifactory/cloudera-repos/
        
        
            jboss
            http://repository.jboss.com/nexus/content/groups/public
        
    
    
        1.8
        1.8
        UTF-8
        2.11.8
        2.11
        2.7.4
        2.2.0
    
    
        
            org.scala-lang
            scala-library
            ${scala.version}
        
        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-hive_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-hive-thriftserver_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-streaming_2.11
            ${spark.version}
        
        
        
            org.apache.spark
            spark-streaming-kafka-0-10_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-sql-kafka-0-10_2.11
            ${spark.version}
        

        

        
            org.apache.hadoop
            hadoop-client
            2.7.4
        
        
            org.apache.hbase
            hbase-client
            1.3.1
        
        
            org.apache.hbase
            hbase-server
            1.3.1
        
        
            com.typesafe
            config
            1.3.3
        
        
            mysql
            mysql-connector-java
            5.1.38
        
    

    
        src/main/scala
        src/test/scala
        
            
            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.5.1
            
            
            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                        
                            
                                -dependencyfile
                                ${project.build.directory}/.scala_dependencies
                            
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-surefire-plugin
                2.18.1
                
                    false
                    true
                    
                        **/*Test.*
                        **/*Suite.*
                    
                
            
            
                org.apache.maven.plugins
                maven-shade-plugin
                2.3
                
                    
                        package
                        
                            shade
                        
                        
                            
                                
                                    *:*
                                    
                                        META-INF/*.SF
                                        META-INF/*.DSA
                                        META-INF/*.RSA
                                    
                                
                            
                            
                                
                                    
                                
                            
                        
                    
                
            
        
    



代码:

object SparkStreamingKafka {
  def main(args: Array[String]): Unit = {
    //创建streamingContext
    val conf: SparkConf = new SparkConf().setAppName("wc").setMaster("local[*]")
    val sc: SparkContext = new SparkContext(conf)
    sc.setLogLevel("WARN")

    val ssc = new StreamingContext(sc,Seconds(5))
    ssc.checkpoint("./ssc")

    //2|准备连接参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "node02:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "ssc",
      //earliest 当各个分区下有已经提交的offset时,从提交的offset开始消费,无提交的offset时,从头开始消费
      //latest 当各个分区下有已经提交的offset时,从提交的offset开始消费,无提交的offset时,消费新产生的该分区下的数据
      //none:topic各个分区都存在已经提交的offset时,从offset后开始消费,只要有一个分区不存在已经提交的额offset,则跑出异常

      //这里配置的latest自动充值偏移量为罪行的偏移量,即如果有偏移量,从偏移量位置开始消费,如果没有偏移量,从新来的额数据开始消费
      "auto.offset.reset" -> "latest",
      //false表示手动提交,默认是由sparkstreaming提交到checkpoint,也可以手动提交到其他地方checkpoint,如mysql/redis
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    //3、声明连接的主题
    val topics = Array("spark_kafka")
    //4、连接对应的分区,接收数据
    val recordDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent, //位置策略,开发spark和kafka一般不会再一台机器,所以使用源码中强烈推荐的粗略
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)//消费策略,使用源码中强烈推荐的
    )

    //5、获取value
    //DStream,发送的一行行数据
    val valueDstream: DStream[String] = recordDStream.map(_.value())//_表示一条数据,就是ConsumerRecord[String, String]

    //6、wordcount
    val resDStream: DStream[(String, Int)] = valueDstream.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_)

    //7、输出
    resDStream.print

    //8.开启
    ssc.start()
    //9、等待终止
    ssc.awaitTermination()

  }

}

你可能感兴趣的:(大数据)