林犀居士

Flink中聚合算子介绍

前言

在flink api中，聚合算子是非常常用的。所谓的聚合就是在分组的基础上做比较计算的操作。下面通过几个简单案例来说明聚合算子的用法和注意事项。

聚合算子案例

因为flink的api操作流程比较固定，从获取执行环境==》获取数据源==》执行数据转换操作==》输出结果。为了复用代码，参考代码使用了一个模板设计模式。

先定义一个Stream的泛型接口

package com.tml.common;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public interface StreamService {

    StreamExecutionEnvironment getEnv();

    DataStream  getSource(StreamExecutionEnvironment env);
}

抽象一个模板

package com.tml.common;

import com.tml.msg.CommonMsg;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.connector.datagen.source.DataGeneratorSource;
import org.apache.flink.connector.datagen.source.GeneratorFunction;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


public abstract class AbsStreamCommonService implements StreamService {


    public void processStream(Integer parallelism) throws Exception {
        StreamExecutionEnvironment env = getEnv();
        env.setParallelism(parallelism);
        DataStream stream = getSource(env);
        handle(stream);
        env.execute();
    }

    public abstract void handle(DataStream source);

    @Override
    public StreamExecutionEnvironment getEnv() {

        return StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration());
    }

    public DataStream getSourceFromSocket(StreamExecutionEnvironment environment) {
        return environment.socketTextStream("43.139.114.233", 9999);
    }

    public DataStream getSourceFromCollection(StreamExecutionEnvironment environment) {
        DataStreamSource source = environment.fromElements(
                new CommonMsg("11", "hello world", 11L),
                new CommonMsg("11", "hello flink", 3L),
                new CommonMsg("12", "hello kitty", 13L),
                new CommonMsg("13", "hello world", 12L),
                new CommonMsg("11", "hello java", 23L));

        return source;
    }

    public DataStream getSourceFromDataGenerator(StreamExecutionEnvironment environment) {
        DataGeneratorSource dataGeneratorSource =
                new DataGeneratorSource<>((GeneratorFunction) o -> o, 100000L,RateLimiterStrategy.perSecond(2), Types.LONG);
        return environment.fromSource(dataGeneratorSource, WatermarkStrategy.noWatermarks(), "dataGeneratorSource", Types.LONG);
    }


}

注：使用

StreamExecutionEnvironment.createLocalEnvironmentWithWebUI(new Configuration())可以在控制台看到flink的web-ui界面，默认是http://localhost:8081,方便看到flink job的执行参数，这种方式适用于本地调试和学习

比如这样

对应的pom文件依赖


  4.0.0

  com.tml
  flink-demo
  1.0-SNAPSHOT
  jar

  flink-demo
  http://maven.apache.org

  
    UTF-8
    1.18.0 
    1.8
    1.8
  

  
    
      junit
      junit
      3.8.1
      test
    

    
      org.apache.flink
      flink-java
      ${flink.version}
    
    
    
      org.apache.flink
      flink-streaming-java
      ${flink.version}
    
    
    
      org.apache.flink
      flink-clients
      ${flink.version}
    

    
      org.apache.flink
      flink-runtime-web
      ${flink.version}
    

    
      org.apache.flink
      flink-connector-files
      ${flink.version}
    

    
      org.apache.flink
      flink-connector-datagen
      ${flink.version}
    

    
      org.projectlombok
      lombok
      1.18.20

keyBy

package com.tml.operator.aggregation;

import com.tml.common.AbsStreamCommonService;
import com.tml.msg.CommonMsg;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


public class KeyByDemo extends AbsStreamCommonService {

    public static void main(String[] args) throws Exception {
        new KeyByDemo().processStream(4);
    }

    @Override
    public void handle(DataStream stream) {
        /**
         * keyby算子返回的是一个keyedStream
         * 1.keyby不是一个转换算子，只对数据进行了重分区，另外还不能设置并行度
         * 2.keyby分组和分区的概念
         *  keyby是对数据进行分组，保证同一个分组的数据会落到同一个数据分区内
         *  分区：一个子任务可以理解为一个分区，一个分区可以包含有多个分组的数据
         */
        KeyedStream keyBy = stream.keyBy((KeySelector) CommonMsg::getId, TypeInformation.of(String.class));
        keyBy.print();
    }


    @Override
    public DataStream getSource(StreamExecutionEnvironment env) {
        return super.getSourceFromCollection(env);
    }
}

数据源是一个有界的数组，对应的数据是程序中自己new出来的，执行结果如下

2> CommonMsg(id=11, msg=hello world, time=11)
2> CommonMsg(id=11, msg=hello flink, time=3)
2> CommonMsg(id=11, msg=hello java, time=23)
1> CommonMsg(id=12, msg=hello kitty, time=13)
3> CommonMsg(id=13, msg=hello world, time=12)

可以看到，通过keyBy的分组操作，相同的数据放在了同一个分区去执行。

sum/min/minBy/max/maxBy

这几个是最基本的聚合算子。

package com.tml.operator.aggregation;

import com.tml.common.AbsStreamCommonService;
import com.tml.msg.CommonMsg;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;


public class SimpleAggregateDemo extends AbsStreamCommonService {

    public static void main(String[] args) throws Exception {
        new SimpleAggregateDemo().processStream(1);
    }


    @Override
    public void handle(DataStream stream) {
        KeyedStream keyStream = stream.keyBy((KeySelector) CommonMsg::getId, TypeInformation.of(String.class));
        //使用sum聚合
        //SingleOutputStreamOperator time = stream.sum("time");
        //SingleOutputStreamOperator min = stream.min("time");
        /**
         * max、maxyBy的区别在于
         * max不会对非比较字段重新赋值，而maxBy会更新非比较字段的值
         */
        SingleOutputStreamOperator minBy = keyStream.minBy("time");
        //min.print();
        minBy.print();
    }

    @Override
    public DataStream getSource(StreamExecutionEnvironment env) {
        return super.getSourceFromCollection(env);
    }
}

先看一下minBy这个算子结果输出

CommonMsg(id=11, msg=hello world, time=11)
CommonMsg(id=11, msg=hello flink, time=3)
CommonMsg(id=12, msg=hello kitty, time=13)
CommonMsg(id=13, msg=hello world, time=12)
CommonMsg(id=11, msg=hello flink, time=3)

将聚合操作的api换成min()，对比一下程序的输出

CommonMsg(id=11, msg=hello world, time=11)
CommonMsg(id=11, msg=hello world, time=3)
CommonMsg(id=12, msg=hello kitty, time=13)
CommonMsg(id=13, msg=hello world, time=12)
CommonMsg(id=11, msg=hello world, time=3)

两个对比输出可以发现，min、minBy的区别在于

min不会对非比较字段重新赋值，而minBy会更新非比较字段的值

当然max、maxBy也是一样

reduce

package com.tml.operator.aggregation;

import com.tml.common.AbsStreamCommonService;
import com.tml.msg.CommonMsg;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

public class ReduceDemo extends AbsStreamCommonService {

    public static void main(String[] args) throws Exception {
        new ReduceDemo().processStream(1);

    }

    @Override
    public void handle(DataStream source) {
        KeyedStream stream = source.keyBy((KeySelector) CommonMsg::getId, TypeInformation.of(String.class));

        /**
         * reduce函数是非常灵活的，可以根据业务需求，非常灵活的进行聚合计算
         * 当每个分组中只有一条数据的时候，是不会进行reduce的，因为只有一条数据，没有比较的数据，进行reduce没有必要
         */
        SingleOutputStreamOperator reduce = stream.reduce((t1, t2) -> {
            System.out.println("t1==>" + t1);
            System.out.println("t2==>" + t2);
            CommonMsg commonMsg = new CommonMsg(t1.getId(), t2.getMsg(), t1.getTime() + t2.getTime());

            return commonMsg;
        });

        reduce.print();
    }

    @Override
    public DataStream getSource(StreamExecutionEnvironment env) {
        return super.getSourceFromCollection(env);
    }
}

看一下运行结果

CommonMsg(id=11, msg=hello world, time=11)
t1==>CommonMsg(id=11, msg=hello world, time=11)
t2==>CommonMsg(id=11, msg=hello flink, time=3)
CommonMsg(id=11, msg=hello flink, time=14)
CommonMsg(id=12, msg=hello kitty, time=13)
CommonMsg(id=13, msg=hello world, time=12)
t1==>CommonMsg(id=11, msg=hello flink, time=14)
t2==>CommonMsg(id=11, msg=hello java, time=23)
CommonMsg(id=11, msg=hello java, time=37)

通过运行结果可以看到，reduce算子是非常灵活的，可以在两个数据之间做非常灵活的操作，当然，如果对应的分组中只有一条数据，自然是不会触发reduce函数的执行了。

richFunction

package com.tml.operator.aggregation;

import com.tml.common.AbsStreamCommonService;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;

/**
 * richfunction添加了一些额外的功能
 * 提供了一些生命周期的管理方法，比如open()\close()
 * open() 在每个子任务启动的时候调用一次
 * close() 在每个任务结束的时候调用一次，如果是flink程序挂掉，不会调用这个close方法，在控制台上点击cancel任务，这个close方法也是可以额正常调用的
 *
 * 另外多了一些运行时上下文，可以通过getRuntimeContext() 来获取上下文中的一些关键信息
 * 在close方法中可以做一些释放资源的操作，回调通知操作等一些hook函数
 */
public class RichFunctionDemo extends AbsStreamCommonService {
    public static void main(String[] args) throws Exception {
        new RichFunctionDemo().processStream(1);
    }

    @Override
    public void handle(DataStream stream) {
        SingleOutputStreamOperator map = stream.map(new RichMapFunction() {

            @Override
            public void open(Configuration parameters) throws Exception {
                super.open(parameters);
                RuntimeContext context = getRuntimeContext();
                String taskName = context.getTaskName();
                int subtasks = context.getNumberOfParallelSubtasks();
                System.out.println("taskName: " + taskName + ", subtasks: " + subtasks + " call open()");
            }

            @Override
            public void close() throws Exception {
                super.close();
                RuntimeContext context = getRuntimeContext();
                String taskName = context.getTaskName();
                int subtasks = context.getNumberOfParallelSubtasks();
                System.out.println("taskName: " + taskName + ", subtasks: " + subtasks + " call close()");
            }

            @Override
            public String map(String value) throws Exception {
                return "(" + value + ")";
            }
        }, TypeInformation.of(String.class));

        map.print();
    }

    @Override
    public DataStream getSource(StreamExecutionEnvironment env) {
        return super.getSourceFromSocket(env);
    }
}

运行程序前需要先运行socket，这里使用了nc，详细可以参考Flink实时统计单词【入门】