taskmanager.memory.process.size=8g
。- flink_taskmanager_numRegisteredTaskManagers # 在线TM数量
- flink_jobmanager_job_numRunningJobs # 运行中作业数
- flink_taskmanager_status_jvm_memory_used # JVM内存使用
- flink_taskmanager_network_buffer_pool_usage # 网络缓冲区使用率
2. Checkpoint 与状态管理
配置优化
execution.checkpointing.interval: 30s # 检查点间隔
execution.checkpointing.timeout: 10m # 超时时间
execution.checkpointing.min-pause: 5s # 最小间隔
state.checkpoints.num-retained: 3 # 保留最近3个检查点
状态后端选择
state.backend: rocksdb
state.backend.rocksdb.localdir: /data/rocksdb # 本地存储路径
# 例如Kafka有16个分区
env.set_parallelism(16)
内存配置
taskmanager.memory.process.size: 12g
taskmanager.memory.heap.size: 8g # 堆内存
taskmanager.memory.network.max: 1g # 网络缓冲区上限
execution.checkpointing.timeout: 20m
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(
3, // 最多重试3次
Time.of(10, TimeUnit.SECONDS) // 每次重试间隔10秒
));
.keyBy(...)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.process(new MyHeavyProcessFunction())
.setParallelism(32) // 单独提高并行度
优化数据倾斜:
// 两阶段聚合解决数据倾斜
.map(new AddRandomKeyFunction()) // 添加随机前缀
.keyBy(...)
.window(...)
.aggregate(new PartialAggregate())
.keyBy(...)
.window(...)
.aggregate(new FinalAggregate())
taskmanager.memory.heap.size: 10g
state.backend.rocksdb.memory.managed: true # 使用Flink管理的内存
taskmanager.memory.managed.fraction: 0.4 # 管理内存占比40%
减少状态大小:
StateTtlConfig ttlConfig = StateTtlConfig
.newBuilder(Time.hours(1)) // 状态1小时过期
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
.build();
valueStateDescriptor.enableTimeToLive(ttlConfig);
groups:
- name: flink.rules
rules:
- alert: FlinkJobFailed
expr: flink_jobmanager_job_numFailedJobs > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Flink Job 失败 (instance {{ $labels.instance }})"
description: "Job {{ $labels.job_name }} 失败,原因: {{ $labels.error_message }}"
2. 故障自愈脚本
#!/bin/bash
# 自动重启失败的Flink作业
JOB_ID=$(flink list -r | grep "FAILED" | awk '{print $4}')
if [ ! -z "$JOB_ID" ]; then
flink cancel $JOB_ID
flink run -s last-savepoint-path /path/to/job.jar
echo "已重启失败作业: $JOB_ID"
fi
3. 资源弹性扩缩
# K8s部署Flink时的HPA配置
apiVersion: autoscaling/v2beta2
kind: HorizontalPodAutoscaler
metadata:
name: flink-taskmanager-hpa
spec:
scaleTargetRef:
apiVersion: apps/v1
kind: Deployment
name: flink-taskmanager
minReplicas: 3
maxReplicas: 10
metrics:
- type: Resource
resource:
name: cpu
target:
type: Utilization
averageUtilization: 70
taskmanager.network.numberOfBuffers: 16384 # 增加网络缓冲区数量
taskmanager.network.memory.fraction: 0.15 # 网络内存占比15%
2. JVM 参数优化
env.java.opts.taskmanager: "-XX:+UseG1GC -XX:MaxGCPauseMillis=200 -XX:+HeapDumpOnOutOfMemoryError"
3. 状态压缩
// 启用RocksDB状态压缩
RocksDBStateBackend backend = new RocksDBStateBackend("hdfs:///flink/checkpoints");
backend.setDbOptions(new DBOptions().setAllowOsBuffer(true).setUseFsync(false));
backend.setEnableIncrementalCheckpoints(true); // 启用增量检查点
env.setStateBackend(backend);
security.kerberos.login.enabled: true
security.kerberos.login.keytab: /path/to/keytab
security.kerberos.login.principal: [email protected]
2. REST API 鉴权
rest.auth.type: basic
rest.auth.basic.realm: Flink REST API
rest.auth.basic.users: admin:password123
3. 网络隔离
# 通过防火墙限制访问
iptables -A INPUT -p tcp --dport 8081 -s trusted-ip-range -j ACCEPT
iptables -A INPUT -p tcp --dport 8081 -j DROP
从稳定版本(如 1.13)升级到 1.14:
# 1. 保存当前作业的Savepoint
flink savepoint hdfs:///flink/savepoints/upgrade-sp
# 2. 停止Flink集群
bin/stop-cluster.sh
# 3. 替换Flink二进制文件
rm -rf flink-1.13.6
tar xzf flink-1.14.4-bin-scala_2.12.tgz
# 4. 复制原有配置
cp conf/* flink-1.14.4/conf/
# 5. 启动新集群
cd flink-1.14.4
bin/start-cluster.sh
# 6. 从Savepoint恢复作业
flink run -s hdfs:///flink/savepoints/upgrade-sp /path/to/job.jar
验证升级结果:
# 检查作业状态
flink list -r
# 查看Web UI确认指标正常
high-availability: zookeeper
high-availability.cluster-id: /flink-cluster-1
high-availability.zookeeper.quorum: zoo1:2181,zoo2:2181,zoo3:2181
high-availability.storageDir: hdfs:///flink/ha/
2. 多 JobManager 部署
# conf/jobmanager-rpc-addresses
jobmanager1
jobmanager2
jobmanager3
# 启动所有JM
bin/jobmanager.sh start-all
通过以上策略,可以构建稳定、高效的 Flink 运维体系,快速响应并解决各类生产问题。建议定期进行故障演练(如模拟 TaskManager 崩溃),验证应急预案的有效性。