在分布式系统架构中,网络抖动、服务瞬时过载、数据库死锁等临时性故障时有发生。本文将通过真实项目案例,深入讲解Spring Boot项目中如何正确实施重试机制,避免因简单粗暴的重试引发雪崩效应。
以下是使用Mermaid语法绘制的重试架构图和决策流程图,可直接嵌入Markdown文档使用:
@Retryable(value = OptimisticLockingFailureException.class, maxAttempts = 3)
public void updateWithOptimisticLock(Order order) {
// 包含版本号的更新操作
}
resilience4j:
retry:
instances:
paymentApi:
maxRetryAttempts: 5
waitDuration: 500ms
retryExceptions:
- org.springframework.web.client.ResourceAccessException
@RabbitListener(queues = "order.queue")
@RabbitRetryable(maxAttempts = 3,
backoff = @Backoff(delay = 1000, multiplier = 2))
public void handleOrderMessage(OrderMessage message) {
// 消息处理逻辑
}
@Service
public class PaymentService {
@Retryable(
include = {PaymentTimeoutException.class},
maxAttempts = 4,
backoff = @Backoff(delay = 1000, multiplier = 2))
public PaymentResult processPayment(PaymentRequest request) {
// 支付处理逻辑
}
@Recover
public PaymentResult fallbackProcessPayment(PaymentTimeoutException e) {
// 兜底处理
}
}
@CircuitBreaker(name = "inventoryService")
@RateLimiter(name = "inventoryService")
@Retry(name = "inventoryService", fallbackMethod = "fallback")
@Bulkhead(name = "inventoryService")
public InventoryResponse deductStock(InventoryRequest request) {
// 库存扣减逻辑
}
// 结合Hystrix配置
@HystrixCommand(
commandProperties = {
@HystrixProperty(name = "execution.isolation.thread.timeoutInMilliseconds", value = "3000")
},
threadPoolProperties = {
@HystrixProperty(name = "coreSize", value = "20")
})
@Retryable(maxAttempts = 3)
public ServiceResponse remoteCall() {
// 远程调用
}
@Retryable(maxAttempts = 3)
@Transactional
public void processWithIdempotent(String bizId) {
// 检查幂等表
if(idempotentRepository.exists(bizId)) {
return;
}
// 业务逻辑
// 记录幂等标记
idempotentRepository.save(bizId);
}
# 使用随机退避算法
spring.retry.backoff.initial-interval=500ms
spring.retry.backoff.multiplier=1.5
spring.retry.backoff.max-interval=5000ms
@Configuration
public class FeignConfig {
@Bean
public Retryer feignRetryer() {
return new Retryer.Default(100, 1000, 3);
}
@Bean
public Request.Options options() {
return new Request.Options(5, TimeUnit.SECONDS, 5, TimeUnit.SECONDS, true);
}
}
@Scheduled(fixedDelay = 30000)
@SchedulerLock(name = "syncJob", lockAtLeastFor = "10s")
@Retryable(maxAttempts = 5)
public void distributedSyncJob() {
// 分布式任务逻辑
}
retry:
policies:
default:
max-attempts: 3
backoff:
initial: 1s
multiplier: 2
max: 10s
critical:
max-attempts: 5
backoff:
initial: 500ms
multiplier: 1.5
max: 5s
@Bean
public RetryRegistry retryRegistry(MeterRegistry meterRegistry) {
return RetryRegistry.of(
RetryConfig.custom()
.maxAttempts(3)
.waitDuration(Duration.ofMillis(500))
.enableMetrics()
.build()
);
}
以下场景应谨慎使用重试:
重试决策流程图:当异常发生时,首先判断是否属于瞬时故障,再检查当前重试次数,最后决定是否继续重试或进入熔断状态。
通过合理的重试策略配置,配合熔断机制、限流措施,可以在分布式系统中构建具有弹性的服务架构。但切记:重试不是万能药,不当使用会放大故障影响。建议在生产环境实施前,进行充分的故障注入测试。