本文深入探讨Elasticsearch在高并发、大数据量场景下的高可用架构设计,结合电商搜索、日志分析等真实案例,提供可落地的技术方案与Java实现。
冗余设计:至少3节点集群 + 1副本配置
故障隔离:角色分离(Master/Data/Ingest/Coordinating)
水平扩展:动态扩容数据节点
灾备策略:跨机房部署 + 定期快照
场景需求:
峰值QPS 10万+
99.9%请求响应<200ms
数据零丢失
# 生产环境集群配置(8节点)
节点1-3:Master节点(3台专用,防止脑裂)
节点4-8:Data节点(32核/64GB/SSD,独立协调节点角色)
public class ProductSearchService {
// 本地缓存(Caffeine)
private final Cache<String, SearchResult> localCache = Caffeine.newBuilder()
.maximumSize(10_000)
.expireAfterWrite(1, TimeUnit.MINUTES)
.build();
// Redis分布式缓存
private final RedisTemplate<String, Object> redisTemplate;
// ES客户端
private final RestHighLevelClient esClient;
@Autowired
private CircuitBreakerRegistry circuitBreakerRegistry;
public SearchResult searchProducts(String keyword, int page) {
// 1. 检查本地缓存
String cacheKey = "search:" + keyword + ":" + page;
SearchResult cached = localCache.getIfPresent(cacheKey);
if (cached != null) return cached;
// 2. 检查Redis缓存
SearchResult redisResult = (SearchResult) redisTemplate.opsForValue().get(cacheKey);
if (redisResult != null) {
localCache.put(cacheKey, redisResult);
return redisResult;
}
// 3. 熔断保护
CircuitBreaker circuitBreaker = circuitBreakerRegistry.circuitBreaker("esSearch");
return circuitBreaker.executeSupplier(() -> {
// 4. ES查询
SearchRequest request = new SearchRequest("products");
SearchSourceBuilder sourceBuilder = new SearchSourceBuilder()
.query(QueryBuilders.matchQuery("name", keyword))
.from((page - 1) * 10)
.size(10);
request.source(sourceBuilder);
SearchResponse response = esClient.search(request, RequestOptions.DEFAULT);
// 5. 处理结果并缓存
SearchResult result = processResponse(response);
redisTemplate.opsForValue().set(cacheKey, result, 5, TimeUnit.MINUTES);
localCache.put(cacheKey, result);
return result;
});
}
// 降级策略
@Override
public SearchResult fallbackSearch(String keyword, int page, Throwable t) {
// 返回静态兜底数据
return new SearchResult(Collections.emptyList(), 0, page, 10);
}
}
优化点 | 实施方法 | 效果提升 |
---|---|---|
查询路由优化 | 自定义routing=product_id | 减少80%分片查询 |
文件系统缓存 | 预留50%内存给OS文件缓存 | 查询速度提升3倍 |
索引拆分 按日期分索引 | (products_2023Q2) | 降低单个索引大小 |
查询并行化 | 使用asyncSearch API | 响应时间减少40% |
场景特点:
每日TB级数据写入
保留周期180天
实时分析+离线报表
public class LogIngester {
private static final int BATCH_SIZE = 1000;
private final BulkProcessor bulkProcessor;
public LogIngester() {
this.bulkProcessor = BulkProcessor.builder(
(request, bulkListener) -> esClient.bulkAsync(request, RequestOptions.DEFAULT, bulkListener),
new BulkProcessor.Listener() {
@Override
public void beforeBulk(long executionId, BulkRequest request) {}
@Override
public void afterBulk(long executionId, BulkRequest request, BulkResponse response) {
if (response.hasFailures()) {
// 记录失败日志
log.error("Bulk failure: {}", response.buildFailureMessage());
// 失败重试机制
retryFailedItems(response);
}
}
@Override
public void afterBulk(long executionId, BulkRequest request, Throwable failure) {
// 持久化到本地文件
saveToLocalFile(request);
}
})
.setBulkActions(BATCH_SIZE)
.setFlushInterval(TimeValue.timeValueSeconds(5))
.build();
}
public void ingestLog(LogEntry log) {
IndexRequest request = new IndexRequest("logs-2023-06")
.source(JSON.toJSONString(log), XContentType.JSON);
bulkProcessor.add(request);
}
// 失败重试逻辑
private void retryFailedItems(BulkResponse response) {
for (BulkItemResponse item : response.getItems()) {
if (item.isFailed()) {
bulkProcessor.add(item.getFailure().getRequest());
}
}
}
}
public class IndexLifecycleManager {
public void setupLogsPolicy() throws IOException {
// 1. 创建生命周期策略
PutLifecyclePolicyRequest putRequest = new PutLifecyclePolicyRequest(
new LifecyclePolicy("logs_policy",
new Phase("hot", TimeValue.ZERO,
new RolloverAction(null, null, null, 50_000L)), // 5万文档滚动
new Phase("warm", new TimeValue(30, TimeUnit.DAYS),
new ShrinkAction(1), // 收缩分片
new ForceMergeAction(1)), // 强制合并
new Phase("cold", new TimeValue(90, TimeUnit.DAYS),
new SearchableSnapshotAction("backup-repo")),
new Phase("delete", new TimeValue(180, TimeUnit.DAYS),
new DeleteAction())
));
esClient.indexLifecycle().putLifecyclePolicy(putRequest, RequestOptions.DEFAULT);
// 2. 应用策略到索引模板
PutComposableIndexTemplateRequest templateRequest = new PutComposableIndexTemplateRequest()
.name("logs_template")
.indexTemplate(new ComposableIndexTemplate(
List.of("logs-*"),
null,
null,
null,
null,
new Template(null, null,
Map.of("index.lifecycle.name", "logs_policy"),
null)
));
esClient.indices().putIndexTemplate(templateRequest, RequestOptions.DEFAULT);
}
}
public class CrossDCWriter {
private final List<RestHighLevelClient> clients;
private final ExecutorService executor = Executors.newFixedThreadPool(3);
public void writeDocument(String index, String id, Map<String, Object> source) {
IndexRequest primaryRequest = new IndexRequest(index)
.id(id).source(source);
// 异步并行写入多个集群
List<Future<Boolean>> futures = new ArrayList<>();
for (RestHighLevelClient client : clients) {
futures.add(executor.submit(() -> {
try {
client.index(primaryRequest, RequestOptions.DEFAULT);
return true;
} catch (IOException e) {
log.error("Write to cluster failed", e);
return false;
}
}));
}
// 检查写入结果
int successCount = 0;
for (Future<Boolean> future : futures) {
if (future.get()) successCount++;
}
// 如果多数节点写入成功,认为成功
if (successCount < clients.size() / 2 + 1) {
throw new WriteConsistencyException("Write quorum not satisfied");
}
}
// 定期数据校验
@Scheduled(fixedRate = 3600000)
public void verifyDataConsistency() {
for (String index : monitoredIndices) {
// 使用scroll API对比不同集群数据
compareClusterData(clients.get(0), clients.get(1), index);
}
}
}
类别 | 关键指标 | 报警阈值 |
---|---|---|
节点健康 | JVM堆内存使用率 | >75% |
索引性能 | 写入延迟 | >1000ms |
查询性能 | 99分位查询时间 | >500ms |
系统资源 | CPU使用率 | >85%持续5分钟 |
public class AutoScalingService {
@Autowired
private CloudClusterManager cloudManager;
@Scheduled(fixedDelay = 60000)
public void checkAndScale() {
// 1. 获取集群状态
ClusterHealthResponse health = esClient.cluster().health(
new ClusterHealthRequest(), RequestOptions.DEFAULT);
// 2. 判断是否需要扩容
if (health.getStatus() == ClusterHealthStatus.RED ||
getPendingTasks() > 100 ||
getNodeLoad() > 0.8) {
// 触发扩容
int newNodeCount = calculateRequiredNodes();
cloudManager.scaleDataNodes(newNodeCount);
// 自动分片重平衡
enableShardRebalancing();
}
// 3. 判断是否需要缩容
if (getNodeLoad() < 0.3 && health.getStatus() == ClusterHealthStatus.GREEN) {
// 排除最近30分钟加入的节点
List<String> candidateNodes = getNodesOlderThan(30, TimeUnit.MINUTES);
if (!candidateNodes.isEmpty()) {
// 迁移分片后下线节点
relocateShards(candidateNodes.get(0));
cloudManager.removeNode(candidateNodes.get(0));
}
}
}
private int calculateRequiredNodes() {
// 基于预测模型计算所需节点数
double predictedLoad = loadPredictor.predictNextHourLoad();
int currentNodes = getCurrentDataNodes();
return (int) Math.ceil(predictedLoad / 0.7); // 70%利用率
}
}
+---------------------+
| 网络层防火墙 |
+----------+----------+
|
+----------+----------+
| 应用层WAF |
+----------+----------+
|
+----------+----------+
| Elasticsearch安全 |
| - RBAC |
| - TLS加密 |
| - 审计日志 |
+---------------------+
public class SecureESClientBuilder {
public RestHighLevelClient buildSecureClient() {
// 1. 配置SSL上下文
SSLContext sslContext = loadSSLContext("es-keystore.jks", "password");
// 2. 构建凭证提供器
CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
credentialsProvider.setCredentials(
AuthScope.ANY,
new UsernamePasswordCredentials("app_user", "securePass123!"));
// 3. 创建安全客户端
return new RestHighLevelClient(
RestClient.builder(new HttpHost("es-cluster.example.com", 9243, "https"))
.setHttpClientConfigCallback(httpClientBuilder ->
httpClientBuilder
.setSSLContext(sslContext)
.setDefaultCredentialsProvider(credentialsProvider))
.setRequestConfigCallback(requestConfigBuilder ->
requestConfigBuilder
.setConnectTimeout(5000)
.setSocketTimeout(60000))
);
}
private SSLContext loadSSLContext(String keystorePath, String password) {
KeyStore truststore = KeyStore.getInstance("jks");
try (InputStream is = Files.newInputStream(Paths.get(keystorePath))) {
truststore.load(is, password.toCharArray());
}
SSLContextBuilder sslBuilder = SSLContexts.custom()
.loadTrustMaterial(truststore, null);
return sslBuilder.build();
}
}
public class SnapshotManager {
@Scheduled(cron = "0 0 2 * * ?") // 每天凌晨2点
public void createDailySnapshot() throws IOException {
// 1. 创建快照仓库(如果不存在)
if (!snapshotRepositoryExists("backup_repo")) {
createS3Repository("backup_repo");
}
// 2. 执行快照
String snapshotName = "snapshot-" + LocalDate.now().format(DateTimeFormatter.ISO_DATE);
CreateSnapshotRequest request = new CreateSnapshotRequest("backup_repo", snapshotName)
.indices("products", "logs-*")
.waitForCompletion(true);
esClient.snapshot().create(request, RequestOptions.DEFAULT);
// 3. 验证快照完整性
verifySnapshot("backup_repo", snapshotName);
// 4. 清理旧快照(保留最近7天)
deleteOldSnapshots("backup_repo", 7);
}
private void createS3Repository(String repoName) throws IOException {
Settings settings = Settings.builder()
.put("bucket", "my-es-backups")
.put("region", "ap-east-1")
.build();
Map<String, Object> config = Map.of(
"client", "default",
"base_path", "elasticsearch/backups"
);
PutRepositoryRequest request = new PutRepositoryRequest(repoName)
.type("s3")
.settings(settings)
.verify(true);
esClient.snapshot().createRepository(request, RequestOptions.DEFAULT);
}
}
数据量 | 节点数 | 分片大小 | 分片总数 |
---|---|---|---|
<100GB | 3 | 30-50GB | 3-5 |
100GB-1TB | 5-7 | 30-50GB | 10-20 |
1TB-10TB | 10-15 | 30-50GB | 50-100 |
>10TB | 20+ | 30-50GB | 200+ |
1.节点离线:
检查网络连接
查看JVM内存溢出日志
验证磁盘空间
2.集群变红:
GET /_cluster/allocation/explain?pretty
3.查询变慢:
使用Profile API分析慢查询
检查文件系统缓存利用率
优化索引映射(禁用不需要的字段)
4.写入阻塞:
检查bulk队列积压
监控merge操作
调整refresh_interval
5.生产环境检查清单:
禁用通配符删除索引 action.destructive_requires_name: true
配置定期快照
设置磁盘水位警戒线
启用安全模块
配置监控告警系统
下集预告:《Elasticsearch RESTful API入门:复合查询与过滤器》