数据路由问题:如何确定数据在哪个库哪个表
跨库查询问题:需要查询多个库/表时的数据合并
分页排序问题:跨库分页和排序的复杂性
事务一致性:跨库事务保证
聚合计算:跨库的SUM、COUNT等聚合操作
方案 | 实现方式 | 优点 | 缺点 | 适用场景 |
---|---|---|---|---|
客户端分片 | 应用层实现路由逻辑 | 简单直接,无额外依赖 | 业务耦合度高,维护成本高 | 简单分片场景 |
代理中间件 | MyCat、ShardingProxy等 | 对应用透明,支持复杂查询 | 性能瓶颈,单点故障 | 大型企业应用 |
ORM框架集成 | ShardingSphere-JDBC | 轻量级,无代理部署 | 侵入业务代码,语言限制 | Java应用 |
分布式数据库 | TiDB、CockroachDB | 自动分片,强一致性 | 学习成本高,生态限制 | 需要强一致性的场景 |
org.apache.shardingsphere
sharding-jdbc-spring-boot-starter
5.1.0
org.apache.shardingsphere
sharding-jdbc-spring-namespace
5.1.0
2.2 分片规则配置
# application-sharding.yml
spring:
shardingsphere:
datasource:
names: ds0, ds1
ds0:
type: com.zaxxer.hikari.HikariDataSource
driver-class-name: com.mysql.cj.jdbc.Driver
jdbc-url: jdbc:mysql://localhost:3306/ds0
username: root
password: root
ds1:
type: com.zaxxer.hikari.HikariDataSource
driver-class-name: com.mysql.cj.jdbc.Driver
jdbc-url: jdbc:mysql://localhost:3306/ds1
username: root
password: root
rules:
sharding:
tables:
orders:
actual-data-nodes: ds$->{0..1}.orders_$->{0..3} # 2库4表
database-strategy:
standard:
sharding-column: user_id
sharding-algorithm-name: database-inline
table-strategy:
standard:
sharding-column: order_id
sharding-algorithm-name: table-inline
key-generate-strategy:
column: order_id
key-generator-name: snowflake
sharding-algorithms:
database-inline:
type: INLINE
props:
algorithm-expression: ds$->{user_id % 2}
table-inline:
type: INLINE
props:
algorithm-expression: orders_$->{order_id % 4}
key-generators:
snowflake:
type: SNOWFLAKE
props:
worker-id: 123
@Repository
public interface OrderRepository extends JpaRepository {
// 根据分片键查询 - 路由到单个库表
@Query("SELECT o FROM Order o WHERE o.orderId = :orderId AND o.userId = :userId")
Order findByOrderIdAndUserId(@Param("orderId") Long orderId,
@Param("userId") Long userId);
// 示例使用
public Order getOrderDetails(Long orderId, Long userId) {
return findByOrderIdAndUserId(orderId, userId);
}
}
2.3.2 范围查询(可能跨多个库表)
@Service
public class OrderService {
@Autowired
private OrderRepository orderRepository;
// 查询某个用户的所有订单
public List getOrdersByUser(Long userId) {
// 根据userId路由到特定库,但可能跨多个表
return orderRepository.findByUserId(userId);
}
// 查询某段时间内的订单(可能跨多个库表)
public List getOrdersByDateRange(Date startDate, Date endDate) {
// 全库表扫描
return orderRepository.findByCreateTimeBetween(startDate, endDate);
}
}
@Service
public class OrderQueryService {
@Autowired
private OrderRepository orderRepository;
public Page getOrdersByPage(Long userId, int page, int size) {
// 使用Spring Data JPA分页
Pageable pageable = PageRequest.of(page, size, Sort.by("createTime").descending());
if (userId != null) {
// 单用户查询 - 路由到特定库
return orderRepository.findByUserId(userId, pageable);
} else {
// 全局查询 - 跨库分页
return orderRepository.findAll(pageable);
}
}
// 分页查询优化 - 避免深度分页
public List getOrdersByCursor(Long lastOrderId, int size) {
// 使用游标分页(基于ID的顺序查询)
return orderRepository.findByIdGreaterThanOrderByIdAsc(lastOrderId, size);
}
}
2.4.2 聚合查询
@Service
public class OrderStatisticsService {
@Autowired
private OrderRepository orderRepository;
// 统计用户订单总金额
public BigDecimal getTotalAmountByUser(Long userId) {
return orderRepository.sumAmountByUserId(userId);
}
// 复杂聚合查询 - 使用ShardingSphere的归并引擎
public Map getOrderStats(Date startDate, Date endDate) {
// 1. 订单总数
long totalCount = orderRepository.countByCreateTimeBetween(startDate, endDate);
// 2. 总金额
BigDecimal totalAmount = orderRepository.sumAmountByCreateTimeBetween(startDate, endDate);
// 3. 平均金额
BigDecimal avgAmount = totalAmount.divide(BigDecimal.valueOf(totalCount), 2, RoundingMode.HALF_UP);
// 4. 最大订单金额
BigDecimal maxAmount = orderRepository.maxAmountByCreateTimeBetween(startDate, endDate);
return Map.of(
"totalCount", totalCount,
"totalAmount", totalAmount,
"avgAmount", avgAmount,
"maxAmount", maxAmount
);
}
}
# 配置绑定表关系
spring:
shardingsphere:
rules:
sharding:
binding-tables:
- orders, order_items # 将订单和订单项绑定
// 绑定表查询示例
@Repository
public interface OrderRepository extends JpaRepository {
// 查询订单及其明细
@Query("SELECT o, i FROM Order o JOIN o.items i WHERE o.orderId = :orderId")
List
2.5.2 广播表(全局字典表)
# 配置广播表
spring:
shardingsphere:
rules:
sharding:
broadcast-tables:
- regions # 地区表作为广播表
// 使用广播表查询
@Service
public class OrderService {
@Autowired
private RegionRepository regionRepository;
public OrderDTO getOrderWithRegion(Long orderId, Long userId) {
Order order = orderRepository.findByOrderIdAndUserId(orderId, userId);
Region region = regionRepository.findById(order.getRegionCode()).orElse(null);
return OrderDTO.builder()
.orderId(order.getOrderId())
.amount(order.getAmount())
.regionName(region.getName())
.build();
}
}
order_id
mod-long-order
4
3.2 MyCat分页查询优化
/* 普通分页 - 性能较差 */
SELECT * FROM orders ORDER BY create_time DESC LIMIT 100000, 10;
/* 优化分页 - 使用ID分页 */
SELECT * FROM orders
WHERE order_id > :lastId
ORDER BY order_id ASC
LIMIT 10;
3.3 MyCat跨库JOIN实现
/* MyCat自动处理跨库JOIN */
SELECT
o.order_id, o.amount, u.username, u.email
FROM
orders o
JOIN
users u ON o.user_id = u.user_id
WHERE
o.create_time > '2023-01-01';
// 基于Snowflake的分布式ID生成
public class DistributedIdGenerator {
private static final Snowflake SHARDING_ID_GENERATOR =
new Snowflake(123); // workerId
public static long generateId() {
return SHARDING_ID_GENERATOR.nextId();
}
// 从ID中提取分片信息
public static int extractShardId(long id, int totalShards) {
return (int) (id % totalShards);
}
}
4.2 跨库分页深度优化
@Service
public class OrderPaginationService {
// 分页查询优化方案
public PageResult optimizedPageQuery(OrderQuery query, int page, int size) {
// 方案1:基于索引键的游标分页
if (query.getLastOrderId() != null) {
return cursorBasedPagination(query, page, size);
}
// 方案2:基于时间范围的分页
if (query.getStartTime() != null && query.getEndTime() != null) {
return timeRangePagination(query, page, size);
}
// 方案3:基于分片并行查询
return parallelShardQuery(query, page, size);
}
// 游标分页实现
private PageResult cursorBasedPagination(OrderQuery query, int page, int size) {
List orders = orderRepository.findByOrderIdGreaterThan(
query.getLastOrderId(), PageRequest.of(0, size));
Long nextCursor = !orders.isEmpty() ? orders.get(orders.size() - 1).getOrderId() : null;
return new PageResult<>(orders, nextCursor);
}
// 时间范围分页
private PageResult timeRangePagination(OrderQuery query, int page, int size) {
// 按时间分片查询
List timeShards = splitTimeRange(query.getStartTime(), query.getEndTime(), 4);
List result = new ArrayList<>();
for (DateRange range : timeShards) {
List shardOrders = orderRepository.findByCreateTimeBetween(
range.getStart(), range.getEnd(),
PageRequest.of(0, size * timeShards.size()));
result.addAll(shardOrders);
}
// 在内存中排序分页
return sortAndPaginate(result, query, page, size);
}
// 分片并行查询
private PageResult parallelShardQuery(OrderQuery query, int page, int size) {
List>> futures = new ArrayList<>();
int totalShards = 4; // 假设4个分片
for (int shard = 0; shard < totalShards; shard++) {
int finalShard = shard;
futures.add(CompletableFuture.supplyAsync(() ->
queryShard(query, finalShard, totalShards, size * totalShards)
));
}
// 等待所有分片完成
List allOrders = futures.stream()
.map(CompletableFuture::join)
.flatMap(List::stream)
.collect(Collectors.toList());
// 在内存中排序分页
return sortAndPaginate(allOrders, query, page, size);
}
// 辅助方法:内存排序分页
private PageResult sortAndPaginate(List orders, OrderQuery query, int page, int size) {
// 1. 排序
orders.sort(Comparator.comparing(Order::getCreateTime).reversed());
// 2. 分页
int total = orders.size();
int fromIndex = Math.min(page * size, total);
int toIndex = Math.min(fromIndex + size, total);
List pageList = orders.subList(fromIndex, toIndex);
return new PageResult<>(pageList, total, page, size);
}
}
@RestController
@RequestMapping("/orders")
public class OrderController {
@Autowired
private OrderService orderService;
@GlobalTransactional // Seata全局事务注解
@PostMapping
public ResponseEntity createOrder(@RequestBody OrderRequest request) {
// 1. 创建订单
Order order = orderService.createOrder(request);
// 2. 扣减库存(可能在不同的库)
inventoryService.reduceStock(request.getProductId(), request.getQuantity());
// 3. 生成支付记录(可能在不同的库)
paymentService.createPayment(order.getOrderId(), order.getAmount());
return ResponseEntity.ok(order);
}
}
4.3.2 最终一致性方案(基于消息队列)
@Service
public class OrderCreationService {
@Autowired
private RabbitTemplate rabbitTemplate;
@Transactional
public Order createOrderWithEvent(OrderRequest request) {
// 1. 创建订单(本地事务)
Order order = orderRepository.save(convertToOrder(request));
// 2. 发送领域事件
OrderCreatedEvent event = new OrderCreatedEvent(
order.getOrderId(),
request.getProductId(),
request.getQuantity()
);
rabbitTemplate.convertAndSend("order-exchange", "order.created", event);
return order;
}
// 库存服务消费者
@RabbitListener(queues = "inventory-queue")
public void handleOrderCreatedEvent(OrderCreatedEvent event) {
try {
inventoryService.reduceStock(event.getProductId(), event.getQuantity());
} catch (Exception e) {
// 重试或补偿机制
handleInventoryError(event);
}
}
}
高基数:分片键应具有大量不同值,避免数据倾斜
均匀分布:确保数据均匀分布在各个分片
业务相关性:选择常用查询条件作为分片键
避免热点:避免使用单调递增的列作为分片键
避免全表扫描:
-- 不推荐:无分片键条件
SELECT * FROM orders WHERE status = 'PAID';
-- 推荐:包含分片键
SELECT * FROM orders WHERE user_id = 123 AND status = 'PAID';
分页优化:
-- 不推荐:深分页
SELECT * FROM orders ORDER BY id LIMIT 1000000, 20;
-- 推荐:基于ID分页
SELECT * FROM orders WHERE id > 1000000 ORDER BY id LIMIT 20;
索引设计:
每个分片表都需要独立索引
复合索引包含分片键
全局索引使用单独服务维护
读写分离:
# ShardingSphere读写分离配置
spring:
shardingsphere:
rules:
replica-query:
data-sources:
pr_ds:
primary-data-source-name: ds_primary
replica-data-source-names:
- ds_replica_0
- ds_replica_1
load-balancers:
round_robin:
type: ROUND_ROBIN
public class CanalBinlogProcessor {
@Autowired
private ClickhouseService clickhouseService;
public void processBinlogEvent(CanalEntry.Entry entry) {
if (entry.getEntryType() != CanalEntry.EntryType.ROWDATA) {
return;
}
CanalEntry.RowChange rowChange = CanalEntry.RowChange.parseFrom(entry.getStoreValue());
for (CanalEntry.RowData rowData : rowChange.getRowDatasList()) {
if (rowChange.getEventType() == CanalEntry.EventType.DELETE) {
handleDelete(rowData.getBeforeColumnsList());
} else {
handleUpsert(rowData.getAfterColumnsList());
}
}
}
private void handleUpsert(List columns) {
Map data = new HashMap<>();
for (CanalEntry.Column column : columns) {
data.put(column.getName(), convertValue(column));
}
clickhouseService.upsert("orders", data);
}
private void handleDelete(List columns) {
Map where = new HashMap<>();
for (CanalEntry.Column column : columns) {
if (column.getIsKey()) {
where.put(column.getName(), convertValue(column));
}
}
clickhouseService.delete("orders", where);
}
private Object convertValue(CanalEntry.Column column) {
// 根据数据类型转换
if (column.getIsNull()) return null;
switch (column.getMysqlType()) {
case "int": return Integer.parseInt(column.getValue());
case "bigint": return Long.parseLong(column.getValue());
case "decimal": return new BigDecimal(column.getValue());
case "datetime": return LocalDateTime.parse(column.getValue());
default: return column.getValue();
}
}
}
关键优化策略:
随着云原生和分布式技术的发展,分库分表查询将越来越智能化,开发者应关注:
TiDB(HTAP架构):
-- 实时分析查询
SELECT user_id, SUM(amount)
FROM orders
WHERE create_time > NOW() - INTERVAL 7 DAY
GROUP BY user_id
ORDER BY SUM(amount) DESC
LIMIT 10;
CockroachDB(全球分布式):
-- 地理位置分区查询
SELECT * FROM orders
WHERE region = 'us-east'
ORDER BY create_time DESC
LIMIT 100;
/* 传统方式 */
SELECT * FROM orders WHERE user_id = 123 AND product_id = 456;
/* 智能优化器自动选择 */
-- 可能重写为:
SELECT * FROM orders_by_product WHERE product_id = 456 AND user_id = 123;
{
"order_id": 123456,
"user_id": 789,
"items": [
{"product_id": "P1001", "quantity": 2},
{"product_id": "P2002", "quantity": 1}
],
"shipping_address": {
"city": "Beijing",
"street": "Main St"
}
}
-- JSON查询
SELECT * FROM orders
WHERE JSON_EXTRACT(shipping_address, '$.city') = 'Beijing';
分库分表查询的核心在于如何高效地定位数据位置并合并查询结果。本文介绍了多种实现方案:
基于ShardingSphere-JDBC:轻量级客户端方案,适合Java应用
基于MyCat中间件:代理层方案,对应用透明
分布式数据库方案:TiDB等原生分布式数据库
实时分析分离架构:将分析查询导向数仓
精心设计分片键
智能查询优化器
HTAP混合负载处理
多模型数据支持
云原生分布式数据库
优化分页查询(游标分页)
合理使用绑定表和广播表
读写分离减轻负载
最终一致性替代分布式事务
选择合适的分库分表查询方案需要根据业务规模、团队技能和未来扩展性综合考虑,平衡查询性能、开发成本和系统复杂度