Docker零停机部署通过容器编排实现,核心思想是:
传统部署 vs Docker部署:
传统方式:
[服务器] → [IIS] → [API应用]
- 应用与系统紧耦合
- 环境一致性难保证
- 扩展和回滚复杂
Docker方式:
[服务器] → [Docker引擎] → [API容器1, API容器2, API容器3]
- 应用完全隔离
- 环境一致性保证
- 扩展和回滚简单
用户请求流程:
[客户端] → [Nginx负载均衡器] → [API容器群组]
详细架构:
[用户请求:80]
↓
[Nginx容器:80] ← 负载均衡和反向代理
↓ 轮询分发
┌─────────┼─────────┼─────────┐
↓ ↓ ↓ ↓
[API-1] [API-2] [API-3] [API-4]
[容器] [容器] [容器] [容器]
docker-compose.yml
version: '3.8'
services:
# Nginx负载均衡器
nginx:
image: nginx:alpine
container_name: api-nginx
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
- ./nginx/ssl:/etc/nginx/ssl:ro
depends_on:
- api
networks:
- api-network
restart: unless-stopped
healthcheck:
test: ["CMD", "nginx", "-t"]
interval: 30s
timeout: 10s
retries: 3
# API应用服务(多实例)
api:
build:
context: .
dockerfile: Dockerfile
image: my-api:latest
deploy:
replicas: 4 # 运行4个实例
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 1 # 每次更新1个容器
delay: 30s # 更新间隔30秒
failure_action: rollback
monitor: 60s # 监控60秒
max_failure_ratio: 0.2
rollback_config:
parallelism: 1
delay: 10s
failure_action: pause
monitor: 60s
environment:
- ASPNETCORE_ENVIRONMENT=Production
- ASPNETCORE_URLS=http://+:5000
- ConnectionStrings__DefaultConnection=${DB_CONNECTION_STRING}
volumes:
- ./logs:/app/logs
- ./config:/app/config:ro
networks:
- api-network
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/api/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# 数据库(可选,如果使用容器化数据库)
database:
image: mcr.microsoft.com/mssql/server:2019-latest
container_name: api-database
environment:
- ACCEPT_EULA=Y
- SA_PASSWORD=${SA_PASSWORD}
- MSSQL_DB=${DB_NAME}
volumes:
- db-data:/var/opt/mssql
networks:
- api-network
restart: unless-stopped
# Redis缓存(可选)
redis:
image: redis:alpine
container_name: api-redis
command: redis-server --appendonly yes
volumes:
- redis-data:/data
networks:
- api-network
restart: unless-stopped
volumes:
db-data:
redis-data:
networks:
api-network:
driver: bridge
Dockerfile
# 多阶段构建 - 构建阶段
FROM mcr.microsoft.com/dotnet/sdk:6.0 AS build
WORKDIR /src
# 复制项目文件并还原依赖
COPY ["MyAPI.csproj", "."]
RUN dotnet restore "MyAPI.csproj"
# 复制源代码并构建
COPY . .
RUN dotnet build "MyAPI.csproj" -c Release -o /app/build
# 发布阶段
FROM build AS publish
RUN dotnet publish "MyAPI.csproj" -c Release -o /app/publish
# 运行时阶段
FROM mcr.microsoft.com/dotnet/aspnet:6.0 AS final
WORKDIR /app
# 安装健康检查依赖
RUN apt-get update && apt-get install -y curl && rm -rf /var/lib/apt/lists/*
# 复制发布文件
COPY --from=publish /app/publish .
# 创建非root用户(安全最佳实践)
RUN groupadd -r apigroup && useradd -r -g apigroup apiuser
RUN chown -R apiuser:apigroup /app
USER apiuser
# 健康检查
HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \
CMD curl -f http://localhost:5000/api/health || exit 1
# 暴露端口
EXPOSE 5000
# 启动应用
ENTRYPOINT ["dotnet", "MyAPI.dll"]
nginx/nginx.conf
events {
worker_connections 1024;
}
http {
# 上游服务器组
upstream api_backend {
# least_conn; # 最少连接算法
server api:5000 max_fails=3 fail_timeout=30s weight=1;
# Docker Compose会自动处理多个实例的负载均衡
# 如果需要指定特定实例,可以这样配置:
# server api_1:5000 max_fails=3 fail_timeout=30s;
# server api_2:5000 max_fails=3 fail_timeout=30s;
# server api_3:5000 max_fails=3 fail_timeout=30s;
# server api_4:5000 max_fails=3 fail_timeout=30s;
}
# 日志格式
log_format main '$remote_addr - $remote_user [$time_local] "$request" '
'$status $body_bytes_sent "$http_referer" '
'"$http_user_agent" "$http_x_forwarded_for" '
'upstream_addr=$upstream_addr '
'upstream_response_time=$upstream_response_time';
access_log /var/log/nginx/access.log main;
error_log /var/log/nginx/error.log;
# 基础配置
sendfile on;
tcp_nopush on;
tcp_nodelay on;
keepalive_timeout 65;
client_max_body_size 50M;
# Gzip压缩
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_types text/plain text/css application/json application/javascript;
server {
listen 80;
server_name localhost;
# API路由
location /api/ {
proxy_pass http://api_backend;
# 代理头设置
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto $scheme;
# 超时配置
proxy_connect_timeout 5s;
proxy_send_timeout 60s;
proxy_read_timeout 60s;
# 健康检查配置
proxy_next_upstream error timeout http_500 http_502 http_503 http_504;
proxy_next_upstream_tries 3;
proxy_next_upstream_timeout 30s;
}
# 健康检查端点
location /health {
proxy_pass http://api_backend/api/health;
proxy_set_header Host $host;
access_log off;
}
# 管理端点
location /admin {
root /usr/share/nginx/html;
try_files /admin.html =404;
}
# 静态文件(如果有)
location /static/ {
root /usr/share/nginx/html;
expires 1y;
add_header Cache-Control "public, immutable";
}
# 默认错误页面
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root /usr/share/nginx/html;
}
}
# HTTPS配置(可选)
server {
listen 443 ssl http2;
server_name localhost;
ssl_certificate /etc/nginx/ssl/cert.pem;
ssl_certificate_key /etc/nginx/ssl/key.pem;
ssl_protocols TLSv1.2 TLSv1.3;
ssl_ciphers HIGH:!aNULL:!MD5;
location /api/ {
proxy_pass http://api_backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
proxy_set_header X-Forwarded-Proto https;
}
}
}
初始化Swarm集群
# 初始化Swarm模式
docker swarm init
# 查看加入命令(如果有多台服务器)
docker swarm join-token worker
docker-stack.yml
version: '3.8'
services:
nginx:
image: nginx:alpine
ports:
- "80:80"
- "443:443"
volumes:
- ./nginx/nginx.conf:/etc/nginx/nginx.conf:ro
networks:
- api-overlay
deploy:
replicas: 2
placement:
constraints:
- node.role == manager
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 1
delay: 10s
failure_action: rollback
api:
image: my-api:latest
networks:
- api-overlay
environment:
- ASPNETCORE_ENVIRONMENT=Production
- ASPNETCORE_URLS=http://+:5000
deploy:
replicas: 6 # 运行6个实例
placement:
max_replicas_per_node: 3
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
update_config:
parallelism: 2 # 每次更新2个容器
delay: 30s # 更新间隔
failure_action: rollback
monitor: 120s
max_failure_ratio: 0.1
rollback_config:
parallelism: 2
delay: 0s
failure_action: pause
monitor: 120s
resources:
limits:
cpus: '1.0'
memory: 512M
reservations:
cpus: '0.5'
memory: 256M
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/api/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 60s
networks:
api-overlay:
driver: overlay
attachable: true
# 部署服务栈
docker stack deploy -c docker-stack.yml api-stack
# 查看服务状态
docker service ls
docker service ps api-stack_api
# 更新服务(零停机)
docker service update --image my-api:v2.0.0 api-stack_api
# 回滚服务
docker service rollback api-stack_api
# 扩展服务
docker service scale api-stack_api=10
# 删除服务栈
docker stack rm api-stack
docker-compose-traefik.yml
version: '3.8'
services:
# Traefik反向代理
traefik:
image: traefik:v2.9
container_name: traefik
command:
# 启用Docker provider
- --providers.docker=true
- --providers.docker.exposedbydefault=false
# 配置入口点
- --entrypoints.web.address=:80
- --entrypoints.websecure.address=:443
# 启用Dashboard
- --api.dashboard=true
- --api.insecure=true
# 日志配置
- --log.level=INFO
- --accesslog=true
# 健康检查配置
- --ping=true
ports:
- "80:80"
- "443:443"
- "8080:8080" # Traefik Dashboard
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- ./traefik/ssl:/ssl:ro
networks:
- traefik-network
restart: unless-stopped
healthcheck:
test: ["CMD", "traefik", "healthcheck", "--ping"]
interval: 30s
timeout: 10s
retries: 3
# API服务
api:
build: .
image: my-api:latest
deploy:
replicas: 4
environment:
- ASPNETCORE_ENVIRONMENT=Production
- ASPNETCORE_URLS=http://+:5000
networks:
- traefik-network
labels:
# 启用Traefik
- "traefik.enable=true"
# HTTP路由配置
- "traefik.http.routers.api.rule=Host(`api.localhost`) && PathPrefix(`/api`)"
- "traefik.http.routers.api.entrypoints=web"
- "traefik.http.services.api.loadbalancer.server.port=5000"
# 健康检查
- "traefik.http.services.api.loadbalancer.healthcheck.path=/api/health"
- "traefik.http.services.api.loadbalancer.healthcheck.interval=30s"
- "traefik.http.services.api.loadbalancer.healthcheck.timeout=10s"
# 负载均衡策略
- "traefik.http.services.api.loadbalancer.sticky=false"
# HTTPS重定向(可选)
- "traefik.http.routers.api-secure.rule=Host(`api.localhost`) && PathPrefix(`/api`)"
- "traefik.http.routers.api-secure.entrypoints=websecure"
- "traefik.http.routers.api-secure.tls=true"
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:5000/api/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
networks:
traefik-network:
driver: bridge
# 1. 准备新版本镜像
docker build -t my-api:v2.0.0 .
# 2. 测试新镜像
docker run -d --name test-api -p 8000:5000 my-api:v2.0.0
curl http://localhost:8000/api/health
# 3. 停止测试容器
docker stop test-api && docker rm test-api
# 方法1: 使用Docker Compose滚动更新
# 修改docker-compose.yml中的镜像版本
# image: my-api:v2.0.0
# 执行滚动更新
docker-compose up -d --no-deps --scale api=6 --no-recreate api
# 等待新容器启动并通过健康检查
sleep 60
# 缩减到目标实例数
docker-compose up -d --no-deps --scale api=4 api
# 方法2: 使用蓝绿部署模式
# 先启动新版本容器(不停止旧版本)
docker-compose -f docker-compose.yml -f docker-compose.new.yml up -d api-new
# 等待健康检查通过
./wait-for-health.sh api-new
# 更新Nginx配置指向新版本
./switch-nginx-upstream.sh api-new
# 停止旧版本容器
docker-compose stop api
docker-compose rm -f api
# 验证部署结果
curl -H "Host: api.localhost" http://localhost/api/health
# 查看容器状态
docker-compose ps
# 查看日志
docker-compose logs -f api
# 监控性能
docker stats
# 1. 构建新版本镜像
docker build -t my-api:v2.0.0 .
# 2. 推送到镜像仓库(生产环境)
docker push my-api:v2.0.0
# 3. 执行滚动更新
docker service update \
--image my-api:v2.0.0 \
--update-parallelism 2 \
--update-delay 30s \
--update-failure-action rollback \
--update-monitor 60s \
api-stack_api
# 4. 监控更新进度
watch docker service ps api-stack_api
# 5. 验证更新结果
curl http://localhost/api/health
# 6. 如果有问题,立即回滚
docker service rollback api-stack_api
wait-for-health.sh
#!/bin/bash
SERVICE_NAME=${1:-api}
MAX_ATTEMPTS=${2:-30}
SLEEP_INTERVAL=${3:-10}
echo "等待 $SERVICE_NAME 服务健康检查通过..."
for i in $(seq 1 $MAX_ATTEMPTS); do
echo "第 $i 次检查..."
# 检查容器健康状态
HEALTHY_COUNT=$(docker-compose ps -q $SERVICE_NAME | xargs docker inspect --format='{{.State.Health.Status}}' | grep -c "healthy")
TOTAL_COUNT=$(docker-compose ps -q $SERVICE_NAME | wc -l)
echo "健康容器数: $HEALTHY_COUNT / $TOTAL_COUNT"
if [ "$HEALTHY_COUNT" -eq "$TOTAL_COUNT" ] && [ "$TOTAL_COUNT" -gt 0 ]; then
echo "✅ 所有 $SERVICE_NAME 容器健康检查通过!"
exit 0
fi
if [ $i -lt $MAX_ATTEMPTS ]; then
echo "⏳ 等待 $SLEEP_INTERVAL 秒后重试..."
sleep $SLEEP_INTERVAL
fi
done
echo "❌ 健康检查超时失败!"
exit 1
deploy.sh
#!/bin/bash
set -e # 遇到错误立即退出
# 配置变量
NEW_VERSION=${1:-latest}
SERVICE_NAME="api"
COMPOSE_FILE="docker-compose.yml"
BACKUP_TAG="backup-$(date +%Y%m%d-%H%M%S)"
echo " 开始部署 $SERVICE_NAME 服务到版本 $NEW_VERSION"
# 函数:回滚操作
rollback() {
echo "❌ 部署失败,开始回滚..."
docker-compose -f $COMPOSE_FILE up -d --no-deps $SERVICE_NAME
echo "✅ 回滚完成"
exit 1
}
# 设置错误处理
trap rollback ERR
# 1. 备份当前运行的容器
echo " 备份当前版本..."
docker tag $(docker-compose images -q $SERVICE_NAME | head -1) my-api:$BACKUP_TAG
# 2. 构建新版本
echo " 构建新版本镜像..."
docker build -t my-api:$NEW_VERSION .
# 3. 更新docker-compose.yml
echo " 更新配置文件..."
sed -i.bak "s|image: my-api:.*|image: my-api:$NEW_VERSION|g" $COMPOSE_FILE
# 4. 执行滚动更新
echo " 执行滚动更新..."
docker-compose up -d --no-deps --scale $SERVICE_NAME=6 $SERVICE_NAME
# 5. 等待健康检查
echo " 等待健康检查..."
./wait-for-health.sh $SERVICE_NAME 30 10
# 6. 缩减到目标实例数
echo " 调整实例数量..."
docker-compose up -d --no-deps --scale $SERVICE_NAME=4 $SERVICE_NAME
# 7. 最终验证
echo "✅ 验证部署结果..."
sleep 10
curl -f http://localhost/api/health || rollback
# 8. 清理
echo " 清理旧版本容器..."
docker system prune -f
echo " 部署完成! 版本: $NEW_VERSION"
echo " 备份版本: $BACKUP_TAG (可用于紧急回滚)"
monitor.sh
#!/bin/bash
SERVICE_NAME=${1:-api}
INTERVAL=${2:-30}
echo " 开始监控 $SERVICE_NAME 服务 (间隔: ${INTERVAL}s)"
echo "按 Ctrl+C 停止监控"
echo "----------------------------------------"
while true; do
timestamp=$(date '+%Y-%m-%d %H:%M:%S')
# 获取容器状态
containers=$(docker-compose ps -q $SERVICE_NAME)
total_count=$(echo "$containers" | wc -l)
healthy_count=0
echo "[$timestamp] 检查 $total_count 个 $SERVICE_NAME 容器:"
for container in $containers; do
name=$(docker inspect --format='{{.Name}}' $container | sed 's|/||')
status=$(docker inspect --format='{{.State.Status}}' $container)
health=$(docker inspect --format='{{if .State.Health}}{{.State.Health.Status}}{{else}}no-health-check{{end}}' $container)
if [ "$health" = "healthy" ] || [ "$health" = "no-health-check" ]; then
healthy_count=$((healthy_count + 1))
echo " ✅ $name: $status ($health)"
else
echo " ❌ $name: $status ($health)"
fi
done
echo " 健康比例: $healthy_count/$total_count"
# 检查API端点
if curl -s -f http://localhost/api/health > /dev/null; then
echo " API端点: ✅ 正常"
else
echo " API端点: ❌ 异常"
fi
echo "----------------------------------------"
sleep $INTERVAL
done
对比维度 | Docker + Nginx | ARR负载均衡 | 蓝绿部署 |
---|---|---|---|
环境一致性 | ⭐⭐⭐⭐⭐ 完全一致 | ⭐⭐⭐ 可能有差异 | ⭐⭐⭐⭐ 基本一致 |
部署速度 | ⭐⭐⭐⭐ 较快 | ⭐⭐⭐ 中等 | ⭐⭐⭐⭐ 较快 |
回滚速度 | ⭐⭐⭐⭐ 快速 | ⭐⭐ 较慢 | ⭐⭐⭐⭐⭐ 极快 |
资源利用 | ⭐⭐⭐⭐ 高效 | ⭐⭐⭐⭐ 高效 | ⭐⭐ 资源浪费 |
扩展能力 | ⭐⭐⭐⭐⭐ 极强 | ⭐⭐⭐⭐ 较强 | ⭐⭐⭐ 中等 |
技术门槛 | ⭐⭐⭐⭐ 需要学习Docker | ⭐⭐ 相对简单 | ⭐⭐⭐ 中等 |
跨平台性 | ⭐⭐⭐⭐⭐ 完全跨平台 | ⭐ Windows专用 | ⭐⭐⭐ 平台相关 |
开发运维一体 | ⭐⭐⭐⭐⭐ DevOps友好 | ⭐⭐⭐ 传统运维 | ⭐⭐⭐⭐ 较好 |
环境一致性:
- 开发、测试、生产环境完全一致
- 消除"在我机器上正常"问题
- 版本控制包含完整环境
资源效率:
- 容器启动速度快(秒级)
- 资源占用少(共享OS内核)
- 弹性扩展能力强
部署灵活性:
- 支持多种部署策略
- 易于实现自动化
- 支持多云部署
生态系统:
- 丰富的镜像库
- 强大的编排工具
- 完善的监控方案
⚠️ 学习成本:
- 需要掌握Docker概念
- 容器编排相对复杂
- 网络和存储配置
⚠️ 调试复杂:
- 容器内调试相对困难
- 日志管理需要统一
- 网络排查复杂
⚠️ 存储持久化:
- 数据持久化需要规划
- 状态管理相对复杂
- 备份策略需要考虑
✅ 微服务架构:
- 多个独立服务需要部署
- 服务间需要隔离
- 需要独立扩展不同服务
✅ 云原生应用:
- 需要在多个云平台部署
- 追求DevOps最佳实践
- 需要容器编排能力
✅ 开发团队现代化:
- 团队愿意学习新技术
- 开发和运维协作紧密
- 追求自动化程度
✅ 频繁部署需求:
- 敏捷开发,需要频繁发布
- 需要快速回滚能力
- 多环境部署需求
❌ 传统企业环境:
- 团队技术栈偏传统
- 安全审计要求严格
- 变更管理流程复杂
❌ 单体应用:
- 应用架构简单
- 部署频率不高
- 现有方案已经稳定
❌ 资源受限环境:
- 服务器配置较低
- 网络带宽受限
- 运维人员不足
任务清单:
□ 学习Docker基础概念
□ 编写Dockerfile
□ 本地容器化测试
□ 解决依赖和配置问题
□ 制定镜像管理策略
目标:
- 应用成功容器化
- 本地Docker环境运行正常
- 团队掌握基础操作
任务清单:
□ 选择编排方案(Compose/Swarm/K8s)
□ 配置负载均衡器
□ 实现健康检查
□ 配置日志和监控
□ 编写部署脚本
目标:
- 完整的编排配置
- 自动化部署流程
- 监控和告警机制
任务清单:
□ 生产环境准备
□ 安全配置加固
□ 性能调优
□ 灾备方案制定
□ 团队培训
目标:
- 生产环境稳定运行
- 零停机部署验证
- 应急预案完备
任务清单:
□ 监控数据分析
□ 性能优化调整
□ 自动化程度提升
□ 安全策略完善
□ 最佳实践总结
目标:
- 系统稳定性持续提升
- 部署效率不断优化
- 团队能力持续增强
技术因素:
业务因素:
现代化团队/云原生应用:
选择 Docker + Kubernetes/Swarm
传统企业/Windows环境:
选择 ARR负载均衡 或 蓝绿部署
小团队/简单应用:
从 ARR负载均衡 开始,逐步向Docker演进
Docker为零停机部署提供了最现代化和灵活的解决方案,是未来的发展趋势!